893 files changed, 65545 insertions, 23671 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index e777961939f3..0dbe0d139ac2 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -22,6 +22,7 @@
 #include <linux/jiffies.h>
 #include <linux/file.h>
+#include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 14d944204571..82ee460e534d 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -24,6 +24,7 @@
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/idr.h>
 #include <net/9p/9p.h>
@@ -151,7 +152,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
                        if (access == V9FS_ACCESS_SINGLE)
                                return ERR_PTR(-EPERM);
-                        if (v9fs_extended(v9ses))
+                        if (v9fs_proto_dotu(v9ses))
                                uname = NULL;
                        else
                                uname = v9ses->uname;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index cf62b05e296a..cb57d3326182 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -29,6 +29,7 @@
 #include <linux/sched.h>
 #include <linux/parser.h>
 #include <linux/idr.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 #include <net/9p/transport.h>
@@ -84,7 +85,7 @@ static const match_table_t tokens = {
 static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 {
-        char *options;
+        char *options, *tmp_options;
        substring_t args[MAX_OPT_ARGS];
        char *p;
        int option = 0;
@@ -102,9 +103,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
        if (!opts)
                return 0;
-        options = kstrdup(opts, GFP_KERNEL);
+        tmp_options = kstrdup(opts, GFP_KERNEL);
-        if (!options)
+        if (!tmp_options) {
+                ret = -ENOMEM;
                goto fail_option_alloc;
+        }
+        options = tmp_options;
        while ((p = strsep(&options, ",")) != NULL) {
                int token;
@@ -159,8 +163,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
                        break;
                case Opt_cache:
                        s = match_strdup(&args[0]);
-                        if (!s)
+                        if (!s) {
-                                goto fail_option_alloc;
+                                ret = -ENOMEM;
+                                P9_DPRINTK(P9_DEBUG_ERROR,
+                                  "problem allocating copy of cache arg\n");
+                                goto free_and_return;
+                        }
                        if (strcmp(s, "loose") == 0)
                                v9ses->cache = CACHE_LOOSE;
@@ -173,8 +181,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
                case Opt_access:
                        s = match_strdup(&args[0]);
-                        if (!s)
+                        if (!s) {
-                                goto fail_option_alloc;
+                                ret = -ENOMEM;
+                                P9_DPRINTK(P9_DEBUG_ERROR,
+                                  "problem allocating copy of access arg\n");
+                                goto free_and_return;
+                        }
                        v9ses->flags &= ~V9FS_ACCESS_MASK;
                        if (strcmp(s, "user") == 0)
@@ -194,13 +206,11 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
                        continue;
                }
        }
-        kfree(options);
-        return ret;
+free_and_return:
+        kfree(tmp_options);
 fail_option_alloc:
-        P9_DPRINTK(P9_DEBUG_ERROR,
+        return ret;
-                   "failed to allocate copy of option argument\n");
-        return -ENOMEM;
 }
 /**
@@ -232,7 +242,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        list_add(&v9ses->slist, &v9fs_sessionlist);
        spin_unlock(&v9fs_sessionlist_lock);
-        v9ses->flags = V9FS_EXTENDED | V9FS_ACCESS_USER;
+        v9ses->flags = V9FS_PROTO_2000U | V9FS_ACCESS_USER;
        strcpy(v9ses->uname, V9FS_DEFUSER);
        strcpy(v9ses->aname, V9FS_DEFANAME);
        v9ses->uid = ~0;
@@ -253,13 +263,13 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
                goto error;
        }
-        if (!v9ses->clnt->dotu)
+        if (!p9_is_proto_dotu(v9ses->clnt))
-                v9ses->flags &= ~V9FS_EXTENDED;
+                v9ses->flags &= ~V9FS_PROTO_2000U;
        v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
        /* for legacy mode, fall back to V9FS_ACCESS_ANY */
-        if (!v9fs_extended(v9ses) &&
+        if (!v9fs_proto_dotu(v9ses) &&
                ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {
                v9ses->flags &= ~V9FS_ACCESS_MASK;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 019f4ccb70c1..6b801d1ddf4b 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -23,7 +23,8 @@
 /**
 * enum p9_session_flags - option flags for each 9P session
- * @V9FS_EXTENDED: whether or not to use 9P2000.u extensions
+ * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions
+ * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions
 * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy
 * @V9FS_ACCESS_USER: a new attach will be issued for every user (default)
 * @V9FS_ACCESS_ANY: use a single attach for all users
@@ -32,11 +33,12 @@
 * Session flags reflect options selected by users at mount time
 */
 enum p9_session_flags {
-        V9FS_EXTENDED           = 0x01,
+        V9FS_PROTO_2000U        = 0x01,
-        V9FS_ACCESS_SINGLE      = 0x02,
+        V9FS_PROTO_2000L        = 0x02,
-        V9FS_ACCESS_USER        = 0x04,
+        V9FS_ACCESS_SINGLE      = 0x04,
-        V9FS_ACCESS_ANY         = 0x06,
+        V9FS_ACCESS_USER        = 0x08,
-        V9FS_ACCESS_MASK        = 0x06,
+        V9FS_ACCESS_ANY         = 0x0C,
+        V9FS_ACCESS_MASK        = 0x0C,
 };
 /* possible values of ->cache */
@@ -121,7 +123,12 @@ static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
        return (inode->i_sb->s_fs_info);
 }
-static inline int v9fs_extended(struct v9fs_session_info *v9ses)
+static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses)
 {
-        return v9ses->flags & V9FS_EXTENDED;
+        return v9ses->flags & V9FS_PROTO_2000U;
+}
+static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
+{
+        return v9ses->flags & V9FS_PROTO_2000L;
 }
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 3a7560e35865..ed835836e0dc 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -60,3 +60,4 @@ void v9fs_dentry_release(struct dentry *);
 int v9fs_uflags2omode(int uflags, int extended);
 ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
+void v9fs_blank_wstat(struct p9_wstat *wstat);
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index d74325295b1e..cbf4e50f3933 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -34,6 +34,7 @@
 #include <linux/namei.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 15cce53bf61e..909711f57c0d 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -32,6 +32,7 @@
 #include <linux/sched.h>
 #include <linux/inet.h>
 #include <linux/idr.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
@@ -76,6 +77,15 @@ static inline int dt_type(struct p9_wstat *mistat)
        return rettype;
 }
+static void p9stat_init(struct p9_wstat *stbuf)
+{
+        stbuf->name  = NULL;
+        stbuf->uid   = NULL;
+        stbuf->gid   = NULL;
+        stbuf->muid  = NULL;
+        stbuf->extension = NULL;
+}
 /**
 * v9fs_dir_readdir - read a directory
 * @filp: opened file structure
@@ -131,11 +141,11 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        rdir->head = 0;
                        rdir->tail = err;
                }
                while (rdir->head < rdir->tail) {
+                        p9stat_init(&st);
                        err = p9stat_read(rdir->buf + rdir->head,
                                                buflen - rdir->head, &st,
-                                                fid->clnt->dotu);
+                                                fid->clnt->proto_version);
                        if (err) {
                                P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
                                err = -EIO;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3902bf43a088..df52d488d2a6 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -61,7 +61,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
        P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file);
        v9ses = v9fs_inode2v9ses(inode);
-        omode = v9fs_uflags2omode(file->f_flags, v9fs_extended(v9ses));
+        omode = v9fs_uflags2omode(file->f_flags, v9fs_proto_dotu(v9ses));
        fid = file->private_data;
        if (!fid) {
                fid = v9fs_fid_clone(file->f_path.dentry);
@@ -77,7 +77,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
                        i_size_write(inode, 0);
                        inode->i_blocks = 0;
                }
-                if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses)))
+                if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses)))
                        generic_file_llseek(file, 0, SEEK_END);
        }
@@ -114,7 +114,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
        P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
        /* No mandatory locks */
-        if (__mandatory_lock(inode))
+        if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
                return -ENOLCK;
        if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
@@ -215,7 +215,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
        struct p9_fid *fid;
        struct p9_client *clnt;
        struct inode *inode = filp->f_path.dentry->d_inode;
-        int origin = *offset;
+        loff_t origin = *offset;
        unsigned long pg_start, pg_end;
        P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
@@ -257,6 +257,23 @@ v9fs_file_write(struct file *filp, const char __user * data,
        return total;
 }
+static int v9fs_file_fsync(struct file *filp, struct dentry *dentry,
+                                        int datasync)
+{
+        struct p9_fid *fid;
+        struct p9_wstat wstat;
+        int retval;
+        P9_DPRINTK(P9_DEBUG_VFS, "filp %p dentry %p datasync %x\n", filp,
+                                                dentry, datasync);
+        fid = filp->private_data;
+        v9fs_blank_wstat(&wstat);
+        retval = p9_client_wstat(fid, &wstat);
+        return retval;
+}
 static const struct file_operations v9fs_cached_file_operations = {
        .llseek = generic_file_llseek,
        .read = do_sync_read,
@@ -266,6 +283,7 @@ static const struct file_operations v9fs_cached_file_operations = {
        .release = v9fs_dir_release,
        .lock = v9fs_file_lock,
        .mmap = generic_file_readonly_mmap,
+        .fsync = v9fs_file_fsync,
 };
 const struct file_operations v9fs_file_operations = {
@@ -276,4 +294,5 @@ const struct file_operations v9fs_file_operations = {
        .release = v9fs_dir_release,
        .lock = v9fs_file_lock,
        .mmap = generic_file_readonly_mmap,
+        .fsync = v9fs_file_fsync,
 };
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 18f74ec4dce9..63c2b5af268a 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -34,6 +34,7 @@
 #include <linux/namei.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
@@ -60,7 +61,7 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
        res = mode & 0777;
        if (S_ISDIR(mode))
                res |= P9_DMDIR;
-        if (v9fs_extended(v9ses)) {
+        if (v9fs_proto_dotu(v9ses)) {
                if (S_ISLNK(mode))
                        res |= P9_DMSYMLINK;
                if (v9ses->nodev == 0) {
@@ -102,21 +103,21 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
        if ((mode & P9_DMDIR) == P9_DMDIR)
                res |= S_IFDIR;
-        else if ((mode & P9_DMSYMLINK) && (v9fs_extended(v9ses)))
+        else if ((mode & P9_DMSYMLINK) && (v9fs_proto_dotu(v9ses)))
                res |= S_IFLNK;
-        else if ((mode & P9_DMSOCKET) && (v9fs_extended(v9ses))
+        else if ((mode & P9_DMSOCKET) && (v9fs_proto_dotu(v9ses))
                 && (v9ses->nodev == 0))
                res |= S_IFSOCK;
-        else if ((mode & P9_DMNAMEDPIPE) && (v9fs_extended(v9ses))
+        else if ((mode & P9_DMNAMEDPIPE) && (v9fs_proto_dotu(v9ses))
                 && (v9ses->nodev == 0))
                res |= S_IFIFO;
-        else if ((mode & P9_DMDEVICE) && (v9fs_extended(v9ses))
+        else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses))
                 && (v9ses->nodev == 0))
                res |= S_IFBLK;
        else
                res |= S_IFREG;
-        if (v9fs_extended(v9ses)) {
+        if (v9fs_proto_dotu(v9ses)) {
                if ((mode & P9_DMSETUID) == P9_DMSETUID)
                        res |= S_ISUID;
@@ -176,7 +177,7 @@ int v9fs_uflags2omode(int uflags, int extended)
 *
 */
-static void
+void
 v9fs_blank_wstat(struct p9_wstat *wstat)
 {
        wstat->type = ~0;
@@ -265,7 +266,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
        case S_IFBLK:
        case S_IFCHR:
        case S_IFSOCK:
-                if (!v9fs_extended(v9ses)) {
+                if (!v9fs_proto_dotu(v9ses)) {
                        P9_DPRINTK(P9_DEBUG_ERROR,
                                   "special files without extended mode\n");
                        err = -EINVAL;
@@ -278,7 +279,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
                inode->i_fop = &v9fs_file_operations;
                break;
        case S_IFLNK:
-                if (!v9fs_extended(v9ses)) {
+                if (!v9fs_proto_dotu(v9ses)) {
                        P9_DPRINTK(P9_DEBUG_ERROR,
                                   "extended modes used w/o 9P2000.u\n");
                        err = -EINVAL;
@@ -288,7 +289,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
                break;
        case S_IFDIR:
                inc_nlink(inode);
-                if (v9fs_extended(v9ses))
+                if (v9fs_proto_dotu(v9ses))
                        inode->i_op = &v9fs_dir_inode_operations_ext;
                else
                        inode->i_op = &v9fs_dir_inode_operations;
@@ -575,7 +576,8 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
                flags = O_RDWR;
        fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
-                                v9fs_uflags2omode(flags, v9fs_extended(v9ses)));
+                                v9fs_uflags2omode(flags,
+                                                v9fs_proto_dotu(v9ses)));
        if (IS_ERR(fid)) {
                err = PTR_ERR(fid);
                fid = NULL;
@@ -858,7 +860,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
        if (iattr->ia_valid & ATTR_SIZE)
                wstat.length = iattr->ia_size;
-        if (v9fs_extended(v9ses)) {
+        if (v9fs_proto_dotu(v9ses)) {
                if (iattr->ia_valid & ATTR_UID)
                        wstat.n_uid = iattr->ia_uid;
@@ -886,6 +888,8 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
        struct super_block *sb)
 {
        char ext[32];
+        char tag_name[14];
+        unsigned int i_nlink;
        struct v9fs_session_info *v9ses = sb->s_fs_info;
        inode->i_nlink = 1;
@@ -897,11 +901,26 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
        inode->i_uid = v9ses->dfltuid;
        inode->i_gid = v9ses->dfltgid;
-        if (v9fs_extended(v9ses)) {
+        if (v9fs_proto_dotu(v9ses)) {
                inode->i_uid = stat->n_uid;
                inode->i_gid = stat->n_gid;
        }
+        if ((S_ISREG(inode->i_mode)) || (S_ISDIR(inode->i_mode))) {
+                if (v9fs_proto_dotu(v9ses) && (stat->extension[0] != '\0')) {
+                        /*
+                         * Hadlink support got added later to
+                         * to the .u extension. So there can be
+                         * server out there that doesn't support
+                         * this even with .u extension. So check
+                         * for non NULL stat->extension
+                         */
+                        strncpy(ext, stat->extension, sizeof(ext));
+                        /* HARDLINKCOUNT %u */
+                        sscanf(ext, "%13s %u", tag_name, &i_nlink);
+                        if (!strncmp(tag_name, "HARDLINKCOUNT", 13))
+                                inode->i_nlink = i_nlink;
+                }
+        }
        inode->i_mode = p9mode2unixmode(v9ses, stat->mode);
        if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) {
                char type = 0;
@@ -976,7 +995,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
        if (IS_ERR(fid))
                return PTR_ERR(fid);
-        if (!v9fs_extended(v9ses))
+        if (!v9fs_proto_dotu(v9ses))
                return -EBADF;
        st = p9_client_stat(fid);
@@ -1001,44 +1020,6 @@ done:
 }
 /**
- * v9fs_vfs_readlink - read a symlink's location
- * @dentry: dentry for symlink
- * @buffer: buffer to load symlink location into
- * @buflen: length of buffer
- *
- */
-static int v9fs_vfs_readlink(struct dentry *dentry, char __user * buffer,
-                             int buflen)
-{
-        int retval;
-        int ret;
-        char *link = __getname();
-        if (unlikely(!link))
-                return -ENOMEM;
-        if (buflen > PATH_MAX)
-                buflen = PATH_MAX;
-        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
-                                                                        dentry);
-        retval = v9fs_readlink(dentry, link, buflen);
-        if (retval > 0) {
-                if ((ret = copy_to_user(buffer, link, retval)) != 0) {
-                        P9_DPRINTK(P9_DEBUG_ERROR,
-                                        "problem copying to user: %d\n", ret);
-                        retval = ret;
-                }
-        }
-        __putname(link);
-        return retval;
-}
-/**
 * v9fs_vfs_follow_link - follow a symlink path
 * @dentry: dentry for symlink
 * @nd: nameidata
@@ -1104,7 +1085,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
        struct p9_fid *fid;
        v9ses = v9fs_inode2v9ses(dir);
-        if (!v9fs_extended(v9ses)) {
+        if (!v9fs_proto_dotu(v9ses)) {
                P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n");
                return -EPERM;
        }
@@ -1230,7 +1211,6 @@ static const struct inode_operations v9fs_dir_inode_operations_ext = {
        .rmdir = v9fs_vfs_rmdir,
        .mknod = v9fs_vfs_mknod,
        .rename = v9fs_vfs_rename,
-        .readlink = v9fs_vfs_readlink,
        .getattr = v9fs_vfs_getattr,
        .setattr = v9fs_vfs_setattr,
 };
@@ -1253,7 +1233,7 @@ static const struct inode_operations v9fs_file_inode_operations = {
 };
 static const struct inode_operations v9fs_symlink_inode_operations = {
-        .readlink = v9fs_vfs_readlink,
+        .readlink = generic_readlink,
        .follow_link = v9fs_vfs_follow_link,
        .put_link = v9fs_vfs_put_link,
        .getattr = v9fs_vfs_getattr,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 14a86448572c..a271549d9e21 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -37,6 +37,7 @@
 #include <linux/mount.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
@@ -188,7 +189,8 @@ static void v9fs_kill_super(struct super_block *s)
        P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
-        v9fs_dentry_release(s->s_root); /* clunk root */
+        if (s->s_root)
+                v9fs_dentry_release(s->s_root); /* clunk root */
        kill_anon_super(s);
diff --git a/fs/Kconfig b/fs/Kconfig
index 64d44efad7a5..5f85b5947613 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -177,6 +177,7 @@ source "fs/efs/Kconfig"
 source "fs/jffs2/Kconfig"
 # UBIFS File system configuration
 source "fs/ubifs/Kconfig"
+source "fs/logfs/Kconfig"
 source "fs/cramfs/Kconfig"
 source "fs/squashfs/Kconfig"
 source "fs/freevxfs/Kconfig"
@@ -234,6 +235,7 @@ config NFS_COMMON
 source "net/sunrpc/Kconfig"
 source "fs/smbfs/Kconfig"
+source "fs/ceph/Kconfig"
 source "fs/cifs/Kconfig"
 source "fs/ncpfs/Kconfig"
 source "fs/coda/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index af6d04700d9c..97f340f14ba2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -99,6 +99,7 @@ obj-$(CONFIG_NTFS_FS)		+= ntfs/
 obj-$(CONFIG_UFS_FS)            += ufs/
 obj-$(CONFIG_EFS_FS)            += efs/
 obj-$(CONFIG_JFFS2_FS)          += jffs2/
+obj-$(CONFIG_LOGFS)             += logfs/
 obj-$(CONFIG_UBIFS_FS)          += ubifs/
 obj-$(CONFIG_AFFS_FS)           += affs/
 obj-$(CONFIG_ROMFS_FS)          += romfs/
@@ -124,3 +125,4 @@ obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
 obj-$(CONFIG_BTRFS_FS)          += btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
 obj-$(CONFIG_EXOFS_FS)          += exofs/
+obj-$(CONFIG_CEPH_FS)           += ceph/
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 9cc18775b832..2ff622f6f547 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -121,7 +121,7 @@ struct adfs_discmap {
 /* Inode stuff */
 struct inode *adfs_iget(struct super_block *sb, struct object_info *obj);
-int adfs_write_inode(struct inode *inode,int unused);
+int adfs_write_inode(struct inode *inode, struct writeback_control *wbc);
 int adfs_notify_change(struct dentry *dentry, struct iattr *attr);
 /* map.c */
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 3f57ce4bee5d..0f5e30978135 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -9,6 +9,7 @@
 */
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/writeback.h>
 #include "adfs.h"
 /*
@@ -360,7 +361,7 @@ out:
 * The adfs-specific inode data has already been updated by
 * adfs_notify_change()
 */
-int adfs_write_inode(struct inode *inode, int wait)
+int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        struct super_block *sb = inode->i_sb;
        struct object_info obj;
@@ -375,7 +376,7 @@ int adfs_write_inode(struct inode *inode, int wait)
        obj.attr        = ADFS_I(inode)->attr;
        obj.size        = inode->i_size;
-        ret = adfs_dir_update(sb, &obj, wait);
+        ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL);
        unlock_kernel();
        return ret;
 }
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 6910a98bd73c..4a3af7075c1d 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -13,6 +13,7 @@
 #include <linux/parser.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/statfs.h>
 #include "adfs.h"
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index e511dc621a2e..861dae68ac12 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -106,8 +106,8 @@ struct affs_sb_info {
        u32 s_last_bmap;
        struct buffer_head *s_bmap_bh;
        char *s_prefix;                 /* Prefix for volumes and assigns. */
-        int s_prefix_len;               /* Length of prefix. */
        char s_volume[32];              /* Volume prefix for absolute symlinks. */
+        spinlock_t symlink_lock;        /* protects the previous two */
 };
 #define SF_INTL         0x0001          /* International filesystem. */
@@ -175,7 +175,8 @@ extern void			 affs_delete_inode(struct inode *inode);
 extern void                      affs_clear_inode(struct inode *inode);
 extern struct inode             *affs_iget(struct super_block *sb,
                                        unsigned long ino);
-extern int                       affs_write_inode(struct inode *inode, int);
+extern int                       affs_write_inode(struct inode *inode,
+                                        struct writeback_control *wbc);
 extern int                       affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type);
 /* file.c */
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index dc5ef14bdc1c..3e262711ae06 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -7,6 +7,7 @@
 *  block allocation, deallocation, calculation of free space.
 */
+#include <linux/slab.h>
 #include "affs.h"
 /* This is, of course, shamelessly stolen from fs/minix */
@@ -128,7 +129,7 @@ err_range:
 /*
 * Allocate a block in the given allocation zone.
 * Since we have to byte-swap the bitmap on little-endian
- * machines, this is rather expensive. Therefor we will
+ * machines, this is rather expensive. Therefore we will
 * preallocate up to 16 blocks from the same word, if
 * possible. We are not doing preallocations in the
 * header zone, though.
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 3c4ec7d864c4..f4b2a4ee4f91 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -10,6 +10,7 @@
 *  (C) 1991  Linus Torvalds - minix filesystem
 */
 #include <linux/sched.h>
+#include <linux/gfp.h>
 #include "affs.h"
 extern const struct inode_operations affs_symlink_inode_operations;
@@ -166,7 +167,7 @@ bad_inode:
 }
 int
-affs_write_inode(struct inode *inode, int unused)
+affs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        struct super_block      *sb = inode->i_sb;
        struct buffer_head      *bh;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 960d336ec694..d70bbbac6b7b 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -341,10 +341,13 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
        p  = (char *)AFFS_HEAD(bh)->table;
        lc = '/';
        if (*symname == '/') {
+                struct affs_sb_info *sbi = AFFS_SB(sb);
                while (*symname == '/')
                        symname++;
-                while (AFFS_SB(sb)->s_volume[i])        /* Cannot overflow */
+                spin_lock(&sbi->symlink_lock);
-                        *p++ = AFFS_SB(sb)->s_volume[i++];
+                while (sbi->s_volume[i])        /* Cannot overflow */
+                        *p++ = sbi->s_volume[i++];
+                spin_unlock(&sbi->symlink_lock);
        }
        while (i < maxlen && (c = *symname++)) {
                if (c == '.' && lc == '/' && *symname == '.' && symname[1] == '/') {
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 104fdcb3a7fc..16a3e4765f68 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -17,6 +17,7 @@
 #include <linux/magic.h>
 #include <linux/sched.h>
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 #include "affs.h"
 extern struct timezone sys_tz;
@@ -203,7 +204,7 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
                switch (token) {
                case Opt_bs:
                        if (match_int(&args[0], &n))
-                                return -EINVAL;
+                                return 0;
                        if (n != 512 && n != 1024 && n != 2048
                            && n != 4096) {
                                printk ("AFFS: Invalid blocksize (512, 1024, 2048, 4096 allowed)\n");
@@ -213,7 +214,7 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
                        break;
                case Opt_mode:
                        if (match_octal(&args[0], &option))
-                                return 1;
+                                return 0;
                        *mode = option & 0777;
                        *mount_opts |= SF_SETMODE;
                        break;
@@ -221,8 +222,6 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
                        *mount_opts |= SF_MUFS;
                        break;
                case Opt_prefix:
-                        /* Free any previous prefix */
-                        kfree(*prefix);
                        *prefix = match_strdup(&args[0]);
                        if (!*prefix)
                                return 0;
@@ -233,21 +232,21 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
                        break;
                case Opt_reserved:
                        if (match_int(&args[0], reserved))
-                                return 1;
+                                return 0;
                        break;
                case Opt_root:
                        if (match_int(&args[0], root))
-                                return 1;
+                                return 0;
                        break;
                case Opt_setgid:
                        if (match_int(&args[0], &option))
-                                return 1;
+                                return 0;
                        *gid = option;
                        *mount_opts |= SF_SETGID;
                        break;
                case Opt_setuid:
                        if (match_int(&args[0], &option))
-                                return -EINVAL;
+                                return 0;
                        *uid = option;
                        *mount_opts |= SF_SETUID;
                        break;
@@ -311,11 +310,14 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
                return -ENOMEM;
        sb->s_fs_info = sbi;
        mutex_init(&sbi->s_bmlock);
+        spin_lock_init(&sbi->symlink_lock);
        if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block,
                                &blocksize,&sbi->s_prefix,
                                sbi->s_volume, &mount_flags)) {
                printk(KERN_ERR "AFFS: Error parsing options\n");
+                kfree(sbi->s_prefix);
+                kfree(sbi);
                return -EINVAL;
        }
        /* N.B. after this point s_prefix must be released */
@@ -516,14 +518,18 @@ affs_remount(struct super_block *sb, int *flags, char *data)
        unsigned long            mount_flags;
        int                      res = 0;
        char                    *new_opts = kstrdup(data, GFP_KERNEL);
+        char                     volume[32];
+        char                    *prefix = NULL;
        pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data);
        *flags |= MS_NODIRATIME;
+        memcpy(volume, sbi->s_volume, 32);
        if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block,
-                           &blocksize, &sbi->s_prefix, sbi->s_volume,
+                           &blocksize, &prefix, volume,
                           &mount_flags)) {
+                kfree(prefix);
                kfree(new_opts);
                return -EINVAL;
        }
@@ -534,6 +540,14 @@ affs_remount(struct super_block *sb, int *flags, char *data)
        sbi->s_mode  = mode;
        sbi->s_uid   = uid;
        sbi->s_gid   = gid;
+        /* protect against readers */
+        spin_lock(&sbi->symlink_lock);
+        if (prefix) {
+                kfree(sbi->s_prefix);
+                sbi->s_prefix = prefix;
+        }
+        memcpy(sbi->s_volume, volume, 32);
+        spin_unlock(&sbi->symlink_lock);
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
                unlock_kernel();
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index 41782539c907..ee00f08c4f53 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -20,7 +20,6 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
        int                      i, j;
        char                     c;
        char                     lc;
-        char                    *pf;
        pr_debug("AFFS: follow_link(ino=%lu)\n",inode->i_ino);
@@ -32,11 +31,15 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
        j  = 0;
        lf = (struct slink_front *)bh->b_data;
        lc = 0;
-        pf = AFFS_SB(inode->i_sb)->s_prefix ? AFFS_SB(inode->i_sb)->s_prefix : "/";
        if (strchr(lf->symname,':')) {  /* Handle assign or volume name */
+                struct affs_sb_info *sbi = AFFS_SB(inode->i_sb);
+                char *pf;
+                spin_lock(&sbi->symlink_lock);
+                pf = sbi->s_prefix ? sbi->s_prefix : "/";
                while (i < 1023 && (c = pf[i]))
                        link[i++] = c;
+                spin_unlock(&sbi->symlink_lock);
                while (i < 1023 && lf->symname[j] != ':')
                        link[i++] = lf->symname[j++];
                if (i < 1023)
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index e2b1d3f16519..0fb315dd4d2a 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -9,7 +9,6 @@
 * 2 of the License, or (at your option) any later version.
 */
-#include <linux/slab.h>
 #include <linux/sched.h>
 #include "internal.h"
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index eb765489164f..a3bcec75c54a 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/ip.h>
 #include "internal.h"
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 88067f36e5e7..adc1cb771b57 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/ctype.h>
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 39b301662f22..0df9bc2b724d 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -12,10 +12,10 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
+#include <linux/gfp.h>
 #include "internal.h"
 static int afs_readpage(struct file *file, struct page *page);
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 023b95b0d9d7..4bd0218473a9 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -10,6 +10,7 @@
 */
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/circ_buf.h>
 #include "internal.h"
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index c048f0658751..d00b312e3110 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -16,7 +16,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/sched.h>
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 6ece2a13bf71..c54dad4e6063 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -733,7 +733,6 @@ extern int afs_write_end(struct file *file, struct address_space *mapping,
                        struct page *page, void *fsdata);
 extern int afs_writepage(struct page *, struct writeback_control *);
 extern int afs_writepages(struct address_space *, struct writeback_control *);
-extern int afs_write_inode(struct inode *, int);
 extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
 extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
                              unsigned long, loff_t);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 5ffb570cd3a8..5e813a816ce4 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -12,11 +12,11 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/gfp.h>
 #include "internal.h"
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index bde3f19c0995..67cf810e0fd6 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -9,6 +9,7 @@
 * 2 of the License, or (at your option) any later version.
 */
+#include <linux/slab.h>
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
 #include <rxrpc/packet.h>
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 3ef504370034..bb4ed144d0e4 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -189,8 +189,9 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
        if (!permits)
                goto out_unlock;
-        memcpy(permits->permits, xpermits->permits,
+        if (xpermits)
-               count * sizeof(struct afs_permit));
+                memcpy(permits->permits, xpermits->permits,
+                        count * sizeof(struct afs_permit));
        _debug("key %x access %x",
               key_serial(key), vnode->status.caller_access);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index e1ea1c240b6a..14f6431598ad 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -48,7 +48,6 @@ struct file_system_type afs_fs_type = {
 static const struct super_operations afs_super_ops = {
        .statfs         = afs_statfs,
        .alloc_inode    = afs_alloc_inode,
-        .write_inode    = afs_write_inode,
        .destroy_inode  = afs_destroy_inode,
        .clear_inode    = afs_clear_inode,
        .put_super      = afs_put_super,
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 36c1306e09e0..340afd0cd182 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -9,6 +9,7 @@
 * 2 of the License, or (at your option) any later version.
 */
+#include <linux/gfp.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include "internal.h"
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 6e689208def2..9ac260d1361d 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include "internal.h"
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 2f05c4fc2a70..25cf4c3f4ff7 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include "internal.h"
diff --git a/fs/afs/write.c b/fs/afs/write.c
index c63a3c8beb73..3bed54a294d4 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -585,27 +585,6 @@ int afs_writepages(struct address_space *mapping,
 }
 /*
- * write an inode back
- */
-int afs_write_inode(struct inode *inode, int sync)
-{
-        struct afs_vnode *vnode = AFS_FS_I(inode);
-        int ret;
-        _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
-        ret = 0;
-        if (sync) {
-                ret = filemap_fdatawait(inode->i_mapping);
-                if (ret < 0)
-                        __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-        }
-        _leave(" = %d", ret);
-        return ret;
-}
-/*
 * completion of write to server
 */
 void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
@@ -671,7 +650,6 @@ ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov,
        struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
        ssize_t result;
        size_t count = iov_length(iov, nr_segs);
-        int ret;
        _enter("{%x.%u},{%zu},%lu,",
               vnode->fid.vid, vnode->fid.vnode, count, nr_segs);
@@ -691,13 +669,6 @@ ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov,
                return result;
        }
-        /* return error values for O_SYNC and IS_SYNC() */
-        if (IS_SYNC(&vnode->vfs_inode) || iocb->ki_filp->f_flags & O_SYNC) {
-                ret = afs_fsync(iocb->ki_filp, dentry, 1);
-                if (ret < 0)
-                        result = ret;
-        }
        _leave(" = %zd", result);
        return result;
 }
diff --git a/fs/aio.c b/fs/aio.c
index 02a2c9340573..1cf12b3dd83a 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -15,6 +15,7 @@
 #include <linux/aio_abi.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/backing-dev.h>
 #include <linux/uio.h>
 #define DEBUG 0
@@ -32,6 +33,9 @@
 #include <linux/workqueue.h>
 #include <linux/security.h>
 #include <linux/eventfd.h>
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/hash.h>
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -60,6 +64,14 @@ static DECLARE_WORK(fput_work, aio_fput_routine);
 static DEFINE_SPINLOCK(fput_lock);
 static LIST_HEAD(fput_head);
+#define AIO_BATCH_HASH_BITS     3 /* allocated on-stack, so don't go crazy */
+#define AIO_BATCH_HASH_SIZE     (1 << AIO_BATCH_HASH_BITS)
+struct aio_batch_entry {
+        struct hlist_node list;
+        struct address_space *mapping;
+};
+mempool_t *abe_pool;
 static void aio_kick_handler(struct work_struct *);
 static void aio_queue_work(struct kioctx *);
@@ -73,6 +85,8 @@ static int __init aio_setup(void)
        kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
        aio_wq = create_workqueue("aio");
+        abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
+        BUG_ON(!abe_pool);
        pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
@@ -697,10 +711,8 @@ static ssize_t aio_run_iocb(struct kiocb *iocb)
         */
        ret = retry(iocb);
-        if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
+        if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED)
-                BUG_ON(!list_empty(&iocb->ki_wait.task_list));
                aio_complete(iocb, ret, 0);
-        }
 out:
        spin_lock_irq(&ctx->ctx_lock);
@@ -852,13 +864,6 @@ static void try_queue_kicked_iocb(struct kiocb *iocb)
        unsigned long flags;
        int run = 0;
-        /* We're supposed to be the only path putting the iocb back on the run
-         * list.  If we find that the iocb is *back* on a wait queue already
-         * than retry has happened before we could queue the iocb.  This also
-         * means that the retry could have completed and freed our iocb, no
-         * good. */
-        BUG_ON((!list_empty(&iocb->ki_wait.task_list)));
        spin_lock_irqsave(&ctx->ctx_lock, flags);
        /* set this inside the lock so that we can't race with aio_run_iocb()
         * testing it and putting the iocb on the run list under the lock */
@@ -872,7 +877,7 @@ static void try_queue_kicked_iocb(struct kiocb *iocb)
 /*
 * kick_iocb:
 *      Called typically from a wait queue callback context
- *      (aio_wake_function) to trigger a retry of the iocb.
+ *      to trigger a retry of the iocb.
 *      The retry is usually executed by aio workqueue
 *      threads (See aio_kick_handler).
 */
@@ -1506,33 +1511,44 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
        return 0;
 }
-/*
+static void aio_batch_add(struct address_space *mapping,
- * aio_wake_function:
+                          struct hlist_head *batch_hash)
- *      wait queue callback function for aio notification,
+{
- *      Simply triggers a retry of the operation via kick_iocb.
+        struct aio_batch_entry *abe;
- *
+        struct hlist_node *pos;
- *      This callback is specified in the wait queue entry in
+        unsigned bucket;
- *      a kiocb.
- *
+        bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS);
- * Note:
+        hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) {
- * This routine is executed with the wait queue lock held.
+                if (abe->mapping == mapping)
- * Since kick_iocb acquires iocb->ctx->ctx_lock, it nests
+                        return;
- * the ioctx lock inside the wait queue lock. This is safe
+        }
- * because this callback isn't used for wait queues which
- * are nested inside ioctx lock (i.e. ctx->wait)
+        abe = mempool_alloc(abe_pool, GFP_KERNEL);
- */
+        BUG_ON(!igrab(mapping->host));
-static int aio_wake_function(wait_queue_t *wait, unsigned mode,
+        abe->mapping = mapping;
-                             int sync, void *key)
+        hlist_add_head(&abe->list, &batch_hash[bucket]);
+        return;
+}
+static void aio_batch_free(struct hlist_head *batch_hash)
 {
-        struct kiocb *iocb = container_of(wait, struct kiocb, ki_wait);
+        struct aio_batch_entry *abe;
+        struct hlist_node *pos, *n;
+        int i;
-        list_del_init(&wait->task_list);
+        for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
-        kick_iocb(iocb);
+                hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
-        return 1;
+                        blk_run_address_space(abe->mapping);
+                        iput(abe->mapping->host);
+                        hlist_del(&abe->list);
+                        mempool_free(abe, abe_pool);
+                }
+        }
 }
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-                         struct iocb *iocb)
+                         struct iocb *iocb, struct hlist_head *batch_hash)
 {
        struct kiocb *req;
        struct file *file;
@@ -1592,8 +1608,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
        req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf;
        req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
        req->ki_opcode = iocb->aio_lio_opcode;
-        init_waitqueue_func_entry(&req->ki_wait, aio_wake_function);
-        INIT_LIST_HEAD(&req->ki_wait.task_list);
        ret = aio_setup_iocb(req);
@@ -1608,6 +1622,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                        ;
        }
        spin_unlock_irq(&ctx->ctx_lock);
+        if (req->ki_opcode == IOCB_CMD_PREAD ||
+            req->ki_opcode == IOCB_CMD_PREADV ||
+            req->ki_opcode == IOCB_CMD_PWRITE ||
+            req->ki_opcode == IOCB_CMD_PWRITEV)
+                aio_batch_add(file->f_mapping, batch_hash);
        aio_put_req(req);       /* drop extra ref to req */
        return 0;
@@ -1635,6 +1655,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
        struct kioctx *ctx;
        long ret = 0;
        int i;
+        struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, };
        if (unlikely(nr < 0))
                return -EINVAL;
@@ -1666,10 +1687,11 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
                        break;
                }
-                ret = io_submit_one(ctx, user_iocb, &tmp);
+                ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash);
                if (ret)
                        break;
        }
+        aio_batch_free(batch_hash);
        put_ioctx(ctx);
        return i ? i : ret;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 2ca7a7cafdbf..e4b75d6eda83 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -12,7 +12,6 @@
 #include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/sched.h>
-#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
@@ -35,14 +34,13 @@ static int anon_inodefs_get_sb(struct file_system_type *fs_type, int flags,
                             mnt);
 }
-static int anon_inodefs_delete_dentry(struct dentry *dentry)
+/*
+ * anon_inodefs_dname() is called from d_path().
+ */
+static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
 {
-        /*
+        return dynamic_dname(dentry, buffer, buflen, "anon_inode:%s",
-         * We faked vfs to believe the dentry was hashed when we created it.
+                                dentry->d_name.name);
-         * Now we restore the flag so that dput() will work correctly.
-         */
-        dentry->d_flags |= DCACHE_UNHASHED;
-        return 1;
 }
 static struct file_system_type anon_inode_fs_type = {
@@ -51,7 +49,7 @@ static struct file_system_type anon_inode_fs_type = {
        .kill_sb        = kill_anon_super,
 };
 static const struct dentry_operations anon_inodefs_dentry_operations = {
-        .d_delete       = anon_inodefs_delete_dentry,
+        .d_dname        = anon_inodefs_dname,
 };
 /*
@@ -88,7 +86,7 @@ struct file *anon_inode_getfile(const char *name,
                                void *priv, int flags)
 {
        struct qstr this;
-        struct dentry *dentry;
+        struct path path;
        struct file *file;
        int error;
@@ -106,10 +104,11 @@ struct file *anon_inode_getfile(const char *name,
        this.name = name;
        this.len = strlen(name);
        this.hash = 0;
-        dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this);
+        path.dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this);
-        if (!dentry)
+        if (!path.dentry)
                goto err_module;
+        path.mnt = mntget(anon_inode_mnt);
        /*
         * We know the anon_inode inode count is always greater than zero,
         * so we can avoid doing an igrab() and we can use an open-coded
@@ -117,27 +116,24 @@ struct file *anon_inode_getfile(const char *name,
         */
        atomic_inc(&anon_inode_inode->i_count);
-        dentry->d_op = &anon_inodefs_dentry_operations;
+        path.dentry->d_op = &anon_inodefs_dentry_operations;
-        /* Do not publish this dentry inside the global dentry hash table */
+        d_instantiate(path.dentry, anon_inode_inode);
-        dentry->d_flags &= ~DCACHE_UNHASHED;
-        d_instantiate(dentry, anon_inode_inode);
        error = -ENFILE;
-        file = alloc_file(anon_inode_mnt, dentry,
+        file = alloc_file(&path, OPEN_FMODE(flags), fops);
-                          FMODE_READ | FMODE_WRITE, fops);
        if (!file)
                goto err_dput;
        file->f_mapping = anon_inode_inode->i_mapping;
        file->f_pos = 0;
-        file->f_flags = O_RDWR | (flags & O_NONBLOCK);
+        file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
        file->f_version = 0;
        file->private_data = priv;
        return file;
 err_dput:
-        dput(dentry);
+        path_put(&path);
 err_module:
        module_put(fops->owner);
        return ERR_PTR(error);
@@ -212,6 +208,7 @@ static struct inode *anon_inode_mkinode(void)
        inode->i_mode = S_IRUSR | S_IWUSR;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
+        inode->i_flags |= S_PRIVATE;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        return inode;
 }
diff --git a/fs/attr.c b/fs/attr.c
index 96d394bdaddf..0815e93bb487 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -12,7 +12,6 @@
 #include <linux/capability.h>
 #include <linux/fsnotify.h>
 #include <linux/fcntl.h>
-#include <linux/quotaops.h>
 #include <linux/security.h>
 /* Taken over from the old code... */
@@ -82,7 +81,7 @@ int inode_newsize_ok(const struct inode *inode, loff_t offset)
        if (inode->i_size < offset) {
                unsigned long limit;
-                limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+                limit = rlimit(RLIMIT_FSIZE);
                if (limit != RLIM_INFINITY && offset > limit)
                        goto out_sig;
                if (offset > inode->i_sb->s_maxbytes)
@@ -212,14 +211,8 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
                error = inode->i_op->setattr(dentry, attr);
        } else {
                error = inode_change_ok(inode, attr);
-                if (!error) {
+                if (!error)
-                        if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+                        error = inode_setattr(inode, attr);
-                            (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid))
-                                error = vfs_dq_transfer(inode, attr) ?
-                                        -EDQUOT : 0;
-                        if (!error)
-                                error = inode_setattr(inode, attr);
-                }
        }
        if (ia_valid & ATTR_SIZE)
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 4a1401cea0a1..8713c7cfbc79 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -13,6 +13,7 @@
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/param.h>
 #include <linux/time.h>
 #include <linux/smp_lock.h>
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 8f7cdde41733..3d283abf67d7 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -75,6 +75,8 @@ struct autofs_info {
        struct completion expire_complete;
        struct list_head active;
+        int active_count;
        struct list_head expiring;
        struct autofs_sb_info *sbi;
@@ -95,6 +97,7 @@ struct autofs_info {
 #define AUTOFS_INF_EXPIRING     (1<<0) /* dentry is in the process of expiring */
 #define AUTOFS_INF_MOUNTPOINT   (1<<1) /* mountpoint status for direct expire */
+#define AUTOFS_INF_PENDING      (1<<2) /* dentry pending mount */
 struct autofs_wait_queue {
        wait_queue_head_t queue;
@@ -161,7 +164,7 @@ static inline int autofs4_ispending(struct dentry *dentry)
 {
        struct autofs_info *inf = autofs4_dentry_ino(dentry);
-        if (dentry->d_flags & DCACHE_AUTOFS_PENDING)
+        if (inf->flags & AUTOFS_INF_PENDING)
                return 1;
        if (inf->flags & AUTOFS_INF_EXPIRING)
@@ -264,5 +267,31 @@ out:
        return ret;
 }
+static inline void autofs4_add_expiring(struct dentry *dentry)
+{
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        if (ino) {
+                spin_lock(&sbi->lookup_lock);
+                if (list_empty(&ino->expiring))
+                        list_add(&ino->expiring, &sbi->expiring_list);
+                spin_unlock(&sbi->lookup_lock);
+        }
+        return;
+}
+static inline void autofs4_del_expiring(struct dentry *dentry)
+{
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        if (ino) {
+                spin_lock(&sbi->lookup_lock);
+                if (!list_empty(&ino->expiring))
+                        list_del_init(&ino->expiring);
+                spin_unlock(&sbi->lookup_lock);
+        }
+        return;
+}
 void autofs4_dentry_release(struct dentry *);
 extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 00bf8fcb245f..d29b7f6df862 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -22,6 +22,7 @@
 #include <linux/magic.h>
 #include <linux/dcache.h>
 #include <linux/uaccess.h>
+#include <linux/slab.h>
 #include "autofs_i.h"
@@ -544,10 +545,9 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
                        goto out;
                devid = new_encode_dev(path.mnt->mnt_sb->s_dev);
                err = 0;
-                if (path.dentry->d_inode &&
+                if (path.mnt->mnt_root == path.dentry) {
-                    path.mnt->mnt_root == path.dentry) {
                        err = 1;
-                        magic = path.dentry->d_inode->i_sb->s_magic;
+                        magic = path.mnt->mnt_sb->s_magic;
                }
        } else {
                dev_t dev = sbi->sb->s_dev;
@@ -560,10 +560,8 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
                err = have_submounts(path.dentry);
-                if (path.mnt->mnt_mountpoint != path.mnt->mnt_root) {
+                if (follow_down(&path))
-                        if (follow_down(&path))
+                        magic = path.mnt->mnt_sb->s_magic;
-                                magic = path.mnt->mnt_sb->s_magic;
-                }
        }
        param->ismountpoint.out.devid = devid;
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 3da18d453488..a796c9417fb1 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -27,7 +27,7 @@ static inline int autofs4_can_expire(struct dentry *dentry,
                return 0;
        /* No point expiring a pending mount */
-        if (dentry->d_flags & DCACHE_AUTOFS_PENDING)
+        if (ino->flags & AUTOFS_INF_PENDING)
                return 0;
        if (!do_now) {
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 69c8142da838..821b2b955dac 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -49,6 +49,7 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
                ino->dentry = NULL;
                ino->size = 0;
                INIT_LIST_HEAD(&ino->active);
+                ino->active_count = 0;
                INIT_LIST_HEAD(&ino->expiring);
                atomic_set(&ino->count, 0);
        }
@@ -95,63 +96,6 @@ void autofs4_free_ino(struct autofs_info *ino)
        kfree(ino);
 }
-/*
- * Deal with the infamous "Busy inodes after umount ..." message.
- *
- * Clean up the dentry tree. This happens with autofs if the user
- * space program goes away due to a SIGKILL, SIGSEGV etc.
- */
-static void autofs4_force_release(struct autofs_sb_info *sbi)
-{
-        struct dentry *this_parent = sbi->sb->s_root;
-        struct list_head *next;
-        if (!sbi->sb->s_root)
-                return;
-        spin_lock(&dcache_lock);
-repeat:
-        next = this_parent->d_subdirs.next;
-resume:
-        while (next != &this_parent->d_subdirs) {
-                struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
-                /* Negative dentry - don`t care */
-                if (!simple_positive(dentry)) {
-                        next = next->next;
-                        continue;
-                }
-                if (!list_empty(&dentry->d_subdirs)) {
-                        this_parent = dentry;
-                        goto repeat;
-                }
-                next = next->next;
-                spin_unlock(&dcache_lock);
-                DPRINTK("dentry %p %.*s",
-                        dentry, (int)dentry->d_name.len, dentry->d_name.name);
-                dput(dentry);
-                spin_lock(&dcache_lock);
-        }
-        if (this_parent != sbi->sb->s_root) {
-                struct dentry *dentry = this_parent;
-                next = this_parent->d_u.d_child.next;
-                this_parent = this_parent->d_parent;
-                spin_unlock(&dcache_lock);
-                DPRINTK("parent dentry %p %.*s",
-                        dentry, (int)dentry->d_name.len, dentry->d_name.name);
-                dput(dentry);
-                spin_lock(&dcache_lock);
-                goto resume;
-        }
-        spin_unlock(&dcache_lock);
-}
 void autofs4_kill_sb(struct super_block *sb)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(sb);
@@ -168,15 +112,12 @@ void autofs4_kill_sb(struct super_block *sb)
        /* Free wait queues, close pipe */
        autofs4_catatonic_mode(sbi);
-        /* Clean up and release dangling references */
-        autofs4_force_release(sbi);
        sb->s_fs_info = NULL;
        kfree(sbi);
 out_kill_sb:
        DPRINTK("shutting down");
-        kill_anon_super(sb);
+        kill_litter_super(sb);
 }
 static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index b96a3c57359d..109a6c606d92 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -15,6 +15,7 @@
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/param.h>
 #include <linux/time.h>
 #include "autofs_i.h"
@@ -72,6 +73,46 @@ const struct inode_operations autofs4_dir_inode_operations = {
        .rmdir          = autofs4_dir_rmdir,
 };
+static void autofs4_add_active(struct dentry *dentry)
+{
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        if (ino) {
+                spin_lock(&sbi->lookup_lock);
+                if (!ino->active_count) {
+                        if (list_empty(&ino->active))
+                                list_add(&ino->active, &sbi->active_list);
+                }
+                ino->active_count++;
+                spin_unlock(&sbi->lookup_lock);
+        }
+        return;
+}
+static void autofs4_del_active(struct dentry *dentry)
+{
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        if (ino) {
+                spin_lock(&sbi->lookup_lock);
+                ino->active_count--;
+                if (!ino->active_count) {
+                        if (!list_empty(&ino->active))
+                                list_del_init(&ino->active);
+                }
+                spin_unlock(&sbi->lookup_lock);
+        }
+        return;
+}
+static unsigned int autofs4_need_mount(unsigned int flags)
+{
+        unsigned int res = 0;
+        if (flags & (TRIGGER_FLAGS | TRIGGER_INTENTS))
+                res = 1;
+        return res;
+}
 static int autofs4_dir_open(struct inode *inode, struct file *file)
 {
        struct dentry *dentry = file->f_path.dentry;
@@ -93,7 +134,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
         * it.
         */
        spin_lock(&dcache_lock);
-        if (!d_mountpoint(dentry) && __simple_empty(dentry)) {
+        if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
                spin_unlock(&dcache_lock);
                return -ENOENT;
        }
@@ -126,32 +167,32 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
                /* Turn this into a real negative dentry? */
                if (status == -ENOENT) {
-                        spin_lock(&dentry->d_lock);
+                        spin_lock(&sbi->fs_lock);
-                        dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
+                        ino->flags &= ~AUTOFS_INF_PENDING;
-                        spin_unlock(&dentry->d_lock);
+                        spin_unlock(&sbi->fs_lock);
                        return status;
                } else if (status) {
                        /* Return a negative dentry, but leave it "pending" */
                        return status;
                }
        /* Trigger mount for path component or follow link */
-        } else if (dentry->d_flags & DCACHE_AUTOFS_PENDING ||
+        } else if (ino->flags & AUTOFS_INF_PENDING ||
-                        flags & (TRIGGER_FLAGS | TRIGGER_INTENTS) ||
+                        autofs4_need_mount(flags) ||
                        current->link_count) {
                DPRINTK("waiting for mount name=%.*s",
                        dentry->d_name.len, dentry->d_name.name);
-                spin_lock(&dentry->d_lock);
+                spin_lock(&sbi->fs_lock);
-                dentry->d_flags |= DCACHE_AUTOFS_PENDING;
+                ino->flags |= AUTOFS_INF_PENDING;
-                spin_unlock(&dentry->d_lock);
+                spin_unlock(&sbi->fs_lock);
                status = autofs4_wait(sbi, dentry, NFY_MOUNT);
                DPRINTK("mount done status=%d", status);
                if (status) {
-                        spin_lock(&dentry->d_lock);
+                        spin_lock(&sbi->fs_lock);
-                        dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
+                        ino->flags &= ~AUTOFS_INF_PENDING;
-                        spin_unlock(&dentry->d_lock);
+                        spin_unlock(&sbi->fs_lock);
                        return status;
                }
        }
@@ -160,9 +201,9 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
        if (ino)
                ino->last_used = jiffies;
-        spin_lock(&dentry->d_lock);
+        spin_lock(&sbi->fs_lock);
-        dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
+        ino->flags &= ~AUTOFS_INF_PENDING;
-        spin_unlock(&dentry->d_lock);
+        spin_unlock(&sbi->fs_lock);
        return 0;
 }
@@ -202,19 +243,24 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
        autofs4_expire_wait(dentry);
        /* We trigger a mount for almost all flags */
-        lookup_type = nd->flags & (TRIGGER_FLAGS | TRIGGER_INTENTS);
+        lookup_type = autofs4_need_mount(nd->flags);
-        if (!(lookup_type || dentry->d_flags & DCACHE_AUTOFS_PENDING))
+        spin_lock(&sbi->fs_lock);
+        spin_lock(&dcache_lock);
+        if (!(lookup_type || ino->flags & AUTOFS_INF_PENDING)) {
+                spin_unlock(&dcache_lock);
+                spin_unlock(&sbi->fs_lock);
                goto follow;
+        }
        /*
         * If the dentry contains directories then it is an autofs
         * multi-mount with no root mount offset. So don't try to
         * mount it again.
         */
-        spin_lock(&dcache_lock);
+        if (ino->flags & AUTOFS_INF_PENDING ||
-        if (dentry->d_flags & DCACHE_AUTOFS_PENDING ||
+            (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) {
-            (!d_mountpoint(dentry) && __simple_empty(dentry))) {
                spin_unlock(&dcache_lock);
+                spin_unlock(&sbi->fs_lock);
                status = try_to_fill_dentry(dentry, 0);
                if (status)
@@ -223,6 +269,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
                goto follow;
        }
        spin_unlock(&dcache_lock);
+        spin_unlock(&sbi->fs_lock);
 follow:
        /*
         * If there is no root mount it must be an autofs
@@ -294,8 +341,7 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
        /* Check for a non-mountpoint directory with no contents */
        spin_lock(&dcache_lock);
        if (S_ISDIR(dentry->d_inode->i_mode) &&
-            !d_mountpoint(dentry) && 
+            !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
-            __simple_empty(dentry)) {
                DPRINTK("dentry=%p %.*s, emptydir",
                         dentry, dentry->d_name.len, dentry->d_name.name);
                spin_unlock(&dcache_lock);
@@ -359,8 +405,11 @@ static const struct dentry_operations autofs4_dentry_operations = {
        .d_release      = autofs4_dentry_release,
 };
-static struct dentry *autofs4_lookup_active(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name)
+static struct dentry *autofs4_lookup_active(struct dentry *dentry)
 {
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct dentry *parent = dentry->d_parent;
+        struct qstr *name = &dentry->d_name;
        unsigned int len = name->len;
        unsigned int hash = name->hash;
        const unsigned char *str = name->name;
@@ -371,23 +420,23 @@ static struct dentry *autofs4_lookup_active(struct autofs_sb_info *sbi, struct d
        head = &sbi->active_list;
        list_for_each(p, head) {
                struct autofs_info *ino;
-                struct dentry *dentry;
+                struct dentry *active;
                struct qstr *qstr;
                ino = list_entry(p, struct autofs_info, active);
-                dentry = ino->dentry;
+                active = ino->dentry;
-                spin_lock(&dentry->d_lock);
+                spin_lock(&active->d_lock);
                /* Already gone? */
-                if (atomic_read(&dentry->d_count) == 0)
+                if (atomic_read(&active->d_count) == 0)
                        goto next;
-                qstr = &dentry->d_name;
+                qstr = &active->d_name;
-                if (dentry->d_name.hash != hash)
+                if (active->d_name.hash != hash)
                        goto next;
-                if (dentry->d_parent != parent)
+                if (active->d_parent != parent)
                        goto next;
                if (qstr->len != len)
@@ -395,15 +444,15 @@ static struct dentry *autofs4_lookup_active(struct autofs_sb_info *sbi, struct d
                if (memcmp(qstr->name, str, len))
                        goto next;
-                if (d_unhashed(dentry)) {
+                if (d_unhashed(active)) {
-                        dget(dentry);
+                        dget(active);
-                        spin_unlock(&dentry->d_lock);
+                        spin_unlock(&active->d_lock);
                        spin_unlock(&sbi->lookup_lock);
                        spin_unlock(&dcache_lock);
-                        return dentry;
+                        return active;
                }
 next:
-                spin_unlock(&dentry->d_lock);
+                spin_unlock(&active->d_lock);
        }
        spin_unlock(&sbi->lookup_lock);
        spin_unlock(&dcache_lock);
@@ -411,8 +460,11 @@ next:
        return NULL;
 }
-static struct dentry *autofs4_lookup_expiring(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name)
+static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
 {
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct dentry *parent = dentry->d_parent;
+        struct qstr *name = &dentry->d_name;
        unsigned int len = name->len;
        unsigned int hash = name->hash;
        const unsigned char *str = name->name;
@@ -423,23 +475,23 @@ static struct dentry *autofs4_lookup_expiring(struct autofs_sb_info *sbi, struct
        head = &sbi->expiring_list;
        list_for_each(p, head) {
                struct autofs_info *ino;
-                struct dentry *dentry;
+                struct dentry *expiring;
                struct qstr *qstr;
                ino = list_entry(p, struct autofs_info, expiring);
-                dentry = ino->dentry;
+                expiring = ino->dentry;
-                spin_lock(&dentry->d_lock);
+                spin_lock(&expiring->d_lock);
                /* Bad luck, we've already been dentry_iput */
-                if (!dentry->d_inode)
+                if (!expiring->d_inode)
                        goto next;
-                qstr = &dentry->d_name;
+                qstr = &expiring->d_name;
-                if (dentry->d_name.hash != hash)
+                if (expiring->d_name.hash != hash)
                        goto next;
-                if (dentry->d_parent != parent)
+                if (expiring->d_parent != parent)
                        goto next;
                if (qstr->len != len)
@@ -447,15 +499,15 @@ static struct dentry *autofs4_lookup_expiring(struct autofs_sb_info *sbi, struct
                if (memcmp(qstr->name, str, len))
                        goto next;
-                if (d_unhashed(dentry)) {
+                if (d_unhashed(expiring)) {
-                        dget(dentry);
+                        dget(expiring);
-                        spin_unlock(&dentry->d_lock);
+                        spin_unlock(&expiring->d_lock);
                        spin_unlock(&sbi->lookup_lock);
                        spin_unlock(&dcache_lock);
-                        return dentry;
+                        return expiring;
                }
 next:
-                spin_unlock(&dentry->d_lock);
+                spin_unlock(&expiring->d_lock);
        }
        spin_unlock(&sbi->lookup_lock);
        spin_unlock(&dcache_lock);
@@ -468,7 +520,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 {
        struct autofs_sb_info *sbi;
        struct autofs_info *ino;
-        struct dentry *expiring, *unhashed;
+        struct dentry *expiring, *active;
        int oz_mode;
        DPRINTK("name = %.*s",
@@ -484,10 +536,11 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
        DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
                 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
-        unhashed = autofs4_lookup_active(sbi, dentry->d_parent, &dentry->d_name);
+        active = autofs4_lookup_active(dentry);
-        if (unhashed)
+        if (active) {
-                dentry = unhashed;
+                dentry = active;
-        else {
+                ino = autofs4_dentry_ino(dentry);
+        } else {
                /*
                 * Mark the dentry incomplete but don't hash it. We do this
                 * to serialize our inode creation operations (symlink and
@@ -513,36 +566,28 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
                dentry->d_fsdata = ino;
                ino->dentry = dentry;
-                spin_lock(&sbi->lookup_lock);
+                autofs4_add_active(dentry);
-                list_add(&ino->active, &sbi->active_list);
-                spin_unlock(&sbi->lookup_lock);
                d_instantiate(dentry, NULL);
        }
        if (!oz_mode) {
                mutex_unlock(&dir->i_mutex);
-                expiring = autofs4_lookup_expiring(sbi,
+                expiring = autofs4_lookup_expiring(dentry);
-                                                   dentry->d_parent,
-                                                   &dentry->d_name);
                if (expiring) {
                        /*
                         * If we are racing with expire the request might not
                         * be quite complete but the directory has been removed
                         * so it must have been successful, so just wait for it.
                         */
-                        ino = autofs4_dentry_ino(expiring);
                        autofs4_expire_wait(expiring);
-                        spin_lock(&sbi->lookup_lock);
+                        autofs4_del_expiring(expiring);
-                        if (!list_empty(&ino->expiring))
-                                list_del_init(&ino->expiring);
-                        spin_unlock(&sbi->lookup_lock);
                        dput(expiring);
                }
-                spin_lock(&dentry->d_lock);
+                spin_lock(&sbi->fs_lock);
-                dentry->d_flags |= DCACHE_AUTOFS_PENDING;
+                ino->flags |= AUTOFS_INF_PENDING;
-                spin_unlock(&dentry->d_lock);
+                spin_unlock(&sbi->fs_lock);
                if (dentry->d_op && dentry->d_op->d_revalidate)
                        (dentry->d_op->d_revalidate)(dentry, nd);
                mutex_lock(&dir->i_mutex);
@@ -552,22 +597,22 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
         * If we are still pending, check if we had to handle
         * a signal. If so we can force a restart..
         */
-        if (dentry->d_flags & DCACHE_AUTOFS_PENDING) {
+        if (ino->flags & AUTOFS_INF_PENDING) {
                /* See if we were interrupted */
                if (signal_pending(current)) {
                        sigset_t *sigset = &current->pending.signal;
                        if (sigismember (sigset, SIGKILL) ||
                            sigismember (sigset, SIGQUIT) ||
                            sigismember (sigset, SIGINT)) {
-                            if (unhashed)
+                            if (active)
-                                dput(unhashed);
+                                dput(active);
                            return ERR_PTR(-ERESTARTNOINTR);
                        }
                }
                if (!oz_mode) {
-                        spin_lock(&dentry->d_lock);
+                        spin_lock(&sbi->fs_lock);
-                        dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
+                        ino->flags &= ~AUTOFS_INF_PENDING;
-                        spin_unlock(&dentry->d_lock);
+                        spin_unlock(&sbi->fs_lock);
                }
        }
@@ -592,14 +637,14 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
                else
                        dentry = ERR_PTR(-ENOENT);
-                if (unhashed)
+                if (active)
-                        dput(unhashed);
+                        dput(active);
                return dentry;
        }
-        if (unhashed)
+        if (active)
-                return unhashed;
+                return active;
        return NULL;
 }
@@ -624,10 +669,7 @@ static int autofs4_dir_symlink(struct inode *dir,
        if (!ino)
                return -ENOMEM;
-        spin_lock(&sbi->lookup_lock);
+        autofs4_del_active(dentry);
-        if (!list_empty(&ino->active))
-                list_del_init(&ino->active);
-        spin_unlock(&sbi->lookup_lock);
        ino->size = strlen(symname);
        cp = kmalloc(ino->size + 1, GFP_KERNEL);
@@ -705,10 +747,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
        dir->i_mtime = CURRENT_TIME;
        spin_lock(&dcache_lock);
-        spin_lock(&sbi->lookup_lock);
+        autofs4_add_expiring(dentry);
-        if (list_empty(&ino->expiring))
-                list_add(&ino->expiring, &sbi->expiring_list);
-        spin_unlock(&sbi->lookup_lock);
        spin_lock(&dentry->d_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
@@ -734,10 +773,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
                spin_unlock(&dcache_lock);
                return -ENOTEMPTY;
        }
-        spin_lock(&sbi->lookup_lock);
+        autofs4_add_expiring(dentry);
-        if (list_empty(&ino->expiring))
-                list_add(&ino->expiring, &sbi->expiring_list);
-        spin_unlock(&sbi->lookup_lock);
        spin_lock(&dentry->d_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
@@ -775,10 +811,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        if (!ino)
                return -ENOMEM;
-        spin_lock(&sbi->lookup_lock);
+        autofs4_del_active(dentry);
-        if (!list_empty(&ino->active))
-                list_del_init(&ino->active);
-        spin_unlock(&sbi->lookup_lock);
        inode = autofs4_get_inode(dir->i_sb, ino);
        if (!inode) {
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index e3287d0d1a58..59096b5e0fc7 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -11,7 +11,6 @@
 */
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include <linux/string.h>
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 33baf27fac78..34ddda888e63 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -873,6 +873,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
        brelse(bh);
      unacquire_priv_sbp:
+        kfree(befs_sb->mount_opts.iocharset);
        kfree(sb->s_fs_info);
      unacquire_none:
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 6f60336c6628..f22a7d3dc362 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -15,6 +15,7 @@
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
+#include <linux/writeback.h>
 #include <asm/uaccess.h>
 #include "bfs.h"
@@ -98,7 +99,7 @@ error:
        return ERR_PTR(-EIO);
 }
-static int bfs_write_inode(struct inode *inode, int wait)
+static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        struct bfs_sb_info *info = BFS_SB(inode->i_sb);
        unsigned int ino = (u16)inode->i_ino;
@@ -147,7 +148,7 @@ static int bfs_write_inode(struct inode *inode, int wait)
        di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
        mark_buffer_dirty(bh);
-        if (wait) {
+        if (wbc->sync_mode == WB_SYNC_ALL) {
                sync_dirty_buffer(bh);
                if (buffer_req(bh) && !buffer_uptodate(bh))
                        err = -EIO;
@@ -353,35 +354,35 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
        struct inode *inode;
        unsigned i, imap_len;
        struct bfs_sb_info *info;
-        long ret = -EINVAL;
+        int ret = -EINVAL;
        unsigned long i_sblock, i_eblock, i_eoff, s_size;
        info = kzalloc(sizeof(*info), GFP_KERNEL);
        if (!info)
                return -ENOMEM;
+        mutex_init(&info->bfs_lock);
        s->s_fs_info = info;
        sb_set_blocksize(s, BFS_BSIZE);
-        bh = sb_bread(s, 0);
+        info->si_sbh = sb_bread(s, 0);
-        if(!bh)
+        if (!info->si_sbh)
                goto out;
-        bfs_sb = (struct bfs_super_block *)bh->b_data;
+        bfs_sb = (struct bfs_super_block *)info->si_sbh->b_data;
        if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) {
                if (!silent)
                        printf("No BFS filesystem on %s (magic=%08x)\n", 
                                s->s_id,  le32_to_cpu(bfs_sb->s_magic));
-                goto out;
+                goto out1;
        }
        if (BFS_UNCLEAN(bfs_sb, s) && !silent)
                printf("%s is unclean, continuing\n", s->s_id);
        s->s_magic = BFS_MAGIC;
-        info->si_sbh = bh;
        if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) {
                printf("Superblock is corrupted\n");
-                goto out;
+                goto out1;
        }
        info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) /
@@ -390,7 +391,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
        imap_len = (info->si_lasti / 8) + 1;
        info->si_imap = kzalloc(imap_len, GFP_KERNEL);
        if (!info->si_imap)
-                goto out;
+                goto out1;
        for (i = 0; i < BFS_ROOT_INO; i++)
                set_bit(i, info->si_imap);
@@ -398,15 +399,13 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
        inode = bfs_iget(s, BFS_ROOT_INO);
        if (IS_ERR(inode)) {
                ret = PTR_ERR(inode);
-                kfree(info->si_imap);
+                goto out2;
-                goto out;
        }
        s->s_root = d_alloc_root(inode);
        if (!s->s_root) {
                iput(inode);
                ret = -ENOMEM;
-                kfree(info->si_imap);
+                goto out2;
-                goto out;
        }
        info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS;
@@ -419,10 +418,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
        bh = sb_bread(s, info->si_blocks - 1);
        if (!bh) {
                printf("Last block not available: %lu\n", info->si_blocks - 1);
-                iput(inode);
                ret = -EIO;
-                kfree(info->si_imap);
+                goto out3;
-                goto out;
        }
        brelse(bh);
@@ -459,11 +456,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
                        printf("Inode 0x%08x corrupted\n", i);
                        brelse(bh);
-                        s->s_root = NULL;
+                        ret = -EIO;
-                        kfree(info->si_imap);
+                        goto out3;
-                        kfree(info);
-                        s->s_fs_info = NULL;
-                        return -EIO;
                }
                if (!di->i_ino) {
@@ -483,11 +477,17 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
                s->s_dirt = 1;
        } 
        dump_imap("read_super", s);
-        mutex_init(&info->bfs_lock);
        return 0;
+out3:
+        dput(s->s_root);
+        s->s_root = NULL;
+out2:
+        kfree(info->si_imap);
+out1:
+        brelse(info->si_sbh);
 out:
-        brelse(bh);
+        mutex_destroy(&info->bfs_lock);
        kfree(info);
        s->s_fs_info = NULL;
        return ret;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index b639dcf7c778..f96eff04e11a 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -20,10 +20,11 @@
 #include <linux/fcntl.h>
 #include <linux/ptrace.h>
 #include <linux/user.h>
-#include <linux/slab.h>
 #include <linux/binfmts.h>
 #include <linux/personality.h>
 #include <linux/init.h>
+#include <linux/coredump.h>
+#include <linux/slab.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -32,7 +33,7 @@
 static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
 static int load_aout_library(struct file*);
-static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit);
+static int aout_core_dump(struct coredump_params *cprm);
 static struct linux_binfmt aout_format = {
        .module         = THIS_MODULE,
@@ -60,26 +61,6 @@ static int set_brk(unsigned long start, unsigned long end)
 }
 /*
- * These are the only things you should do on a core-file: use only these
- * macros to write out all the necessary info.
- */
-static int dump_write(struct file *file, const void *addr, int nr)
-{
-        return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-}
-#define DUMP_WRITE(addr, nr)    \
-        if (!dump_write(file, (void *)(addr), (nr))) \
-                goto end_coredump;
-#define DUMP_SEEK(offset) \
-if (file->f_op->llseek) { \
-        if (file->f_op->llseek(file,(offset),0) != (offset)) \
-                goto end_coredump; \
-} else file->f_pos = (offset)
-/*
 * Routine writes a core dump image in the current directory.
 * Currently only a stub-function.
 *
@@ -89,18 +70,21 @@ if (file->f_op->llseek) { \
 * dumping of the process results in another error..
 */
-static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit)
+static int aout_core_dump(struct coredump_params *cprm)
 {
+        struct file *file = cprm->file;
        mm_segment_t fs;
        int has_dumped = 0;
-        unsigned long dump_start, dump_size;
+        void __user *dump_start;
+        int dump_size;
        struct user dump;
 #ifdef __alpha__
-#       define START_DATA(u)    (u.start_data)
+#       define START_DATA(u)    ((void __user *)u.start_data)
 #else
-#       define START_DATA(u)    ((u.u_tsize << PAGE_SHIFT) + u.start_code)
+#       define START_DATA(u)    ((void __user *)((u.u_tsize << PAGE_SHIFT) + \
+                                 u.start_code))
 #endif
-#       define START_STACK(u)   (u.start_stack)
+#       define START_STACK(u)   ((void __user *)u.start_stack)
        fs = get_fs();
        set_fs(KERNEL_DS);
@@ -108,47 +92,52 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, u
        current->flags |= PF_DUMPCORE;
        strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
        dump.u_ar0 = offsetof(struct user, regs);
-        dump.signal = signr;
+        dump.signal = cprm->signr;
-        aout_dump_thread(regs, &dump);
+        aout_dump_thread(cprm->regs, &dump);
 /* If the size of the dump file exceeds the rlimit, then see what would happen
   if we wrote the stack, but not the data area.  */
-        if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > limit)
+        if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > cprm->limit)
                dump.u_dsize = 0;
 /* Make sure we have enough room to write the stack and data areas. */
-        if ((dump.u_ssize + 1) * PAGE_SIZE > limit)
+        if ((dump.u_ssize + 1) * PAGE_SIZE > cprm->limit)
                dump.u_ssize = 0;
 /* make sure we actually have a data and stack area to dump */
        set_fs(USER_DS);
-        if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
+        if (!access_ok(VERIFY_READ, START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
                dump.u_dsize = 0;
-        if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
+        if (!access_ok(VERIFY_READ, START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
                dump.u_ssize = 0;
        set_fs(KERNEL_DS);
 /* struct user */
-        DUMP_WRITE(&dump,sizeof(dump));
+        if (!dump_write(file, &dump, sizeof(dump)))
+                goto end_coredump;
 /* Now dump all of the user data.  Include malloced stuff as well */
-        DUMP_SEEK(PAGE_SIZE);
+        if (!dump_seek(cprm->file, PAGE_SIZE - sizeof(dump)))
+                goto end_coredump;
 /* now we start writing out the user space info */
        set_fs(USER_DS);
 /* Dump the data area */
        if (dump.u_dsize != 0) {
                dump_start = START_DATA(dump);
                dump_size = dump.u_dsize << PAGE_SHIFT;
-                DUMP_WRITE(dump_start,dump_size);
+                if (!dump_write(file, dump_start, dump_size))
+                        goto end_coredump;
        }
 /* Now prepare to dump the stack area */
        if (dump.u_ssize != 0) {
                dump_start = START_STACK(dump);
                dump_size = dump.u_ssize << PAGE_SHIFT;
-                DUMP_WRITE(dump_start,dump_size);
+                if (!dump_write(file, dump_start, dump_size))
+                        goto end_coredump;
        }
 /* Finally dump the task struct.  Not be used by gdb, but could be useful */
        set_fs(KERNEL_DS);
-        DUMP_WRITE(current,sizeof(*current));
+        if (!dump_write(file, current, sizeof(*current)))
+                goto end_coredump;
 end_coredump:
        set_fs(fs);
        return has_dumped;
@@ -246,7 +235,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
         * size limits imposed on them by creating programs with large
         * arrays in the data or bss.
         */
-        rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
+        rlim = rlimit(RLIMIT_DATA);
        if (rlim >= RLIM_INFINITY)
                rlim = ~0;
        if (ex.a_data + ex.a_bss > rlim)
@@ -263,6 +252,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 #else
        set_personality(PER_LINUX);
 #endif
+        setup_new_exec(bprm);
        current->mm->end_code = ex.a_text +
                (current->mm->start_code = N_TXTADDR(ex));
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b9b3bb51b1e4..535e763ab1a6 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -31,6 +31,7 @@
 #include <linux/random.h>
 #include <linux/elf.h>
 #include <linux/utsname.h>
+#include <linux/coredump.h>
 #include <asm/uaccess.h>
 #include <asm/param.h>
 #include <asm/page.h>
@@ -44,8 +45,8 @@ static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
 * If we don't support core dumping, then supply a NULL so we
 * don't even try.
 */
-#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+#ifdef CONFIG_ELF_CORE
-static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit);
+static int elf_core_dump(struct coredump_params *cprm);
 #else
 #define elf_core_dump   NULL
 #endif
@@ -662,27 +663,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                        if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
                                goto out_free_interp;
-                        /*
-                         * The early SET_PERSONALITY here is so that the lookup
-                         * for the interpreter happens in the namespace of the 
-                         * to-be-execed image.  SET_PERSONALITY can select an
-                         * alternate root.
-                         *
-                         * However, SET_PERSONALITY is NOT allowed to switch
-                         * this task into the new images's memory mapping
-                         * policy - that is, TASK_SIZE must still evaluate to
-                         * that which is appropriate to the execing application.
-                         * This is because exit_mmap() needs to have TASK_SIZE
-                         * evaluate to the size of the old image.
-                         *
-                         * So if (say) a 64-bit application is execing a 32-bit
-                         * application it is the architecture's responsibility
-                         * to defer changing the value of TASK_SIZE until the
-                         * switch really is going to happen - do this in
-                         * flush_thread().      - akpm
-                         */
-                        SET_PERSONALITY(loc->elf_ex);
                        interpreter = open_exec(elf_interpreter);
                        retval = PTR_ERR(interpreter);
                        if (IS_ERR(interpreter))
@@ -730,9 +710,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                /* Verify the interpreter has a valid arch */
                if (!elf_check_arch(&loc->interp_elf_ex))
                        goto out_free_dentry;
-        } else {
-                /* Executables without an interpreter also need a personality  */
-                SET_PERSONALITY(loc->elf_ex);
        }
        /* Flush all traces of the currently running executable */
@@ -752,7 +729,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
                current->flags |= PF_RANDOMIZE;
-        arch_pick_mmap_layout(current->mm);
+        setup_new_exec(bprm);
        /* Do this so that we can load the interpreter, if need be.  We will
           change some of these later */
@@ -767,7 +745,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        
        current->mm->start_stack = bprm->p;
-        /* Now we do a little grungy work by mmaping the ELF image into
+        /* Now we do a little grungy work by mmapping the ELF image into
           the correct location in memory. */
        for(i = 0, elf_ppnt = elf_phdata;
            i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
@@ -1101,48 +1079,13 @@ out:
        return error;
 }
-/*
+#ifdef CONFIG_ELF_CORE
- * Note that some platforms still use traditional core dumps and not
- * the ELF core dump.  Each platform can select it as appropriate.
- */
-#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
 /*
 * ELF core dumper
 *
 * Modelled on fs/exec.c:aout_core_dump()
 * Jeremy Fitzhardinge <jeremy@sw.oz.au>
 */
-/*
- * These are the only things you should do on a core-file: use only these
- * functions to write out all the necessary info.
- */
-static int dump_write(struct file *file, const void *addr, int nr)
-{
-        return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-}
-static int dump_seek(struct file *file, loff_t off)
-{
-        if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
-                if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
-                        return 0;
-        } else {
-                char *buf = (char *)get_zeroed_page(GFP_KERNEL);
-                if (!buf)
-                        return 0;
-                while (off > 0) {
-                        unsigned long n = off;
-                        if (n > PAGE_SIZE)
-                                n = PAGE_SIZE;
-                        if (!dump_write(file, buf, n))
-                                return 0;
-                        off -= n;
-                }
-                free_page((unsigned long)buf);
-        }
-        return 1;
-}
 /*
 * Decide what to dump of a segment, part, all or none.
@@ -1277,10 +1220,6 @@ static int writenote(struct memelfnote *men, struct file *file,
 }
 #undef DUMP_WRITE
-#define DUMP_WRITE(addr, nr)    \
-        if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \
-                goto end_coredump;
 static void fill_elf_header(struct elfhdr *elf, int segs,
                            u16 machine, u32 flags, u8 osabi)
 {
@@ -1899,6 +1838,34 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
        return gate_vma;
 }
+static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
+                             elf_addr_t e_shoff, int segs)
+{
+        elf->e_shoff = e_shoff;
+        elf->e_shentsize = sizeof(*shdr4extnum);
+        elf->e_shnum = 1;
+        elf->e_shstrndx = SHN_UNDEF;
+        memset(shdr4extnum, 0, sizeof(*shdr4extnum));
+        shdr4extnum->sh_type = SHT_NULL;
+        shdr4extnum->sh_size = elf->e_shnum;
+        shdr4extnum->sh_link = elf->e_shstrndx;
+        shdr4extnum->sh_info = segs;
+}
+static size_t elf_core_vma_data_size(struct vm_area_struct *gate_vma,
+                                     unsigned long mm_flags)
+{
+        struct vm_area_struct *vma;
+        size_t size = 0;
+        for (vma = first_vma(current, gate_vma); vma != NULL;
+             vma = next_vma(vma, gate_vma))
+                size += vma_dump_size(vma, mm_flags);
+        return size;
+}
 /*
 * Actual dumper
 *
@@ -1906,7 +1873,7 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
 * and then they are actually written out.  If we run out of core limit
 * we just truncate.
 */
-static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit)
+static int elf_core_dump(struct coredump_params *cprm)
 {
        int has_dumped = 0;
        mm_segment_t fs;
@@ -1915,8 +1882,11 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
        struct vm_area_struct *vma, *gate_vma;
        struct elfhdr *elf = NULL;
        loff_t offset = 0, dataoff, foffset;
-        unsigned long mm_flags;
        struct elf_note_info info;
+        struct elf_phdr *phdr4note = NULL;
+        struct elf_shdr *shdr4extnum = NULL;
+        Elf_Half e_phnum;
+        elf_addr_t e_shoff;
        /*
         * We no longer stop all VM operations.
@@ -1939,20 +1909,25 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
         * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
         */
        segs = current->mm->map_count;
-#ifdef ELF_CORE_EXTRA_PHDRS
+        segs += elf_core_extra_phdrs();
-        segs += ELF_CORE_EXTRA_PHDRS;
-#endif
        gate_vma = get_gate_vma(current);
        if (gate_vma != NULL)
                segs++;
+        /* for notes section */
+        segs++;
+        /* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid
+         * this, kernel supports extended numbering. Have a look at
+         * include/linux/elf.h for further information. */
+        e_phnum = segs > PN_XNUM ? PN_XNUM : segs;
        /*
         * Collect all the non-memory information about the process for the
         * notes.  This also sets up the file header.
         */
-        if (!fill_note_info(elf, segs + 1, /* including notes section */
+        if (!fill_note_info(elf, e_phnum, &info, cprm->signr, cprm->regs))
-                            &info, signr, regs))
                goto cleanup;
        has_dumped = 1;
@@ -1961,31 +1936,47 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
        fs = get_fs();
        set_fs(KERNEL_DS);
-        DUMP_WRITE(elf, sizeof(*elf));
        offset += sizeof(*elf);                         /* Elf header */
-        offset += (segs + 1) * sizeof(struct elf_phdr); /* Program headers */
+        offset += segs * sizeof(struct elf_phdr);       /* Program headers */
        foffset = offset;
        /* Write notes phdr entry */
        {
-                struct elf_phdr phdr;
                size_t sz = get_note_info_size(&info);
                sz += elf_coredump_extra_notes_size();
-                fill_elf_note_phdr(&phdr, sz, offset);
+                phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
+                if (!phdr4note)
+                        goto end_coredump;
+                fill_elf_note_phdr(phdr4note, sz, offset);
                offset += sz;
-                DUMP_WRITE(&phdr, sizeof(phdr));
        }
        dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
-        /*
+        offset += elf_core_vma_data_size(gate_vma, cprm->mm_flags);
-         * We must use the same mm->flags while dumping core to avoid
+        offset += elf_core_extra_data_size();
-         * inconsistency between the program headers and bodies, otherwise an
+        e_shoff = offset;
-         * unusable core file can be generated.
-         */
+        if (e_phnum == PN_XNUM) {
-        mm_flags = current->mm->flags;
+                shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL);
+                if (!shdr4extnum)
+                        goto end_coredump;
+                fill_extnum_info(elf, shdr4extnum, e_shoff, segs);
+        }
+        offset = dataoff;
+        size += sizeof(*elf);
+        if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf)))
+                goto end_coredump;
+        size += sizeof(*phdr4note);
+        if (size > cprm->limit
+            || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note)))
+                goto end_coredump;
        /* Write program headers for segments dump */
        for (vma = first_vma(current, gate_vma); vma != NULL;
@@ -1996,7 +1987,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
                phdr.p_offset = offset;
                phdr.p_vaddr = vma->vm_start;
                phdr.p_paddr = 0;
-                phdr.p_filesz = vma_dump_size(vma, mm_flags);
+                phdr.p_filesz = vma_dump_size(vma, cprm->mm_flags);
                phdr.p_memsz = vma->vm_end - vma->vm_start;
                offset += phdr.p_filesz;
                phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
@@ -2006,22 +1997,24 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
                        phdr.p_flags |= PF_X;
                phdr.p_align = ELF_EXEC_PAGESIZE;
-                DUMP_WRITE(&phdr, sizeof(phdr));
+                size += sizeof(phdr);
+                if (size > cprm->limit
+                    || !dump_write(cprm->file, &phdr, sizeof(phdr)))
+                        goto end_coredump;
        }
-#ifdef ELF_CORE_WRITE_EXTRA_PHDRS
+        if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit))
-        ELF_CORE_WRITE_EXTRA_PHDRS;
+                goto end_coredump;
-#endif
        /* write out the notes section */
-        if (!write_note_info(&info, file, &foffset))
+        if (!write_note_info(&info, cprm->file, &foffset))
                goto end_coredump;
-        if (elf_coredump_extra_notes_write(file, &foffset))
+        if (elf_coredump_extra_notes_write(cprm->file, &foffset))
                goto end_coredump;
        /* Align to page */
-        if (!dump_seek(file, dataoff - foffset))
+        if (!dump_seek(cprm->file, dataoff - foffset))
                goto end_coredump;
        for (vma = first_vma(current, gate_vma); vma != NULL;
@@ -2029,7 +2022,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
                unsigned long addr;
                unsigned long end;
-                end = vma->vm_start + vma_dump_size(vma, mm_flags);
+                end = vma->vm_start + vma_dump_size(vma, cprm->mm_flags);
                for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) {
                        struct page *page;
@@ -2038,32 +2031,42 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
                        page = get_dump_page(addr);
                        if (page) {
                                void *kaddr = kmap(page);
-                                stop = ((size += PAGE_SIZE) > limit) ||
+                                stop = ((size += PAGE_SIZE) > cprm->limit) ||
-                                        !dump_write(file, kaddr, PAGE_SIZE);
+                                        !dump_write(cprm->file, kaddr,
+                                                    PAGE_SIZE);
                                kunmap(page);
                                page_cache_release(page);
                        } else
-                                stop = !dump_seek(file, PAGE_SIZE);
+                                stop = !dump_seek(cprm->file, PAGE_SIZE);
                        if (stop)
                                goto end_coredump;
                }
        }
-#ifdef ELF_CORE_WRITE_EXTRA_DATA
+        if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit))
-        ELF_CORE_WRITE_EXTRA_DATA;
+                goto end_coredump;
-#endif
+        if (e_phnum == PN_XNUM) {
+                size += sizeof(*shdr4extnum);
+                if (size > cprm->limit
+                    || !dump_write(cprm->file, shdr4extnum,
+                                   sizeof(*shdr4extnum)))
+                        goto end_coredump;
+        }
 end_coredump:
        set_fs(fs);
 cleanup:
        free_note_info(&info);
+        kfree(shdr4extnum);
+        kfree(phdr4note);
        kfree(elf);
 out:
        return has_dumped;
 }
-#endif          /* USE_ELF_CORE_DUMP */
+#endif          /* CONFIG_ELF_CORE */
 static int __init init_elf_binfmt(void)
 {
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 38502c67987c..7ab23e006e4c 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -34,6 +34,7 @@
 #include <linux/elf.h>
 #include <linux/elf-fdpic.h>
 #include <linux/elfcore.h>
+#include <linux/coredump.h>
 #include <asm/uaccess.h>
 #include <asm/param.h>
@@ -75,14 +76,14 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *,
 static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *,
                                             struct file *, struct mm_struct *);
-#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+#ifdef CONFIG_ELF_CORE
-static int elf_fdpic_core_dump(long, struct pt_regs *, struct file *, unsigned long limit);
+static int elf_fdpic_core_dump(struct coredump_params *cprm);
 #endif
 static struct linux_binfmt elf_fdpic_format = {
        .module         = THIS_MODULE,
        .load_binary    = load_elf_fdpic_binary,
-#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+#ifdef CONFIG_ELF_CORE
        .core_dump      = elf_fdpic_core_dump,
 #endif
        .min_coredump   = ELF_EXEC_PAGESIZE,
@@ -171,6 +172,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
 #ifdef ELF_FDPIC_PLAT_INIT
        unsigned long dynaddr;
 #endif
+#ifndef CONFIG_MMU
+        unsigned long stack_prot;
+#endif
        struct file *interpreter = NULL; /* to shut gcc up */
        char *interpreter_name = NULL;
        int executable_stack;
@@ -316,6 +320,11 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
         * defunct, deceased, etc. after this point we have to exit via
         * error_kill */
        set_personality(PER_LINUX_FDPIC);
+        if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
+                current->personality |= READ_IMPLIES_EXEC;
+        setup_new_exec(bprm);
        set_binfmt(&elf_fdpic_format);
        current->mm->start_code = 0;
@@ -377,10 +386,15 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
        if (stack_size < PAGE_SIZE * 2)
                stack_size = PAGE_SIZE * 2;
+        stack_prot = PROT_READ | PROT_WRITE;
+        if (executable_stack == EXSTACK_ENABLE_X ||
+            (executable_stack == EXSTACK_DEFAULT && VM_STACK_FLAGS & VM_EXEC))
+                stack_prot |= PROT_EXEC;
        down_write(&current->mm->mmap_sem);
-        current->mm->start_brk = do_mmap(NULL, 0, stack_size,
+        current->mm->start_brk = do_mmap(NULL, 0, stack_size, stack_prot,
-                                         PROT_READ | PROT_WRITE | PROT_EXEC,
+                                         MAP_PRIVATE | MAP_ANONYMOUS |
-                                         MAP_PRIVATE | MAP_ANONYMOUS | MAP_GROWSDOWN,
+                                         MAP_UNINITIALIZED | MAP_GROWSDOWN,
                                         0);
        if (IS_ERR_VALUE(current->mm->start_brk)) {
@@ -1200,27 +1214,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 *
 * Modelled on fs/binfmt_elf.c core dumper
 */
-#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+#ifdef CONFIG_ELF_CORE
-/*
- * These are the only things you should do on a core-file: use only these
- * functions to write out all the necessary info.
- */
-static int dump_write(struct file *file, const void *addr, int nr)
-{
-        return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-}
-static int dump_seek(struct file *file, loff_t off)
-{
-        if (file->f_op->llseek) {
-                if (file->f_op->llseek(file, off, SEEK_SET) != off)
-                        return 0;
-        } else {
-                file->f_pos = off;
-        }
-        return 1;
-}
 /*
 * Decide whether a segment is worth dumping; default is yes to be
@@ -1300,34 +1294,35 @@ static int notesize(struct memelfnote *en)
 /* #define DEBUG */
-#define DUMP_WRITE(addr, nr)    \
+#define DUMP_WRITE(addr, nr, foffset)   \
-        do { if (!dump_write(file, (addr), (nr))) return 0; } while(0)
+        do { if (!dump_write(file, (addr), (nr))) return 0; *foffset += (nr); } while(0)
-#define DUMP_SEEK(off)  \
-        do { if (!dump_seek(file, (off))) return 0; } while(0)
-static int writenote(struct memelfnote *men, struct file *file)
+static int alignfile(struct file *file, loff_t *foffset)
 {
-        struct elf_note en;
+        static const char buf[4] = { 0, };
+        DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset);
+        return 1;
+}
+static int writenote(struct memelfnote *men, struct file *file,
+                        loff_t *foffset)
+{
+        struct elf_note en;
        en.n_namesz = strlen(men->name) + 1;
        en.n_descsz = men->datasz;
        en.n_type = men->type;
-        DUMP_WRITE(&en, sizeof(en));
+        DUMP_WRITE(&en, sizeof(en), foffset);
-        DUMP_WRITE(men->name, en.n_namesz);
+        DUMP_WRITE(men->name, en.n_namesz, foffset);
-        /* XXX - cast from long long to long to avoid need for libgcc.a */
+        if (!alignfile(file, foffset))
-        DUMP_SEEK(roundup((unsigned long)file->f_pos, 4));      /* XXX */
+                return 0;
-        DUMP_WRITE(men->data, men->datasz);
+        DUMP_WRITE(men->data, men->datasz, foffset);
-        DUMP_SEEK(roundup((unsigned long)file->f_pos, 4));      /* XXX */
+        if (!alignfile(file, foffset))
+                return 0;
        return 1;
 }
 #undef DUMP_WRITE
-#undef DUMP_SEEK
-#define DUMP_WRITE(addr, nr)    \
-        if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \
-                goto end_coredump;
 static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs)
 {
@@ -1379,7 +1374,7 @@ static inline void fill_note(struct memelfnote *note, const char *name, int type
 /*
 * fill up all the fields in prstatus from the given task struct, except
- * registers which need to be filled up seperately.
+ * registers which need to be filled up separately.
 */
 static void fill_prstatus(struct elf_prstatus *prstatus,
                          struct task_struct *p, long signr)
@@ -1510,6 +1505,22 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
        return sz;
 }
+static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
+                             elf_addr_t e_shoff, int segs)
+{
+        elf->e_shoff = e_shoff;
+        elf->e_shentsize = sizeof(*shdr4extnum);
+        elf->e_shnum = 1;
+        elf->e_shstrndx = SHN_UNDEF;
+        memset(shdr4extnum, 0, sizeof(*shdr4extnum));
+        shdr4extnum->sh_type = SHT_NULL;
+        shdr4extnum->sh_size = elf->e_shnum;
+        shdr4extnum->sh_link = elf->e_shstrndx;
+        shdr4extnum->sh_info = segs;
+}
 /*
 * dump the segments for an MMU process
 */
@@ -1538,7 +1549,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
                                        err = -EIO;
                                kunmap(page);
                                page_cache_release(page);
-                        } else if (!dump_seek(file, file->f_pos + PAGE_SIZE))
+                        } else if (!dump_seek(file, PAGE_SIZE))
                                err = -EFBIG;
                        if (err)
                                goto out;
@@ -1574,6 +1585,17 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
 }
 #endif
+static size_t elf_core_vma_data_size(unsigned long mm_flags)
+{
+        struct vm_area_struct *vma;
+        size_t size = 0;
+        for (vma = current->mm->mmap; vma; vma = vma->vm_next)
+                if (maydump(vma, mm_flags))
+                        size += vma->vm_end - vma->vm_start;
+        return size;
+}
 /*
 * Actual dumper
 *
@@ -1581,8 +1603,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
 * and then they are actually written out.  If we run out of core limit
 * we just truncate.
 */
-static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
+static int elf_fdpic_core_dump(struct coredump_params *cprm)
-                               struct file *file, unsigned long limit)
 {
 #define NUM_NOTES       6
        int has_dumped = 0;
@@ -1592,7 +1613,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
        int i;
        struct vm_area_struct *vma;
        struct elfhdr *elf = NULL;
-        loff_t offset = 0, dataoff;
+        loff_t offset = 0, dataoff, foffset;
        int numnote;
        struct memelfnote *notes = NULL;
        struct elf_prstatus *prstatus = NULL;   /* NT_PRSTATUS */
@@ -1605,7 +1626,10 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
 #endif
        int thread_status_size = 0;
        elf_addr_t *auxv;
-        unsigned long mm_flags;
+        struct elf_phdr *phdr4note = NULL;
+        struct elf_shdr *shdr4extnum = NULL;
+        Elf_Half e_phnum;
+        elf_addr_t e_shoff;
        /*
         * We no longer stop all VM operations.
@@ -1641,7 +1665,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
                goto cleanup;
 #endif
-        if (signr) {
+        if (cprm->signr) {
                struct core_thread *ct;
                struct elf_thread_status *tmp;
@@ -1660,22 +1684,28 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
                        int sz;
                        tmp = list_entry(t, struct elf_thread_status, list);
-                        sz = elf_dump_thread_status(signr, tmp);
+                        sz = elf_dump_thread_status(cprm->signr, tmp);
                        thread_status_size += sz;
                }
        }
        /* now collect the dump for the current */
-        fill_prstatus(prstatus, current, signr);
+        fill_prstatus(prstatus, current, cprm->signr);
-        elf_core_copy_regs(&prstatus->pr_reg, regs);
+        elf_core_copy_regs(&prstatus->pr_reg, cprm->regs);
        segs = current->mm->map_count;
-#ifdef ELF_CORE_EXTRA_PHDRS
+        segs += elf_core_extra_phdrs();
-        segs += ELF_CORE_EXTRA_PHDRS;
-#endif
+        /* for notes section */
+        segs++;
+        /* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid
+         * this, kernel supports extended numbering. Have a look at
+         * include/linux/elf.h for further information. */
+        e_phnum = segs > PN_XNUM ? PN_XNUM : segs;
        /* Set up header */
-        fill_elf_fdpic_header(elf, segs + 1);   /* including notes section */
+        fill_elf_fdpic_header(elf, e_phnum);
        has_dumped = 1;
        current->flags |= PF_DUMPCORE;
@@ -1702,7 +1732,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
        /* Try to dump the FPU. */
        if ((prstatus->pr_fpvalid =
-             elf_core_copy_task_fpregs(current, regs, fpu)))
+             elf_core_copy_task_fpregs(current, cprm->regs, fpu)))
                fill_note(notes + numnote++,
                          "CORE", NT_PRFPREG, sizeof(*fpu), fpu);
 #ifdef ELF_CORE_COPY_XFPREGS
@@ -1714,13 +1744,12 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
        fs = get_fs();
        set_fs(KERNEL_DS);
-        DUMP_WRITE(elf, sizeof(*elf));
        offset += sizeof(*elf);                         /* Elf header */
-        offset += (segs+1) * sizeof(struct elf_phdr);   /* Program headers */
+        offset += segs * sizeof(struct elf_phdr);       /* Program headers */
+        foffset = offset;
        /* Write notes phdr entry */
        {
-                struct elf_phdr phdr;
                int sz = 0;
                for (i = 0; i < numnote; i++)
@@ -1728,20 +1757,38 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
                sz += thread_status_size;
-                fill_elf_note_phdr(&phdr, sz, offset);
+                phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
+                if (!phdr4note)
+                        goto end_coredump;
+                fill_elf_note_phdr(phdr4note, sz, offset);
                offset += sz;
-                DUMP_WRITE(&phdr, sizeof(phdr));
        }
        /* Page-align dumped data */
        dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
-        /*
+        offset += elf_core_vma_data_size(cprm->mm_flags);
-         * We must use the same mm->flags while dumping core to avoid
+        offset += elf_core_extra_data_size();
-         * inconsistency between the program headers and bodies, otherwise an
+        e_shoff = offset;
-         * unusable core file can be generated.
-         */
+        if (e_phnum == PN_XNUM) {
-        mm_flags = current->mm->flags;
+                shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL);
+                if (!shdr4extnum)
+                        goto end_coredump;
+                fill_extnum_info(elf, shdr4extnum, e_shoff, segs);
+        }
+        offset = dataoff;
+        size += sizeof(*elf);
+        if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf)))
+                goto end_coredump;
+        size += sizeof(*phdr4note);
+        if (size > cprm->limit
+            || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note)))
+                goto end_coredump;
        /* write program headers for segments dump */
        for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
@@ -1754,7 +1801,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
                phdr.p_offset = offset;
                phdr.p_vaddr = vma->vm_start;
                phdr.p_paddr = 0;
-                phdr.p_filesz = maydump(vma, mm_flags) ? sz : 0;
+                phdr.p_filesz = maydump(vma, cprm->mm_flags) ? sz : 0;
                phdr.p_memsz = sz;
                offset += phdr.p_filesz;
                phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
@@ -1764,16 +1811,18 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
                        phdr.p_flags |= PF_X;
                phdr.p_align = ELF_EXEC_PAGESIZE;
-                DUMP_WRITE(&phdr, sizeof(phdr));
+                size += sizeof(phdr);
+                if (size > cprm->limit
+                    || !dump_write(cprm->file, &phdr, sizeof(phdr)))
+                        goto end_coredump;
        }
-#ifdef ELF_CORE_WRITE_EXTRA_PHDRS
+        if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit))
-        ELF_CORE_WRITE_EXTRA_PHDRS;
+                goto end_coredump;
-#endif
        /* write out the notes section */
        for (i = 0; i < numnote; i++)
-                if (!writenote(notes + i, file))
+                if (!writenote(notes + i, cprm->file, &foffset))
                        goto end_coredump;
        /* write out the thread status notes section */
@@ -1782,25 +1831,33 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
                                list_entry(t, struct elf_thread_status, list);
                for (i = 0; i < tmp->num_notes; i++)
-                        if (!writenote(&tmp->notes[i], file))
+                        if (!writenote(&tmp->notes[i], cprm->file, &foffset))
                                goto end_coredump;
        }
-        if (!dump_seek(file, dataoff))
+        if (!dump_seek(cprm->file, dataoff - foffset))
                goto end_coredump;
-        if (elf_fdpic_dump_segments(file, &size, &limit, mm_flags) < 0)
+        if (elf_fdpic_dump_segments(cprm->file, &size, &cprm->limit,
+                                    cprm->mm_flags) < 0)
                goto end_coredump;
-#ifdef ELF_CORE_WRITE_EXTRA_DATA
+        if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit))
-        ELF_CORE_WRITE_EXTRA_DATA;
+                goto end_coredump;
-#endif
-        if (file->f_pos != offset) {
+        if (e_phnum == PN_XNUM) {
+                size += sizeof(*shdr4extnum);
+                if (size > cprm->limit
+                    || !dump_write(cprm->file, shdr4extnum,
+                                   sizeof(*shdr4extnum)))
+                        goto end_coredump;
+        }
+        if (cprm->file->f_pos != offset) {
                /* Sanity check */
                printk(KERN_WARNING
                       "elf_core_dump: file->f_pos (%lld) != offset (%lld)\n",
-                       file->f_pos, offset);
+                       cprm->file->f_pos, offset);
        }
 end_coredump:
@@ -1812,7 +1869,7 @@ cleanup:
                list_del(tmp);
                kfree(list_entry(tmp, struct elf_thread_status, list));
        }
+        kfree(phdr4note);
        kfree(elf);
        kfree(prstatus);
        kfree(psinfo);
@@ -1825,4 +1882,4 @@ cleanup:
 #undef NUM_NOTES
 }
-#endif          /* USE_ELF_CORE_DUMP */
+#endif          /* CONFIG_ELF_CORE */
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 32fb00b52cd0..b8e8b0acf9bd 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -11,7 +11,6 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/stat.h>
-#include <linux/slab.h>
 #include <linux/binfmts.h>
 #include <linux/elf.h>
 #include <linux/init.h>
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index a2796651e756..e0e769bdca59 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -87,7 +87,7 @@ static int load_flat_shared_library(int id, struct lib_info *p);
 #endif
 static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs);
-static int flat_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit);
+static int flat_core_dump(struct coredump_params *cprm);
 static struct linux_binfmt flat_format = {
        .module         = THIS_MODULE,
@@ -102,10 +102,10 @@ static struct linux_binfmt flat_format = {
 * Currently only a stub-function.
 */
-static int flat_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit)
+static int flat_core_dump(struct coredump_params *cprm)
 {
        printk("Process %s:%d received signr %d and should have core dumped\n",
-                        current->comm, current->pid, (int) signr);
+                        current->comm, current->pid, (int) cprm->signr);
        return(1);
 }
@@ -501,7 +501,7 @@ static int load_flat_file(struct linux_binprm * bprm,
         * size limits imposed on them by creating programs with large
         * arrays in the data or bss.
         */
-        rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
+        rlim = rlimit(RLIMIT_DATA);
        if (rlim >= RLIM_INFINITY)
                rlim = ~0;
        if (data_len + bss_len > rlim) {
@@ -519,6 +519,7 @@ static int load_flat_file(struct linux_binprm * bprm,
                /* OK, This is the point of no return */
                set_personality(PER_LINUX_32BIT);
+                setup_new_exec(bprm);
        }
        /*
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 08343505e184..aca9d55afb22 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -8,7 +8,6 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/stat.h>
-#include <linux/slab.h>
 #include <linux/binfmts.h>
 #include <linux/init.h>
 #include <linux/file.h>
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index eff74b9c9e77..cc8560f6c9b0 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -43,7 +43,7 @@ static int load_som_library(struct file *);
 * don't even try.
 */
 #if 0
-static int som_core_dump(long signr, struct pt_regs *regs, unsigned long limit);
+static int som_core_dump(struct coredump_params *cprm);
 #else
 #define som_core_dump   NULL
 #endif
@@ -227,6 +227,7 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
        /* OK, This is the point of no return */
        current->flags &= ~PF_FORKNOEXEC;
        current->personality = PER_HPUX;
+        setup_new_exec(bprm);
        /* Set the task size for HP-UX processes such that
         * the gateway page is outside the address space.
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 49a34e7f7306..612a5c38d3c1 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -24,6 +24,7 @@
 #include <linux/mempool.h>
 #include <linux/bio.h>
 #include <linux/workqueue.h>
+#include <linux/slab.h>
 struct integrity_slab {
        struct kmem_cache *slab;
@@ -61,7 +62,7 @@ static inline unsigned int vecs_to_idx(unsigned int nr)
 static inline int use_bip_pool(unsigned int idx)
 {
-        if (idx == BIOVEC_NR_POOLS)
+        if (idx == BIOVEC_MAX_IDX)
                return 1;
        return 0;
@@ -95,6 +96,7 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
        /* Use mempool if lower order alloc failed or max vecs were requested */
        if (bip == NULL) {
+                idx = BIOVEC_MAX_IDX;  /* so we free the payload properly later */
                bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
                if (unlikely(bip == NULL)) {
diff --git a/fs/bio.c b/fs/bio.c
index 12da5db8682c..e1f922184b45 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -78,7 +78,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
        i = 0;
        while (i < bio_slab_nr) {
-                struct bio_slab *bslab = &bio_slabs[i];
+                bslab = &bio_slabs[i];
                if (!bslab->slab && entry == -1)
                        entry = i;
@@ -264,15 +264,14 @@ EXPORT_SYMBOL(bio_init);
 * bio_alloc_bioset - allocate a bio for I/O
 * @gfp_mask:   the GFP_ mask given to the slab allocator
 * @nr_iovecs:  number of iovecs to pre-allocate
- * @bs:         the bio_set to allocate from. If %NULL, just use kmalloc
+ * @bs:         the bio_set to allocate from.
 *
 * Description:
- *   bio_alloc_bioset will first try its own mempool to satisfy the allocation.
+ *   bio_alloc_bioset will try its own mempool to satisfy the allocation.
 *   If %__GFP_WAIT is set then we will block on the internal pool waiting
- *   for a &struct bio to become free. If a %NULL @bs is passed in, we will
+ *   for a &struct bio to become free.
- *   fall back to just using @kmalloc to allocate the required memory.
 *
- *   Note that the caller must set ->bi_destructor on succesful return
+ *   Note that the caller must set ->bi_destructor on successful return
 *   of a bio, to do the appropriate freeing of the bio once the reference
 *   count drops to zero.
 **/
@@ -507,10 +506,8 @@ int bio_get_nr_vecs(struct block_device *bdev)
        int nr_pages;
        nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-        if (nr_pages > queue_max_phys_segments(q))
+        if (nr_pages > queue_max_segments(q))
-                nr_pages = queue_max_phys_segments(q);
+                nr_pages = queue_max_segments(q);
-        if (nr_pages > queue_max_hw_segments(q))
-                nr_pages = queue_max_hw_segments(q);
        return nr_pages;
 }
@@ -542,13 +539,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
                if (page == prev->bv_page &&
                    offset == prev->bv_offset + prev->bv_len) {
+                        unsigned int prev_bv_len = prev->bv_len;
                        prev->bv_len += len;
                        if (q->merge_bvec_fn) {
                                struct bvec_merge_data bvm = {
+                                        /* prev_bvec is already charged in
+                                           bi_size, discharge it in order to
+                                           simulate merging updated prev_bvec
+                                           as new bvec. */
                                        .bi_bdev = bio->bi_bdev,
                                        .bi_sector = bio->bi_sector,
-                                        .bi_size = bio->bi_size,
+                                        .bi_size = bio->bi_size - prev_bv_len,
                                        .bi_rw = bio->bi_rw,
                                };
@@ -570,8 +572,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
         * make this too complex.
         */
-        while (bio->bi_phys_segments >= queue_max_phys_segments(q)
+        while (bio->bi_phys_segments >= queue_max_segments(q)) {
-               || bio->bi_phys_segments >= queue_max_hw_segments(q)) {
                if (retried_segments)
                        return 0;
@@ -1393,6 +1394,18 @@ void bio_check_pages_dirty(struct bio *bio)
        }
 }
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+void bio_flush_dcache_pages(struct bio *bi)
+{
+        int i;
+        struct bio_vec *bvec;
+        bio_for_each_segment(bvec, bi, i)
+                flush_dcache_page(bvec->bv_page);
+}
+EXPORT_SYMBOL(bio_flush_dcache_pages);
+#endif
 /**
 * bio_endio - end I/O on a bio
 * @bio:        bio
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8bed0557d88c..d11d0289f3d2 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -246,7 +246,8 @@ struct super_block *freeze_bdev(struct block_device *bdev)
        if (!sb)
                goto out;
        if (sb->s_flags & MS_RDONLY) {
-                deactivate_locked_super(sb);
+                sb->s_frozen = SB_FREEZE_TRANS;
+                up_write(&sb->s_umount);
                mutex_unlock(&bdev->bd_fsfreeze_mutex);
                return sb;
        }
@@ -307,7 +308,7 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
        BUG_ON(sb->s_bdev != bdev);
        down_write(&sb->s_umount);
        if (sb->s_flags & MS_RDONLY)
-                goto out_deactivate;
+                goto out_unfrozen;
        if (sb->s_op->unfreeze_fs) {
                error = sb->s_op->unfreeze_fs(sb);
@@ -321,11 +322,11 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
                }
        }
+out_unfrozen:
        sb->s_frozen = SB_UNFROZEN;
        smp_wmb();
        wake_up(&sb->s_wait_unfrozen);
-out_deactivate:
        if (sb)
                deactivate_locked_super(sb);
 out_unlock:
@@ -405,7 +406,17 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
 
 static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
 {
-        return sync_blockdev(I_BDEV(filp->f_mapping->host));
+        struct block_device *bdev = I_BDEV(filp->f_mapping->host);
+        int error;
+        error = sync_blockdev(bdev);
+        if (error)
+                return error;
+        
+        error = blkdev_issue_flush(bdev, NULL);
+        if (error == -EOPNOTSUPP)
+                error = 0;
+        return error;
 }
 /*
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index da3133c69830..6ef7b26724ec 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -22,6 +22,7 @@
 #include <linux/posix_acl_xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include "ctree.h"
 #include "btrfs_inode.h"
@@ -73,13 +74,13 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
        return acl;
 }
-static int btrfs_xattr_get_acl(struct inode *inode, int type,
+static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name,
-                               void *value, size_t size)
+                void *value, size_t size, int type)
 {
        struct posix_acl *acl;
        int ret = 0;
-        acl = btrfs_get_acl(inode, type);
+        acl = btrfs_get_acl(dentry->d_inode, type);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
@@ -153,8 +154,8 @@ out:
        return ret;
 }
-static int btrfs_xattr_set_acl(struct inode *inode, int type,
+static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
-                               const void *value, size_t size)
+                const void *value, size_t size, int flags, int type)
 {
        int ret;
        struct posix_acl *acl = NULL;
@@ -169,38 +170,13 @@ static int btrfs_xattr_set_acl(struct inode *inode, int type,
                }
        }
-        ret = btrfs_set_acl(NULL, inode, acl, type);
+        ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type);
        posix_acl_release(acl);
        return ret;
 }
-static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name,
-                                      void *value, size_t size)
-{
-        return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size);
-}
-static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name,
-                                      const void *value, size_t size, int flags)
-{
-        return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
-}
-static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name,
-                                       void *value, size_t size)
-{
-        return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size);
-}
-static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name,
-                               const void *value, size_t size, int flags)
-{
-        return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
-}
 int btrfs_check_acl(struct inode *inode, int mask)
 {
        struct posix_acl *acl;
@@ -308,14 +284,16 @@ int btrfs_acl_chmod(struct inode *inode)
 struct xattr_handler btrfs_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
-        .get    = btrfs_xattr_acl_default_get,
+        .flags  = ACL_TYPE_DEFAULT,
-        .set    = btrfs_xattr_acl_default_set,
+        .get    = btrfs_xattr_acl_get,
+        .set    = btrfs_xattr_acl_set,
 };
 struct xattr_handler btrfs_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
-        .get    = btrfs_xattr_acl_access_get,
+        .flags  = ACL_TYPE_ACCESS,
-        .set    = btrfs_xattr_acl_access_set,
+        .get    = btrfs_xattr_acl_get,
+        .set    = btrfs_xattr_acl_set,
 };
 #else /* CONFIG_BTRFS_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c0861e781cdb..462859a30141 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -17,6 +17,7 @@
 */
 #include <linux/kthread.h>
+#include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/freezer.h>
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 1d54c5308df5..396039b3a8a2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -31,6 +31,7 @@
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/bit_spinlock.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index babf7fbaec84..6795a713b205 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -17,6 +17,7 @@
 */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ae8c40922c54..746a7248678e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -26,6 +26,7 @@
 #include <linux/completion.h>
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
+#include <linux/slab.h>
 #include <asm/kmap_types.h>
 #include "extent_io.h"
 #include "extent_map.h"
@@ -2327,7 +2328,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_delete_inode(struct inode *inode);
 void btrfs_put_inode(struct inode *inode);
-int btrfs_write_inode(struct inode *inode, int wait);
+int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
 void btrfs_dirty_inode(struct inode *inode);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 84e6781413b1..902ce507c4e3 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -17,6 +17,7 @@
 */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/sort.h>
 #include "ctree.h"
 #include "delayed-ref.h"
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6632e5c4c8bb..e7b8f2c89ccb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -27,6 +27,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/crc32c.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 101041d4d2b2..9e23ffea7f54 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -22,6 +22,7 @@
 #include <linux/sort.h>
 #include <linux/rcupdate.h>
 #include <linux/kthread.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "hash.h"
 #include "ctree.h"
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index fc742e59815e..d2d03684fab2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2,7 +2,6 @@
 #include <linux/slab.h>
 #include <linux/bio.h>
 #include <linux/mm.h>
-#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/page-flags.h>
 #include <linux/module.h>
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 5a01f35507dd..454ca52d6451 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,5 +1,4 @@
 #include <linux/err.h>
-#include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
@@ -242,7 +241,7 @@ out:
 * Insert @em into @tree or perform a simple forward/backward merge with
 * existing mappings.  The extent_map struct passed in will be inserted
 * into the tree directly, with an additional reference taken, or a
- * reference dropped if the merge attempt was sucessfull.
+ * reference dropped if the merge attempt was successfull.
 */
 int add_extent_mapping(struct extent_map_tree *tree,
                       struct extent_map *em)
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 9b99886562d0..54a255065aa3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -17,6 +17,7 @@
 */
 #include <linux/bio.h>
+#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
 #include "ctree.h"
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d146dde7efb6..29ff749ff4ca 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -28,6 +28,7 @@
 #include <linux/writeback.h>
 #include <linux/statfs.h>
 #include <linux/compat.h>
+#include <linux/slab.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -839,7 +840,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
        unsigned long last_index;
        int will_write;
-        will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) ||
+        will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
                      (file->f_flags & O_DIRECT));
        nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
@@ -1006,7 +1007,7 @@ out_nolock:
                if (err)
                        num_written = err;
-                if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
+                if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
                        trans = btrfs_start_transaction(root, 1);
                        ret = btrfs_log_dentry_safe(trans, root,
                                                    file->f_dentry);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index dd831ed31eea..f488fac04d99 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,7 @@
 #include <linux/pagemap.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/math64.h>
 #include "ctree.h"
 #include "free-space-cache.h"
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a85b90c86cb0..2bfdc641d4e3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -36,6 +36,7 @@
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/falloc.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -3943,7 +3944,7 @@ err:
        return ret;
 }
-int btrfs_write_inode(struct inode *inode, int wait)
+int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
@@ -3952,7 +3953,7 @@ int btrfs_write_inode(struct inode *inode, int wait)
        if (root->fs_info->btree_inode == inode)
                return 0;
-        if (wait) {
+        if (wbc->sync_mode == WB_SYNC_ALL) {
                trans = btrfs_join_transaction(root, 1);
                btrfs_set_trans_block_group(trans, inode);
                ret = btrfs_commit_transaction(trans, root);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2b7dd88fc54f..e84ef60ffe35 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -39,6 +39,7 @@
 #include <linux/security.h>
 #include <linux/xattr.h>
 #include <linux/vmalloc.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 1c36e5cd8f55..6151f2ea38bb 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -16,7 +16,6 @@
 * Boston, MA 021110-1307, USA.
 */
 #include <linux/sched.h>
-#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/spinlock.h>
 #include <linux/page-flags.h>
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 5c99882b9763..a127c0ebb2dc 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -16,7 +16,6 @@
 * Boston, MA 021110-1307, USA.
 */
-#include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/writeback.h>
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index d0cc62bccb94..a97314cf6bd6 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -17,6 +17,7 @@
 */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/sort.h>
 #include "ctree.h"
 #include "ref-cache.h"
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 0b23942cbc0d..e558dd941ded 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -21,6 +21,7 @@
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 #include <linux/rbtree.h>
+#include <linux/slab.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d11b12fc086b..1866dff0538e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -38,6 +38,7 @@
 #include <linux/namei.h>
 #include <linux/miscdevice.h>
 #include <linux/magic.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index a240b6fa81df..4ce16ef702a3 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -164,12 +164,12 @@ static void btrfs_root_release(struct kobject *kobj)
        complete(&root->kobj_unregister);
 }
-static struct sysfs_ops btrfs_super_attr_ops = {
+static const struct sysfs_ops btrfs_super_attr_ops = {
        .show   = btrfs_super_attr_show,
        .store  = btrfs_super_attr_store,
 };
-static struct sysfs_ops btrfs_root_attr_ops = {
+static const struct sysfs_ops btrfs_root_attr_ops = {
        .show   = btrfs_root_attr_show,
        .store  = btrfs_root_attr_store,
 };
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 01cebd661997..2cb116099b90 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -17,6 +17,7 @@
 */
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/writeback.h>
 #include <linux/pagemap.h>
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1255fcc8ade5..af57dd2b43d4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -17,6 +17,7 @@
 */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include "ctree.h"
 #include "transaction.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9bf1f581b872..aa7dc36dac78 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -17,6 +17,7 @@
 */
 #include <linux/sched.h>
 #include <linux/bio.h>
+#include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/random.h>
diff --git a/fs/buffer.c b/fs/buffer.c
index 6fa530256bfd..c9c266db0624 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2893,7 +2893,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
        /*
         * The page straddles i_size.  It must be zeroed out on each and every
-         * writepage invokation because it may be mmapped.  "A file is mapped
+         * writepage invocation because it may be mmapped.  "A file is mapped
         * in multiples of the page size.  For a file that is not a multiple of
         * the  page size, the remaining memory is zeroed when mapped, and
         * writes to that region are not written out to the file."
@@ -3265,7 +3265,7 @@ static void recalc_bh_state(void)
        
 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
 {
-        struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
+        struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
        if (ret) {
                INIT_LIST_HEAD(&ret->b_assoc_buffers);
                get_cpu_var(bh_accounting).nr++;
@@ -3352,15 +3352,6 @@ int bh_submit_read(struct buffer_head *bh)
 }
 EXPORT_SYMBOL(bh_submit_read);
-static void
-init_buffer_head(void *data)
-{
-        struct buffer_head *bh = data;
-        memset(bh, 0, sizeof(*bh));
-        INIT_LIST_HEAD(&bh->b_assoc_buffers);
-}
 void __init buffer_init(void)
 {
        int nrpages;
@@ -3369,7 +3360,7 @@ void __init buffer_init(void)
                        sizeof(struct buffer_head), 0,
                                (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
                                SLAB_MEM_SPREAD),
-                                init_buffer_head);
+                                NULL);
        /*
         * Limit the bh occupancy to 10% of ZONE_NORMAL
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index 3797e0077b35..2906077ac798 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -84,7 +84,7 @@ int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
 static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
 {
        struct cachefiles_object *fsdef;
-        struct nameidata nd;
+        struct path path;
        struct kstatfs stats;
        struct dentry *graveyard, *cachedir, *root;
        const struct cred *saved_cred;
@@ -114,15 +114,12 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
        _debug("- fsdef %p", fsdef);
        /* look up the directory at the root of the cache */
-        memset(&nd, 0, sizeof(nd));
+        ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path);
-        ret = path_lookup(cache->rootdirname, LOOKUP_DIRECTORY, &nd);
        if (ret < 0)
                goto error_open_root;
-        cache->mnt = mntget(nd.path.mnt);
+        cache->mnt = path.mnt;
-        root = dget(nd.path.dentry);
+        root = path.dentry;
-        path_put(&nd.path);
        /* check parameters */
        ret = -EOPNOTSUPP;
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 4618516dd994..c2413561ea75 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -21,6 +21,7 @@
 #include <linux/mount.h>
 #include <linux/statfs.h>
 #include <linux/ctype.h>
+#include <linux/string.h>
 #include <linux/fs_struct.h>
 #include "internal.h"
@@ -257,8 +258,7 @@ static ssize_t cachefiles_daemon_write(struct file *file,
                if (args == data)
                        goto error;
                *args = '\0';
-                for (args++; isspace(*args); args++)
+                args = skip_spaces(++args);
-                        continue;
        }
        /* run the appropriate command handler */
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 27089311fbea..37fe101a4e0d 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -9,6 +9,7 @@
 * 2 of the Licence, or (at your option) any later version.
 */
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/buffer_head.h>
 #include "internal.h"
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 14ac4806e291..d5db84a1ee0d 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -19,6 +19,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/security.h>
+#include <linux/slab.h>
 #include "internal.h"
 #define CACHEFILES_KEYBUF_SIZE 512
@@ -348,7 +349,17 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
        dir = dget_parent(object->dentry);
        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
-        ret = cachefiles_bury_object(cache, dir, object->dentry);
+        /* we need to check that our parent is _still_ our parent - it may have
+         * been renamed */
+        if (dir == object->dentry->d_parent) {
+                ret = cachefiles_bury_object(cache, dir, object->dentry);
+        } else {
+                /* it got moved, presumably by cachefilesd culling it, so it's
+                 * no longer in the key path and we can ignore it */
+                mutex_unlock(&dir->d_inode->i_mutex);
+                ret = 0;
+        }
        dput(dir);
        _leave(" = %d", ret);
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index a6c8c6fe8df9..0f0d41fbb03f 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -10,8 +10,8 @@
 */
 #include <linux/mount.h>
+#include <linux/slab.h>
 #include <linux/file.h>
-#include <linux/ima.h>
 #include "internal.h"
 /*
@@ -923,7 +923,6 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
        if (IS_ERR(file)) {
                ret = PTR_ERR(file);
        } else {
-                ima_counts_get(file);
                ret = -EIO;
                if (file->f_op->write) {
                        pos = (loff_t) page->index << PAGE_SHIFT;
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index f3e7a0bf068b..e18b183b47e1 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -16,6 +16,7 @@
 #include <linux/fsnotify.h>
 #include <linux/quotaops.h>
 #include <linux/xattr.h>
+#include <linux/slab.h>
 #include "internal.h"
 static const char cachefiles_xattr_cache[] =
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
new file mode 100644
index 000000000000..04b8280582a9
--- /dev/null
+++ b/fs/ceph/Kconfig
@@ -0,0 +1,27 @@
+config CEPH_FS
+        tristate "Ceph distributed file system (EXPERIMENTAL)"
+        depends on INET && EXPERIMENTAL
+        select LIBCRC32C
+        select CONFIG_CRYPTO_AES
+        help
+          Choose Y or M here to include support for mounting the
+          experimental Ceph distributed file system.  Ceph is an extremely
+          scalable file system designed to provide high performance,
+          reliable access to petabytes of storage.
+          More information at http://ceph.newdream.net/.
+          If unsure, say N.
+config CEPH_FS_PRETTYDEBUG
+        bool "Include file:line in ceph debug output"
+        depends on CEPH_FS
+        default n
+        help
+          If you say Y here, debug output will include a filename and
+          line to aid debugging.  This icnreases kernel size and slows
+          execution slightly when debug call sites are enabled (e.g.,
+          via CONFIG_DYNAMIC_DEBUG).
+          If unsure, say N.
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
new file mode 100644
index 000000000000..6a660e610be8
--- /dev/null
+++ b/fs/ceph/Makefile
@@ -0,0 +1,39 @@
+#
+# Makefile for CEPH filesystem.
+#
+ifneq ($(KERNELRELEASE),)
+obj-$(CONFIG_CEPH_FS) += ceph.o
+ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
+        export.o caps.o snap.o xattr.o \
+        messenger.o msgpool.o buffer.o pagelist.o \
+        mds_client.o mdsmap.o \
+        mon_client.o \
+        osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
+        debugfs.o \
+        auth.o auth_none.o \
+        crypto.o armor.o \
+        auth_x.o \
+        ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
+else
+#Otherwise we were called directly from the command
+# line; invoke the kernel build system.
+KERNELDIR ?= /lib/modules/$(shell uname -r)/build
+PWD := $(shell pwd)
+default: all
+all:
+        $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
+modules_install:
+        $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
+clean:
+        $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
+endif
diff --git a/fs/ceph/README b/fs/ceph/README
new file mode 100644
index 000000000000..18352fab37c0
--- /dev/null
+++ b/fs/ceph/README
@@ -0,0 +1,20 @@
+#
+# The following files are shared by (and manually synchronized
+# between) the Ceph userland and kernel client.
+#
+# userland                  kernel
+src/include/ceph_fs.h       fs/ceph/ceph_fs.h
+src/include/ceph_fs.cc      fs/ceph/ceph_fs.c
+src/include/msgr.h          fs/ceph/msgr.h
+src/include/rados.h         fs/ceph/rados.h
+src/include/ceph_strings.cc fs/ceph/ceph_strings.c
+src/include/ceph_frag.h     fs/ceph/ceph_frag.h
+src/include/ceph_frag.cc    fs/ceph/ceph_frag.c
+src/include/ceph_hash.h     fs/ceph/ceph_hash.h
+src/include/ceph_hash.cc    fs/ceph/ceph_hash.c
+src/crush/crush.c           fs/ceph/crush/crush.c
+src/crush/crush.h           fs/ceph/crush/crush.h
+src/crush/mapper.c          fs/ceph/crush/mapper.c
+src/crush/mapper.h          fs/ceph/crush/mapper.h
+src/crush/hash.h            fs/ceph/crush/hash.h
+src/crush/hash.c            fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
new file mode 100644
index 000000000000..aa3cd7cc3e40
--- /dev/null
+++ b/fs/ceph/addr.c
@@ -0,0 +1,1195 @@
+#include "ceph_debug.h"
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>    /* generic_writepages */
+#include <linux/slab.h>
+#include <linux/pagevec.h>
+#include <linux/task_io_accounting_ops.h>
+#include "super.h"
+#include "osd_client.h"
+/*
+ * Ceph address space ops.
+ *
+ * There are a few funny things going on here.
+ *
+ * The page->private field is used to reference a struct
+ * ceph_snap_context for _every_ dirty page.  This indicates which
+ * snapshot the page was logically dirtied in, and thus which snap
+ * context needs to be associated with the osd write during writeback.
+ *
+ * Similarly, struct ceph_inode_info maintains a set of counters to
+ * count dirty pages on the inode.  In the absense of snapshots,
+ * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
+ *
+ * When a snapshot is taken (that is, when the client receives
+ * notification that a snapshot was taken), each inode with caps and
+ * with dirty pages (dirty pages implies there is a cap) gets a new
+ * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
+ * order, new snaps go to the tail).  The i_wrbuffer_ref_head count is
+ * moved to capsnap->dirty. (Unless a sync write is currently in
+ * progress.  In that case, the capsnap is said to be "pending", new
+ * writes cannot start, and the capsnap isn't "finalized" until the
+ * write completes (or fails) and a final size/mtime for the inode for
+ * that snap can be settled upon.)  i_wrbuffer_ref_head is reset to 0.
+ *
+ * On writeback, we must submit writes to the osd IN SNAP ORDER.  So,
+ * we look for the first capsnap in i_cap_snaps and write out pages in
+ * that snap context _only_.  Then we move on to the next capsnap,
+ * eventually reaching the "live" or "head" context (i.e., pages that
+ * are not yet snapped) and are writing the most recently dirtied
+ * pages.
+ *
+ * Invalidate and so forth must take care to ensure the dirty page
+ * accounting is preserved.
+ */
+#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
+#define CONGESTION_OFF_THRESH(congestion_kb)                            \
+        (CONGESTION_ON_THRESH(congestion_kb) -                          \
+         (CONGESTION_ON_THRESH(congestion_kb) >> 2))
+/*
+ * Dirty a page.  Optimistically adjust accounting, on the assumption
+ * that we won't race with invalidate.  If we do, readjust.
+ */
+static int ceph_set_page_dirty(struct page *page)
+{
+        struct address_space *mapping = page->mapping;
+        struct inode *inode;
+        struct ceph_inode_info *ci;
+        int undo = 0;
+        struct ceph_snap_context *snapc;
+        if (unlikely(!mapping))
+                return !TestSetPageDirty(page);
+        if (TestSetPageDirty(page)) {
+                dout("%p set_page_dirty %p idx %lu -- already dirty\n",
+                     mapping->host, page, page->index);
+                return 0;
+        }
+        inode = mapping->host;
+        ci = ceph_inode(inode);
+        /*
+         * Note that we're grabbing a snapc ref here without holding
+         * any locks!
+         */
+        snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
+        /* dirty the head */
+        spin_lock(&inode->i_lock);
+        if (ci->i_wrbuffer_ref_head == 0)
+                ci->i_head_snapc = ceph_get_snap_context(snapc);
+        ++ci->i_wrbuffer_ref_head;
+        if (ci->i_wrbuffer_ref == 0)
+                igrab(inode);
+        ++ci->i_wrbuffer_ref;
+        dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
+             "snapc %p seq %lld (%d snaps)\n",
+             mapping->host, page, page->index,
+             ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
+             ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+             snapc, snapc->seq, snapc->num_snaps);
+        spin_unlock(&inode->i_lock);
+        /* now adjust page */
+        spin_lock_irq(&mapping->tree_lock);
+        if (page->mapping) {    /* Race with truncate? */
+                WARN_ON_ONCE(!PageUptodate(page));
+                if (mapping_cap_account_dirty(mapping)) {
+                        __inc_zone_page_state(page, NR_FILE_DIRTY);
+                        __inc_bdi_stat(mapping->backing_dev_info,
+                                        BDI_RECLAIMABLE);
+                        task_io_account_write(PAGE_CACHE_SIZE);
+                }
+                radix_tree_tag_set(&mapping->page_tree,
+                                page_index(page), PAGECACHE_TAG_DIRTY);
+                /*
+                 * Reference snap context in page->private.  Also set
+                 * PagePrivate so that we get invalidatepage callback.
+                 */
+                page->private = (unsigned long)snapc;
+                SetPagePrivate(page);
+        } else {
+                dout("ANON set_page_dirty %p (raced truncate?)\n", page);
+                undo = 1;
+        }
+        spin_unlock_irq(&mapping->tree_lock);
+        if (undo)
+                /* whoops, we failed to dirty the page */
+                ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+        BUG_ON(!PageDirty(page));
+        return 1;
+}
+/*
+ * If we are truncating the full page (i.e. offset == 0), adjust the
+ * dirty page counters appropriately.  Only called if there is private
+ * data on the page.
+ */
+static void ceph_invalidatepage(struct page *page, unsigned long offset)
+{
+        struct inode *inode;
+        struct ceph_inode_info *ci;
+        struct ceph_snap_context *snapc = (void *)page->private;
+        BUG_ON(!PageLocked(page));
+        BUG_ON(!page->private);
+        BUG_ON(!PagePrivate(page));
+        BUG_ON(!page->mapping);
+        inode = page->mapping->host;
+        /*
+         * We can get non-dirty pages here due to races between
+         * set_page_dirty and truncate_complete_page; just spit out a
+         * warning, in case we end up with accounting problems later.
+         */
+        if (!PageDirty(page))
+                pr_err("%p invalidatepage %p page not dirty\n", inode, page);
+        if (offset == 0)
+                ClearPageChecked(page);
+        ci = ceph_inode(inode);
+        if (offset == 0) {
+                dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
+                     inode, page, page->index, offset);
+                ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+                ceph_put_snap_context(snapc);
+                page->private = 0;
+                ClearPagePrivate(page);
+        } else {
+                dout("%p invalidatepage %p idx %lu partial dirty page\n",
+                     inode, page, page->index);
+        }
+}
+/* just a sanity check */
+static int ceph_releasepage(struct page *page, gfp_t g)
+{
+        struct inode *inode = page->mapping ? page->mapping->host : NULL;
+        dout("%p releasepage %p idx %lu\n", inode, page, page->index);
+        WARN_ON(PageDirty(page));
+        WARN_ON(page->private);
+        WARN_ON(PagePrivate(page));
+        return 0;
+}
+/*
+ * read a single page, without unlocking it.
+ */
+static int readpage_nounlock(struct file *filp, struct page *page)
+{
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+        int err = 0;
+        u64 len = PAGE_CACHE_SIZE;
+        dout("readpage inode %p file %p page %p index %lu\n",
+             inode, filp, page, page->index);
+        err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
+                                  page->index << PAGE_CACHE_SHIFT, &len,
+                                  ci->i_truncate_seq, ci->i_truncate_size,
+                                  &page, 1);
+        if (err == -ENOENT)
+                err = 0;
+        if (err < 0) {
+                SetPageError(page);
+                goto out;
+        } else if (err < PAGE_CACHE_SIZE) {
+                /* zero fill remainder of page */
+                zero_user_segment(page, err, PAGE_CACHE_SIZE);
+        }
+        SetPageUptodate(page);
+out:
+        return err < 0 ? err : 0;
+}
+static int ceph_readpage(struct file *filp, struct page *page)
+{
+        int r = readpage_nounlock(filp, page);
+        unlock_page(page);
+        return r;
+}
+/*
+ * Build a vector of contiguous pages from the provided page list.
+ */
+static struct page **page_vector_from_list(struct list_head *page_list,
+                                           unsigned *nr_pages)
+{
+        struct page **pages;
+        struct page *page;
+        int next_index, contig_pages = 0;
+        /* build page vector */
+        pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
+        if (!pages)
+                return ERR_PTR(-ENOMEM);
+        BUG_ON(list_empty(page_list));
+        next_index = list_entry(page_list->prev, struct page, lru)->index;
+        list_for_each_entry_reverse(page, page_list, lru) {
+                if (page->index == next_index) {
+                        dout("readpages page %d %p\n", contig_pages, page);
+                        pages[contig_pages] = page;
+                        contig_pages++;
+                        next_index++;
+                } else {
+                        break;
+                }
+        }
+        *nr_pages = contig_pages;
+        return pages;
+}
+/*
+ * Read multiple pages.  Leave pages we don't read + unlock in page_list;
+ * the caller (VM) cleans them up.
+ */
+static int ceph_readpages(struct file *file, struct address_space *mapping,
+                          struct list_head *page_list, unsigned nr_pages)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+        int rc = 0;
+        struct page **pages;
+        struct pagevec pvec;
+        loff_t offset;
+        u64 len;
+        dout("readpages %p file %p nr_pages %d\n",
+             inode, file, nr_pages);
+        pages = page_vector_from_list(page_list, &nr_pages);
+        if (IS_ERR(pages))
+                return PTR_ERR(pages);
+        /* guess read extent */
+        offset = pages[0]->index << PAGE_CACHE_SHIFT;
+        len = nr_pages << PAGE_CACHE_SHIFT;
+        rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
+                                 offset, &len,
+                                 ci->i_truncate_seq, ci->i_truncate_size,
+                                 pages, nr_pages);
+        if (rc == -ENOENT)
+                rc = 0;
+        if (rc < 0)
+                goto out;
+        /* set uptodate and add to lru in pagevec-sized chunks */
+        pagevec_init(&pvec, 0);
+        for (; !list_empty(page_list) && len > 0;
+             rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
+                struct page *page =
+                        list_entry(page_list->prev, struct page, lru);
+                list_del(&page->lru);
+                if (rc < (int)PAGE_CACHE_SIZE) {
+                        /* zero (remainder of) page */
+                        int s = rc < 0 ? 0 : rc;
+                        zero_user_segment(page, s, PAGE_CACHE_SIZE);
+                }
+                if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
+                        page_cache_release(page);
+                        dout("readpages %p add_to_page_cache failed %p\n",
+                             inode, page);
+                        continue;
+                }
+                dout("readpages %p adding %p idx %lu\n", inode, page,
+                     page->index);
+                flush_dcache_page(page);
+                SetPageUptodate(page);
+                unlock_page(page);
+                if (pagevec_add(&pvec, page) == 0)
+                        pagevec_lru_add_file(&pvec);   /* add to lru */
+        }
+        pagevec_lru_add_file(&pvec);
+        rc = 0;
+out:
+        kfree(pages);
+        return rc;
+}
+/*
+ * Get ref for the oldest snapc for an inode with dirty data... that is, the
+ * only snap context we are allowed to write back.
+ *
+ * Caller holds i_lock.
+ */
+static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
+                                                      u64 *snap_size)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_snap_context *snapc = NULL;
+        struct ceph_cap_snap *capsnap = NULL;
+        list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+                dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
+                     capsnap->context, capsnap->dirty_pages);
+                if (capsnap->dirty_pages) {
+                        snapc = ceph_get_snap_context(capsnap->context);
+                        if (snap_size)
+                                *snap_size = capsnap->size;
+                        break;
+                }
+        }
+        if (!snapc && ci->i_snap_realm) {
+                snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
+                dout(" head snapc %p has %d dirty pages\n",
+                     snapc, ci->i_wrbuffer_ref_head);
+        }
+        return snapc;
+}
+static struct ceph_snap_context *get_oldest_context(struct inode *inode,
+                                                    u64 *snap_size)
+{
+        struct ceph_snap_context *snapc = NULL;
+        spin_lock(&inode->i_lock);
+        snapc = __get_oldest_context(inode, snap_size);
+        spin_unlock(&inode->i_lock);
+        return snapc;
+}
+/*
+ * Write a single page, but leave the page locked.
+ *
+ * If we get a write error, set the page error bit, but still adjust the
+ * dirty page accounting (i.e., page is no longer dirty).
+ */
+static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
+{
+        struct inode *inode;
+        struct ceph_inode_info *ci;
+        struct ceph_client *client;
+        struct ceph_osd_client *osdc;
+        loff_t page_off = page->index << PAGE_CACHE_SHIFT;
+        int len = PAGE_CACHE_SIZE;
+        loff_t i_size;
+        int err = 0;
+        struct ceph_snap_context *snapc;
+        u64 snap_size = 0;
+        long writeback_stat;
+        dout("writepage %p idx %lu\n", page, page->index);
+        if (!page->mapping || !page->mapping->host) {
+                dout("writepage %p - no mapping\n", page);
+                return -EFAULT;
+        }
+        inode = page->mapping->host;
+        ci = ceph_inode(inode);
+        client = ceph_inode_to_client(inode);
+        osdc = &client->osdc;
+        /* verify this is a writeable snap context */
+        snapc = (void *)page->private;
+        if (snapc == NULL) {
+                dout("writepage %p page %p not dirty?\n", inode, page);
+                goto out;
+        }
+        if (snapc != get_oldest_context(inode, &snap_size)) {
+                dout("writepage %p page %p snapc %p not writeable - noop\n",
+                     inode, page, (void *)page->private);
+                /* we should only noop if called by kswapd */
+                WARN_ON((current->flags & PF_MEMALLOC) == 0);
+                goto out;
+        }
+        /* is this a partial page at end of file? */
+        if (snap_size)
+                i_size = snap_size;
+        else
+                i_size = i_size_read(inode);
+        if (i_size < page_off + len)
+                len = i_size - page_off;
+        dout("writepage %p page %p index %lu on %llu~%u\n",
+             inode, page, page->index, page_off, len);
+        writeback_stat = atomic_long_inc_return(&client->writeback_count);
+        if (writeback_stat >
+            CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
+                set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+        set_page_writeback(page);
+        err = ceph_osdc_writepages(osdc, ceph_vino(inode),
+                                   &ci->i_layout, snapc,
+                                   page_off, len,
+                                   ci->i_truncate_seq, ci->i_truncate_size,
+                                   &inode->i_mtime,
+                                   &page, 1, 0, 0, true);
+        if (err < 0) {
+                dout("writepage setting page/mapping error %d %p\n", err, page);
+                SetPageError(page);
+                mapping_set_error(&inode->i_data, err);
+                if (wbc)
+                        wbc->pages_skipped++;
+        } else {
+                dout("writepage cleaned page %p\n", page);
+                err = 0;  /* vfs expects us to return 0 */
+        }
+        page->private = 0;
+        ClearPagePrivate(page);
+        end_page_writeback(page);
+        ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+        ceph_put_snap_context(snapc);
+out:
+        return err;
+}
+static int ceph_writepage(struct page *page, struct writeback_control *wbc)
+{
+        int err;
+        struct inode *inode = page->mapping->host;
+        BUG_ON(!inode);
+        igrab(inode);
+        err = writepage_nounlock(page, wbc);
+        unlock_page(page);
+        iput(inode);
+        return err;
+}
+/*
+ * lame release_pages helper.  release_pages() isn't exported to
+ * modules.
+ */
+static void ceph_release_pages(struct page **pages, int num)
+{
+        struct pagevec pvec;
+        int i;
+        pagevec_init(&pvec, 0);
+        for (i = 0; i < num; i++) {
+                if (pagevec_add(&pvec, pages[i]) == 0)
+                        pagevec_release(&pvec);
+        }
+        pagevec_release(&pvec);
+}
+/*
+ * async writeback completion handler.
+ *
+ * If we get an error, set the mapping error bit, but not the individual
+ * page error bits.
+ */
+static void writepages_finish(struct ceph_osd_request *req,
+                              struct ceph_msg *msg)
+{
+        struct inode *inode = req->r_inode;
+        struct ceph_osd_reply_head *replyhead;
+        struct ceph_osd_op *op;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        unsigned wrote;
+        struct page *page;
+        int i;
+        struct ceph_snap_context *snapc = req->r_snapc;
+        struct address_space *mapping = inode->i_mapping;
+        struct writeback_control *wbc = req->r_wbc;
+        __s32 rc = -EIO;
+        u64 bytes = 0;
+        struct ceph_client *client = ceph_inode_to_client(inode);
+        long writeback_stat;
+        unsigned issued = __ceph_caps_issued(ci, NULL);
+        /* parse reply */
+        replyhead = msg->front.iov_base;
+        WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
+        op = (void *)(replyhead + 1);
+        rc = le32_to_cpu(replyhead->result);
+        bytes = le64_to_cpu(op->extent.length);
+        if (rc >= 0) {
+                /*
+                 * Assume we wrote the pages we originally sent.  The
+                 * osd might reply with fewer pages if our writeback
+                 * raced with a truncation and was adjusted at the osd,
+                 * so don't believe the reply.
+                 */
+                wrote = req->r_num_pages;
+        } else {
+                wrote = 0;
+                mapping_set_error(mapping, rc);
+        }
+        dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
+             inode, rc, bytes, wrote);
+        /* clean all pages */
+        for (i = 0; i < req->r_num_pages; i++) {
+                page = req->r_pages[i];
+                BUG_ON(!page);
+                WARN_ON(!PageUptodate(page));
+                writeback_stat =
+                        atomic_long_dec_return(&client->writeback_count);
+                if (writeback_stat <
+                    CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
+                        clear_bdi_congested(&client->backing_dev_info,
+                                            BLK_RW_ASYNC);
+                if (i >= wrote) {
+                        dout("inode %p skipping page %p\n", inode, page);
+                        wbc->pages_skipped++;
+                }
+                page->private = 0;
+                ClearPagePrivate(page);
+                ceph_put_snap_context(snapc);
+                dout("unlocking %d %p\n", i, page);
+                end_page_writeback(page);
+                /*
+                 * We lost the cache cap, need to truncate the page before
+                 * it is unlocked, otherwise we'd truncate it later in the
+                 * page truncation thread, possibly losing some data that
+                 * raced its way in
+                 */
+                if ((issued & CEPH_CAP_FILE_CACHE) == 0)
+                        generic_error_remove_page(inode->i_mapping, page);
+                unlock_page(page);
+        }
+        dout("%p wrote+cleaned %d pages\n", inode, wrote);
+        ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
+        ceph_release_pages(req->r_pages, req->r_num_pages);
+        if (req->r_pages_from_pool)
+                mempool_free(req->r_pages,
+                             ceph_client(inode->i_sb)->wb_pagevec_pool);
+        else
+                kfree(req->r_pages);
+        ceph_osdc_put_request(req);
+}
+/*
+ * allocate a page vec, either directly, or if necessary, via a the
+ * mempool.  we avoid the mempool if we can because req->r_num_pages
+ * may be less than the maximum write size.
+ */
+static void alloc_page_vec(struct ceph_client *client,
+                           struct ceph_osd_request *req)
+{
+        req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
+                               GFP_NOFS);
+        if (!req->r_pages) {
+                req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
+                req->r_pages_from_pool = 1;
+                WARN_ON(!req->r_pages);
+        }
+}
+/*
+ * initiate async writeback
+ */
+static int ceph_writepages_start(struct address_space *mapping,
+                                 struct writeback_control *wbc)
+{
+        struct inode *inode = mapping->host;
+        struct backing_dev_info *bdi = mapping->backing_dev_info;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_client *client;
+        pgoff_t index, start, end;
+        int range_whole = 0;
+        int should_loop = 1;
+        pgoff_t max_pages = 0, max_pages_ever = 0;
+        struct ceph_snap_context *snapc = NULL, *last_snapc = NULL;
+        struct pagevec pvec;
+        int done = 0;
+        int rc = 0;
+        unsigned wsize = 1 << inode->i_blkbits;
+        struct ceph_osd_request *req = NULL;
+        int do_sync;
+        u64 snap_size = 0;
+        /*
+         * Include a 'sync' in the OSD request if this is a data
+         * integrity write (e.g., O_SYNC write or fsync()), or if our
+         * cap is being revoked.
+         */
+        do_sync = wbc->sync_mode == WB_SYNC_ALL;
+        if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
+                do_sync = 1;
+        dout("writepages_start %p dosync=%d (mode=%s)\n",
+             inode, do_sync,
+             wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
+             (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
+        client = ceph_inode_to_client(inode);
+        if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
+                pr_warning("writepage_start %p on forced umount\n", inode);
+                return -EIO; /* we're in a forced umount, don't write! */
+        }
+        if (client->mount_args->wsize && client->mount_args->wsize < wsize)
+                wsize = client->mount_args->wsize;
+        if (wsize < PAGE_CACHE_SIZE)
+                wsize = PAGE_CACHE_SIZE;
+        max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
+        pagevec_init(&pvec, 0);
+        /* ?? */
+        if (wbc->nonblocking && bdi_write_congested(bdi)) {
+                dout(" writepages congested\n");
+                wbc->encountered_congestion = 1;
+                goto out_final;
+        }
+        /* where to start/end? */
+        if (wbc->range_cyclic) {
+                start = mapping->writeback_index; /* Start from prev offset */
+                end = -1;
+                dout(" cyclic, start at %lu\n", start);
+        } else {
+                start = wbc->range_start >> PAGE_CACHE_SHIFT;
+                end = wbc->range_end >> PAGE_CACHE_SHIFT;
+                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+                        range_whole = 1;
+                should_loop = 0;
+                dout(" not cyclic, %lu to %lu\n", start, end);
+        }
+        index = start;
+retry:
+        /* find oldest snap context with dirty data */
+        ceph_put_snap_context(snapc);
+        snapc = get_oldest_context(inode, &snap_size);
+        if (!snapc) {
+                /* hmm, why does writepages get called when there
+                   is no dirty data? */
+                dout(" no snap context with dirty data?\n");
+                goto out;
+        }
+        dout(" oldest snapc is %p seq %lld (%d snaps)\n",
+             snapc, snapc->seq, snapc->num_snaps);
+        if (last_snapc && snapc != last_snapc) {
+                /* if we switched to a newer snapc, restart our scan at the
+                 * start of the original file range. */
+                dout("  snapc differs from last pass, restarting at %lu\n",
+                     index);
+                index = start;
+        }
+        last_snapc = snapc;
+        while (!done && index <= end) {
+                unsigned i;
+                int first;
+                pgoff_t next;
+                int pvec_pages, locked_pages;
+                struct page *page;
+                int want;
+                u64 offset, len;
+                struct ceph_osd_request_head *reqhead;
+                struct ceph_osd_op *op;
+                long writeback_stat;
+                next = 0;
+                locked_pages = 0;
+                max_pages = max_pages_ever;
+get_more_pages:
+                first = -1;
+                want = min(end - index,
+                           min((pgoff_t)PAGEVEC_SIZE,
+                               max_pages - (pgoff_t)locked_pages) - 1)
+                        + 1;
+                pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                                                PAGECACHE_TAG_DIRTY,
+                                                want);
+                dout("pagevec_lookup_tag got %d\n", pvec_pages);
+                if (!pvec_pages && !locked_pages)
+                        break;
+                for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
+                        page = pvec.pages[i];
+                        dout("? %p idx %lu\n", page, page->index);
+                        if (locked_pages == 0)
+                                lock_page(page);  /* first page */
+                        else if (!trylock_page(page))
+                                break;
+                        /* only dirty pages, or our accounting breaks */
+                        if (unlikely(!PageDirty(page)) ||
+                            unlikely(page->mapping != mapping)) {
+                                dout("!dirty or !mapping %p\n", page);
+                                unlock_page(page);
+                                break;
+                        }
+                        if (!wbc->range_cyclic && page->index > end) {
+                                dout("end of range %p\n", page);
+                                done = 1;
+                                unlock_page(page);
+                                break;
+                        }
+                        if (next && (page->index != next)) {
+                                dout("not consecutive %p\n", page);
+                                unlock_page(page);
+                                break;
+                        }
+                        if (wbc->sync_mode != WB_SYNC_NONE) {
+                                dout("waiting on writeback %p\n", page);
+                                wait_on_page_writeback(page);
+                        }
+                        if ((snap_size && page_offset(page) > snap_size) ||
+                            (!snap_size &&
+                             page_offset(page) > i_size_read(inode))) {
+                                dout("%p page eof %llu\n", page, snap_size ?
+                                     snap_size : i_size_read(inode));
+                                done = 1;
+                                unlock_page(page);
+                                break;
+                        }
+                        if (PageWriteback(page)) {
+                                dout("%p under writeback\n", page);
+                                unlock_page(page);
+                                break;
+                        }
+                        /* only if matching snap context */
+                        if (snapc != (void *)page->private) {
+                                dout("page snapc %p != oldest %p\n",
+                                     (void *)page->private, snapc);
+                                unlock_page(page);
+                                if (!locked_pages)
+                                        continue; /* keep looking for snap */
+                                break;
+                        }
+                        if (!clear_page_dirty_for_io(page)) {
+                                dout("%p !clear_page_dirty_for_io\n", page);
+                                unlock_page(page);
+                                break;
+                        }
+                        /* ok */
+                        if (locked_pages == 0) {
+                                /* prepare async write request */
+                                offset = page->index << PAGE_CACHE_SHIFT;
+                                len = wsize;
+                                req = ceph_osdc_new_request(&client->osdc,
+                                            &ci->i_layout,
+                                            ceph_vino(inode),
+                                            offset, &len,
+                                            CEPH_OSD_OP_WRITE,
+                                            CEPH_OSD_FLAG_WRITE |
+                                                    CEPH_OSD_FLAG_ONDISK,
+                                            snapc, do_sync,
+                                            ci->i_truncate_seq,
+                                            ci->i_truncate_size,
+                                            &inode->i_mtime, true, 1);
+                                max_pages = req->r_num_pages;
+                                alloc_page_vec(client, req);
+                                req->r_callback = writepages_finish;
+                                req->r_inode = inode;
+                                req->r_wbc = wbc;
+                        }
+                        /* note position of first page in pvec */
+                        if (first < 0)
+                                first = i;
+                        dout("%p will write page %p idx %lu\n",
+                             inode, page, page->index);
+                        writeback_stat = atomic_long_inc_return(&client->writeback_count);
+                        if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) {
+                                set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+                        }
+                        set_page_writeback(page);
+                        req->r_pages[locked_pages] = page;
+                        locked_pages++;
+                        next = page->index + 1;
+                }
+                /* did we get anything? */
+                if (!locked_pages)
+                        goto release_pvec_pages;
+                if (i) {
+                        int j;
+                        BUG_ON(!locked_pages || first < 0);
+                        if (pvec_pages && i == pvec_pages &&
+                            locked_pages < max_pages) {
+                                dout("reached end pvec, trying for more\n");
+                                pagevec_reinit(&pvec);
+                                goto get_more_pages;
+                        }
+                        /* shift unused pages over in the pvec...  we
+                         * will need to release them below. */
+                        for (j = i; j < pvec_pages; j++) {
+                                dout(" pvec leftover page %p\n",
+                                     pvec.pages[j]);
+                                pvec.pages[j-i+first] = pvec.pages[j];
+                        }
+                        pvec.nr -= i-first;
+                }
+                /* submit the write */
+                offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
+                len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
+                          (u64)locked_pages << PAGE_CACHE_SHIFT);
+                dout("writepages got %d pages at %llu~%llu\n",
+                     locked_pages, offset, len);
+                /* revise final length, page count */
+                req->r_num_pages = locked_pages;
+                reqhead = req->r_request->front.iov_base;
+                op = (void *)(reqhead + 1);
+                op->extent.length = cpu_to_le64(len);
+                op->payload_len = cpu_to_le32(len);
+                req->r_request->hdr.data_len = cpu_to_le32(len);
+                ceph_osdc_start_request(&client->osdc, req, true);
+                req = NULL;
+                /* continue? */
+                index = next;
+                wbc->nr_to_write -= locked_pages;
+                if (wbc->nr_to_write <= 0)
+                        done = 1;
+release_pvec_pages:
+                dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
+                     pvec.nr ? pvec.pages[0] : NULL);
+                pagevec_release(&pvec);
+                if (locked_pages && !done)
+                        goto retry;
+        }
+        if (should_loop && !done) {
+                /* more to do; loop back to beginning of file */
+                dout("writepages looping back to beginning of file\n");
+                should_loop = 0;
+                index = 0;
+                goto retry;
+        }
+        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+                mapping->writeback_index = index;
+out:
+        if (req)
+                ceph_osdc_put_request(req);
+        if (rc > 0)
+                rc = 0;  /* vfs expects us to return 0 */
+        ceph_put_snap_context(snapc);
+        dout("writepages done, rc = %d\n", rc);
+out_final:
+        return rc;
+}
+/*
+ * See if a given @snapc is either writeable, or already written.
+ */
+static int context_is_writeable_or_written(struct inode *inode,
+                                           struct ceph_snap_context *snapc)
+{
+        struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
+        return !oldest || snapc->seq <= oldest->seq;
+}
+/*
+ * We are only allowed to write into/dirty the page if the page is
+ * clean, or already dirty within the same snap context.
+ *
+ * called with page locked.
+ * return success with page locked,
+ * or any failure (incl -EAGAIN) with page unlocked.
+ */
+static int ceph_update_writeable_page(struct file *file,
+                            loff_t pos, unsigned len,
+                            struct page *page)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+        loff_t page_off = pos & PAGE_CACHE_MASK;
+        int pos_in_page = pos & ~PAGE_CACHE_MASK;
+        int end_in_page = pos_in_page + len;
+        loff_t i_size;
+        struct ceph_snap_context *snapc;
+        int r;
+retry_locked:
+        /* writepages currently holds page lock, but if we change that later, */
+        wait_on_page_writeback(page);
+        /* check snap context */
+        BUG_ON(!ci->i_snap_realm);
+        down_read(&mdsc->snap_rwsem);
+        BUG_ON(!ci->i_snap_realm->cached_context);
+        if (page->private &&
+            (void *)page->private != ci->i_snap_realm->cached_context) {
+                /*
+                 * this page is already dirty in another (older) snap
+                 * context!  is it writeable now?
+                 */
+                snapc = get_oldest_context(inode, NULL);
+                up_read(&mdsc->snap_rwsem);
+                if (snapc != (void *)page->private) {
+                        dout(" page %p snapc %p not current or oldest\n",
+                             page, (void *)page->private);
+                        /*
+                         * queue for writeback, and wait for snapc to
+                         * be writeable or written
+                         */
+                        snapc = ceph_get_snap_context((void *)page->private);
+                        unlock_page(page);
+                        ceph_queue_writeback(inode);
+                        r = wait_event_interruptible(ci->i_cap_wq,
+                               context_is_writeable_or_written(inode, snapc));
+                        ceph_put_snap_context(snapc);
+                        if (r == -ERESTARTSYS)
+                                return r;
+                        return -EAGAIN;
+                }
+                /* yay, writeable, do it now (without dropping page lock) */
+                dout(" page %p snapc %p not current, but oldest\n",
+                     page, snapc);
+                if (!clear_page_dirty_for_io(page))
+                        goto retry_locked;
+                r = writepage_nounlock(page, NULL);
+                if (r < 0)
+                        goto fail_nosnap;
+                goto retry_locked;
+        }
+        if (PageUptodate(page)) {
+                dout(" page %p already uptodate\n", page);
+                return 0;
+        }
+        /* full page? */
+        if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
+                return 0;
+        /* past end of file? */
+        i_size = inode->i_size;   /* caller holds i_mutex */
+        if (i_size + len > inode->i_sb->s_maxbytes) {
+                /* file is too big */
+                r = -EINVAL;
+                goto fail;
+        }
+        if (page_off >= i_size ||
+            (pos_in_page == 0 && (pos+len) >= i_size &&
+             end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
+                dout(" zeroing %p 0 - %d and %d - %d\n",
+                     page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
+                zero_user_segments(page,
+                                   0, pos_in_page,
+                                   end_in_page, PAGE_CACHE_SIZE);
+                return 0;
+        }
+        /* we need to read it. */
+        up_read(&mdsc->snap_rwsem);
+        r = readpage_nounlock(file, page);
+        if (r < 0)
+                goto fail_nosnap;
+        goto retry_locked;
+fail:
+        up_read(&mdsc->snap_rwsem);
+fail_nosnap:
+        unlock_page(page);
+        return r;
+}
+/*
+ * We are only allowed to write into/dirty the page if the page is
+ * clean, or already dirty within the same snap context.
+ */
+static int ceph_write_begin(struct file *file, struct address_space *mapping,
+                            loff_t pos, unsigned len, unsigned flags,
+                            struct page **pagep, void **fsdata)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct page *page;
+        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+        int r;
+        do {
+                /* get a page */
+                page = grab_cache_page_write_begin(mapping, index, 0);
+                if (!page)
+                        return -ENOMEM;
+                *pagep = page;
+                dout("write_begin file %p inode %p page %p %d~%d\n", file,
+                inode, page, (int)pos, (int)len);
+                r = ceph_update_writeable_page(file, pos, len, page);
+        } while (r == -EAGAIN);
+        return r;
+}
+/*
+ * we don't do anything in here that simple_write_end doesn't do
+ * except adjust dirty page accounting and drop read lock on
+ * mdsc->snap_rwsem.
+ */
+static int ceph_write_end(struct file *file, struct address_space *mapping,
+                          loff_t pos, unsigned len, unsigned copied,
+                          struct page *page, void *fsdata)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct ceph_client *client = ceph_inode_to_client(inode);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+        int check_cap = 0;
+        dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
+             inode, page, (int)pos, (int)copied, (int)len);
+        /* zero the stale part of the page if we did a short copy */
+        if (copied < len)
+                zero_user_segment(page, from+copied, len);
+        /* did file size increase? */
+        /* (no need for i_size_read(); we caller holds i_mutex */
+        if (pos+copied > inode->i_size)
+                check_cap = ceph_inode_set_size(inode, pos+copied);
+        if (!PageUptodate(page))
+                SetPageUptodate(page);
+        set_page_dirty(page);
+        unlock_page(page);
+        up_read(&mdsc->snap_rwsem);
+        page_cache_release(page);
+        if (check_cap)
+                ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
+        return copied;
+}
+/*
+ * we set .direct_IO to indicate direct io is supported, but since we
+ * intercept O_DIRECT reads and writes early, this function should
+ * never get called.
+ */
+static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
+                              const struct iovec *iov,
+                              loff_t pos, unsigned long nr_segs)
+{
+        WARN_ON(1);
+        return -EINVAL;
+}
+const struct address_space_operations ceph_aops = {
+        .readpage = ceph_readpage,
+        .readpages = ceph_readpages,
+        .writepage = ceph_writepage,
+        .writepages = ceph_writepages_start,
+        .write_begin = ceph_write_begin,
+        .write_end = ceph_write_end,
+        .set_page_dirty = ceph_set_page_dirty,
+        .invalidatepage = ceph_invalidatepage,
+        .releasepage = ceph_releasepage,
+        .direct_IO = ceph_direct_io,
+};
+/*
+ * vm ops
+ */
+/*
+ * Reuse write_begin here for simplicity.
+ */
+static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        struct inode *inode = vma->vm_file->f_dentry->d_inode;
+        struct page *page = vmf->page;
+        struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+        loff_t off = page->index << PAGE_CACHE_SHIFT;
+        loff_t size, len;
+        int ret;
+        size = i_size_read(inode);
+        if (off + PAGE_CACHE_SIZE <= size)
+                len = PAGE_CACHE_SIZE;
+        else
+                len = size & ~PAGE_CACHE_MASK;
+        dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
+             off, len, page, page->index);
+        lock_page(page);
+        ret = VM_FAULT_NOPAGE;
+        if ((off > size) ||
+            (page->mapping != inode->i_mapping))
+                goto out;
+        ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
+        if (ret == 0) {
+                /* success.  we'll keep the page locked. */
+                set_page_dirty(page);
+                up_read(&mdsc->snap_rwsem);
+                ret = VM_FAULT_LOCKED;
+        } else {
+                if (ret == -ENOMEM)
+                        ret = VM_FAULT_OOM;
+                else
+                        ret = VM_FAULT_SIGBUS;
+        }
+out:
+        dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
+        if (ret != VM_FAULT_LOCKED)
+                unlock_page(page);
+        return ret;
+}
+static struct vm_operations_struct ceph_vmops = {
+        .fault          = filemap_fault,
+        .page_mkwrite   = ceph_page_mkwrite,
+};
+int ceph_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        struct address_space *mapping = file->f_mapping;
+        if (!mapping->a_ops->readpage)
+                return -ENOEXEC;
+        file_accessed(file);
+        vma->vm_ops = &ceph_vmops;
+        vma->vm_flags |= VM_CAN_NONLINEAR;
+        return 0;
+}
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
new file mode 100644
index 000000000000..67b2c030924b
--- /dev/null
+++ b/fs/ceph/armor.c
@@ -0,0 +1,99 @@
+#include <linux/errno.h>
+/*
+ * base64 encode/decode.
+ */
+const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+static int encode_bits(int c)
+{
+        return pem_key[c];
+}
+static int decode_bits(char c)
+{
+        if (c >= 'A' && c <= 'Z')
+                return c - 'A';
+        if (c >= 'a' && c <= 'z')
+                return c - 'a' + 26;
+        if (c >= '0' && c <= '9')
+                return c - '0' + 52;
+        if (c == '+')
+                return 62;
+        if (c == '/')
+                return 63;
+        if (c == '=')
+                return 0; /* just non-negative, please */
+        return -EINVAL;
+}
+int ceph_armor(char *dst, const char *src, const char *end)
+{
+        int olen = 0;
+        int line = 0;
+        while (src < end) {
+                unsigned char a, b, c;
+                a = *src++;
+                *dst++ = encode_bits(a >> 2);
+                if (src < end) {
+                        b = *src++;
+                        *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
+                        if (src < end) {
+                                c = *src++;
+                                *dst++ = encode_bits(((b & 15) << 2) |
+                                                     (c >> 6));
+                                *dst++ = encode_bits(c & 63);
+                        } else {
+                                *dst++ = encode_bits((b & 15) << 2);
+                                *dst++ = '=';
+                        }
+                } else {
+                        *dst++ = encode_bits(((a & 3) << 4));
+                        *dst++ = '=';
+                        *dst++ = '=';
+                }
+                olen += 4;
+                line += 4;
+                if (line == 64) {
+                        line = 0;
+                        *(dst++) = '\n';
+                        olen++;
+                }
+        }
+        return olen;
+}
+int ceph_unarmor(char *dst, const char *src, const char *end)
+{
+        int olen = 0;
+        while (src < end) {
+                int a, b, c, d;
+                if (src < end && src[0] == '\n')
+                        src++;
+                if (src + 4 > end)
+                        return -EINVAL;
+                a = decode_bits(src[0]);
+                b = decode_bits(src[1]);
+                c = decode_bits(src[2]);
+                d = decode_bits(src[3]);
+                if (a < 0 || b < 0 || c < 0 || d < 0)
+                        return -EINVAL;
+                *dst++ = (a << 2) | (b >> 4);
+                if (src[2] == '=')
+                        return olen + 1;
+                *dst++ = ((b & 15) << 4) | (c >> 2);
+                if (src[3] == '=')
+                        return olen + 2;
+                *dst++ = ((c & 3) << 6) | d;
+                olen += 3;
+                src += 4;
+        }
+        return olen;
+}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
new file mode 100644
index 000000000000..f6394b94b866
--- /dev/null
+++ b/fs/ceph/auth.c
@@ -0,0 +1,258 @@
+#include "ceph_debug.h"
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include "types.h"
+#include "auth_none.h"
+#include "auth_x.h"
+#include "decode.h"
+#include "super.h"
+#include "messenger.h"
+/*
+ * get protocol handler
+ */
+static u32 supported_protocols[] = {
+        CEPH_AUTH_NONE,
+        CEPH_AUTH_CEPHX
+};
+int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
+{
+        switch (protocol) {
+        case CEPH_AUTH_NONE:
+                return ceph_auth_none_init(ac);
+        case CEPH_AUTH_CEPHX:
+                return ceph_x_init(ac);
+        default:
+                return -ENOENT;
+        }
+}
+/*
+ * setup, teardown.
+ */
+struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
+{
+        struct ceph_auth_client *ac;
+        int ret;
+        dout("auth_init name '%s' secret '%s'\n", name, secret);
+        ret = -ENOMEM;
+        ac = kzalloc(sizeof(*ac), GFP_NOFS);
+        if (!ac)
+                goto out;
+        ac->negotiating = true;
+        if (name)
+                ac->name = name;
+        else
+                ac->name = CEPH_AUTH_NAME_DEFAULT;
+        dout("auth_init name %s secret %s\n", ac->name, secret);
+        ac->secret = secret;
+        return ac;
+out:
+        return ERR_PTR(ret);
+}
+void ceph_auth_destroy(struct ceph_auth_client *ac)
+{
+        dout("auth_destroy %p\n", ac);
+        if (ac->ops)
+                ac->ops->destroy(ac);
+        kfree(ac);
+}
+/*
+ * Reset occurs when reconnecting to the monitor.
+ */
+void ceph_auth_reset(struct ceph_auth_client *ac)
+{
+        dout("auth_reset %p\n", ac);
+        if (ac->ops && !ac->negotiating)
+                ac->ops->reset(ac);
+        ac->negotiating = true;
+}
+int ceph_entity_name_encode(const char *name, void **p, void *end)
+{
+        int len = strlen(name);
+        if (*p + 2*sizeof(u32) + len > end)
+                return -ERANGE;
+        ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
+        ceph_encode_32(p, len);
+        ceph_encode_copy(p, name, len);
+        return 0;
+}
+/*
+ * Initiate protocol negotiation with monitor.  Include entity name
+ * and list supported protocols.
+ */
+int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
+{
+        struct ceph_mon_request_header *monhdr = buf;
+        void *p = monhdr + 1, *end = buf + len, *lenp;
+        int i, num;
+        int ret;
+        dout("auth_build_hello\n");
+        monhdr->have_version = 0;
+        monhdr->session_mon = cpu_to_le16(-1);
+        monhdr->session_mon_tid = 0;
+        ceph_encode_32(&p, 0);  /* no protocol, yet */
+        lenp = p;
+        p += sizeof(u32);
+        ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+        ceph_encode_8(&p, 1);
+        num = ARRAY_SIZE(supported_protocols);
+        ceph_encode_32(&p, num);
+        ceph_decode_need(&p, end, num * sizeof(u32), bad);
+        for (i = 0; i < num; i++)
+                ceph_encode_32(&p, supported_protocols[i]);
+        ret = ceph_entity_name_encode(ac->name, &p, end);
+        if (ret < 0)
+                return ret;
+        ceph_decode_need(&p, end, sizeof(u64), bad);
+        ceph_encode_64(&p, ac->global_id);
+        ceph_encode_32(&lenp, p - lenp - sizeof(u32));
+        return p - buf;
+bad:
+        return -ERANGE;
+}
+int ceph_build_auth_request(struct ceph_auth_client *ac,
+                           void *msg_buf, size_t msg_len)
+{
+        struct ceph_mon_request_header *monhdr = msg_buf;
+        void *p = monhdr + 1;
+        void *end = msg_buf + msg_len;
+        int ret;
+        monhdr->have_version = 0;
+        monhdr->session_mon = cpu_to_le16(-1);
+        monhdr->session_mon_tid = 0;
+        ceph_encode_32(&p, ac->protocol);
+        ret = ac->ops->build_request(ac, p + sizeof(u32), end);
+        if (ret < 0) {
+                pr_err("error %d building request\n", ret);
+                return ret;
+        }
+        dout(" built request %d bytes\n", ret);
+        ceph_encode_32(&p, ret);
+        return p + ret - msg_buf;
+}
+/*
+ * Handle auth message from monitor.
+ */
+int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+                           void *buf, size_t len,
+                           void *reply_buf, size_t reply_len)
+{
+        void *p = buf;
+        void *end = buf + len;
+        int protocol;
+        s32 result;
+        u64 global_id;
+        void *payload, *payload_end;
+        int payload_len;
+        char *result_msg;
+        int result_msg_len;
+        int ret = -EINVAL;
+        dout("handle_auth_reply %p %p\n", p, end);
+        ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
+        protocol = ceph_decode_32(&p);
+        result = ceph_decode_32(&p);
+        global_id = ceph_decode_64(&p);
+        payload_len = ceph_decode_32(&p);
+        payload = p;
+        p += payload_len;
+        ceph_decode_need(&p, end, sizeof(u32), bad);
+        result_msg_len = ceph_decode_32(&p);
+        result_msg = p;
+        p += result_msg_len;
+        if (p != end)
+                goto bad;
+        dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
+             result_msg, global_id, payload_len);
+        payload_end = payload + payload_len;
+        if (global_id && ac->global_id != global_id) {
+                dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
+                ac->global_id = global_id;
+        }
+        if (ac->negotiating) {
+                /* server does not support our protocols? */
+                if (!protocol && result < 0) {
+                        ret = result;
+                        goto out;
+                }
+                /* set up (new) protocol handler? */
+                if (ac->protocol && ac->protocol != protocol) {
+                        ac->ops->destroy(ac);
+                        ac->protocol = 0;
+                        ac->ops = NULL;
+                }
+                if (ac->protocol != protocol) {
+                        ret = ceph_auth_init_protocol(ac, protocol);
+                        if (ret) {
+                                pr_err("error %d on auth protocol %d init\n",
+                                       ret, protocol);
+                                goto out;
+                        }
+                }
+                ac->negotiating = false;
+        }
+        ret = ac->ops->handle_reply(ac, result, payload, payload_end);
+        if (ret == -EAGAIN) {
+                return ceph_build_auth_request(ac, reply_buf, reply_len);
+        } else if (ret) {
+                pr_err("authentication error %d\n", ret);
+                return ret;
+        }
+        return 0;
+bad:
+        pr_err("failed to decode auth msg\n");
+out:
+        return ret;
+}
+int ceph_build_auth(struct ceph_auth_client *ac,
+                    void *msg_buf, size_t msg_len)
+{
+        if (!ac->protocol)
+                return ceph_auth_build_hello(ac, msg_buf, msg_len);
+        BUG_ON(!ac->ops);
+        if (!ac->ops->is_authenticated(ac))
+                return ceph_build_auth_request(ac, msg_buf, msg_len);
+        return 0;
+}
+int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
+{
+        if (!ac->ops)
+                return 0;
+        return ac->ops->is_authenticated(ac);
+}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
new file mode 100644
index 000000000000..ca4f57cfb267
--- /dev/null
+++ b/fs/ceph/auth.h
@@ -0,0 +1,84 @@
+#ifndef _FS_CEPH_AUTH_H
+#define _FS_CEPH_AUTH_H
+#include "types.h"
+#include "buffer.h"
+/*
+ * Abstract interface for communicating with the authenticate module.
+ * There is some handshake that takes place between us and the monitor
+ * to acquire the necessary keys.  These are used to generate an
+ * 'authorizer' that we use when connecting to a service (mds, osd).
+ */
+struct ceph_auth_client;
+struct ceph_authorizer;
+struct ceph_auth_client_ops {
+        /*
+         * true if we are authenticated and can connect to
+         * services.
+         */
+        int (*is_authenticated)(struct ceph_auth_client *ac);
+        /*
+         * build requests and process replies during monitor
+         * handshake.  if handle_reply returns -EAGAIN, we build
+         * another request.
+         */
+        int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
+        int (*handle_reply)(struct ceph_auth_client *ac, int result,
+                            void *buf, void *end);
+        /*
+         * Create authorizer for connecting to a service, and verify
+         * the response to authenticate the service.
+         */
+        int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
+                                 struct ceph_authorizer **a,
+                                 void **buf, size_t *len,
+                                 void **reply_buf, size_t *reply_len);
+        int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
+                                       struct ceph_authorizer *a, size_t len);
+        void (*destroy_authorizer)(struct ceph_auth_client *ac,
+                                   struct ceph_authorizer *a);
+        void (*invalidate_authorizer)(struct ceph_auth_client *ac,
+                                      int peer_type);
+        /* reset when we (re)connect to a monitor */
+        void (*reset)(struct ceph_auth_client *ac);
+        void (*destroy)(struct ceph_auth_client *ac);
+};
+struct ceph_auth_client {
+        u32 protocol;           /* CEPH_AUTH_* */
+        void *private;          /* for use by protocol implementation */
+        const struct ceph_auth_client_ops *ops;  /* null iff protocol==0 */
+        bool negotiating;       /* true if negotiating protocol */
+        const char *name;       /* entity name */
+        u64 global_id;          /* our unique id in system */
+        const char *secret;     /* our secret key */
+        unsigned want_keys;     /* which services we want */
+};
+extern struct ceph_auth_client *ceph_auth_init(const char *name,
+                                               const char *secret);
+extern void ceph_auth_destroy(struct ceph_auth_client *ac);
+extern void ceph_auth_reset(struct ceph_auth_client *ac);
+extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
+                                 void *buf, size_t len);
+extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+                                  void *buf, size_t len,
+                                  void *reply_buf, size_t reply_len);
+extern int ceph_entity_name_encode(const char *name, void **p, void *end);
+extern int ceph_build_auth(struct ceph_auth_client *ac,
+                    void *msg_buf, size_t msg_len);
+extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
+#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
new file mode 100644
index 000000000000..8cd9e3af07f7
--- /dev/null
+++ b/fs/ceph/auth_none.c
@@ -0,0 +1,122 @@
+#include "ceph_debug.h"
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include "auth_none.h"
+#include "auth.h"
+#include "decode.h"
+static void reset(struct ceph_auth_client *ac)
+{
+        struct ceph_auth_none_info *xi = ac->private;
+        xi->starting = true;
+        xi->built_authorizer = false;
+}
+static void destroy(struct ceph_auth_client *ac)
+{
+        kfree(ac->private);
+        ac->private = NULL;
+}
+static int is_authenticated(struct ceph_auth_client *ac)
+{
+        struct ceph_auth_none_info *xi = ac->private;
+        return !xi->starting;
+}
+/*
+ * the generic auth code decode the global_id, and we carry no actual
+ * authenticate state, so nothing happens here.
+ */
+static int handle_reply(struct ceph_auth_client *ac, int result,
+                        void *buf, void *end)
+{
+        struct ceph_auth_none_info *xi = ac->private;
+        xi->starting = false;
+        return result;
+}
+/*
+ * build an 'authorizer' with our entity_name and global_id.  we can
+ * reuse a single static copy since it is identical for all services
+ * we connect to.
+ */
+static int ceph_auth_none_create_authorizer(
+        struct ceph_auth_client *ac, int peer_type,
+        struct ceph_authorizer **a,
+        void **buf, size_t *len,
+        void **reply_buf, size_t *reply_len)
+{
+        struct ceph_auth_none_info *ai = ac->private;
+        struct ceph_none_authorizer *au = &ai->au;
+        void *p, *end;
+        int ret;
+        if (!ai->built_authorizer) {
+                p = au->buf;
+                end = p + sizeof(au->buf);
+                ceph_encode_8(&p, 1);
+                ret = ceph_entity_name_encode(ac->name, &p, end - 8);
+                if (ret < 0)
+                        goto bad;
+                ceph_decode_need(&p, end, sizeof(u64), bad2);
+                ceph_encode_64(&p, ac->global_id);
+                au->buf_len = p - (void *)au->buf;
+                ai->built_authorizer = true;
+                dout("built authorizer len %d\n", au->buf_len);
+        }
+        *a = (struct ceph_authorizer *)au;
+        *buf = au->buf;
+        *len = au->buf_len;
+        *reply_buf = au->reply_buf;
+        *reply_len = sizeof(au->reply_buf);
+        return 0;
+bad2:
+        ret = -ERANGE;
+bad:
+        return ret;
+}
+static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
+                                      struct ceph_authorizer *a)
+{
+        /* nothing to do */
+}
+static const struct ceph_auth_client_ops ceph_auth_none_ops = {
+        .reset = reset,
+        .destroy = destroy,
+        .is_authenticated = is_authenticated,
+        .handle_reply = handle_reply,
+        .create_authorizer = ceph_auth_none_create_authorizer,
+        .destroy_authorizer = ceph_auth_none_destroy_authorizer,
+};
+int ceph_auth_none_init(struct ceph_auth_client *ac)
+{
+        struct ceph_auth_none_info *xi;
+        dout("ceph_auth_none_init %p\n", ac);
+        xi = kzalloc(sizeof(*xi), GFP_NOFS);
+        if (!xi)
+                return -ENOMEM;
+        xi->starting = true;
+        xi->built_authorizer = false;
+        ac->protocol = CEPH_AUTH_NONE;
+        ac->private = xi;
+        ac->ops = &ceph_auth_none_ops;
+        return 0;
+}
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
new file mode 100644
index 000000000000..56c05533a31c
--- /dev/null
+++ b/fs/ceph/auth_none.h
@@ -0,0 +1,28 @@
+#ifndef _FS_CEPH_AUTH_NONE_H
+#define _FS_CEPH_AUTH_NONE_H
+#include "auth.h"
+/*
+ * null security mode.
+ *
+ * we use a single static authorizer that simply encodes our entity name
+ * and global id.
+ */
+struct ceph_none_authorizer {
+        char buf[128];
+        int buf_len;
+        char reply_buf[0];
+};
+struct ceph_auth_none_info {
+        bool starting;
+        bool built_authorizer;
+        struct ceph_none_authorizer au;   /* we only need one; it's static */
+};
+extern int ceph_auth_none_init(struct ceph_auth_client *ac);
+#endif
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
new file mode 100644
index 000000000000..d9001a4dc8cc
--- /dev/null
+++ b/fs/ceph/auth_x.c
@@ -0,0 +1,680 @@
+#include "ceph_debug.h"
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include "auth_x.h"
+#include "auth_x_protocol.h"
+#include "crypto.h"
+#include "auth.h"
+#include "decode.h"
+struct kmem_cache *ceph_x_ticketbuf_cachep;
+#define TEMP_TICKET_BUF_LEN     256
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
+static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
+{
+        struct ceph_x_info *xi = ac->private;
+        int need;
+        ceph_x_validate_tickets(ac, &need);
+        dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
+             ac->want_keys, need, xi->have_keys);
+        return (ac->want_keys & xi->have_keys) == ac->want_keys;
+}
+static int ceph_x_encrypt_buflen(int ilen)
+{
+        return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
+                sizeof(u32);
+}
+static int ceph_x_encrypt(struct ceph_crypto_key *secret,
+                          void *ibuf, int ilen, void *obuf, size_t olen)
+{
+        struct ceph_x_encrypt_header head = {
+                .struct_v = 1,
+                .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
+        };
+        size_t len = olen - sizeof(u32);
+        int ret;
+        ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
+                            &head, sizeof(head), ibuf, ilen);
+        if (ret)
+                return ret;
+        ceph_encode_32(&obuf, len);
+        return len + sizeof(u32);
+}
+static int ceph_x_decrypt(struct ceph_crypto_key *secret,
+                          void **p, void *end, void *obuf, size_t olen)
+{
+        struct ceph_x_encrypt_header head;
+        size_t head_len = sizeof(head);
+        int len, ret;
+        len = ceph_decode_32(p);
+        if (*p + len > end)
+                return -EINVAL;
+        dout("ceph_x_decrypt len %d\n", len);
+        ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
+                            *p, len);
+        if (ret)
+                return ret;
+        if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
+                return -EPERM;
+        *p += len;
+        return olen;
+}
+/*
+ * get existing (or insert new) ticket handler
+ */
+struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac,
+                                                 int service)
+{
+        struct ceph_x_ticket_handler *th;
+        struct ceph_x_info *xi = ac->private;
+        struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
+        while (*p) {
+                parent = *p;
+                th = rb_entry(parent, struct ceph_x_ticket_handler, node);
+                if (service < th->service)
+                        p = &(*p)->rb_left;
+                else if (service > th->service)
+                        p = &(*p)->rb_right;
+                else
+                        return th;
+        }
+        /* add it */
+        th = kzalloc(sizeof(*th), GFP_NOFS);
+        if (!th)
+                return ERR_PTR(-ENOMEM);
+        th->service = service;
+        rb_link_node(&th->node, parent, p);
+        rb_insert_color(&th->node, &xi->ticket_handlers);
+        return th;
+}
+static void remove_ticket_handler(struct ceph_auth_client *ac,
+                                  struct ceph_x_ticket_handler *th)
+{
+        struct ceph_x_info *xi = ac->private;
+        dout("remove_ticket_handler %p %d\n", th, th->service);
+        rb_erase(&th->node, &xi->ticket_handlers);
+        ceph_crypto_key_destroy(&th->session_key);
+        if (th->ticket_blob)
+                ceph_buffer_put(th->ticket_blob);
+        kfree(th);
+}
+static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
+                                    struct ceph_crypto_key *secret,
+                                    void *buf, void *end)
+{
+        struct ceph_x_info *xi = ac->private;
+        int num;
+        void *p = buf;
+        int ret;
+        char *dbuf;
+        char *ticket_buf;
+        u8 struct_v;
+        dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC);
+        if (!dbuf)
+                return -ENOMEM;
+        ret = -ENOMEM;
+        ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep,
+                                      GFP_NOFS | GFP_ATOMIC);
+        if (!ticket_buf)
+                goto out_dbuf;
+        ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+        struct_v = ceph_decode_8(&p);
+        if (struct_v != 1)
+                goto bad;
+        num = ceph_decode_32(&p);
+        dout("%d tickets\n", num);
+        while (num--) {
+                int type;
+                u8 struct_v;
+                struct ceph_x_ticket_handler *th;
+                void *dp, *dend;
+                int dlen;
+                char is_enc;
+                struct timespec validity;
+                struct ceph_crypto_key old_key;
+                void *tp, *tpend;
+                struct ceph_timespec new_validity;
+                struct ceph_crypto_key new_session_key;
+                struct ceph_buffer *new_ticket_blob;
+                unsigned long new_expires, new_renew_after;
+                u64 new_secret_id;
+                ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
+                type = ceph_decode_32(&p);
+                dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
+                struct_v = ceph_decode_8(&p);
+                if (struct_v != 1)
+                        goto bad;
+                th = get_ticket_handler(ac, type);
+                if (IS_ERR(th)) {
+                        ret = PTR_ERR(th);
+                        goto out;
+                }
+                /* blob for me */
+                dlen = ceph_x_decrypt(secret, &p, end, dbuf,
+                                      TEMP_TICKET_BUF_LEN);
+                if (dlen <= 0) {
+                        ret = dlen;
+                        goto out;
+                }
+                dout(" decrypted %d bytes\n", dlen);
+                dend = dbuf + dlen;
+                dp = dbuf;
+                struct_v = ceph_decode_8(&dp);
+                if (struct_v != 1)
+                        goto bad;
+                memcpy(&old_key, &th->session_key, sizeof(old_key));
+                ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
+                if (ret)
+                        goto out;
+                ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
+                ceph_decode_timespec(&validity, &new_validity);
+                new_expires = get_seconds() + validity.tv_sec;
+                new_renew_after = new_expires - (validity.tv_sec / 4);
+                dout(" expires=%lu renew_after=%lu\n", new_expires,
+                     new_renew_after);
+                /* ticket blob for service */
+                ceph_decode_8_safe(&p, end, is_enc, bad);
+                tp = ticket_buf;
+                if (is_enc) {
+                        /* encrypted */
+                        dout(" encrypted ticket\n");
+                        dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
+                                              TEMP_TICKET_BUF_LEN);
+                        if (dlen < 0) {
+                                ret = dlen;
+                                goto out;
+                        }
+                        dlen = ceph_decode_32(&tp);
+                } else {
+                        /* unencrypted */
+                        ceph_decode_32_safe(&p, end, dlen, bad);
+                        ceph_decode_need(&p, end, dlen, bad);
+                        ceph_decode_copy(&p, ticket_buf, dlen);
+                }
+                tpend = tp + dlen;
+                dout(" ticket blob is %d bytes\n", dlen);
+                ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
+                struct_v = ceph_decode_8(&tp);
+                new_secret_id = ceph_decode_64(&tp);
+                ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
+                if (ret)
+                        goto out;
+                /* all is well, update our ticket */
+                ceph_crypto_key_destroy(&th->session_key);
+                if (th->ticket_blob)
+                        ceph_buffer_put(th->ticket_blob);
+                th->session_key = new_session_key;
+                th->ticket_blob = new_ticket_blob;
+                th->validity = new_validity;
+                th->secret_id = new_secret_id;
+                th->expires = new_expires;
+                th->renew_after = new_renew_after;
+                dout(" got ticket service %d (%s) secret_id %lld len %d\n",
+                     type, ceph_entity_type_name(type), th->secret_id,
+                     (int)th->ticket_blob->vec.iov_len);
+                xi->have_keys |= th->service;
+        }
+        ret = 0;
+out:
+        kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf);
+out_dbuf:
+        kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf);
+        return ret;
+bad:
+        ret = -EINVAL;
+        goto out;
+}
+static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
+                                   struct ceph_x_ticket_handler *th,
+                                   struct ceph_x_authorizer *au)
+{
+        int maxlen;
+        struct ceph_x_authorize_a *msg_a;
+        struct ceph_x_authorize_b msg_b;
+        void *p, *end;
+        int ret;
+        int ticket_blob_len =
+                (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
+        dout("build_authorizer for %s %p\n",
+             ceph_entity_type_name(th->service), au);
+        maxlen = sizeof(*msg_a) + sizeof(msg_b) +
+                ceph_x_encrypt_buflen(ticket_blob_len);
+        dout("  need len %d\n", maxlen);
+        if (au->buf && au->buf->alloc_len < maxlen) {
+                ceph_buffer_put(au->buf);
+                au->buf = NULL;
+        }
+        if (!au->buf) {
+                au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
+                if (!au->buf)
+                        return -ENOMEM;
+        }
+        au->service = th->service;
+        msg_a = au->buf->vec.iov_base;
+        msg_a->struct_v = 1;
+        msg_a->global_id = cpu_to_le64(ac->global_id);
+        msg_a->service_id = cpu_to_le32(th->service);
+        msg_a->ticket_blob.struct_v = 1;
+        msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
+        msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
+        if (ticket_blob_len) {
+                memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
+                       th->ticket_blob->vec.iov_len);
+        }
+        dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
+             le64_to_cpu(msg_a->ticket_blob.secret_id));
+        p = msg_a + 1;
+        p += ticket_blob_len;
+        end = au->buf->vec.iov_base + au->buf->vec.iov_len;
+        get_random_bytes(&au->nonce, sizeof(au->nonce));
+        msg_b.struct_v = 1;
+        msg_b.nonce = cpu_to_le64(au->nonce);
+        ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
+                             p, end - p);
+        if (ret < 0)
+                goto out_buf;
+        p += ret;
+        au->buf->vec.iov_len = p - au->buf->vec.iov_base;
+        dout(" built authorizer nonce %llx len %d\n", au->nonce,
+             (int)au->buf->vec.iov_len);
+        BUG_ON(au->buf->vec.iov_len > maxlen);
+        return 0;
+out_buf:
+        ceph_buffer_put(au->buf);
+        au->buf = NULL;
+        return ret;
+}
+static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
+                                void **p, void *end)
+{
+        ceph_decode_need(p, end, 1 + sizeof(u64), bad);
+        ceph_encode_8(p, 1);
+        ceph_encode_64(p, th->secret_id);
+        if (th->ticket_blob) {
+                const char *buf = th->ticket_blob->vec.iov_base;
+                u32 len = th->ticket_blob->vec.iov_len;
+                ceph_encode_32_safe(p, end, len, bad);
+                ceph_encode_copy_safe(p, end, buf, len, bad);
+        } else {
+                ceph_encode_32_safe(p, end, 0, bad);
+        }
+        return 0;
+bad:
+        return -ERANGE;
+}
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
+{
+        int want = ac->want_keys;
+        struct ceph_x_info *xi = ac->private;
+        int service;
+        *pneed = ac->want_keys & ~(xi->have_keys);
+        for (service = 1; service <= want; service <<= 1) {
+                struct ceph_x_ticket_handler *th;
+                if (!(ac->want_keys & service))
+                        continue;
+                if (*pneed & service)
+                        continue;
+                th = get_ticket_handler(ac, service);
+                if (!th) {
+                        *pneed |= service;
+                        continue;
+                }
+                if (get_seconds() >= th->renew_after)
+                        *pneed |= service;
+                if (get_seconds() >= th->expires)
+                        xi->have_keys &= ~service;
+        }
+}
+static int ceph_x_build_request(struct ceph_auth_client *ac,
+                                void *buf, void *end)
+{
+        struct ceph_x_info *xi = ac->private;
+        int need;
+        struct ceph_x_request_header *head = buf;
+        int ret;
+        struct ceph_x_ticket_handler *th =
+                get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+        ceph_x_validate_tickets(ac, &need);
+        dout("build_request want %x have %x need %x\n",
+             ac->want_keys, xi->have_keys, need);
+        if (need & CEPH_ENTITY_TYPE_AUTH) {
+                struct ceph_x_authenticate *auth = (void *)(head + 1);
+                void *p = auth + 1;
+                struct ceph_x_challenge_blob tmp;
+                char tmp_enc[40];
+                u64 *u;
+                if (p > end)
+                        return -ERANGE;
+                dout(" get_auth_session_key\n");
+                head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
+                /* encrypt and hash */
+                get_random_bytes(&auth->client_challenge, sizeof(u64));
+                tmp.client_challenge = auth->client_challenge;
+                tmp.server_challenge = cpu_to_le64(xi->server_challenge);
+                ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
+                                     tmp_enc, sizeof(tmp_enc));
+                if (ret < 0)
+                        return ret;
+                auth->struct_v = 1;
+                auth->key = 0;
+                for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
+                        auth->key ^= *u;
+                dout(" server_challenge %llx client_challenge %llx key %llx\n",
+                     xi->server_challenge, le64_to_cpu(auth->client_challenge),
+                     le64_to_cpu(auth->key));
+                /* now encode the old ticket if exists */
+                ret = ceph_x_encode_ticket(th, &p, end);
+                if (ret < 0)
+                        return ret;
+                return p - buf;
+        }
+        if (need) {
+                void *p = head + 1;
+                struct ceph_x_service_ticket_request *req;
+                if (p > end)
+                        return -ERANGE;
+                head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
+                BUG_ON(!th);
+                ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
+                if (ret)
+                        return ret;
+                ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
+                                 xi->auth_authorizer.buf->vec.iov_len);
+                req = p;
+                req->keys = cpu_to_le32(need);
+                p += sizeof(*req);
+                return p - buf;
+        }
+        return 0;
+}
+static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
+                               void *buf, void *end)
+{
+        struct ceph_x_info *xi = ac->private;
+        struct ceph_x_reply_header *head = buf;
+        struct ceph_x_ticket_handler *th;
+        int len = end - buf;
+        int op;
+        int ret;
+        if (result)
+                return result;  /* XXX hmm? */
+        if (xi->starting) {
+                /* it's a hello */
+                struct ceph_x_server_challenge *sc = buf;
+                if (len != sizeof(*sc))
+                        return -EINVAL;
+                xi->server_challenge = le64_to_cpu(sc->server_challenge);
+                dout("handle_reply got server challenge %llx\n",
+                     xi->server_challenge);
+                xi->starting = false;
+                xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
+                return -EAGAIN;
+        }
+        op = le32_to_cpu(head->op);
+        result = le32_to_cpu(head->result);
+        dout("handle_reply op %d result %d\n", op, result);
+        switch (op) {
+        case CEPHX_GET_AUTH_SESSION_KEY:
+                /* verify auth key */
+                ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
+                                               buf + sizeof(*head), end);
+                break;
+        case CEPHX_GET_PRINCIPAL_SESSION_KEY:
+                th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+                BUG_ON(!th);
+                ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
+                                               buf + sizeof(*head), end);
+                break;
+        default:
+                return -EINVAL;
+        }
+        if (ret)
+                return ret;
+        if (ac->want_keys == xi->have_keys)
+                return 0;
+        return -EAGAIN;
+}
+static int ceph_x_create_authorizer(
+        struct ceph_auth_client *ac, int peer_type,
+        struct ceph_authorizer **a,
+        void **buf, size_t *len,
+        void **reply_buf, size_t *reply_len)
+{
+        struct ceph_x_authorizer *au;
+        struct ceph_x_ticket_handler *th;
+        int ret;
+        th = get_ticket_handler(ac, peer_type);
+        if (IS_ERR(th))
+                return PTR_ERR(th);
+        au = kzalloc(sizeof(*au), GFP_NOFS);
+        if (!au)
+                return -ENOMEM;
+        ret = ceph_x_build_authorizer(ac, th, au);
+        if (ret) {
+                kfree(au);
+                return ret;
+        }
+        *a = (struct ceph_authorizer *)au;
+        *buf = au->buf->vec.iov_base;
+        *len = au->buf->vec.iov_len;
+        *reply_buf = au->reply_buf;
+        *reply_len = sizeof(au->reply_buf);
+        return 0;
+}
+static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
+                                          struct ceph_authorizer *a, size_t len)
+{
+        struct ceph_x_authorizer *au = (void *)a;
+        struct ceph_x_ticket_handler *th;
+        int ret = 0;
+        struct ceph_x_authorize_reply reply;
+        void *p = au->reply_buf;
+        void *end = p + sizeof(au->reply_buf);
+        th = get_ticket_handler(ac, au->service);
+        if (!th)
+                return -EIO;  /* hrm! */
+        ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
+        if (ret < 0)
+                return ret;
+        if (ret != sizeof(reply))
+                return -EPERM;
+        if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
+                ret = -EPERM;
+        else
+                ret = 0;
+        dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
+             au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
+        return ret;
+}
+static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
+                                      struct ceph_authorizer *a)
+{
+        struct ceph_x_authorizer *au = (void *)a;
+        ceph_buffer_put(au->buf);
+        kfree(au);
+}
+static void ceph_x_reset(struct ceph_auth_client *ac)
+{
+        struct ceph_x_info *xi = ac->private;
+        dout("reset\n");
+        xi->starting = true;
+        xi->server_challenge = 0;
+}
+static void ceph_x_destroy(struct ceph_auth_client *ac)
+{
+        struct ceph_x_info *xi = ac->private;
+        struct rb_node *p;
+        dout("ceph_x_destroy %p\n", ac);
+        ceph_crypto_key_destroy(&xi->secret);
+        while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
+                struct ceph_x_ticket_handler *th =
+                        rb_entry(p, struct ceph_x_ticket_handler, node);
+                remove_ticket_handler(ac, th);
+        }
+        kmem_cache_destroy(ceph_x_ticketbuf_cachep);
+        kfree(ac->private);
+        ac->private = NULL;
+}
+static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
+                                   int peer_type)
+{
+        struct ceph_x_ticket_handler *th;
+        th = get_ticket_handler(ac, peer_type);
+        if (th && !IS_ERR(th))
+                remove_ticket_handler(ac, th);
+}
+static const struct ceph_auth_client_ops ceph_x_ops = {
+        .is_authenticated = ceph_x_is_authenticated,
+        .build_request = ceph_x_build_request,
+        .handle_reply = ceph_x_handle_reply,
+        .create_authorizer = ceph_x_create_authorizer,
+        .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
+        .destroy_authorizer = ceph_x_destroy_authorizer,
+        .invalidate_authorizer = ceph_x_invalidate_authorizer,
+        .reset =  ceph_x_reset,
+        .destroy = ceph_x_destroy,
+};
+int ceph_x_init(struct ceph_auth_client *ac)
+{
+        struct ceph_x_info *xi;
+        int ret;
+        dout("ceph_x_init %p\n", ac);
+        xi = kzalloc(sizeof(*xi), GFP_NOFS);
+        if (!xi)
+                return -ENOMEM;
+        ret = -ENOMEM;
+        ceph_x_ticketbuf_cachep = kmem_cache_create("ceph_x_ticketbuf",
+                                      TEMP_TICKET_BUF_LEN, 8,
+                                      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+                                      NULL);
+        if (!ceph_x_ticketbuf_cachep)
+                goto done_nomem;
+        ret = -EINVAL;
+        if (!ac->secret) {
+                pr_err("no secret set (for auth_x protocol)\n");
+                goto done_nomem;
+        }
+        ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
+        if (ret)
+                goto done_nomem;
+        xi->starting = true;
+        xi->ticket_handlers = RB_ROOT;
+        ac->protocol = CEPH_AUTH_CEPHX;
+        ac->private = xi;
+        ac->ops = &ceph_x_ops;
+        return 0;
+done_nomem:
+        kfree(xi);
+        if (ceph_x_ticketbuf_cachep)
+                kmem_cache_destroy(ceph_x_ticketbuf_cachep);
+        return ret;
+}
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
new file mode 100644
index 000000000000..ff6f8180e681
--- /dev/null
+++ b/fs/ceph/auth_x.h
@@ -0,0 +1,49 @@
+#ifndef _FS_CEPH_AUTH_X_H
+#define _FS_CEPH_AUTH_X_H
+#include <linux/rbtree.h>
+#include "crypto.h"
+#include "auth.h"
+#include "auth_x_protocol.h"
+/*
+ * Handle ticket for a single service.
+ */
+struct ceph_x_ticket_handler {
+        struct rb_node node;
+        unsigned service;
+        struct ceph_crypto_key session_key;
+        struct ceph_timespec validity;
+        u64 secret_id;
+        struct ceph_buffer *ticket_blob;
+        unsigned long renew_after, expires;
+};
+struct ceph_x_authorizer {
+        struct ceph_buffer *buf;
+        unsigned service;
+        u64 nonce;
+        char reply_buf[128];  /* big enough for encrypted blob */
+};
+struct ceph_x_info {
+        struct ceph_crypto_key secret;
+        bool starting;
+        u64 server_challenge;
+        unsigned have_keys;
+        struct rb_root ticket_handlers;
+        struct ceph_x_authorizer auth_authorizer;
+};
+extern int ceph_x_init(struct ceph_auth_client *ac);
+#endif
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
new file mode 100644
index 000000000000..671d30576c4f
--- /dev/null
+++ b/fs/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
+#ifndef __FS_CEPH_AUTH_X_PROTOCOL
+#define __FS_CEPH_AUTH_X_PROTOCOL
+#define CEPHX_GET_AUTH_SESSION_KEY      0x0100
+#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
+#define CEPHX_GET_ROTATING_KEY          0x0400
+/* common bits */
+struct ceph_x_ticket_blob {
+        __u8 struct_v;
+        __le64 secret_id;
+        __le32 blob_len;
+        char blob[];
+} __attribute__ ((packed));
+/* common request/reply headers */
+struct ceph_x_request_header {
+        __le16 op;
+} __attribute__ ((packed));
+struct ceph_x_reply_header {
+        __le16 op;
+        __le32 result;
+} __attribute__ ((packed));
+/* authenticate handshake */
+/* initial hello (no reply header) */
+struct ceph_x_server_challenge {
+        __u8 struct_v;
+        __le64 server_challenge;
+} __attribute__ ((packed));
+struct ceph_x_authenticate {
+        __u8 struct_v;
+        __le64 client_challenge;
+        __le64 key;
+        /* ticket blob */
+} __attribute__ ((packed));
+struct ceph_x_service_ticket_request {
+        __u8 struct_v;
+        __le32 keys;
+} __attribute__ ((packed));
+struct ceph_x_challenge_blob {
+        __le64 server_challenge;
+        __le64 client_challenge;
+} __attribute__ ((packed));
+/* authorize handshake */
+/*
+ * The authorizer consists of two pieces:
+ *  a - service id, ticket blob
+ *  b - encrypted with session key
+ */
+struct ceph_x_authorize_a {
+        __u8 struct_v;
+        __le64 global_id;
+        __le32 service_id;
+        struct ceph_x_ticket_blob ticket_blob;
+} __attribute__ ((packed));
+struct ceph_x_authorize_b {
+        __u8 struct_v;
+        __le64 nonce;
+} __attribute__ ((packed));
+struct ceph_x_authorize_reply {
+        __u8 struct_v;
+        __le64 nonce_plus_one;
+} __attribute__ ((packed));
+/*
+ * encyption bundle
+ */
+#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
+struct ceph_x_encrypt_header {
+        __u8 struct_v;
+        __le64 magic;
+} __attribute__ ((packed));
+#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
new file mode 100644
index 000000000000..c67535d70aa6
--- /dev/null
+++ b/fs/ceph/buffer.c
@@ -0,0 +1,81 @@
+#include "ceph_debug.h"
+#include <linux/slab.h>
+#include "buffer.h"
+#include "decode.h"
+struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
+{
+        struct ceph_buffer *b;
+        b = kmalloc(sizeof(*b), gfp);
+        if (!b)
+                return NULL;
+        b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
+        if (b->vec.iov_base) {
+                b->is_vmalloc = false;
+        } else {
+                b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
+                if (!b->vec.iov_base) {
+                        kfree(b);
+                        return NULL;
+                }
+                b->is_vmalloc = true;
+        }
+        kref_init(&b->kref);
+        b->alloc_len = len;
+        b->vec.iov_len = len;
+        dout("buffer_new %p\n", b);
+        return b;
+}
+void ceph_buffer_release(struct kref *kref)
+{
+        struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
+        dout("buffer_release %p\n", b);
+        if (b->vec.iov_base) {
+                if (b->is_vmalloc)
+                        vfree(b->vec.iov_base);
+                else
+                        kfree(b->vec.iov_base);
+        }
+        kfree(b);
+}
+int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
+{
+        b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
+        if (b->vec.iov_base) {
+                b->is_vmalloc = false;
+        } else {
+                b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
+                b->is_vmalloc = true;
+        }
+        if (!b->vec.iov_base)
+                return -ENOMEM;
+        b->alloc_len = len;
+        b->vec.iov_len = len;
+        return 0;
+}
+int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
+{
+        size_t len;
+        ceph_decode_need(p, end, sizeof(u32), bad);
+        len = ceph_decode_32(p);
+        dout("decode_buffer len %d\n", (int)len);
+        ceph_decode_need(p, end, len, bad);
+        *b = ceph_buffer_new(len, GFP_NOFS);
+        if (!*b)
+                return -ENOMEM;
+        ceph_decode_copy(p, (*b)->vec.iov_base, len);
+        return 0;
+bad:
+        return -EINVAL;
+}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
new file mode 100644
index 000000000000..58d19014068f
--- /dev/null
+++ b/fs/ceph/buffer.h
@@ -0,0 +1,39 @@
+#ifndef __FS_CEPH_BUFFER_H
+#define __FS_CEPH_BUFFER_H
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/types.h>
+#include <linux/uio.h>
+/*
+ * a simple reference counted buffer.
+ *
+ * use kmalloc for small sizes (<= one page), vmalloc for larger
+ * sizes.
+ */
+struct ceph_buffer {
+        struct kref kref;
+        struct kvec vec;
+        size_t alloc_len;
+        bool is_vmalloc;
+};
+extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
+extern void ceph_buffer_release(struct kref *kref);
+static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
+{
+        kref_get(&b->kref);
+        return b;
+}
+static inline void ceph_buffer_put(struct ceph_buffer *b)
+{
+        kref_put(&b->kref, ceph_buffer_release);
+}
+extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
+#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
new file mode 100644
index 000000000000..3710e077a857
--- /dev/null
+++ b/fs/ceph/caps.c
@@ -0,0 +1,2933 @@
+#include "ceph_debug.h"
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
+#include "super.h"
+#include "decode.h"
+#include "messenger.h"
+/*
+ * Capability management
+ *
+ * The Ceph metadata servers control client access to inode metadata
+ * and file data by issuing capabilities, granting clients permission
+ * to read and/or write both inode field and file data to OSDs
+ * (storage nodes).  Each capability consists of a set of bits
+ * indicating which operations are allowed.
+ *
+ * If the client holds a *_SHARED cap, the client has a coherent value
+ * that can be safely read from the cached inode.
+ *
+ * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
+ * client is allowed to change inode attributes (e.g., file size,
+ * mtime), note its dirty state in the ceph_cap, and asynchronously
+ * flush that metadata change to the MDS.
+ *
+ * In the event of a conflicting operation (perhaps by another
+ * client), the MDS will revoke the conflicting client capabilities.
+ *
+ * In order for a client to cache an inode, it must hold a capability
+ * with at least one MDS server.  When inodes are released, release
+ * notifications are batched and periodically sent en masse to the MDS
+ * cluster to release server state.
+ */
+/*
+ * Generate readable cap strings for debugging output.
+ */
+#define MAX_CAP_STR 20
+static char cap_str[MAX_CAP_STR][40];
+static DEFINE_SPINLOCK(cap_str_lock);
+static int last_cap_str;
+static char *gcap_string(char *s, int c)
+{
+        if (c & CEPH_CAP_GSHARED)
+                *s++ = 's';
+        if (c & CEPH_CAP_GEXCL)
+                *s++ = 'x';
+        if (c & CEPH_CAP_GCACHE)
+                *s++ = 'c';
+        if (c & CEPH_CAP_GRD)
+                *s++ = 'r';
+        if (c & CEPH_CAP_GWR)
+                *s++ = 'w';
+        if (c & CEPH_CAP_GBUFFER)
+                *s++ = 'b';
+        if (c & CEPH_CAP_GLAZYIO)
+                *s++ = 'l';
+        return s;
+}
+const char *ceph_cap_string(int caps)
+{
+        int i;
+        char *s;
+        int c;
+        spin_lock(&cap_str_lock);
+        i = last_cap_str++;
+        if (last_cap_str == MAX_CAP_STR)
+                last_cap_str = 0;
+        spin_unlock(&cap_str_lock);
+        s = cap_str[i];
+        if (caps & CEPH_CAP_PIN)
+                *s++ = 'p';
+        c = (caps >> CEPH_CAP_SAUTH) & 3;
+        if (c) {
+                *s++ = 'A';
+                s = gcap_string(s, c);
+        }
+        c = (caps >> CEPH_CAP_SLINK) & 3;
+        if (c) {
+                *s++ = 'L';
+                s = gcap_string(s, c);
+        }
+        c = (caps >> CEPH_CAP_SXATTR) & 3;
+        if (c) {
+                *s++ = 'X';
+                s = gcap_string(s, c);
+        }
+        c = caps >> CEPH_CAP_SFILE;
+        if (c) {
+                *s++ = 'F';
+                s = gcap_string(s, c);
+        }
+        if (s == cap_str[i])
+                *s++ = '-';
+        *s = 0;
+        return cap_str[i];
+}
+/*
+ * Cap reservations
+ *
+ * Maintain a global pool of preallocated struct ceph_caps, referenced
+ * by struct ceph_caps_reservations.  This ensures that we preallocate
+ * memory needed to successfully process an MDS response.  (If an MDS
+ * sends us cap information and we fail to process it, we will have
+ * problems due to the client and MDS being out of sync.)
+ *
+ * Reservations are 'owned' by a ceph_cap_reservation context.
+ */
+static spinlock_t caps_list_lock;
+static struct list_head caps_list;  /* unused (reserved or unreserved) */
+static int caps_total_count;        /* total caps allocated */
+static int caps_use_count;          /* in use */
+static int caps_reserve_count;      /* unused, reserved */
+static int caps_avail_count;        /* unused, unreserved */
+static int caps_min_count;          /* keep at least this many (unreserved) */
+void __init ceph_caps_init(void)
+{
+        INIT_LIST_HEAD(&caps_list);
+        spin_lock_init(&caps_list_lock);
+}
+void ceph_caps_finalize(void)
+{
+        struct ceph_cap *cap;
+        spin_lock(&caps_list_lock);
+        while (!list_empty(&caps_list)) {
+                cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
+                list_del(&cap->caps_item);
+                kmem_cache_free(ceph_cap_cachep, cap);
+        }
+        caps_total_count = 0;
+        caps_avail_count = 0;
+        caps_use_count = 0;
+        caps_reserve_count = 0;
+        caps_min_count = 0;
+        spin_unlock(&caps_list_lock);
+}
+void ceph_adjust_min_caps(int delta)
+{
+        spin_lock(&caps_list_lock);
+        caps_min_count += delta;
+        BUG_ON(caps_min_count < 0);
+        spin_unlock(&caps_list_lock);
+}
+int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
+{
+        int i;
+        struct ceph_cap *cap;
+        int have;
+        int alloc = 0;
+        LIST_HEAD(newcaps);
+        int ret = 0;
+        dout("reserve caps ctx=%p need=%d\n", ctx, need);
+        /* first reserve any caps that are already allocated */
+        spin_lock(&caps_list_lock);
+        if (caps_avail_count >= need)
+                have = need;
+        else
+                have = caps_avail_count;
+        caps_avail_count -= have;
+        caps_reserve_count += have;
+        BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+               caps_avail_count);
+        spin_unlock(&caps_list_lock);
+        for (i = have; i < need; i++) {
+                cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+                if (!cap) {
+                        ret = -ENOMEM;
+                        goto out_alloc_count;
+                }
+                list_add(&cap->caps_item, &newcaps);
+                alloc++;
+        }
+        BUG_ON(have + alloc != need);
+        spin_lock(&caps_list_lock);
+        caps_total_count += alloc;
+        caps_reserve_count += alloc;
+        list_splice(&newcaps, &caps_list);
+        BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+               caps_avail_count);
+        spin_unlock(&caps_list_lock);
+        ctx->count = need;
+        dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
+             ctx, caps_total_count, caps_use_count, caps_reserve_count,
+             caps_avail_count);
+        return 0;
+out_alloc_count:
+        /* we didn't manage to reserve as much as we needed */
+        pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
+                   ctx, need, have);
+        return ret;
+}
+int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
+{
+        dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
+        if (ctx->count) {
+                spin_lock(&caps_list_lock);
+                BUG_ON(caps_reserve_count < ctx->count);
+                caps_reserve_count -= ctx->count;
+                caps_avail_count += ctx->count;
+                ctx->count = 0;
+                dout("unreserve caps %d = %d used + %d resv + %d avail\n",
+                     caps_total_count, caps_use_count, caps_reserve_count,
+                     caps_avail_count);
+                BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+                       caps_avail_count);
+                spin_unlock(&caps_list_lock);
+        }
+        return 0;
+}
+static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
+{
+        struct ceph_cap *cap = NULL;
+        /* temporary, until we do something about cap import/export */
+        if (!ctx)
+                return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+        spin_lock(&caps_list_lock);
+        dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
+             ctx, ctx->count, caps_total_count, caps_use_count,
+             caps_reserve_count, caps_avail_count);
+        BUG_ON(!ctx->count);
+        BUG_ON(ctx->count > caps_reserve_count);
+        BUG_ON(list_empty(&caps_list));
+        ctx->count--;
+        caps_reserve_count--;
+        caps_use_count++;
+        cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
+        list_del(&cap->caps_item);
+        BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+               caps_avail_count);
+        spin_unlock(&caps_list_lock);
+        return cap;
+}
+void ceph_put_cap(struct ceph_cap *cap)
+{
+        spin_lock(&caps_list_lock);
+        dout("put_cap %p %d = %d used + %d resv + %d avail\n",
+             cap, caps_total_count, caps_use_count,
+             caps_reserve_count, caps_avail_count);
+        caps_use_count--;
+        /*
+         * Keep some preallocated caps around (ceph_min_count), to
+         * avoid lots of free/alloc churn.
+         */
+        if (caps_avail_count >= caps_reserve_count + caps_min_count) {
+                caps_total_count--;
+                kmem_cache_free(ceph_cap_cachep, cap);
+        } else {
+                caps_avail_count++;
+                list_add(&cap->caps_item, &caps_list);
+        }
+        BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+               caps_avail_count);
+        spin_unlock(&caps_list_lock);
+}
+void ceph_reservation_status(struct ceph_client *client,
+                             int *total, int *avail, int *used, int *reserved,
+                             int *min)
+{
+        if (total)
+                *total = caps_total_count;
+        if (avail)
+                *avail = caps_avail_count;
+        if (used)
+                *used = caps_use_count;
+        if (reserved)
+                *reserved = caps_reserve_count;
+        if (min)
+                *min = caps_min_count;
+}
+/*
+ * Find ceph_cap for given mds, if any.
+ *
+ * Called with i_lock held.
+ */
+static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
+{
+        struct ceph_cap *cap;
+        struct rb_node *n = ci->i_caps.rb_node;
+        while (n) {
+                cap = rb_entry(n, struct ceph_cap, ci_node);
+                if (mds < cap->mds)
+                        n = n->rb_left;
+                else if (mds > cap->mds)
+                        n = n->rb_right;
+                else
+                        return cap;
+        }
+        return NULL;
+}
+/*
+ * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
+ * -1.
+ */
+static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
+{
+        struct ceph_cap *cap;
+        int mds = -1;
+        struct rb_node *p;
+        /* prefer mds with WR|WRBUFFER|EXCL caps */
+        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+                cap = rb_entry(p, struct ceph_cap, ci_node);
+                mds = cap->mds;
+                if (mseq)
+                        *mseq = cap->mseq;
+                if (cap->issued & (CEPH_CAP_FILE_WR |
+                                   CEPH_CAP_FILE_BUFFER |
+                                   CEPH_CAP_FILE_EXCL))
+                        break;
+        }
+        return mds;
+}
+int ceph_get_cap_mds(struct inode *inode)
+{
+        int mds;
+        spin_lock(&inode->i_lock);
+        mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
+        spin_unlock(&inode->i_lock);
+        return mds;
+}
+/*
+ * Called under i_lock.
+ */
+static void __insert_cap_node(struct ceph_inode_info *ci,
+                              struct ceph_cap *new)
+{
+        struct rb_node **p = &ci->i_caps.rb_node;
+        struct rb_node *parent = NULL;
+        struct ceph_cap *cap = NULL;
+        while (*p) {
+                parent = *p;
+                cap = rb_entry(parent, struct ceph_cap, ci_node);
+                if (new->mds < cap->mds)
+                        p = &(*p)->rb_left;
+                else if (new->mds > cap->mds)
+                        p = &(*p)->rb_right;
+                else
+                        BUG();
+        }
+        rb_link_node(&new->ci_node, parent, p);
+        rb_insert_color(&new->ci_node, &ci->i_caps);
+}
+/*
+ * (re)set cap hold timeouts, which control the delayed release
+ * of unused caps back to the MDS.  Should be called on cap use.
+ */
+static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
+                               struct ceph_inode_info *ci)
+{
+        struct ceph_mount_args *ma = mdsc->client->mount_args;
+        ci->i_hold_caps_min = round_jiffies(jiffies +
+                                            ma->caps_wanted_delay_min * HZ);
+        ci->i_hold_caps_max = round_jiffies(jiffies +
+                                            ma->caps_wanted_delay_max * HZ);
+        dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
+             ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
+}
+/*
+ * (Re)queue cap at the end of the delayed cap release list.
+ *
+ * If I_FLUSH is set, leave the inode at the front of the list.
+ *
+ * Caller holds i_lock
+ *    -> we take mdsc->cap_delay_lock
+ */
+static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
+                                struct ceph_inode_info *ci)
+{
+        __cap_set_timeouts(mdsc, ci);
+        dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
+             ci->i_ceph_flags, ci->i_hold_caps_max);
+        if (!mdsc->stopping) {
+                spin_lock(&mdsc->cap_delay_lock);
+                if (!list_empty(&ci->i_cap_delay_list)) {
+                        if (ci->i_ceph_flags & CEPH_I_FLUSH)
+                                goto no_change;
+                        list_del_init(&ci->i_cap_delay_list);
+                }
+                list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
+no_change:
+                spin_unlock(&mdsc->cap_delay_lock);
+        }
+}
+/*
+ * Queue an inode for immediate writeback.  Mark inode with I_FLUSH,
+ * indicating we should send a cap message to flush dirty metadata
+ * asap, and move to the front of the delayed cap list.
+ */
+static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
+                                      struct ceph_inode_info *ci)
+{
+        dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
+        spin_lock(&mdsc->cap_delay_lock);
+        ci->i_ceph_flags |= CEPH_I_FLUSH;
+        if (!list_empty(&ci->i_cap_delay_list))
+                list_del_init(&ci->i_cap_delay_list);
+        list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
+        spin_unlock(&mdsc->cap_delay_lock);
+}
+/*
+ * Cancel delayed work on cap.
+ *
+ * Caller must hold i_lock.
+ */
+static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
+                               struct ceph_inode_info *ci)
+{
+        dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
+        if (list_empty(&ci->i_cap_delay_list))
+                return;
+        spin_lock(&mdsc->cap_delay_lock);
+        list_del_init(&ci->i_cap_delay_list);
+        spin_unlock(&mdsc->cap_delay_lock);
+}
+/*
+ * Common issue checks for add_cap, handle_cap_grant.
+ */
+static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
+                              unsigned issued)
+{
+        unsigned had = __ceph_caps_issued(ci, NULL);
+        /*
+         * Each time we receive FILE_CACHE anew, we increment
+         * i_rdcache_gen.
+         */
+        if ((issued & CEPH_CAP_FILE_CACHE) &&
+            (had & CEPH_CAP_FILE_CACHE) == 0)
+                ci->i_rdcache_gen++;
+        /*
+         * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
+         * don't know what happened to this directory while we didn't
+         * have the cap.
+         */
+        if ((issued & CEPH_CAP_FILE_SHARED) &&
+            (had & CEPH_CAP_FILE_SHARED) == 0) {
+                ci->i_shared_gen++;
+                if (S_ISDIR(ci->vfs_inode.i_mode)) {
+                        dout(" marking %p NOT complete\n", &ci->vfs_inode);
+                        ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+                }
+        }
+}
+/*
+ * Add a capability under the given MDS session.
+ *
+ * Caller should hold session snap_rwsem (read) and s_mutex.
+ *
+ * @fmode is the open file mode, if we are opening a file, otherwise
+ * it is < 0.  (This is so we can atomically add the cap and add an
+ * open file reference to it.)
+ */
+int ceph_add_cap(struct inode *inode,
+                 struct ceph_mds_session *session, u64 cap_id,
+                 int fmode, unsigned issued, unsigned wanted,
+                 unsigned seq, unsigned mseq, u64 realmino, int flags,
+                 struct ceph_cap_reservation *caps_reservation)
+{
+        struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_cap *new_cap = NULL;
+        struct ceph_cap *cap;
+        int mds = session->s_mds;
+        int actual_wanted;
+        dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
+             session->s_mds, cap_id, ceph_cap_string(issued), seq);
+        /*
+         * If we are opening the file, include file mode wanted bits
+         * in wanted.
+         */
+        if (fmode >= 0)
+                wanted |= ceph_caps_for_mode(fmode);
+retry:
+        spin_lock(&inode->i_lock);
+        cap = __get_cap_for_mds(ci, mds);
+        if (!cap) {
+                if (new_cap) {
+                        cap = new_cap;
+                        new_cap = NULL;
+                } else {
+                        spin_unlock(&inode->i_lock);
+                        new_cap = get_cap(caps_reservation);
+                        if (new_cap == NULL)
+                                return -ENOMEM;
+                        goto retry;
+                }
+                cap->issued = 0;
+                cap->implemented = 0;
+                cap->mds = mds;
+                cap->mds_wanted = 0;
+                cap->ci = ci;
+                __insert_cap_node(ci, cap);
+                /* clear out old exporting info?  (i.e. on cap import) */
+                if (ci->i_cap_exporting_mds == mds) {
+                        ci->i_cap_exporting_issued = 0;
+                        ci->i_cap_exporting_mseq = 0;
+                        ci->i_cap_exporting_mds = -1;
+                }
+                /* add to session cap list */
+                cap->session = session;
+                spin_lock(&session->s_cap_lock);
+                list_add_tail(&cap->session_caps, &session->s_caps);
+                session->s_nr_caps++;
+                spin_unlock(&session->s_cap_lock);
+        }
+        if (!ci->i_snap_realm) {
+                /*
+                 * add this inode to the appropriate snap realm
+                 */
+                struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
+                                                               realmino);
+                if (realm) {
+                        ceph_get_snap_realm(mdsc, realm);
+                        spin_lock(&realm->inodes_with_caps_lock);
+                        ci->i_snap_realm = realm;
+                        list_add(&ci->i_snap_realm_item,
+                                 &realm->inodes_with_caps);
+                        spin_unlock(&realm->inodes_with_caps_lock);
+                } else {
+                        pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
+                               realmino);
+                }
+        }
+        __check_cap_issue(ci, cap, issued);
+        /*
+         * If we are issued caps we don't want, or the mds' wanted
+         * value appears to be off, queue a check so we'll release
+         * later and/or update the mds wanted value.
+         */
+        actual_wanted = __ceph_caps_wanted(ci);
+        if ((wanted & ~actual_wanted) ||
+            (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
+                dout(" issued %s, mds wanted %s, actual %s, queueing\n",
+                     ceph_cap_string(issued), ceph_cap_string(wanted),
+                     ceph_cap_string(actual_wanted));
+                __cap_delay_requeue(mdsc, ci);
+        }
+        if (flags & CEPH_CAP_FLAG_AUTH)
+                ci->i_auth_cap = cap;
+        else if (ci->i_auth_cap == cap)
+                ci->i_auth_cap = NULL;
+        dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
+             inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
+             ceph_cap_string(issued|cap->issued), seq, mds);
+        cap->cap_id = cap_id;
+        cap->issued = issued;
+        cap->implemented |= issued;
+        cap->mds_wanted |= wanted;
+        cap->seq = seq;
+        cap->issue_seq = seq;
+        cap->mseq = mseq;
+        cap->cap_gen = session->s_cap_gen;
+        if (fmode >= 0)
+                __ceph_get_fmode(ci, fmode);
+        spin_unlock(&inode->i_lock);
+        wake_up(&ci->i_cap_wq);
+        return 0;
+}
+/*
+ * Return true if cap has not timed out and belongs to the current
+ * generation of the MDS session (i.e. has not gone 'stale' due to
+ * us losing touch with the mds).
+ */
+static int __cap_is_valid(struct ceph_cap *cap)
+{
+        unsigned long ttl;
+        u32 gen;
+        spin_lock(&cap->session->s_cap_lock);
+        gen = cap->session->s_cap_gen;
+        ttl = cap->session->s_cap_ttl;
+        spin_unlock(&cap->session->s_cap_lock);
+        if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
+                dout("__cap_is_valid %p cap %p issued %s "
+                     "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
+                     cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
+                return 0;
+        }
+        return 1;
+}
+/*
+ * Return set of valid cap bits issued to us.  Note that caps time
+ * out, and may be invalidated in bulk if the client session times out
+ * and session->s_cap_gen is bumped.
+ */
+int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
+{
+        int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
+        struct ceph_cap *cap;
+        struct rb_node *p;
+        if (implemented)
+                *implemented = 0;
+        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+                cap = rb_entry(p, struct ceph_cap, ci_node);
+                if (!__cap_is_valid(cap))
+                        continue;
+                dout("__ceph_caps_issued %p cap %p issued %s\n",
+                     &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
+                have |= cap->issued;
+                if (implemented)
+                        *implemented |= cap->implemented;
+        }
+        return have;
+}
+/*
+ * Get cap bits issued by caps other than @ocap
+ */
+int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
+{
+        int have = ci->i_snap_caps;
+        struct ceph_cap *cap;
+        struct rb_node *p;
+        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+                cap = rb_entry(p, struct ceph_cap, ci_node);
+                if (cap == ocap)
+                        continue;
+                if (!__cap_is_valid(cap))
+                        continue;
+                have |= cap->issued;
+        }
+        return have;
+}
+/*
+ * Move a cap to the end of the LRU (oldest caps at list head, newest
+ * at list tail).
+ */
+static void __touch_cap(struct ceph_cap *cap)
+{
+        struct ceph_mds_session *s = cap->session;
+        spin_lock(&s->s_cap_lock);
+        if (s->s_cap_iterator == NULL) {
+                dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
+                     s->s_mds);
+                list_move_tail(&cap->session_caps, &s->s_caps);
+        } else {
+                dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
+                     &cap->ci->vfs_inode, cap, s->s_mds);
+        }
+        spin_unlock(&s->s_cap_lock);
+}
+/*
+ * Check if we hold the given mask.  If so, move the cap(s) to the
+ * front of their respective LRUs.  (This is the preferred way for
+ * callers to check for caps they want.)
+ */
+int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
+{
+        struct ceph_cap *cap;
+        struct rb_node *p;
+        int have = ci->i_snap_caps;
+        if ((have & mask) == mask) {
+                dout("__ceph_caps_issued_mask %p snap issued %s"
+                     " (mask %s)\n", &ci->vfs_inode,
+                     ceph_cap_string(have),
+                     ceph_cap_string(mask));
+                return 1;
+        }
+        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+                cap = rb_entry(p, struct ceph_cap, ci_node);
+                if (!__cap_is_valid(cap))
+                        continue;
+                if ((cap->issued & mask) == mask) {
+                        dout("__ceph_caps_issued_mask %p cap %p issued %s"
+                             " (mask %s)\n", &ci->vfs_inode, cap,
+                             ceph_cap_string(cap->issued),
+                             ceph_cap_string(mask));
+                        if (touch)
+                                __touch_cap(cap);
+                        return 1;
+                }
+                /* does a combination of caps satisfy mask? */
+                have |= cap->issued;
+                if ((have & mask) == mask) {
+                        dout("__ceph_caps_issued_mask %p combo issued %s"
+                             " (mask %s)\n", &ci->vfs_inode,
+                             ceph_cap_string(cap->issued),
+                             ceph_cap_string(mask));
+                        if (touch) {
+                                struct rb_node *q;
+                                /* touch this + preceeding caps */
+                                __touch_cap(cap);
+                                for (q = rb_first(&ci->i_caps); q != p;
+                                     q = rb_next(q)) {
+                                        cap = rb_entry(q, struct ceph_cap,
+                                                       ci_node);
+                                        if (!__cap_is_valid(cap))
+                                                continue;
+                                        __touch_cap(cap);
+                                }
+                        }
+                        return 1;
+                }
+        }
+        return 0;
+}
+/*
+ * Return true if mask caps are currently being revoked by an MDS.
+ */
+int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+{
+        struct inode *inode = &ci->vfs_inode;
+        struct ceph_cap *cap;
+        struct rb_node *p;
+        int ret = 0;
+        spin_lock(&inode->i_lock);
+        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+                cap = rb_entry(p, struct ceph_cap, ci_node);
+                if (__cap_is_valid(cap) &&
+                    (cap->implemented & ~cap->issued & mask)) {
+                        ret = 1;
+                        break;
+                }
+        }
+        spin_unlock(&inode->i_lock);
+        dout("ceph_caps_revoking %p %s = %d\n", inode,
+             ceph_cap_string(mask), ret);
+        return ret;
+}
+int __ceph_caps_used(struct ceph_inode_info *ci)
+{
+        int used = 0;
+        if (ci->i_pin_ref)
+                used |= CEPH_CAP_PIN;
+        if (ci->i_rd_ref)
+                used |= CEPH_CAP_FILE_RD;
+        if (ci->i_rdcache_ref || ci->i_rdcache_gen)
+                used |= CEPH_CAP_FILE_CACHE;
+        if (ci->i_wr_ref)
+                used |= CEPH_CAP_FILE_WR;
+        if (ci->i_wrbuffer_ref)
+                used |= CEPH_CAP_FILE_BUFFER;
+        return used;
+}
+/*
+ * wanted, by virtue of open file modes
+ */
+int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
+{
+        int want = 0;
+        int mode;
+        for (mode = 0; mode < 4; mode++)
+                if (ci->i_nr_by_mode[mode])
+                        want |= ceph_caps_for_mode(mode);
+        return want;
+}
+/*
+ * Return caps we have registered with the MDS(s) as 'wanted'.
+ */
+int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
+{
+        struct ceph_cap *cap;
+        struct rb_node *p;
+        int mds_wanted = 0;
+        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+                cap = rb_entry(p, struct ceph_cap, ci_node);
+                if (!__cap_is_valid(cap))
+                        continue;
+                mds_wanted |= cap->mds_wanted;
+        }
+        return mds_wanted;
+}
+/*
+ * called under i_lock
+ */
+static int __ceph_is_any_caps(struct ceph_inode_info *ci)
+{
+        return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
+}
+/*
+ * caller should hold i_lock.
+ * caller will not hold session s_mutex if called from destroy_inode.
+ */
+void __ceph_remove_cap(struct ceph_cap *cap)
+{
+        struct ceph_mds_session *session = cap->session;
+        struct ceph_inode_info *ci = cap->ci;
+        struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+        dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
+        /* remove from inode list */
+        rb_erase(&cap->ci_node, &ci->i_caps);
+        cap->ci = NULL;
+        if (ci->i_auth_cap == cap)
+                ci->i_auth_cap = NULL;
+        /* remove from session list */
+        spin_lock(&session->s_cap_lock);
+        if (session->s_cap_iterator == cap) {
+                /* not yet, we are iterating over this very cap */
+                dout("__ceph_remove_cap  delaying %p removal from session %p\n",
+                     cap, cap->session);
+        } else {
+                list_del_init(&cap->session_caps);
+                session->s_nr_caps--;
+                cap->session = NULL;
+        }
+        spin_unlock(&session->s_cap_lock);
+        if (cap->session == NULL)
+                ceph_put_cap(cap);
+        if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
+                struct ceph_snap_realm *realm = ci->i_snap_realm;
+                spin_lock(&realm->inodes_with_caps_lock);
+                list_del_init(&ci->i_snap_realm_item);
+                ci->i_snap_realm_counter++;
+                ci->i_snap_realm = NULL;
+                spin_unlock(&realm->inodes_with_caps_lock);
+                ceph_put_snap_realm(mdsc, realm);
+        }
+        if (!__ceph_is_any_real_caps(ci))
+                __cap_delay_cancel(mdsc, ci);
+}
+/*
+ * Build and send a cap message to the given MDS.
+ *
+ * Caller should be holding s_mutex.
+ */
+static int send_cap_msg(struct ceph_mds_session *session,
+                        u64 ino, u64 cid, int op,
+                        int caps, int wanted, int dirty,
+                        u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
+                        u64 size, u64 max_size,
+                        struct timespec *mtime, struct timespec *atime,
+                        u64 time_warp_seq,
+                        uid_t uid, gid_t gid, mode_t mode,
+                        u64 xattr_version,
+                        struct ceph_buffer *xattrs_buf,
+                        u64 follows)
+{
+        struct ceph_mds_caps *fc;
+        struct ceph_msg *msg;
+        dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
+             " seq %u/%u mseq %u follows %lld size %llu/%llu"
+             " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
+             cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
+             ceph_cap_string(dirty),
+             seq, issue_seq, mseq, follows, size, max_size,
+             xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
+        msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
+        if (IS_ERR(msg))
+                return PTR_ERR(msg);
+        msg->hdr.tid = cpu_to_le64(flush_tid);
+        fc = msg->front.iov_base;
+        memset(fc, 0, sizeof(*fc));
+        fc->cap_id = cpu_to_le64(cid);
+        fc->op = cpu_to_le32(op);
+        fc->seq = cpu_to_le32(seq);
+        fc->issue_seq = cpu_to_le32(issue_seq);
+        fc->migrate_seq = cpu_to_le32(mseq);
+        fc->caps = cpu_to_le32(caps);
+        fc->wanted = cpu_to_le32(wanted);
+        fc->dirty = cpu_to_le32(dirty);
+        fc->ino = cpu_to_le64(ino);
+        fc->snap_follows = cpu_to_le64(follows);
+        fc->size = cpu_to_le64(size);
+        fc->max_size = cpu_to_le64(max_size);
+        if (mtime)
+                ceph_encode_timespec(&fc->mtime, mtime);
+        if (atime)
+                ceph_encode_timespec(&fc->atime, atime);
+        fc->time_warp_seq = cpu_to_le32(time_warp_seq);
+        fc->uid = cpu_to_le32(uid);
+        fc->gid = cpu_to_le32(gid);
+        fc->mode = cpu_to_le32(mode);
+        fc->xattr_version = cpu_to_le64(xattr_version);
+        if (xattrs_buf) {
+                msg->middle = ceph_buffer_get(xattrs_buf);
+                fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
+                msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
+        }
+        ceph_con_send(&session->s_con, msg);
+        return 0;
+}
+/*
+ * Queue cap releases when an inode is dropped from our cache.  Since
+ * inode is about to be destroyed, there is no need for i_lock.
+ */
+void ceph_queue_caps_release(struct inode *inode)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct rb_node *p;
+        p = rb_first(&ci->i_caps);
+        while (p) {
+                struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
+                struct ceph_mds_session *session = cap->session;
+                struct ceph_msg *msg;
+                struct ceph_mds_cap_release *head;
+                struct ceph_mds_cap_item *item;
+                spin_lock(&session->s_cap_lock);
+                BUG_ON(!session->s_num_cap_releases);
+                msg = list_first_entry(&session->s_cap_releases,
+                                       struct ceph_msg, list_head);
+                dout(" adding %p release to mds%d msg %p (%d left)\n",
+                     inode, session->s_mds, msg, session->s_num_cap_releases);
+                BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
+                head = msg->front.iov_base;
+                head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
+                item = msg->front.iov_base + msg->front.iov_len;
+                item->ino = cpu_to_le64(ceph_ino(inode));
+                item->cap_id = cpu_to_le64(cap->cap_id);
+                item->migrate_seq = cpu_to_le32(cap->mseq);
+                item->seq = cpu_to_le32(cap->issue_seq);
+                session->s_num_cap_releases--;
+                msg->front.iov_len += sizeof(*item);
+                if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
+                        dout(" release msg %p full\n", msg);
+                        list_move_tail(&msg->list_head,
+                                       &session->s_cap_releases_done);
+                } else {
+                        dout(" release msg %p at %d/%d (%d)\n", msg,
+                             (int)le32_to_cpu(head->num),
+                             (int)CEPH_CAPS_PER_RELEASE,
+                             (int)msg->front.iov_len);
+                }
+                spin_unlock(&session->s_cap_lock);
+                p = rb_next(p);
+                __ceph_remove_cap(cap);
+        }
+}
+/*
+ * Send a cap msg on the given inode.  Update our caps state, then
+ * drop i_lock and send the message.
+ *
+ * Make note of max_size reported/requested from mds, revoked caps
+ * that have now been implemented.
+ *
+ * Make half-hearted attempt ot to invalidate page cache if we are
+ * dropping RDCACHE.  Note that this will leave behind locked pages
+ * that we'll then need to deal with elsewhere.
+ *
+ * Return non-zero if delayed release, or we experienced an error
+ * such that the caller should requeue + retry later.
+ *
+ * called with i_lock, then drops it.
+ * caller should hold snap_rwsem (read), s_mutex.
+ */
+static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
+                      int op, int used, int want, int retain, int flushing,
+                      unsigned *pflush_tid)
+        __releases(cap->ci->vfs_inode->i_lock)
+{
+        struct ceph_inode_info *ci = cap->ci;
+        struct inode *inode = &ci->vfs_inode;
+        u64 cap_id = cap->cap_id;
+        int held, revoking, dropping, keep;
+        u64 seq, issue_seq, mseq, time_warp_seq, follows;
+        u64 size, max_size;
+        struct timespec mtime, atime;
+        int wake = 0;
+        mode_t mode;
+        uid_t uid;
+        gid_t gid;
+        struct ceph_mds_session *session;
+        u64 xattr_version = 0;
+        int delayed = 0;
+        u64 flush_tid = 0;
+        int i;
+        int ret;
+        held = cap->issued | cap->implemented;
+        revoking = cap->implemented & ~cap->issued;
+        retain &= ~revoking;
+        dropping = cap->issued & ~retain;
+        dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
+             inode, cap, cap->session,
+             ceph_cap_string(held), ceph_cap_string(held & retain),
+             ceph_cap_string(revoking));
+        BUG_ON((retain & CEPH_CAP_PIN) == 0);
+        session = cap->session;
+        /* don't release wanted unless we've waited a bit. */
+        if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
+            time_before(jiffies, ci->i_hold_caps_min)) {
+                dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
+                     ceph_cap_string(cap->issued),
+                     ceph_cap_string(cap->issued & retain),
+                     ceph_cap_string(cap->mds_wanted),
+                     ceph_cap_string(want));
+                want |= cap->mds_wanted;
+                retain |= cap->issued;
+                delayed = 1;
+        }
+        ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
+        cap->issued &= retain;  /* drop bits we don't want */
+        if (cap->implemented & ~cap->issued) {
+                /*
+                 * Wake up any waiters on wanted -> needed transition.
+                 * This is due to the weird transition from buffered
+                 * to sync IO... we need to flush dirty pages _before_
+                 * allowing sync writes to avoid reordering.
+                 */
+                wake = 1;
+        }
+        cap->implemented &= cap->issued | used;
+        cap->mds_wanted = want;
+        if (flushing) {
+                /*
+                 * assign a tid for flush operations so we can avoid
+                 * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
+                 * clean type races.  track latest tid for every bit
+                 * so we can handle flush AxFw, flush Fw, and have the
+                 * first ack clean Ax.
+                 */
+                flush_tid = ++ci->i_cap_flush_last_tid;
+                if (pflush_tid)
+                        *pflush_tid = flush_tid;
+                dout(" cap_flush_tid %d\n", (int)flush_tid);
+                for (i = 0; i < CEPH_CAP_BITS; i++)
+                        if (flushing & (1 << i))
+                                ci->i_cap_flush_tid[i] = flush_tid;
+        }
+        keep = cap->implemented;
+        seq = cap->seq;
+        issue_seq = cap->issue_seq;
+        mseq = cap->mseq;
+        size = inode->i_size;
+        ci->i_reported_size = size;
+        max_size = ci->i_wanted_max_size;
+        ci->i_requested_max_size = max_size;
+        mtime = inode->i_mtime;
+        atime = inode->i_atime;
+        time_warp_seq = ci->i_time_warp_seq;
+        follows = ci->i_snap_realm->cached_context->seq;
+        uid = inode->i_uid;
+        gid = inode->i_gid;
+        mode = inode->i_mode;
+        if (dropping & CEPH_CAP_XATTR_EXCL) {
+                __ceph_build_xattrs_blob(ci);
+                xattr_version = ci->i_xattrs.version + 1;
+        }
+        spin_unlock(&inode->i_lock);
+        ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
+                op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
+                size, max_size, &mtime, &atime, time_warp_seq,
+                uid, gid, mode,
+                xattr_version,
+                (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
+                follows);
+        if (ret < 0) {
+                dout("error sending cap msg, must requeue %p\n", inode);
+                delayed = 1;
+        }
+        if (wake)
+                wake_up(&ci->i_cap_wq);
+        return delayed;
+}
+/*
+ * When a snapshot is taken, clients accumulate dirty metadata on
+ * inodes with capabilities in ceph_cap_snaps to describe the file
+ * state at the time the snapshot was taken.  This must be flushed
+ * asynchronously back to the MDS once sync writes complete and dirty
+ * data is written out.
+ *
+ * Called under i_lock.  Takes s_mutex as needed.
+ */
+void __ceph_flush_snaps(struct ceph_inode_info *ci,
+                        struct ceph_mds_session **psession)
+{
+        struct inode *inode = &ci->vfs_inode;
+        int mds;
+        struct ceph_cap_snap *capsnap;
+        u32 mseq;
+        struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+        struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
+                                                    session->s_mutex */
+        u64 next_follows = 0;  /* keep track of how far we've gotten through the
+                             i_cap_snaps list, and skip these entries next time
+                             around to avoid an infinite loop */
+        if (psession)
+                session = *psession;
+        dout("__flush_snaps %p\n", inode);
+retry:
+        list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+                /* avoid an infiniute loop after retry */
+                if (capsnap->follows < next_follows)
+                        continue;
+                /*
+                 * we need to wait for sync writes to complete and for dirty
+                 * pages to be written out.
+                 */
+                if (capsnap->dirty_pages || capsnap->writing)
+                        continue;
+                /* pick mds, take s_mutex */
+                mds = __ceph_get_cap_mds(ci, &mseq);
+                if (session && session->s_mds != mds) {
+                        dout("oops, wrong session %p mutex\n", session);
+                        mutex_unlock(&session->s_mutex);
+                        ceph_put_mds_session(session);
+                        session = NULL;
+                }
+                if (!session) {
+                        spin_unlock(&inode->i_lock);
+                        mutex_lock(&mdsc->mutex);
+                        session = __ceph_lookup_mds_session(mdsc, mds);
+                        mutex_unlock(&mdsc->mutex);
+                        if (session) {
+                                dout("inverting session/ino locks on %p\n",
+                                     session);
+                                mutex_lock(&session->s_mutex);
+                        }
+                        /*
+                         * if session == NULL, we raced against a cap
+                         * deletion.  retry, and we'll get a better
+                         * @mds value next time.
+                         */
+                        spin_lock(&inode->i_lock);
+                        goto retry;
+                }
+                capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
+                atomic_inc(&capsnap->nref);
+                if (!list_empty(&capsnap->flushing_item))
+                        list_del_init(&capsnap->flushing_item);
+                list_add_tail(&capsnap->flushing_item,
+                              &session->s_cap_snaps_flushing);
+                spin_unlock(&inode->i_lock);
+                dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
+                     inode, capsnap, next_follows, capsnap->size);
+                send_cap_msg(session, ceph_vino(inode).ino, 0,
+                             CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
+                             capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
+                             capsnap->size, 0,
+                             &capsnap->mtime, &capsnap->atime,
+                             capsnap->time_warp_seq,
+                             capsnap->uid, capsnap->gid, capsnap->mode,
+                             0, NULL,
+                             capsnap->follows);
+                next_follows = capsnap->follows + 1;
+                ceph_put_cap_snap(capsnap);
+                spin_lock(&inode->i_lock);
+                goto retry;
+        }
+        /* we flushed them all; remove this inode from the queue */
+        spin_lock(&mdsc->snap_flush_lock);
+        list_del_init(&ci->i_snap_flush_item);
+        spin_unlock(&mdsc->snap_flush_lock);
+        if (psession)
+                *psession = session;
+        else if (session) {
+                mutex_unlock(&session->s_mutex);
+                ceph_put_mds_session(session);
+        }
+}
+static void ceph_flush_snaps(struct ceph_inode_info *ci)
+{
+        struct inode *inode = &ci->vfs_inode;
+        spin_lock(&inode->i_lock);
+        __ceph_flush_snaps(ci, NULL);
+        spin_unlock(&inode->i_lock);
+}
+/*
+ * Mark caps dirty.  If inode is newly dirty, add to the global dirty
+ * list.
+ */
+void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
+{
+        struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+        struct inode *inode = &ci->vfs_inode;
+        int was = ci->i_dirty_caps;
+        int dirty = 0;
+        dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
+             ceph_cap_string(mask), ceph_cap_string(was),
+             ceph_cap_string(was | mask));
+        ci->i_dirty_caps |= mask;
+        if (was == 0) {
+                dout(" inode %p now dirty\n", &ci->vfs_inode);
+                BUG_ON(!list_empty(&ci->i_dirty_item));
+                spin_lock(&mdsc->cap_dirty_lock);
+                list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
+                spin_unlock(&mdsc->cap_dirty_lock);
+                if (ci->i_flushing_caps == 0) {
+                        igrab(inode);
+                        dirty |= I_DIRTY_SYNC;
+                }
+        }
+        BUG_ON(list_empty(&ci->i_dirty_item));
+        if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
+            (mask & CEPH_CAP_FILE_BUFFER))
+                dirty |= I_DIRTY_DATASYNC;
+        if (dirty)
+                __mark_inode_dirty(inode, dirty);
+        __cap_delay_requeue(mdsc, ci);
+}
+/*
+ * Add dirty inode to the flushing list.  Assigned a seq number so we
+ * can wait for caps to flush without starving.
+ *
+ * Called under i_lock.
+ */
+static int __mark_caps_flushing(struct inode *inode,
+                                 struct ceph_mds_session *session)
+{
+        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int flushing;
+        BUG_ON(ci->i_dirty_caps == 0);
+        BUG_ON(list_empty(&ci->i_dirty_item));
+        flushing = ci->i_dirty_caps;
+        dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
+             ceph_cap_string(flushing),
+             ceph_cap_string(ci->i_flushing_caps),
+             ceph_cap_string(ci->i_flushing_caps | flushing));
+        ci->i_flushing_caps |= flushing;
+        ci->i_dirty_caps = 0;
+        dout(" inode %p now !dirty\n", inode);
+        spin_lock(&mdsc->cap_dirty_lock);
+        list_del_init(&ci->i_dirty_item);
+        ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
+        if (list_empty(&ci->i_flushing_item)) {
+                list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
+                mdsc->num_cap_flushing++;
+                dout(" inode %p now flushing seq %lld\n", inode,
+                     ci->i_cap_flush_seq);
+        } else {
+                list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
+                dout(" inode %p now flushing (more) seq %lld\n", inode,
+                     ci->i_cap_flush_seq);
+        }
+        spin_unlock(&mdsc->cap_dirty_lock);
+        return flushing;
+}
+/*
+ * try to invalidate mapping pages without blocking.
+ */
+static int mapping_is_empty(struct address_space *mapping)
+{
+        struct page *page = find_get_page(mapping, 0);
+        if (!page)
+                return 1;
+        put_page(page);
+        return 0;
+}
+static int try_nonblocking_invalidate(struct inode *inode)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        u32 invalidating_gen = ci->i_rdcache_gen;
+        spin_unlock(&inode->i_lock);
+        invalidate_mapping_pages(&inode->i_data, 0, -1);
+        spin_lock(&inode->i_lock);
+        if (mapping_is_empty(&inode->i_data) &&
+            invalidating_gen == ci->i_rdcache_gen) {
+                /* success. */
+                dout("try_nonblocking_invalidate %p success\n", inode);
+                ci->i_rdcache_gen = 0;
+                ci->i_rdcache_revoking = 0;
+                return 0;
+        }
+        dout("try_nonblocking_invalidate %p failed\n", inode);
+        return -1;
+}
+/*
+ * Swiss army knife function to examine currently used and wanted
+ * versus held caps.  Release, flush, ack revoked caps to mds as
+ * appropriate.
+ *
+ *  CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
+ *    cap release further.
+ *  CHECK_CAPS_AUTHONLY - we should only check the auth cap
+ *  CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
+ *    further delay.
+ */
+void ceph_check_caps(struct ceph_inode_info *ci, int flags,
+                     struct ceph_mds_session *session)
+        __releases(session->s_mutex)
+{
+        struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct inode *inode = &ci->vfs_inode;
+        struct ceph_cap *cap;
+        int file_wanted, used;
+        int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */
+        int issued, implemented, want, retain, revoking, flushing = 0;
+        int mds = -1;   /* keep track of how far we've gone through i_caps list
+                           to avoid an infinite loop on retry */
+        struct rb_node *p;
+        int tried_invalidate = 0;
+        int delayed = 0, sent = 0, force_requeue = 0, num;
+        int queue_invalidate = 0;
+        int is_delayed = flags & CHECK_CAPS_NODELAY;
+        /* if we are unmounting, flush any unused caps immediately. */
+        if (mdsc->stopping)
+                is_delayed = 1;
+        spin_lock(&inode->i_lock);
+        if (ci->i_ceph_flags & CEPH_I_FLUSH)
+                flags |= CHECK_CAPS_FLUSH;
+        /* flush snaps first time around only */
+        if (!list_empty(&ci->i_cap_snaps))
+                __ceph_flush_snaps(ci, &session);
+        goto retry_locked;
+retry:
+        spin_lock(&inode->i_lock);
+retry_locked:
+        file_wanted = __ceph_caps_file_wanted(ci);
+        used = __ceph_caps_used(ci);
+        want = file_wanted | used;
+        issued = __ceph_caps_issued(ci, &implemented);
+        revoking = implemented & ~issued;
+        retain = want | CEPH_CAP_PIN;
+        if (!mdsc->stopping && inode->i_nlink > 0) {
+                if (want) {
+                        retain |= CEPH_CAP_ANY;       /* be greedy */
+                } else {
+                        retain |= CEPH_CAP_ANY_SHARED;
+                        /*
+                         * keep RD only if we didn't have the file open RW,
+                         * because then the mds would revoke it anyway to
+                         * journal max_size=0.
+                         */
+                        if (ci->i_max_size == 0)
+                                retain |= CEPH_CAP_ANY_RD;
+                }
+        }
+        dout("check_caps %p file_want %s used %s dirty %s flushing %s"
+             " issued %s revoking %s retain %s %s%s%s\n", inode,
+             ceph_cap_string(file_wanted),
+             ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
+             ceph_cap_string(ci->i_flushing_caps),
+             ceph_cap_string(issued), ceph_cap_string(revoking),
+             ceph_cap_string(retain),
+             (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
+             (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
+             (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
+        /*
+         * If we no longer need to hold onto old our caps, and we may
+         * have cached pages, but don't want them, then try to invalidate.
+         * If we fail, it's because pages are locked.... try again later.
+         */
+        if ((!is_delayed || mdsc->stopping) &&
+            ci->i_wrbuffer_ref == 0 &&               /* no dirty pages... */
+            ci->i_rdcache_gen &&                     /* may have cached pages */
+            (file_wanted == 0 ||                     /* no open files */
+             (revoking & CEPH_CAP_FILE_CACHE)) &&     /*  or revoking cache */
+            !tried_invalidate) {
+                dout("check_caps trying to invalidate on %p\n", inode);
+                if (try_nonblocking_invalidate(inode) < 0) {
+                        if (revoking & CEPH_CAP_FILE_CACHE) {
+                                dout("check_caps queuing invalidate\n");
+                                queue_invalidate = 1;
+                                ci->i_rdcache_revoking = ci->i_rdcache_gen;
+                        } else {
+                                dout("check_caps failed to invalidate pages\n");
+                                /* we failed to invalidate pages.  check these
+                                   caps again later. */
+                                force_requeue = 1;
+                                __cap_set_timeouts(mdsc, ci);
+                        }
+                }
+                tried_invalidate = 1;
+                goto retry_locked;
+        }
+        num = 0;
+        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+                cap = rb_entry(p, struct ceph_cap, ci_node);
+                num++;
+                /* avoid looping forever */
+                if (mds >= cap->mds ||
+                    ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
+                        continue;
+                /* NOTE: no side-effects allowed, until we take s_mutex */
+                revoking = cap->implemented & ~cap->issued;
+                if (revoking)
+                        dout(" mds%d revoking %s\n", cap->mds,
+                             ceph_cap_string(revoking));
+                if (cap == ci->i_auth_cap &&
+                    (cap->issued & CEPH_CAP_FILE_WR)) {
+                        /* request larger max_size from MDS? */
+                        if (ci->i_wanted_max_size > ci->i_max_size &&
+                            ci->i_wanted_max_size > ci->i_requested_max_size) {
+                                dout("requesting new max_size\n");
+                                goto ack;
+                        }
+                        /* approaching file_max? */
+                        if ((inode->i_size << 1) >= ci->i_max_size &&
+                            (ci->i_reported_size << 1) < ci->i_max_size) {
+                                dout("i_size approaching max_size\n");
+                                goto ack;
+                        }
+                }
+                /* flush anything dirty? */
+                if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
+                    ci->i_dirty_caps) {
+                        dout("flushing dirty caps\n");
+                        goto ack;
+                }
+                /* completed revocation? going down and there are no caps? */
+                if (revoking && (revoking & used) == 0) {
+                        dout("completed revocation of %s\n",
+                             ceph_cap_string(cap->implemented & ~cap->issued));
+                        goto ack;
+                }
+                /* want more caps from mds? */
+                if (want & ~(cap->mds_wanted | cap->issued))
+                        goto ack;
+                /* things we might delay */
+                if ((cap->issued & ~retain) == 0 &&
+                    cap->mds_wanted == want)
+                        continue;     /* nope, all good */
+                if (is_delayed)
+                        goto ack;
+                /* delay? */
+                if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
+                    time_before(jiffies, ci->i_hold_caps_max)) {
+                        dout(" delaying issued %s -> %s, wanted %s -> %s\n",
+                             ceph_cap_string(cap->issued),
+                             ceph_cap_string(cap->issued & retain),
+                             ceph_cap_string(cap->mds_wanted),
+                             ceph_cap_string(want));
+                        delayed++;
+                        continue;
+                }
+ack:
+                if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
+                        dout(" skipping %p I_NOFLUSH set\n", inode);
+                        continue;
+                }
+                if (session && session != cap->session) {
+                        dout("oops, wrong session %p mutex\n", session);
+                        mutex_unlock(&session->s_mutex);
+                        session = NULL;
+                }
+                if (!session) {
+                        session = cap->session;
+                        if (mutex_trylock(&session->s_mutex) == 0) {
+                                dout("inverting session/ino locks on %p\n",
+                                     session);
+                                spin_unlock(&inode->i_lock);
+                                if (took_snap_rwsem) {
+                                        up_read(&mdsc->snap_rwsem);
+                                        took_snap_rwsem = 0;
+                                }
+                                mutex_lock(&session->s_mutex);
+                                goto retry;
+                        }
+                }
+                /* take snap_rwsem after session mutex */
+                if (!took_snap_rwsem) {
+                        if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
+                                dout("inverting snap/in locks on %p\n",
+                                     inode);
+                                spin_unlock(&inode->i_lock);
+                                down_read(&mdsc->snap_rwsem);
+                                took_snap_rwsem = 1;
+                                goto retry;
+                        }
+                        took_snap_rwsem = 1;
+                }
+                if (cap == ci->i_auth_cap && ci->i_dirty_caps)
+                        flushing = __mark_caps_flushing(inode, session);
+                mds = cap->mds;  /* remember mds, so we don't repeat */
+                sent++;
+                /* __send_cap drops i_lock */
+                delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
+                                      retain, flushing, NULL);
+                goto retry; /* retake i_lock and restart our cap scan. */
+        }
+        /*
+         * Reschedule delayed caps release if we delayed anything,
+         * otherwise cancel.
+         */
+        if (delayed && is_delayed)
+                force_requeue = 1;   /* __send_cap delayed release; requeue */
+        if (!delayed && !is_delayed)
+                __cap_delay_cancel(mdsc, ci);
+        else if (!is_delayed || force_requeue)
+                __cap_delay_requeue(mdsc, ci);
+        spin_unlock(&inode->i_lock);
+        if (queue_invalidate)
+                ceph_queue_invalidate(inode);
+        if (session)
+                mutex_unlock(&session->s_mutex);
+        if (took_snap_rwsem)
+                up_read(&mdsc->snap_rwsem);
+}
+/*
+ * Try to flush dirty caps back to the auth mds.
+ */
+static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
+                          unsigned *flush_tid)
+{
+        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int unlock_session = session ? 0 : 1;
+        int flushing = 0;
+retry:
+        spin_lock(&inode->i_lock);
+        if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
+                dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
+                goto out;
+        }
+        if (ci->i_dirty_caps && ci->i_auth_cap) {
+                struct ceph_cap *cap = ci->i_auth_cap;
+                int used = __ceph_caps_used(ci);
+                int want = __ceph_caps_wanted(ci);
+                int delayed;
+                if (!session) {
+                        spin_unlock(&inode->i_lock);
+                        session = cap->session;
+                        mutex_lock(&session->s_mutex);
+                        goto retry;
+                }
+                BUG_ON(session != cap->session);
+                if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
+                        goto out;
+                flushing = __mark_caps_flushing(inode, session);
+                /* __send_cap drops i_lock */
+                delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
+                                     cap->issued | cap->implemented, flushing,
+                                     flush_tid);
+                if (!delayed)
+                        goto out_unlocked;
+                spin_lock(&inode->i_lock);
+                __cap_delay_requeue(mdsc, ci);
+        }
+out:
+        spin_unlock(&inode->i_lock);
+out_unlocked:
+        if (session && unlock_session)
+                mutex_unlock(&session->s_mutex);
+        return flushing;
+}
+/*
+ * Return true if we've flushed caps through the given flush_tid.
+ */
+static int caps_are_flushed(struct inode *inode, unsigned tid)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int dirty, i, ret = 1;
+        spin_lock(&inode->i_lock);
+        dirty = __ceph_caps_dirty(ci);
+        for (i = 0; i < CEPH_CAP_BITS; i++)
+                if ((ci->i_flushing_caps & (1 << i)) &&
+                    ci->i_cap_flush_tid[i] <= tid) {
+                        /* still flushing this bit */
+                        ret = 0;
+                        break;
+                }
+        spin_unlock(&inode->i_lock);
+        return ret;
+}
+/*
+ * Wait on any unsafe replies for the given inode.  First wait on the
+ * newest request, and make that the upper bound.  Then, if there are
+ * more requests, keep waiting on the oldest as long as it is still older
+ * than the original request.
+ */
+static void sync_write_wait(struct inode *inode)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct list_head *head = &ci->i_unsafe_writes;
+        struct ceph_osd_request *req;
+        u64 last_tid;
+        spin_lock(&ci->i_unsafe_lock);
+        if (list_empty(head))
+                goto out;
+        /* set upper bound as _last_ entry in chain */
+        req = list_entry(head->prev, struct ceph_osd_request,
+                         r_unsafe_item);
+        last_tid = req->r_tid;
+        do {
+                ceph_osdc_get_request(req);
+                spin_unlock(&ci->i_unsafe_lock);
+                dout("sync_write_wait on tid %llu (until %llu)\n",
+                     req->r_tid, last_tid);
+                wait_for_completion(&req->r_safe_completion);
+                spin_lock(&ci->i_unsafe_lock);
+                ceph_osdc_put_request(req);
+                /*
+                 * from here on look at first entry in chain, since we
+                 * only want to wait for anything older than last_tid
+                 */
+                if (list_empty(head))
+                        break;
+                req = list_entry(head->next, struct ceph_osd_request,
+                                 r_unsafe_item);
+        } while (req->r_tid < last_tid);
+out:
+        spin_unlock(&ci->i_unsafe_lock);
+}
+int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+        struct inode *inode = dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        unsigned flush_tid;
+        int ret;
+        int dirty;
+        dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
+        sync_write_wait(inode);
+        ret = filemap_write_and_wait(inode->i_mapping);
+        if (ret < 0)
+                return ret;
+        dirty = try_flush_caps(inode, NULL, &flush_tid);
+        dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
+        /*
+         * only wait on non-file metadata writeback (the mds
+         * can recover size and mtime, so we don't need to
+         * wait for that)
+         */
+        if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
+                dout("fsync waiting for flush_tid %u\n", flush_tid);
+                ret = wait_event_interruptible(ci->i_cap_wq,
+                                       caps_are_flushed(inode, flush_tid));
+        }
+        dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
+        return ret;
+}
+/*
+ * Flush any dirty caps back to the mds.  If we aren't asked to wait,
+ * queue inode for flush but don't do so immediately, because we can
+ * get by with fewer MDS messages if we wait for data writeback to
+ * complete first.
+ */
+int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        unsigned flush_tid;
+        int err = 0;
+        int dirty;
+        int wait = wbc->sync_mode == WB_SYNC_ALL;
+        dout("write_inode %p wait=%d\n", inode, wait);
+        if (wait) {
+                dirty = try_flush_caps(inode, NULL, &flush_tid);
+                if (dirty)
+                        err = wait_event_interruptible(ci->i_cap_wq,
+                                       caps_are_flushed(inode, flush_tid));
+        } else {
+                struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+                spin_lock(&inode->i_lock);
+                if (__ceph_caps_dirty(ci))
+                        __cap_delay_requeue_front(mdsc, ci);
+                spin_unlock(&inode->i_lock);
+        }
+        return err;
+}
+/*
+ * After a recovering MDS goes active, we need to resend any caps
+ * we were flushing.
+ *
+ * Caller holds session->s_mutex.
+ */
+static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
+                                   struct ceph_mds_session *session)
+{
+        struct ceph_cap_snap *capsnap;
+        dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
+        list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
+                            flushing_item) {
+                struct ceph_inode_info *ci = capsnap->ci;
+                struct inode *inode = &ci->vfs_inode;
+                struct ceph_cap *cap;
+                spin_lock(&inode->i_lock);
+                cap = ci->i_auth_cap;
+                if (cap && cap->session == session) {
+                        dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
+                             cap, capsnap);
+                        __ceph_flush_snaps(ci, &session);
+                } else {
+                        pr_err("%p auth cap %p not mds%d ???\n", inode,
+                               cap, session->s_mds);
+                        spin_unlock(&inode->i_lock);
+                }
+        }
+}
+void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
+                             struct ceph_mds_session *session)
+{
+        struct ceph_inode_info *ci;
+        kick_flushing_capsnaps(mdsc, session);
+        dout("kick_flushing_caps mds%d\n", session->s_mds);
+        list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
+                struct inode *inode = &ci->vfs_inode;
+                struct ceph_cap *cap;
+                int delayed = 0;
+                spin_lock(&inode->i_lock);
+                cap = ci->i_auth_cap;
+                if (cap && cap->session == session) {
+                        dout("kick_flushing_caps %p cap %p %s\n", inode,
+                             cap, ceph_cap_string(ci->i_flushing_caps));
+                        delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+                                             __ceph_caps_used(ci),
+                                             __ceph_caps_wanted(ci),
+                                             cap->issued | cap->implemented,
+                                             ci->i_flushing_caps, NULL);
+                        if (delayed) {
+                                spin_lock(&inode->i_lock);
+                                __cap_delay_requeue(mdsc, ci);
+                                spin_unlock(&inode->i_lock);
+                        }
+                } else {
+                        pr_err("%p auth cap %p not mds%d ???\n", inode,
+                               cap, session->s_mds);
+                        spin_unlock(&inode->i_lock);
+                }
+        }
+}
+/*
+ * Take references to capabilities we hold, so that we don't release
+ * them to the MDS prematurely.
+ *
+ * Protected by i_lock.
+ */
+static void __take_cap_refs(struct ceph_inode_info *ci, int got)
+{
+        if (got & CEPH_CAP_PIN)
+                ci->i_pin_ref++;
+        if (got & CEPH_CAP_FILE_RD)
+                ci->i_rd_ref++;
+        if (got & CEPH_CAP_FILE_CACHE)
+                ci->i_rdcache_ref++;
+        if (got & CEPH_CAP_FILE_WR)
+                ci->i_wr_ref++;
+        if (got & CEPH_CAP_FILE_BUFFER) {
+                if (ci->i_wrbuffer_ref == 0)
+                        igrab(&ci->vfs_inode);
+                ci->i_wrbuffer_ref++;
+                dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
+                     &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
+        }
+}
+/*
+ * Try to grab cap references.  Specify those refs we @want, and the
+ * minimal set we @need.  Also include the larger offset we are writing
+ * to (when applicable), and check against max_size here as well.
+ * Note that caller is responsible for ensuring max_size increases are
+ * requested from the MDS.
+ */
+static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
+                            int *got, loff_t endoff, int *check_max, int *err)
+{
+        struct inode *inode = &ci->vfs_inode;
+        int ret = 0;
+        int have, implemented;
+        int file_wanted;
+        dout("get_cap_refs %p need %s want %s\n", inode,
+             ceph_cap_string(need), ceph_cap_string(want));
+        spin_lock(&inode->i_lock);
+        /* make sure file is actually open */
+        file_wanted = __ceph_caps_file_wanted(ci);
+        if ((file_wanted & need) == 0) {
+                dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
+                     ceph_cap_string(need), ceph_cap_string(file_wanted));
+                *err = -EBADF;
+                ret = 1;
+                goto out;
+        }
+        if (need & CEPH_CAP_FILE_WR) {
+                if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
+                        dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
+                             inode, endoff, ci->i_max_size);
+                        if (endoff > ci->i_wanted_max_size) {
+                                *check_max = 1;
+                                ret = 1;
+                        }
+                        goto out;
+                }
+                /*
+                 * If a sync write is in progress, we must wait, so that we
+                 * can get a final snapshot value for size+mtime.
+                 */
+                if (__ceph_have_pending_cap_snap(ci)) {
+                        dout("get_cap_refs %p cap_snap_pending\n", inode);
+                        goto out;
+                }
+        }
+        have = __ceph_caps_issued(ci, &implemented);
+        /*
+         * disallow writes while a truncate is pending
+         */
+        if (ci->i_truncate_pending)
+                have &= ~CEPH_CAP_FILE_WR;
+        if ((have & need) == need) {
+                /*
+                 * Look at (implemented & ~have & not) so that we keep waiting
+                 * on transition from wanted -> needed caps.  This is needed
+                 * for WRBUFFER|WR -> WR to avoid a new WR sync write from
+                 * going before a prior buffered writeback happens.
+                 */
+                int not = want & ~(have & need);
+                int revoking = implemented & ~have;
+                dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
+                     inode, ceph_cap_string(have), ceph_cap_string(not),
+                     ceph_cap_string(revoking));
+                if ((revoking & not) == 0) {
+                        *got = need | (have & want);
+                        __take_cap_refs(ci, *got);
+                        ret = 1;
+                }
+        } else {
+                dout("get_cap_refs %p have %s needed %s\n", inode,
+                     ceph_cap_string(have), ceph_cap_string(need));
+        }
+out:
+        spin_unlock(&inode->i_lock);
+        dout("get_cap_refs %p ret %d got %s\n", inode,
+             ret, ceph_cap_string(*got));
+        return ret;
+}
+/*
+ * Check the offset we are writing up to against our current
+ * max_size.  If necessary, tell the MDS we want to write to
+ * a larger offset.
+ */
+static void check_max_size(struct inode *inode, loff_t endoff)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int check = 0;
+        /* do we need to explicitly request a larger max_size? */
+        spin_lock(&inode->i_lock);
+        if ((endoff >= ci->i_max_size ||
+             endoff > (inode->i_size << 1)) &&
+            endoff > ci->i_wanted_max_size) {
+                dout("write %p at large endoff %llu, req max_size\n",
+                     inode, endoff);
+                ci->i_wanted_max_size = endoff;
+                check = 1;
+        }
+        spin_unlock(&inode->i_lock);
+        if (check)
+                ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+}
+/*
+ * Wait for caps, and take cap references.  If we can't get a WR cap
+ * due to a small max_size, make sure we check_max_size (and possibly
+ * ask the mds) so we don't get hung up indefinitely.
+ */
+int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
+                  loff_t endoff)
+{
+        int check_max, ret, err;
+retry:
+        if (endoff > 0)
+                check_max_size(&ci->vfs_inode, endoff);
+        check_max = 0;
+        err = 0;
+        ret = wait_event_interruptible(ci->i_cap_wq,
+                                       try_get_cap_refs(ci, need, want,
+                                                        got, endoff,
+                                                        &check_max, &err));
+        if (err)
+                ret = err;
+        if (check_max)
+                goto retry;
+        return ret;
+}
+/*
+ * Take cap refs.  Caller must already know we hold at least one ref
+ * on the caps in question or we don't know this is safe.
+ */
+void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
+{
+        spin_lock(&ci->vfs_inode.i_lock);
+        __take_cap_refs(ci, caps);
+        spin_unlock(&ci->vfs_inode.i_lock);
+}
+/*
+ * Release cap refs.
+ *
+ * If we released the last ref on any given cap, call ceph_check_caps
+ * to release (or schedule a release).
+ *
+ * If we are releasing a WR cap (from a sync write), finalize any affected
+ * cap_snap, and wake up any waiters.
+ */
+void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
+{
+        struct inode *inode = &ci->vfs_inode;
+        int last = 0, put = 0, flushsnaps = 0, wake = 0;
+        struct ceph_cap_snap *capsnap;
+        spin_lock(&inode->i_lock);
+        if (had & CEPH_CAP_PIN)
+                --ci->i_pin_ref;
+        if (had & CEPH_CAP_FILE_RD)
+                if (--ci->i_rd_ref == 0)
+                        last++;
+        if (had & CEPH_CAP_FILE_CACHE)
+                if (--ci->i_rdcache_ref == 0)
+                        last++;
+        if (had & CEPH_CAP_FILE_BUFFER) {
+                if (--ci->i_wrbuffer_ref == 0) {
+                        last++;
+                        put++;
+                }
+                dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
+                     inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
+        }
+        if (had & CEPH_CAP_FILE_WR)
+                if (--ci->i_wr_ref == 0) {
+                        last++;
+                        if (!list_empty(&ci->i_cap_snaps)) {
+                                capsnap = list_first_entry(&ci->i_cap_snaps,
+                                                     struct ceph_cap_snap,
+                                                     ci_item);
+                                if (capsnap->writing) {
+                                        capsnap->writing = 0;
+                                        flushsnaps =
+                                                __ceph_finish_cap_snap(ci,
+                                                                       capsnap);
+                                        wake = 1;
+                                }
+                        }
+                }
+        spin_unlock(&inode->i_lock);
+        dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had),
+             last ? "last" : "");
+        if (last && !flushsnaps)
+                ceph_check_caps(ci, 0, NULL);
+        else if (flushsnaps)
+                ceph_flush_snaps(ci);
+        if (wake)
+                wake_up(&ci->i_cap_wq);
+        if (put)
+                iput(inode);
+}
+/*
+ * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
+ * context.  Adjust per-snap dirty page accounting as appropriate.
+ * Once all dirty data for a cap_snap is flushed, flush snapped file
+ * metadata back to the MDS.  If we dropped the last ref, call
+ * ceph_check_caps.
+ */
+void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
+                                struct ceph_snap_context *snapc)
+{
+        struct inode *inode = &ci->vfs_inode;
+        int last = 0;
+        int last_snap = 0;
+        int found = 0;
+        struct ceph_cap_snap *capsnap = NULL;
+        spin_lock(&inode->i_lock);
+        ci->i_wrbuffer_ref -= nr;
+        last = !ci->i_wrbuffer_ref;
+        if (ci->i_head_snapc == snapc) {
+                ci->i_wrbuffer_ref_head -= nr;
+                if (!ci->i_wrbuffer_ref_head) {
+                        ceph_put_snap_context(ci->i_head_snapc);
+                        ci->i_head_snapc = NULL;
+                }
+                dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
+                     inode,
+                     ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
+                     ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+                     last ? " LAST" : "");
+        } else {
+                list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+                        if (capsnap->context == snapc) {
+                                found = 1;
+                                capsnap->dirty_pages -= nr;
+                                last_snap = !capsnap->dirty_pages;
+                                break;
+                        }
+                }
+                BUG_ON(!found);
+                dout("put_wrbuffer_cap_refs on %p cap_snap %p "
+                     " snap %lld %d/%d -> %d/%d %s%s\n",
+                     inode, capsnap, capsnap->context->seq,
+                     ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
+                     ci->i_wrbuffer_ref, capsnap->dirty_pages,
+                     last ? " (wrbuffer last)" : "",
+                     last_snap ? " (capsnap last)" : "");
+        }
+        spin_unlock(&inode->i_lock);
+        if (last) {
+                ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+                iput(inode);
+        } else if (last_snap) {
+                ceph_flush_snaps(ci);
+                wake_up(&ci->i_cap_wq);
+        }
+}
+/*
+ * Handle a cap GRANT message from the MDS.  (Note that a GRANT may
+ * actually be a revocation if it specifies a smaller cap set.)
+ *
+ * caller holds s_mutex and i_lock, we drop both.
+ *
+ * return value:
+ *  0 - ok
+ *  1 - check_caps on auth cap only (writeback)
+ *  2 - check_caps (ack revoke)
+ */
+static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
+                             struct ceph_mds_session *session,
+                             struct ceph_cap *cap,
+                             struct ceph_buffer *xattr_buf)
+        __releases(inode->i_lock)
+        __releases(session->s_mutex)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int mds = session->s_mds;
+        int seq = le32_to_cpu(grant->seq);
+        int newcaps = le32_to_cpu(grant->caps);
+        int issued, implemented, used, wanted, dirty;
+        u64 size = le64_to_cpu(grant->size);
+        u64 max_size = le64_to_cpu(grant->max_size);
+        struct timespec mtime, atime, ctime;
+        int check_caps = 0;
+        int wake = 0;
+        int writeback = 0;
+        int revoked_rdcache = 0;
+        int queue_invalidate = 0;
+        dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
+             inode, cap, mds, seq, ceph_cap_string(newcaps));
+        dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
+                inode->i_size);
+        /*
+         * If CACHE is being revoked, and we have no dirty buffers,
+         * try to invalidate (once).  (If there are dirty buffers, we
+         * will invalidate _after_ writeback.)
+         */
+        if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
+            !ci->i_wrbuffer_ref) {
+                if (try_nonblocking_invalidate(inode) == 0) {
+                        revoked_rdcache = 1;
+                } else {
+                        /* there were locked pages.. invalidate later
+                           in a separate thread. */
+                        if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
+                                queue_invalidate = 1;
+                                ci->i_rdcache_revoking = ci->i_rdcache_gen;
+                        }
+                }
+        }
+        /* side effects now are allowed */
+        issued = __ceph_caps_issued(ci, &implemented);
+        issued |= implemented | __ceph_caps_dirty(ci);
+        cap->cap_gen = session->s_cap_gen;
+        __check_cap_issue(ci, cap, newcaps);
+        if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+                inode->i_mode = le32_to_cpu(grant->mode);
+                inode->i_uid = le32_to_cpu(grant->uid);
+                inode->i_gid = le32_to_cpu(grant->gid);
+                dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
+                     inode->i_uid, inode->i_gid);
+        }
+        if ((issued & CEPH_CAP_LINK_EXCL) == 0)
+                inode->i_nlink = le32_to_cpu(grant->nlink);
+        if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
+                int len = le32_to_cpu(grant->xattr_len);
+                u64 version = le64_to_cpu(grant->xattr_version);
+                if (version > ci->i_xattrs.version) {
+                        dout(" got new xattrs v%llu on %p len %d\n",
+                             version, inode, len);
+                        if (ci->i_xattrs.blob)
+                                ceph_buffer_put(ci->i_xattrs.blob);
+                        ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
+                        ci->i_xattrs.version = version;
+                }
+        }
+        /* size/ctime/mtime/atime? */
+        ceph_fill_file_size(inode, issued,
+                            le32_to_cpu(grant->truncate_seq),
+                            le64_to_cpu(grant->truncate_size), size);
+        ceph_decode_timespec(&mtime, &grant->mtime);
+        ceph_decode_timespec(&atime, &grant->atime);
+        ceph_decode_timespec(&ctime, &grant->ctime);
+        ceph_fill_file_time(inode, issued,
+                            le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
+                            &atime);
+        /* max size increase? */
+        if (max_size != ci->i_max_size) {
+                dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
+                ci->i_max_size = max_size;
+                if (max_size >= ci->i_wanted_max_size) {
+                        ci->i_wanted_max_size = 0;  /* reset */
+                        ci->i_requested_max_size = 0;
+                }
+                wake = 1;
+        }
+        /* check cap bits */
+        wanted = __ceph_caps_wanted(ci);
+        used = __ceph_caps_used(ci);
+        dirty = __ceph_caps_dirty(ci);
+        dout(" my wanted = %s, used = %s, dirty %s\n",
+             ceph_cap_string(wanted),
+             ceph_cap_string(used),
+             ceph_cap_string(dirty));
+        if (wanted != le32_to_cpu(grant->wanted)) {
+                dout("mds wanted %s -> %s\n",
+                     ceph_cap_string(le32_to_cpu(grant->wanted)),
+                     ceph_cap_string(wanted));
+                grant->wanted = cpu_to_le32(wanted);
+        }
+        cap->seq = seq;
+        /* file layout may have changed */
+        ci->i_layout = grant->layout;
+        /* revocation, grant, or no-op? */
+        if (cap->issued & ~newcaps) {
+                dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
+                     ceph_cap_string(newcaps));
+                if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
+                        writeback = 1; /* will delay ack */
+                else if (dirty & ~newcaps)
+                        check_caps = 1;  /* initiate writeback in check_caps */
+                else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
+                           revoked_rdcache)
+                        check_caps = 2;     /* send revoke ack in check_caps */
+                cap->issued = newcaps;
+                cap->implemented |= newcaps;
+        } else if (cap->issued == newcaps) {
+                dout("caps unchanged: %s -> %s\n",
+                     ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
+        } else {
+                dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
+                     ceph_cap_string(newcaps));
+                cap->issued = newcaps;
+                cap->implemented |= newcaps; /* add bits only, to
+                                              * avoid stepping on a
+                                              * pending revocation */
+                wake = 1;
+        }
+        BUG_ON(cap->issued & ~cap->implemented);
+        spin_unlock(&inode->i_lock);
+        if (writeback)
+                /*
+                 * queue inode for writeback: we can't actually call
+                 * filemap_write_and_wait, etc. from message handler
+                 * context.
+                 */
+                ceph_queue_writeback(inode);
+        if (queue_invalidate)
+                ceph_queue_invalidate(inode);
+        if (wake)
+                wake_up(&ci->i_cap_wq);
+        if (check_caps == 1)
+                ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
+                                session);
+        else if (check_caps == 2)
+                ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
+        else
+                mutex_unlock(&session->s_mutex);
+}
+/*
+ * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
+ * MDS has been safely committed.
+ */
+static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
+                                 struct ceph_mds_caps *m,
+                                 struct ceph_mds_session *session,
+                                 struct ceph_cap *cap)
+        __releases(inode->i_lock)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+        unsigned seq = le32_to_cpu(m->seq);
+        int dirty = le32_to_cpu(m->dirty);
+        int cleaned = 0;
+        int drop = 0;
+        int i;
+        for (i = 0; i < CEPH_CAP_BITS; i++)
+                if ((dirty & (1 << i)) &&
+                    flush_tid == ci->i_cap_flush_tid[i])
+                        cleaned |= 1 << i;
+        dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
+             " flushing %s -> %s\n",
+             inode, session->s_mds, seq, ceph_cap_string(dirty),
+             ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
+             ceph_cap_string(ci->i_flushing_caps & ~cleaned));
+        if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
+                goto out;
+        ci->i_flushing_caps &= ~cleaned;
+        spin_lock(&mdsc->cap_dirty_lock);
+        if (ci->i_flushing_caps == 0) {
+                list_del_init(&ci->i_flushing_item);
+                if (!list_empty(&session->s_cap_flushing))
+                        dout(" mds%d still flushing cap on %p\n",
+                             session->s_mds,
+                             &list_entry(session->s_cap_flushing.next,
+                                         struct ceph_inode_info,
+                                         i_flushing_item)->vfs_inode);
+                mdsc->num_cap_flushing--;
+                wake_up(&mdsc->cap_flushing_wq);
+                dout(" inode %p now !flushing\n", inode);
+                if (ci->i_dirty_caps == 0) {
+                        dout(" inode %p now clean\n", inode);
+                        BUG_ON(!list_empty(&ci->i_dirty_item));
+                        drop = 1;
+                } else {
+                        BUG_ON(list_empty(&ci->i_dirty_item));
+                }
+        }
+        spin_unlock(&mdsc->cap_dirty_lock);
+        wake_up(&ci->i_cap_wq);
+out:
+        spin_unlock(&inode->i_lock);
+        if (drop)
+                iput(inode);
+}
+/*
+ * Handle FLUSHSNAP_ACK.  MDS has flushed snap data to disk and we can
+ * throw away our cap_snap.
+ *
+ * Caller hold s_mutex.
+ */
+static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
+                                     struct ceph_mds_caps *m,
+                                     struct ceph_mds_session *session)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        u64 follows = le64_to_cpu(m->snap_follows);
+        struct ceph_cap_snap *capsnap;
+        int drop = 0;
+        dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
+             inode, ci, session->s_mds, follows);
+        spin_lock(&inode->i_lock);
+        list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+                if (capsnap->follows == follows) {
+                        if (capsnap->flush_tid != flush_tid) {
+                                dout(" cap_snap %p follows %lld tid %lld !="
+                                     " %lld\n", capsnap, follows,
+                                     flush_tid, capsnap->flush_tid);
+                                break;
+                        }
+                        WARN_ON(capsnap->dirty_pages || capsnap->writing);
+                        dout(" removing cap_snap %p follows %lld\n",
+                             capsnap, follows);
+                        ceph_put_snap_context(capsnap->context);
+                        list_del(&capsnap->ci_item);
+                        list_del(&capsnap->flushing_item);
+                        ceph_put_cap_snap(capsnap);
+                        drop = 1;
+                        break;
+                } else {
+                        dout(" skipping cap_snap %p follows %lld\n",
+                             capsnap, capsnap->follows);
+                }
+        }
+        spin_unlock(&inode->i_lock);
+        if (drop)
+                iput(inode);
+}
+/*
+ * Handle TRUNC from MDS, indicating file truncation.
+ *
+ * caller hold s_mutex.
+ */
+static void handle_cap_trunc(struct inode *inode,
+                             struct ceph_mds_caps *trunc,
+                             struct ceph_mds_session *session)
+        __releases(inode->i_lock)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int mds = session->s_mds;
+        int seq = le32_to_cpu(trunc->seq);
+        u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
+        u64 truncate_size = le64_to_cpu(trunc->truncate_size);
+        u64 size = le64_to_cpu(trunc->size);
+        int implemented = 0;
+        int dirty = __ceph_caps_dirty(ci);
+        int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
+        int queue_trunc = 0;
+        issued |= implemented | dirty;
+        dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
+             inode, mds, seq, truncate_size, truncate_seq);
+        queue_trunc = ceph_fill_file_size(inode, issued,
+                                          truncate_seq, truncate_size, size);
+        spin_unlock(&inode->i_lock);
+        if (queue_trunc)
+                ceph_queue_vmtruncate(inode);
+}
+/*
+ * Handle EXPORT from MDS.  Cap is being migrated _from_ this mds to a
+ * different one.  If we are the most recent migration we've seen (as
+ * indicated by mseq), make note of the migrating cap bits for the
+ * duration (until we see the corresponding IMPORT).
+ *
+ * caller holds s_mutex
+ */
+static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
+                              struct ceph_mds_session *session)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int mds = session->s_mds;
+        unsigned mseq = le32_to_cpu(ex->migrate_seq);
+        struct ceph_cap *cap = NULL, *t;
+        struct rb_node *p;
+        int remember = 1;
+        dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
+             inode, ci, mds, mseq);
+        spin_lock(&inode->i_lock);
+        /* make sure we haven't seen a higher mseq */
+        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+                t = rb_entry(p, struct ceph_cap, ci_node);
+                if (ceph_seq_cmp(t->mseq, mseq) > 0) {
+                        dout(" higher mseq on cap from mds%d\n",
+                             t->session->s_mds);
+                        remember = 0;
+                }
+                if (t->session->s_mds == mds)
+                        cap = t;
+        }
+        if (cap) {
+                if (remember) {
+                        /* make note */
+                        ci->i_cap_exporting_mds = mds;
+                        ci->i_cap_exporting_mseq = mseq;
+                        ci->i_cap_exporting_issued = cap->issued;
+                }
+                __ceph_remove_cap(cap);
+        }
+        /* else, we already released it */
+        spin_unlock(&inode->i_lock);
+}
+/*
+ * Handle cap IMPORT.  If there are temp bits from an older EXPORT,
+ * clean them up.
+ *
+ * caller holds s_mutex.
+ */
+static void handle_cap_import(struct ceph_mds_client *mdsc,
+                              struct inode *inode, struct ceph_mds_caps *im,
+                              struct ceph_mds_session *session,
+                              void *snaptrace, int snaptrace_len)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int mds = session->s_mds;
+        unsigned issued = le32_to_cpu(im->caps);
+        unsigned wanted = le32_to_cpu(im->wanted);
+        unsigned seq = le32_to_cpu(im->seq);
+        unsigned mseq = le32_to_cpu(im->migrate_seq);
+        u64 realmino = le64_to_cpu(im->realm);
+        u64 cap_id = le64_to_cpu(im->cap_id);
+        if (ci->i_cap_exporting_mds >= 0 &&
+            ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
+                dout("handle_cap_import inode %p ci %p mds%d mseq %d"
+                     " - cleared exporting from mds%d\n",
+                     inode, ci, mds, mseq,
+                     ci->i_cap_exporting_mds);
+                ci->i_cap_exporting_issued = 0;
+                ci->i_cap_exporting_mseq = 0;
+                ci->i_cap_exporting_mds = -1;
+        } else {
+                dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
+                     inode, ci, mds, mseq);
+        }
+        down_write(&mdsc->snap_rwsem);
+        ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
+                               false);
+        downgrade_write(&mdsc->snap_rwsem);
+        ceph_add_cap(inode, session, cap_id, -1,
+                     issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
+                     NULL /* no caps context */);
+        try_flush_caps(inode, session, NULL);
+        up_read(&mdsc->snap_rwsem);
+}
+/*
+ * Handle a caps message from the MDS.
+ *
+ * Identify the appropriate session, inode, and call the right handler
+ * based on the cap op.
+ */
+void ceph_handle_caps(struct ceph_mds_session *session,
+                      struct ceph_msg *msg)
+{
+        struct ceph_mds_client *mdsc = session->s_mdsc;
+        struct super_block *sb = mdsc->client->sb;
+        struct inode *inode;
+        struct ceph_cap *cap;
+        struct ceph_mds_caps *h;
+        int mds = session->s_mds;
+        int op;
+        u32 seq;
+        struct ceph_vino vino;
+        u64 cap_id;
+        u64 size, max_size;
+        u64 tid;
+        void *snaptrace;
+        dout("handle_caps from mds%d\n", mds);
+        /* decode */
+        tid = le64_to_cpu(msg->hdr.tid);
+        if (msg->front.iov_len < sizeof(*h))
+                goto bad;
+        h = msg->front.iov_base;
+        snaptrace = h + 1;
+        op = le32_to_cpu(h->op);
+        vino.ino = le64_to_cpu(h->ino);
+        vino.snap = CEPH_NOSNAP;
+        cap_id = le64_to_cpu(h->cap_id);
+        seq = le32_to_cpu(h->seq);
+        size = le64_to_cpu(h->size);
+        max_size = le64_to_cpu(h->max_size);
+        mutex_lock(&session->s_mutex);
+        session->s_seq++;
+        dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
+             (unsigned)seq);
+        /* lookup ino */
+        inode = ceph_find_inode(sb, vino);
+        dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
+             vino.snap, inode);
+        if (!inode) {
+                dout(" i don't have ino %llx\n", vino.ino);
+                goto done;
+        }
+        /* these will work even if we don't have a cap yet */
+        switch (op) {
+        case CEPH_CAP_OP_FLUSHSNAP_ACK:
+                handle_cap_flushsnap_ack(inode, tid, h, session);
+                goto done;
+        case CEPH_CAP_OP_EXPORT:
+                handle_cap_export(inode, h, session);
+                goto done;
+        case CEPH_CAP_OP_IMPORT:
+                handle_cap_import(mdsc, inode, h, session,
+                                  snaptrace, le32_to_cpu(h->snap_trace_len));
+                ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
+                                session);
+                goto done_unlocked;
+        }
+        /* the rest require a cap */
+        spin_lock(&inode->i_lock);
+        cap = __get_cap_for_mds(ceph_inode(inode), mds);
+        if (!cap) {
+                dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
+                     inode, ceph_ino(inode), ceph_snap(inode), mds);
+                spin_unlock(&inode->i_lock);
+                goto done;
+        }
+        /* note that each of these drops i_lock for us */
+        switch (op) {
+        case CEPH_CAP_OP_REVOKE:
+        case CEPH_CAP_OP_GRANT:
+                handle_cap_grant(inode, h, session, cap, msg->middle);
+                goto done_unlocked;
+        case CEPH_CAP_OP_FLUSH_ACK:
+                handle_cap_flush_ack(inode, tid, h, session, cap);
+                break;
+        case CEPH_CAP_OP_TRUNC:
+                handle_cap_trunc(inode, h, session);
+                break;
+        default:
+                spin_unlock(&inode->i_lock);
+                pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
+                       ceph_cap_op_name(op));
+        }
+done:
+        mutex_unlock(&session->s_mutex);
+done_unlocked:
+        if (inode)
+                iput(inode);
+        return;
+bad:
+        pr_err("ceph_handle_caps: corrupt message\n");
+        ceph_msg_dump(msg);
+        return;
+}
+/*
+ * Delayed work handler to process end of delayed cap release LRU list.
+ */
+void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
+{
+        struct ceph_inode_info *ci;
+        int flags = CHECK_CAPS_NODELAY;
+        dout("check_delayed_caps\n");
+        while (1) {
+                spin_lock(&mdsc->cap_delay_lock);
+                if (list_empty(&mdsc->cap_delay_list))
+                        break;
+                ci = list_first_entry(&mdsc->cap_delay_list,
+                                      struct ceph_inode_info,
+                                      i_cap_delay_list);
+                if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
+                    time_before(jiffies, ci->i_hold_caps_max))
+                        break;
+                list_del_init(&ci->i_cap_delay_list);
+                spin_unlock(&mdsc->cap_delay_lock);
+                dout("check_delayed_caps on %p\n", &ci->vfs_inode);
+                ceph_check_caps(ci, flags, NULL);
+        }
+        spin_unlock(&mdsc->cap_delay_lock);
+}
+/*
+ * Flush all dirty caps to the mds
+ */
+void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
+{
+        struct ceph_inode_info *ci, *nci = NULL;
+        struct inode *inode, *ninode = NULL;
+        struct list_head *p, *n;
+        dout("flush_dirty_caps\n");
+        spin_lock(&mdsc->cap_dirty_lock);
+        list_for_each_safe(p, n, &mdsc->cap_dirty) {
+                if (nci) {
+                        ci = nci;
+                        inode = ninode;
+                        ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
+                        dout("flush_dirty_caps inode %p (was next inode)\n",
+                             inode);
+                } else {
+                        ci = list_entry(p, struct ceph_inode_info,
+                                        i_dirty_item);
+                        inode = igrab(&ci->vfs_inode);
+                        BUG_ON(!inode);
+                        dout("flush_dirty_caps inode %p\n", inode);
+                }
+                if (n != &mdsc->cap_dirty) {
+                        nci = list_entry(n, struct ceph_inode_info,
+                                         i_dirty_item);
+                        ninode = igrab(&nci->vfs_inode);
+                        BUG_ON(!ninode);
+                        nci->i_ceph_flags |= CEPH_I_NOFLUSH;
+                        dout("flush_dirty_caps next inode %p, noflush\n",
+                             ninode);
+                } else {
+                        nci = NULL;
+                        ninode = NULL;
+                }
+                spin_unlock(&mdsc->cap_dirty_lock);
+                if (inode) {
+                        ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
+                                        NULL);
+                        iput(inode);
+                }
+                spin_lock(&mdsc->cap_dirty_lock);
+        }
+        spin_unlock(&mdsc->cap_dirty_lock);
+}
+/*
+ * Drop open file reference.  If we were the last open file,
+ * we may need to release capabilities to the MDS (or schedule
+ * their delayed release).
+ */
+void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
+{
+        struct inode *inode = &ci->vfs_inode;
+        int last = 0;
+        spin_lock(&inode->i_lock);
+        dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
+             ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
+        BUG_ON(ci->i_nr_by_mode[fmode] == 0);
+        if (--ci->i_nr_by_mode[fmode] == 0)
+                last++;
+        spin_unlock(&inode->i_lock);
+        if (last && ci->i_vino.snap == CEPH_NOSNAP)
+                ceph_check_caps(ci, 0, NULL);
+}
+/*
+ * Helpers for embedding cap and dentry lease releases into mds
+ * requests.
+ *
+ * @force is used by dentry_release (below) to force inclusion of a
+ * record for the directory inode, even when there aren't any caps to
+ * drop.
+ */
+int ceph_encode_inode_release(void **p, struct inode *inode,
+                              int mds, int drop, int unless, int force)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_cap *cap;
+        struct ceph_mds_request_release *rel = *p;
+        int ret = 0;
+        int used = 0;
+        spin_lock(&inode->i_lock);
+        used = __ceph_caps_used(ci);
+        dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode,
+             mds, ceph_cap_string(used), ceph_cap_string(drop),
+             ceph_cap_string(unless));
+        /* only drop unused caps */
+        drop &= ~used;
+        cap = __get_cap_for_mds(ci, mds);
+        if (cap && __cap_is_valid(cap)) {
+                if (force ||
+                    ((cap->issued & drop) &&
+                     (cap->issued & unless) == 0)) {
+                        if ((cap->issued & drop) &&
+                            (cap->issued & unless) == 0) {
+                                dout("encode_inode_release %p cap %p %s -> "
+                                     "%s\n", inode, cap,
+                                     ceph_cap_string(cap->issued),
+                                     ceph_cap_string(cap->issued & ~drop));
+                                cap->issued &= ~drop;
+                                cap->implemented &= ~drop;
+                                if (ci->i_ceph_flags & CEPH_I_NODELAY) {
+                                        int wanted = __ceph_caps_wanted(ci);
+                                        dout("  wanted %s -> %s (act %s)\n",
+                                             ceph_cap_string(cap->mds_wanted),
+                                             ceph_cap_string(cap->mds_wanted &
+                                                             ~wanted),
+                                             ceph_cap_string(wanted));
+                                        cap->mds_wanted &= wanted;
+                                }
+                        } else {
+                                dout("encode_inode_release %p cap %p %s"
+                                     " (force)\n", inode, cap,
+                                     ceph_cap_string(cap->issued));
+                        }
+                        rel->ino = cpu_to_le64(ceph_ino(inode));
+                        rel->cap_id = cpu_to_le64(cap->cap_id);
+                        rel->seq = cpu_to_le32(cap->seq);
+                        rel->issue_seq = cpu_to_le32(cap->issue_seq),
+                        rel->mseq = cpu_to_le32(cap->mseq);
+                        rel->caps = cpu_to_le32(cap->issued);
+                        rel->wanted = cpu_to_le32(cap->mds_wanted);
+                        rel->dname_len = 0;
+                        rel->dname_seq = 0;
+                        *p += sizeof(*rel);
+                        ret = 1;
+                } else {
+                        dout("encode_inode_release %p cap %p %s\n",
+                             inode, cap, ceph_cap_string(cap->issued));
+                }
+        }
+        spin_unlock(&inode->i_lock);
+        return ret;
+}
+int ceph_encode_dentry_release(void **p, struct dentry *dentry,
+                               int mds, int drop, int unless)
+{
+        struct inode *dir = dentry->d_parent->d_inode;
+        struct ceph_mds_request_release *rel = *p;
+        struct ceph_dentry_info *di = ceph_dentry(dentry);
+        int force = 0;
+        int ret;
+        /*
+         * force an record for the directory caps if we have a dentry lease.
+         * this is racy (can't take i_lock and d_lock together), but it
+         * doesn't have to be perfect; the mds will revoke anything we don't
+         * release.
+         */
+        spin_lock(&dentry->d_lock);
+        if (di->lease_session && di->lease_session->s_mds == mds)
+                force = 1;
+        spin_unlock(&dentry->d_lock);
+        ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
+        spin_lock(&dentry->d_lock);
+        if (ret && di->lease_session && di->lease_session->s_mds == mds) {
+                dout("encode_dentry_release %p mds%d seq %d\n",
+                     dentry, mds, (int)di->lease_seq);
+                rel->dname_len = cpu_to_le32(dentry->d_name.len);
+                memcpy(*p, dentry->d_name.name, dentry->d_name.len);
+                *p += dentry->d_name.len;
+                rel->dname_seq = cpu_to_le32(di->lease_seq);
+        }
+        spin_unlock(&dentry->d_lock);
+        return ret;
+}
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
new file mode 100644
index 000000000000..1818c2305610
--- /dev/null
+++ b/fs/ceph/ceph_debug.h
@@ -0,0 +1,37 @@
+#ifndef _FS_CEPH_DEBUG_H
+#define _FS_CEPH_DEBUG_H
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
+/*
+ * wrap pr_debug to include a filename:lineno prefix on each line.
+ * this incurs some overhead (kernel size and execution time) due to
+ * the extra function call at each call site.
+ */
+# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
+extern const char *ceph_file_part(const char *s, int len);
+#  define dout(fmt, ...)                                                \
+        pr_debug(" %12.12s:%-4d : " fmt,                                \
+                 ceph_file_part(__FILE__, sizeof(__FILE__)),            \
+                 __LINE__, ##__VA_ARGS__)
+# else
+/* faux printk call just to see any compiler warnings. */
+#  define dout(fmt, ...)        do {                            \
+                if (0)                                          \
+                        printk(KERN_DEBUG fmt, ##__VA_ARGS__);  \
+        } while (0)
+# endif
+#else
+/*
+ * or, just wrap pr_debug
+ */
+# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
+#endif
+#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
new file mode 100644
index 000000000000..ab6cf35c4091
--- /dev/null
+++ b/fs/ceph/ceph_frag.c
@@ -0,0 +1,21 @@
+/*
+ * Ceph 'frag' type
+ */
+#include "types.h"
+int ceph_frag_compare(__u32 a, __u32 b)
+{
+        unsigned va = ceph_frag_value(a);
+        unsigned vb = ceph_frag_value(b);
+        if (va < vb)
+                return -1;
+        if (va > vb)
+                return 1;
+        va = ceph_frag_bits(a);
+        vb = ceph_frag_bits(b);
+        if (va < vb)
+                return -1;
+        if (va > vb)
+                return 1;
+        return 0;
+}
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
new file mode 100644
index 000000000000..793f50cb7c22
--- /dev/null
+++ b/fs/ceph/ceph_frag.h
@@ -0,0 +1,109 @@
+#ifndef _FS_CEPH_FRAG_H
+#define _FS_CEPH_FRAG_H
+/*
+ * "Frags" are a way to describe a subset of a 32-bit number space,
+ * using a mask and a value to match against that mask.  Any given frag
+ * (subset of the number space) can be partitioned into 2^n sub-frags.
+ *
+ * Frags are encoded into a 32-bit word:
+ *   8 upper bits = "bits"
+ *  24 lower bits = "value"
+ * (We could go to 5+27 bits, but who cares.)
+ *
+ * We use the _most_ significant bits of the 24 bit value.  This makes
+ * values logically sort.
+ *
+ * Unfortunately, because the "bits" field is still in the high bits, we
+ * can't sort encoded frags numerically.  However, it does allow you
+ * to feed encoded frags as values into frag_contains_value.
+ */
+static inline __u32 ceph_frag_make(__u32 b, __u32 v)
+{
+        return (b << 24) |
+                (v & (0xffffffu << (24-b)) & 0xffffffu);
+}
+static inline __u32 ceph_frag_bits(__u32 f)
+{
+        return f >> 24;
+}
+static inline __u32 ceph_frag_value(__u32 f)
+{
+        return f & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask(__u32 f)
+{
+        return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask_shift(__u32 f)
+{
+        return 24 - ceph_frag_bits(f);
+}
+static inline int ceph_frag_contains_value(__u32 f, __u32 v)
+{
+        return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
+{
+        /* is sub as specific as us, and contained by us? */
+        return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
+               (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+static inline __u32 ceph_frag_parent(__u32 f)
+{
+        return ceph_frag_make(ceph_frag_bits(f) - 1,
+                         ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
+}
+static inline int ceph_frag_is_left_child(__u32 f)
+{
+        return ceph_frag_bits(f) > 0 &&
+                (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
+}
+static inline int ceph_frag_is_right_child(__u32 f)
+{
+        return ceph_frag_bits(f) > 0 &&
+                (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
+}
+static inline __u32 ceph_frag_sibling(__u32 f)
+{
+        return ceph_frag_make(ceph_frag_bits(f),
+                      ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
+}
+static inline __u32 ceph_frag_left_child(__u32 f)
+{
+        return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
+}
+static inline __u32 ceph_frag_right_child(__u32 f)
+{
+        return ceph_frag_make(ceph_frag_bits(f)+1,
+              ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
+}
+static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
+{
+        int newbits = ceph_frag_bits(f) + by;
+        return ceph_frag_make(newbits,
+                         ceph_frag_value(f) | (i << (24 - newbits)));
+}
+static inline int ceph_frag_is_leftmost(__u32 f)
+{
+        return ceph_frag_value(f) == 0;
+}
+static inline int ceph_frag_is_rightmost(__u32 f)
+{
+        return ceph_frag_value(f) == ceph_frag_mask(f);
+}
+static inline __u32 ceph_frag_next(__u32 f)
+{
+        return ceph_frag_make(ceph_frag_bits(f),
+                         ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
+}
+/*
+ * comparator to sort frags logically, as when traversing the
+ * number space in ascending order...
+ */
+int ceph_frag_compare(__u32 a, __u32 b);
+#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
new file mode 100644
index 000000000000..79d76bc4303f
--- /dev/null
+++ b/fs/ceph/ceph_fs.c
@@ -0,0 +1,74 @@
+/*
+ * Some non-inline ceph helpers
+ */
+#include "types.h"
+/*
+ * return true if @layout appears to be valid
+ */
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
+{
+        __u32 su = le32_to_cpu(layout->fl_stripe_unit);
+        __u32 sc = le32_to_cpu(layout->fl_stripe_count);
+        __u32 os = le32_to_cpu(layout->fl_object_size);
+        /* stripe unit, object size must be non-zero, 64k increment */
+        if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
+                return 0;
+        if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
+                return 0;
+        /* object size must be a multiple of stripe unit */
+        if (os < su || os % su)
+                return 0;
+        /* stripe count must be non-zero */
+        if (!sc)
+                return 0;
+        return 1;
+}
+int ceph_flags_to_mode(int flags)
+{
+#ifdef O_DIRECTORY  /* fixme */
+        if ((flags & O_DIRECTORY) == O_DIRECTORY)
+                return CEPH_FILE_MODE_PIN;
+#endif
+#ifdef O_LAZY
+        if (flags & O_LAZY)
+                return CEPH_FILE_MODE_LAZY;
+#endif
+        if ((flags & O_APPEND) == O_APPEND)
+                flags |= O_WRONLY;
+        flags &= O_ACCMODE;
+        if ((flags & O_RDWR) == O_RDWR)
+                return CEPH_FILE_MODE_RDWR;
+        if ((flags & O_WRONLY) == O_WRONLY)
+                return CEPH_FILE_MODE_WR;
+        return CEPH_FILE_MODE_RD;
+}
+int ceph_caps_for_mode(int mode)
+{
+        switch (mode) {
+        case CEPH_FILE_MODE_PIN:
+                return CEPH_CAP_PIN;
+        case CEPH_FILE_MODE_RD:
+                return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+                        CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
+        case CEPH_FILE_MODE_RDWR:
+                return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+                        CEPH_CAP_FILE_EXCL |
+                        CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
+                        CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+                        CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+                        CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+        case CEPH_FILE_MODE_WR:
+                return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+                        CEPH_CAP_FILE_EXCL |
+                        CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+                        CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+                        CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+        }
+        return 0;
+}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
new file mode 100644
index 000000000000..0c2241ef3653
--- /dev/null
+++ b/fs/ceph/ceph_fs.h
@@ -0,0 +1,650 @@
+/*
+ * ceph_fs.h - Ceph constants and data types to share between kernel and
+ * user space.
+ *
+ * Most types in this file are defined as little-endian, and are
+ * primarily intended to describe data structures that pass over the
+ * wire or that are stored on disk.
+ *
+ * LGPL2
+ */
+#ifndef _FS_CEPH_CEPH_FS_H
+#define _FS_CEPH_CEPH_FS_H
+#include "msgr.h"
+#include "rados.h"
+/*
+ * Ceph release version
+ */
+#define CEPH_VERSION_MAJOR 0
+#define CEPH_VERSION_MINOR 19
+#define CEPH_VERSION_PATCH 0
+#define _CEPH_STRINGIFY(x) #x
+#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
+#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
+        "." CEPH_STRINGIFY(z)
+#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
+                                       CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
+/*
+ * subprotocol versions.  when specific messages types or high-level
+ * protocols change, bump the affected components.  we keep rev
+ * internal cluster protocols separately from the public,
+ * client-facing protocol.
+ */
+#define CEPH_OSD_PROTOCOL     8 /* cluster internal */
+#define CEPH_MDS_PROTOCOL     9 /* cluster internal */
+#define CEPH_MON_PROTOCOL     5 /* cluster internal */
+#define CEPH_OSDC_PROTOCOL   24 /* server/client */
+#define CEPH_MDSC_PROTOCOL   32 /* server/client */
+#define CEPH_MONC_PROTOCOL   15 /* server/client */
+#define CEPH_INO_ROOT  1
+#define CEPH_INO_CEPH  2        /* hidden .ceph dir */
+/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
+#define CEPH_MAX_MON   31
+/*
+ * feature bits
+ */
+#define CEPH_FEATURE_SUPPORTED  0
+#define CEPH_FEATURE_REQUIRED   0
+/*
+ * ceph_file_layout - describe data layout for a file/inode
+ */
+struct ceph_file_layout {
+        /* file -> object mapping */
+        __le32 fl_stripe_unit;     /* stripe unit, in bytes.  must be multiple
+                                      of page size. */
+        __le32 fl_stripe_count;    /* over this many objects */
+        __le32 fl_object_size;     /* until objects are this big, then move to
+                                      new objects */
+        __le32 fl_cas_hash;        /* 0 = none; 1 = sha256 */
+        /* pg -> disk layout */
+        __le32 fl_object_stripe_unit;  /* for per-object parity, if any */
+        /* object -> pg layout */
+        __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
+        __le32 fl_pg_pool;      /* namespace, crush ruleset, rep level */
+} __attribute__ ((packed));
+#define CEPH_MIN_STRIPE_UNIT 65536
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
+/* crypto algorithms */
+#define CEPH_CRYPTO_NONE 0x0
+#define CEPH_CRYPTO_AES  0x1
+/* security/authentication protocols */
+#define CEPH_AUTH_UNKNOWN       0x0
+#define CEPH_AUTH_NONE          0x1
+#define CEPH_AUTH_CEPHX         0x2
+/*********************************************
+ * message layer
+ */
+/*
+ * message types
+ */
+/* misc */
+#define CEPH_MSG_SHUTDOWN               1
+#define CEPH_MSG_PING                   2
+/* client <-> monitor */
+#define CEPH_MSG_MON_MAP                4
+#define CEPH_MSG_MON_GET_MAP            5
+#define CEPH_MSG_STATFS                 13
+#define CEPH_MSG_STATFS_REPLY           14
+#define CEPH_MSG_MON_SUBSCRIBE          15
+#define CEPH_MSG_MON_SUBSCRIBE_ACK      16
+#define CEPH_MSG_AUTH                   17
+#define CEPH_MSG_AUTH_REPLY             18
+/* client <-> mds */
+#define CEPH_MSG_MDS_MAP                21
+#define CEPH_MSG_CLIENT_SESSION         22
+#define CEPH_MSG_CLIENT_RECONNECT       23
+#define CEPH_MSG_CLIENT_REQUEST         24
+#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
+#define CEPH_MSG_CLIENT_REPLY           26
+#define CEPH_MSG_CLIENT_CAPS            0x310
+#define CEPH_MSG_CLIENT_LEASE           0x311
+#define CEPH_MSG_CLIENT_SNAP            0x312
+#define CEPH_MSG_CLIENT_CAPRELEASE      0x313
+/* osd */
+#define CEPH_MSG_OSD_MAP          41
+#define CEPH_MSG_OSD_OP           42
+#define CEPH_MSG_OSD_OPREPLY      43
+struct ceph_mon_request_header {
+        __le64 have_version;
+        __le16 session_mon;
+        __le64 session_mon_tid;
+} __attribute__ ((packed));
+struct ceph_mon_statfs {
+        struct ceph_mon_request_header monhdr;
+        struct ceph_fsid fsid;
+} __attribute__ ((packed));
+struct ceph_statfs {
+        __le64 kb, kb_used, kb_avail;
+        __le64 num_objects;
+} __attribute__ ((packed));
+struct ceph_mon_statfs_reply {
+        struct ceph_fsid fsid;
+        __le64 version;
+        struct ceph_statfs st;
+} __attribute__ ((packed));
+struct ceph_osd_getmap {
+        struct ceph_mon_request_header monhdr;
+        struct ceph_fsid fsid;
+        __le32 start;
+} __attribute__ ((packed));
+struct ceph_mds_getmap {
+        struct ceph_mon_request_header monhdr;
+        struct ceph_fsid fsid;
+} __attribute__ ((packed));
+struct ceph_client_mount {
+        struct ceph_mon_request_header monhdr;
+} __attribute__ ((packed));
+struct ceph_mon_subscribe_item {
+        __le64 have_version;    __le64 have;
+        __u8 onetime;
+} __attribute__ ((packed));
+struct ceph_mon_subscribe_ack {
+        __le32 duration;         /* seconds */
+        struct ceph_fsid fsid;
+} __attribute__ ((packed));
+/*
+ * mds states
+ *   > 0 -> in
+ *  <= 0 -> out
+ */
+#define CEPH_MDS_STATE_DNE          0  /* down, does not exist. */
+#define CEPH_MDS_STATE_STOPPED     -1  /* down, once existed, but no subtrees.
+                                          empty log. */
+#define CEPH_MDS_STATE_BOOT        -4  /* up, boot announcement. */
+#define CEPH_MDS_STATE_STANDBY     -5  /* up, idle.  waiting for assignment. */
+#define CEPH_MDS_STATE_CREATING    -6  /* up, creating MDS instance. */
+#define CEPH_MDS_STATE_STARTING    -7  /* up, starting previously stopped mds */
+#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
+#define CEPH_MDS_STATE_REPLAY       8  /* up, replaying journal. */
+#define CEPH_MDS_STATE_RESOLVE      9  /* up, disambiguating distributed
+                                          operations (import, rename, etc.) */
+#define CEPH_MDS_STATE_RECONNECT    10 /* up, reconnect to clients */
+#define CEPH_MDS_STATE_REJOIN       11 /* up, rejoining distributed cache */
+#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
+#define CEPH_MDS_STATE_ACTIVE       13 /* up, active */
+#define CEPH_MDS_STATE_STOPPING     14 /* up, but exporting metadata */
+extern const char *ceph_mds_state_name(int s);
+/*
+ * metadata lock types.
+ *  - these are bitmasks.. we can compose them
+ *  - they also define the lock ordering by the MDS
+ *  - a few of these are internal to the mds
+ */
+#define CEPH_LOCK_DN          1
+#define CEPH_LOCK_ISNAP       2
+#define CEPH_LOCK_IVERSION    4     /* mds internal */
+#define CEPH_LOCK_IFILE       8     /* mds internal */
+#define CEPH_LOCK_IAUTH       32
+#define CEPH_LOCK_ILINK       64
+#define CEPH_LOCK_IDFT        128   /* dir frag tree */
+#define CEPH_LOCK_INEST       256   /* mds internal */
+#define CEPH_LOCK_IXATTR      512
+#define CEPH_LOCK_INO         2048  /* immutable inode bits; not a lock */
+/* client_session ops */
+enum {
+        CEPH_SESSION_REQUEST_OPEN,
+        CEPH_SESSION_OPEN,
+        CEPH_SESSION_REQUEST_CLOSE,
+        CEPH_SESSION_CLOSE,
+        CEPH_SESSION_REQUEST_RENEWCAPS,
+        CEPH_SESSION_RENEWCAPS,
+        CEPH_SESSION_STALE,
+        CEPH_SESSION_RECALL_STATE,
+};
+extern const char *ceph_session_op_name(int op);
+struct ceph_mds_session_head {
+        __le32 op;
+        __le64 seq;
+        struct ceph_timespec stamp;
+        __le32 max_caps, max_leases;
+} __attribute__ ((packed));
+/* client_request */
+/*
+ * metadata ops.
+ *  & 0x001000 -> write op
+ *  & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
+ &  & 0x100000 -> use weird ino/path trace
+ */
+#define CEPH_MDS_OP_WRITE        0x001000
+enum {
+        CEPH_MDS_OP_LOOKUP     = 0x00100,
+        CEPH_MDS_OP_GETATTR    = 0x00101,
+        CEPH_MDS_OP_LOOKUPHASH = 0x00102,
+        CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
+        CEPH_MDS_OP_SETXATTR   = 0x01105,
+        CEPH_MDS_OP_RMXATTR    = 0x01106,
+        CEPH_MDS_OP_SETLAYOUT  = 0x01107,
+        CEPH_MDS_OP_SETATTR    = 0x01108,
+        CEPH_MDS_OP_MKNOD      = 0x01201,
+        CEPH_MDS_OP_LINK       = 0x01202,
+        CEPH_MDS_OP_UNLINK     = 0x01203,
+        CEPH_MDS_OP_RENAME     = 0x01204,
+        CEPH_MDS_OP_MKDIR      = 0x01220,
+        CEPH_MDS_OP_RMDIR      = 0x01221,
+        CEPH_MDS_OP_SYMLINK    = 0x01222,
+        CEPH_MDS_OP_CREATE     = 0x01301,
+        CEPH_MDS_OP_OPEN       = 0x00302,
+        CEPH_MDS_OP_READDIR    = 0x00305,
+        CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
+        CEPH_MDS_OP_MKSNAP     = 0x01400,
+        CEPH_MDS_OP_RMSNAP     = 0x01401,
+        CEPH_MDS_OP_LSSNAP     = 0x00402,
+};
+extern const char *ceph_mds_op_name(int op);
+#define CEPH_SETATTR_MODE   1
+#define CEPH_SETATTR_UID    2
+#define CEPH_SETATTR_GID    4
+#define CEPH_SETATTR_MTIME  8
+#define CEPH_SETATTR_ATIME 16
+#define CEPH_SETATTR_SIZE  32
+#define CEPH_SETATTR_CTIME 64
+union ceph_mds_request_args {
+        struct {
+                __le32 mask;                 /* CEPH_CAP_* */
+        } __attribute__ ((packed)) getattr;
+        struct {
+                __le32 mode;
+                __le32 uid;
+                __le32 gid;
+                struct ceph_timespec mtime;
+                struct ceph_timespec atime;
+                __le64 size, old_size;       /* old_size needed by truncate */
+                __le32 mask;                 /* CEPH_SETATTR_* */
+        } __attribute__ ((packed)) setattr;
+        struct {
+                __le32 frag;                 /* which dir fragment */
+                __le32 max_entries;          /* how many dentries to grab */
+        } __attribute__ ((packed)) readdir;
+        struct {
+                __le32 mode;
+                __le32 rdev;
+        } __attribute__ ((packed)) mknod;
+        struct {
+                __le32 mode;
+        } __attribute__ ((packed)) mkdir;
+        struct {
+                __le32 flags;
+                __le32 mode;
+                __le32 stripe_unit;          /* layout for newly created file */
+                __le32 stripe_count;         /* ... */
+                __le32 object_size;
+                __le32 file_replication;
+                __le32 preferred;
+        } __attribute__ ((packed)) open;
+        struct {
+                __le32 flags;
+        } __attribute__ ((packed)) setxattr;
+        struct {
+                struct ceph_file_layout layout;
+        } __attribute__ ((packed)) setlayout;
+} __attribute__ ((packed));
+#define CEPH_MDS_FLAG_REPLAY        1  /* this is a replayed op */
+#define CEPH_MDS_FLAG_WANT_DENTRY   2  /* want dentry in reply */
+struct ceph_mds_request_head {
+        __le64 oldest_client_tid;
+        __le32 mdsmap_epoch;           /* on client */
+        __le32 flags;                  /* CEPH_MDS_FLAG_* */
+        __u8 num_retry, num_fwd;       /* count retry, fwd attempts */
+        __le16 num_releases;           /* # include cap/lease release records */
+        __le32 op;                     /* mds op code */
+        __le32 caller_uid, caller_gid;
+        __le64 ino;                    /* use this ino for openc, mkdir, mknod,
+                                          etc. (if replaying) */
+        union ceph_mds_request_args args;
+} __attribute__ ((packed));
+/* cap/lease release record */
+struct ceph_mds_request_release {
+        __le64 ino, cap_id;            /* ino and unique cap id */
+        __le32 caps, wanted;           /* new issued, wanted */
+        __le32 seq, issue_seq, mseq;
+        __le32 dname_seq;              /* if releasing a dentry lease, a */
+        __le32 dname_len;              /* string follows. */
+} __attribute__ ((packed));
+/* client reply */
+struct ceph_mds_reply_head {
+        __le32 op;
+        __le32 result;
+        __le32 mdsmap_epoch;
+        __u8 safe;                     /* true if committed to disk */
+        __u8 is_dentry, is_target;     /* true if dentry, target inode records
+                                          are included with reply */
+} __attribute__ ((packed));
+/* one for each node split */
+struct ceph_frag_tree_split {
+        __le32 frag;                   /* this frag splits... */
+        __le32 by;                     /* ...by this many bits */
+} __attribute__ ((packed));
+struct ceph_frag_tree_head {
+        __le32 nsplits;                /* num ceph_frag_tree_split records */
+        struct ceph_frag_tree_split splits[];
+} __attribute__ ((packed));
+/* capability issue, for bundling with mds reply */
+struct ceph_mds_reply_cap {
+        __le32 caps, wanted;           /* caps issued, wanted */
+        __le64 cap_id;
+        __le32 seq, mseq;
+        __le64 realm;                  /* snap realm */
+        __u8 flags;                    /* CEPH_CAP_FLAG_* */
+} __attribute__ ((packed));
+#define CEPH_CAP_FLAG_AUTH  1          /* cap is issued by auth mds */
+/* inode record, for bundling with mds reply */
+struct ceph_mds_reply_inode {
+        __le64 ino;
+        __le64 snapid;
+        __le32 rdev;
+        __le64 version;                /* inode version */
+        __le64 xattr_version;          /* version for xattr blob */
+        struct ceph_mds_reply_cap cap; /* caps issued for this inode */
+        struct ceph_file_layout layout;
+        struct ceph_timespec ctime, mtime, atime;
+        __le32 time_warp_seq;
+        __le64 size, max_size, truncate_size;
+        __le32 truncate_seq;
+        __le32 mode, uid, gid;
+        __le32 nlink;
+        __le64 files, subdirs, rbytes, rfiles, rsubdirs;  /* dir stats */
+        struct ceph_timespec rctime;
+        struct ceph_frag_tree_head fragtree;  /* (must be at end of struct) */
+} __attribute__ ((packed));
+/* followed by frag array, then symlink string, then xattr blob */
+/* reply_lease follows dname, and reply_inode */
+struct ceph_mds_reply_lease {
+        __le16 mask;            /* lease type(s) */
+        __le32 duration_ms;     /* lease duration */
+        __le32 seq;
+} __attribute__ ((packed));
+struct ceph_mds_reply_dirfrag {
+        __le32 frag;            /* fragment */
+        __le32 auth;            /* auth mds, if this is a delegation point */
+        __le32 ndist;           /* number of mds' this is replicated on */
+        __le32 dist[];
+} __attribute__ ((packed));
+/* file access modes */
+#define CEPH_FILE_MODE_PIN        0
+#define CEPH_FILE_MODE_RD         1
+#define CEPH_FILE_MODE_WR         2
+#define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
+#define CEPH_FILE_MODE_LAZY       4  /* lazy io */
+#define CEPH_FILE_MODE_NUM        8  /* bc these are bit fields.. mostly */
+int ceph_flags_to_mode(int flags);
+/* capability bits */
+#define CEPH_CAP_PIN         1  /* no specific capabilities beyond the pin */
+/* generic cap bits */
+#define CEPH_CAP_GSHARED     1  /* client can reads */
+#define CEPH_CAP_GEXCL       2  /* client can read and update */
+#define CEPH_CAP_GCACHE      4  /* (file) client can cache reads */
+#define CEPH_CAP_GRD         8  /* (file) client can read */
+#define CEPH_CAP_GWR        16  /* (file) client can write */
+#define CEPH_CAP_GBUFFER    32  /* (file) client can buffer writes */
+#define CEPH_CAP_GWREXTEND  64  /* (file) client can extend EOF */
+#define CEPH_CAP_GLAZYIO   128  /* (file) client can perform lazy io */
+/* per-lock shift */
+#define CEPH_CAP_SAUTH      2
+#define CEPH_CAP_SLINK      4
+#define CEPH_CAP_SXATTR     6
+#define CEPH_CAP_SFILE      8   /* goes at the end (uses >2 cap bits) */
+#define CEPH_CAP_BITS       16
+/* composed values */
+#define CEPH_CAP_AUTH_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SAUTH)
+#define CEPH_CAP_AUTH_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SAUTH)
+#define CEPH_CAP_LINK_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SLINK)
+#define CEPH_CAP_LINK_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SLINK)
+#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED  << CEPH_CAP_SXATTR)
+#define CEPH_CAP_XATTR_EXCL    (CEPH_CAP_GEXCL     << CEPH_CAP_SXATTR)
+#define CEPH_CAP_FILE(x)    (x << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_SHARED   (CEPH_CAP_GSHARED   << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_CACHE    (CEPH_CAP_GCACHE    << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_RD       (CEPH_CAP_GRD       << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WR       (CEPH_CAP_GWR       << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_BUFFER   (CEPH_CAP_GBUFFER   << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_LAZYIO   (CEPH_CAP_GLAZYIO   << CEPH_CAP_SFILE)
+/* cap masks (for getattr) */
+#define CEPH_STAT_CAP_INODE    CEPH_CAP_PIN
+#define CEPH_STAT_CAP_TYPE     CEPH_CAP_PIN  /* mode >> 12 */
+#define CEPH_STAT_CAP_SYMLINK  CEPH_CAP_PIN
+#define CEPH_STAT_CAP_UID      CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_GID      CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_MODE     CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_NLINK    CEPH_CAP_LINK_SHARED
+#define CEPH_STAT_CAP_LAYOUT   CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_MTIME    CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_SIZE     CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_ATIME    CEPH_CAP_FILE_SHARED  /* fixme */
+#define CEPH_STAT_CAP_XATTR    CEPH_CAP_XATTR_SHARED
+#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN |                 \
+                                 CEPH_CAP_AUTH_SHARED | \
+                                 CEPH_CAP_LINK_SHARED | \
+                                 CEPH_CAP_FILE_SHARED | \
+                                 CEPH_CAP_XATTR_SHARED)
+#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED |                     \
+                              CEPH_CAP_LINK_SHARED |                    \
+                              CEPH_CAP_XATTR_SHARED |                   \
+                              CEPH_CAP_FILE_SHARED)
+#define CEPH_CAP_ANY_RD   (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD |     \
+                           CEPH_CAP_FILE_CACHE)
+#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL |         \
+                           CEPH_CAP_LINK_EXCL |         \
+                           CEPH_CAP_XATTR_EXCL |        \
+                           CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
+                              CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_WR   (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
+#define CEPH_CAP_ANY      (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
+                           CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
+#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
+                        CEPH_LOCK_IXATTR)
+int ceph_caps_for_mode(int mode);
+enum {
+        CEPH_CAP_OP_GRANT,         /* mds->client grant */
+        CEPH_CAP_OP_REVOKE,        /* mds->client revoke */
+        CEPH_CAP_OP_TRUNC,         /* mds->client trunc notify */
+        CEPH_CAP_OP_EXPORT,        /* mds has exported the cap */
+        CEPH_CAP_OP_IMPORT,        /* mds has imported the cap */
+        CEPH_CAP_OP_UPDATE,        /* client->mds update */
+        CEPH_CAP_OP_DROP,          /* client->mds drop cap bits */
+        CEPH_CAP_OP_FLUSH,         /* client->mds cap writeback */
+        CEPH_CAP_OP_FLUSH_ACK,     /* mds->client flushed */
+        CEPH_CAP_OP_FLUSHSNAP,     /* client->mds flush snapped metadata */
+        CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
+        CEPH_CAP_OP_RELEASE,       /* client->mds release (clean) cap */
+        CEPH_CAP_OP_RENEW,         /* client->mds renewal request */
+};
+extern const char *ceph_cap_op_name(int op);
+/*
+ * caps message, used for capability callbacks, acks, requests, etc.
+ */
+struct ceph_mds_caps {
+        __le32 op;                  /* CEPH_CAP_OP_* */
+        __le64 ino, realm;
+        __le64 cap_id;
+        __le32 seq, issue_seq;
+        __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
+        __le32 migrate_seq;
+        __le64 snap_follows;
+        __le32 snap_trace_len;
+        /* authlock */
+        __le32 uid, gid, mode;
+        /* linklock */
+        __le32 nlink;
+        /* xattrlock */
+        __le32 xattr_len;
+        __le64 xattr_version;
+        /* filelock */
+        __le64 size, max_size, truncate_size;
+        __le32 truncate_seq;
+        struct ceph_timespec mtime, atime, ctime;
+        struct ceph_file_layout layout;
+        __le32 time_warp_seq;
+} __attribute__ ((packed));
+/* cap release msg head */
+struct ceph_mds_cap_release {
+        __le32 num;                /* number of cap_items that follow */
+} __attribute__ ((packed));
+struct ceph_mds_cap_item {
+        __le64 ino;
+        __le64 cap_id;
+        __le32 migrate_seq, seq;
+} __attribute__ ((packed));
+#define CEPH_MDS_LEASE_REVOKE           1  /*    mds  -> client */
+#define CEPH_MDS_LEASE_RELEASE          2  /* client  -> mds    */
+#define CEPH_MDS_LEASE_RENEW            3  /* client <-> mds    */
+#define CEPH_MDS_LEASE_REVOKE_ACK       4  /* client  -> mds    */
+extern const char *ceph_lease_op_name(int o);
+/* lease msg header */
+struct ceph_mds_lease {
+        __u8 action;            /* CEPH_MDS_LEASE_* */
+        __le16 mask;            /* which lease */
+        __le64 ino;
+        __le64 first, last;     /* snap range */
+        __le32 seq;
+        __le32 duration_ms;     /* duration of renewal */
+} __attribute__ ((packed));
+/* followed by a __le32+string for dname */
+/* client reconnect */
+struct ceph_mds_cap_reconnect {
+        __le64 cap_id;
+        __le32 wanted;
+        __le32 issued;
+        __le64 size;
+        struct ceph_timespec mtime, atime;
+        __le64 snaprealm;
+        __le64 pathbase;        /* base ino for our path to this ino */
+} __attribute__ ((packed));
+/* followed by encoded string */
+struct ceph_mds_snaprealm_reconnect {
+        __le64 ino;     /* snap realm base */
+        __le64 seq;     /* snap seq for this snap realm */
+        __le64 parent;  /* parent realm */
+} __attribute__ ((packed));
+/*
+ * snaps
+ */
+enum {
+        CEPH_SNAP_OP_UPDATE,  /* CREATE or DESTROY */
+        CEPH_SNAP_OP_CREATE,
+        CEPH_SNAP_OP_DESTROY,
+        CEPH_SNAP_OP_SPLIT,
+};
+extern const char *ceph_snap_op_name(int o);
+/* snap msg header */
+struct ceph_mds_snap_head {
+        __le32 op;                /* CEPH_SNAP_OP_* */
+        __le64 split;             /* ino to split off, if any */
+        __le32 num_split_inos;    /* # inos belonging to new child realm */
+        __le32 num_split_realms;  /* # child realms udner new child realm */
+        __le32 trace_len;         /* size of snap trace blob */
+} __attribute__ ((packed));
+/* followed by split ino list, then split realms, then the trace blob */
+/*
+ * encode info about a snaprealm, as viewed by a client
+ */
+struct ceph_mds_snap_realm {
+        __le64 ino;           /* ino */
+        __le64 created;       /* snap: when created */
+        __le64 parent;        /* ino: parent realm */
+        __le64 parent_since;  /* snap: same parent since */
+        __le64 seq;           /* snap: version */
+        __le32 num_snaps;
+        __le32 num_prior_parent_snaps;
+} __attribute__ ((packed));
+/* followed by my snap list, then prior parent snap list */
+#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
new file mode 100644
index 000000000000..bd570015d147
--- /dev/null
+++ b/fs/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
+#include "types.h"
+/*
+ * Robert Jenkin's hash function.
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * This is in the public domain.
+ */
+#define mix(a, b, c)                                            \
+        do {                                                    \
+                a = a - b;  a = a - c;  a = a ^ (c >> 13);      \
+                b = b - c;  b = b - a;  b = b ^ (a << 8);       \
+                c = c - a;  c = c - b;  c = c ^ (b >> 13);      \
+                a = a - b;  a = a - c;  a = a ^ (c >> 12);      \
+                b = b - c;  b = b - a;  b = b ^ (a << 16);      \
+                c = c - a;  c = c - b;  c = c ^ (b >> 5);       \
+                a = a - b;  a = a - c;  a = a ^ (c >> 3);       \
+                b = b - c;  b = b - a;  b = b ^ (a << 10);      \
+                c = c - a;  c = c - b;  c = c ^ (b >> 15);      \
+        } while (0)
+unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
+{
+        const unsigned char *k = (const unsigned char *)str;
+        __u32 a, b, c;  /* the internal state */
+        __u32 len;      /* how many key bytes still need mixing */
+        /* Set up the internal state */
+        len = length;
+        a = 0x9e3779b9;      /* the golden ratio; an arbitrary value */
+        b = a;
+        c = 0;               /* variable initialization of internal state */
+        /* handle most of the key */
+        while (len >= 12) {
+                a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
+                         ((__u32)k[3] << 24));
+                b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
+                         ((__u32)k[7] << 24));
+                c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
+                         ((__u32)k[11] << 24));
+                mix(a, b, c);
+                k = k + 12;
+                len = len - 12;
+        }
+        /* handle the last 11 bytes */
+        c = c + length;
+        switch (len) {            /* all the case statements fall through */
+        case 11:
+                c = c + ((__u32)k[10] << 24);
+        case 10:
+                c = c + ((__u32)k[9] << 16);
+        case 9:
+                c = c + ((__u32)k[8] << 8);
+                /* the first byte of c is reserved for the length */
+        case 8:
+                b = b + ((__u32)k[7] << 24);
+        case 7:
+                b = b + ((__u32)k[6] << 16);
+        case 6:
+                b = b + ((__u32)k[5] << 8);
+        case 5:
+                b = b + k[4];
+        case 4:
+                a = a + ((__u32)k[3] << 24);
+        case 3:
+                a = a + ((__u32)k[2] << 16);
+        case 2:
+                a = a + ((__u32)k[1] << 8);
+        case 1:
+                a = a + k[0];
+                /* case 0: nothing left to add */
+        }
+        mix(a, b, c);
+        return c;
+}
+/*
+ * linux dcache hash
+ */
+unsigned ceph_str_hash_linux(const char *str, unsigned length)
+{
+        unsigned long hash = 0;
+        unsigned char c;
+        while (length--) {
+                c = *str++;
+                hash = (hash + (c << 4) + (c >> 4)) * 11;
+        }
+        return hash;
+}
+unsigned ceph_str_hash(int type, const char *s, unsigned len)
+{
+        switch (type) {
+        case CEPH_STR_HASH_LINUX:
+                return ceph_str_hash_linux(s, len);
+        case CEPH_STR_HASH_RJENKINS:
+                return ceph_str_hash_rjenkins(s, len);
+        default:
+                return -1;
+        }
+}
+const char *ceph_str_hash_name(int type)
+{
+        switch (type) {
+        case CEPH_STR_HASH_LINUX:
+                return "linux";
+        case CEPH_STR_HASH_RJENKINS:
+                return "rjenkins";
+        default:
+                return "unknown";
+        }
+}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
new file mode 100644
index 000000000000..5ac470c433c9
--- /dev/null
+++ b/fs/ceph/ceph_hash.h
@@ -0,0 +1,13 @@
+#ifndef _FS_CEPH_HASH_H
+#define _FS_CEPH_HASH_H
+#define CEPH_STR_HASH_LINUX      0x1  /* linux dcache hash */
+#define CEPH_STR_HASH_RJENKINS   0x2  /* robert jenkins' */
+extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
+extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
+extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
+extern const char *ceph_str_hash_name(int type);
+#endif
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
new file mode 100644
index 000000000000..8e4be6a80c62
--- /dev/null
+++ b/fs/ceph/ceph_strings.c
@@ -0,0 +1,176 @@
+/*
+ * Ceph string constants
+ */
+#include "types.h"
+const char *ceph_entity_type_name(int type)
+{
+        switch (type) {
+        case CEPH_ENTITY_TYPE_MDS: return "mds";
+        case CEPH_ENTITY_TYPE_OSD: return "osd";
+        case CEPH_ENTITY_TYPE_MON: return "mon";
+        case CEPH_ENTITY_TYPE_CLIENT: return "client";
+        case CEPH_ENTITY_TYPE_ADMIN: return "admin";
+        case CEPH_ENTITY_TYPE_AUTH: return "auth";
+        default: return "unknown";
+        }
+}
+const char *ceph_osd_op_name(int op)
+{
+        switch (op) {
+        case CEPH_OSD_OP_READ: return "read";
+        case CEPH_OSD_OP_STAT: return "stat";
+        case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
+        case CEPH_OSD_OP_WRITE: return "write";
+        case CEPH_OSD_OP_DELETE: return "delete";
+        case CEPH_OSD_OP_TRUNCATE: return "truncate";
+        case CEPH_OSD_OP_ZERO: return "zero";
+        case CEPH_OSD_OP_WRITEFULL: return "writefull";
+        case CEPH_OSD_OP_APPEND: return "append";
+        case CEPH_OSD_OP_STARTSYNC: return "startsync";
+        case CEPH_OSD_OP_SETTRUNC: return "settrunc";
+        case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
+        case CEPH_OSD_OP_TMAPUP: return "tmapup";
+        case CEPH_OSD_OP_TMAPGET: return "tmapget";
+        case CEPH_OSD_OP_TMAPPUT: return "tmapput";
+        case CEPH_OSD_OP_GETXATTR: return "getxattr";
+        case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
+        case CEPH_OSD_OP_SETXATTR: return "setxattr";
+        case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
+        case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
+        case CEPH_OSD_OP_RMXATTR: return "rmxattr";
+        case CEPH_OSD_OP_PULL: return "pull";
+        case CEPH_OSD_OP_PUSH: return "push";
+        case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
+        case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
+        case CEPH_OSD_OP_SCRUB: return "scrub";
+        case CEPH_OSD_OP_WRLOCK: return "wrlock";
+        case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
+        case CEPH_OSD_OP_RDLOCK: return "rdlock";
+        case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
+        case CEPH_OSD_OP_UPLOCK: return "uplock";
+        case CEPH_OSD_OP_DNLOCK: return "dnlock";
+        case CEPH_OSD_OP_CALL: return "call";
+        case CEPH_OSD_OP_PGLS: return "pgls";
+        }
+        return "???";
+}
+const char *ceph_mds_state_name(int s)
+{
+        switch (s) {
+                /* down and out */
+        case CEPH_MDS_STATE_DNE:        return "down:dne";
+        case CEPH_MDS_STATE_STOPPED:    return "down:stopped";
+                /* up and out */
+        case CEPH_MDS_STATE_BOOT:       return "up:boot";
+        case CEPH_MDS_STATE_STANDBY:    return "up:standby";
+        case CEPH_MDS_STATE_STANDBY_REPLAY:    return "up:standby-replay";
+        case CEPH_MDS_STATE_CREATING:   return "up:creating";
+        case CEPH_MDS_STATE_STARTING:   return "up:starting";
+                /* up and in */
+        case CEPH_MDS_STATE_REPLAY:     return "up:replay";
+        case CEPH_MDS_STATE_RESOLVE:    return "up:resolve";
+        case CEPH_MDS_STATE_RECONNECT:  return "up:reconnect";
+        case CEPH_MDS_STATE_REJOIN:     return "up:rejoin";
+        case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
+        case CEPH_MDS_STATE_ACTIVE:     return "up:active";
+        case CEPH_MDS_STATE_STOPPING:   return "up:stopping";
+        }
+        return "???";
+}
+const char *ceph_session_op_name(int op)
+{
+        switch (op) {
+        case CEPH_SESSION_REQUEST_OPEN: return "request_open";
+        case CEPH_SESSION_OPEN: return "open";
+        case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
+        case CEPH_SESSION_CLOSE: return "close";
+        case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
+        case CEPH_SESSION_RENEWCAPS: return "renewcaps";
+        case CEPH_SESSION_STALE: return "stale";
+        case CEPH_SESSION_RECALL_STATE: return "recall_state";
+        }
+        return "???";
+}
+const char *ceph_mds_op_name(int op)
+{
+        switch (op) {
+        case CEPH_MDS_OP_LOOKUP:  return "lookup";
+        case CEPH_MDS_OP_LOOKUPHASH:  return "lookuphash";
+        case CEPH_MDS_OP_LOOKUPPARENT:  return "lookupparent";
+        case CEPH_MDS_OP_GETATTR:  return "getattr";
+        case CEPH_MDS_OP_SETXATTR: return "setxattr";
+        case CEPH_MDS_OP_SETATTR: return "setattr";
+        case CEPH_MDS_OP_RMXATTR: return "rmxattr";
+        case CEPH_MDS_OP_READDIR: return "readdir";
+        case CEPH_MDS_OP_MKNOD: return "mknod";
+        case CEPH_MDS_OP_LINK: return "link";
+        case CEPH_MDS_OP_UNLINK: return "unlink";
+        case CEPH_MDS_OP_RENAME: return "rename";
+        case CEPH_MDS_OP_MKDIR: return "mkdir";
+        case CEPH_MDS_OP_RMDIR: return "rmdir";
+        case CEPH_MDS_OP_SYMLINK: return "symlink";
+        case CEPH_MDS_OP_CREATE: return "create";
+        case CEPH_MDS_OP_OPEN: return "open";
+        case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
+        case CEPH_MDS_OP_LSSNAP: return "lssnap";
+        case CEPH_MDS_OP_MKSNAP: return "mksnap";
+        case CEPH_MDS_OP_RMSNAP: return "rmsnap";
+        }
+        return "???";
+}
+const char *ceph_cap_op_name(int op)
+{
+        switch (op) {
+        case CEPH_CAP_OP_GRANT: return "grant";
+        case CEPH_CAP_OP_REVOKE: return "revoke";
+        case CEPH_CAP_OP_TRUNC: return "trunc";
+        case CEPH_CAP_OP_EXPORT: return "export";
+        case CEPH_CAP_OP_IMPORT: return "import";
+        case CEPH_CAP_OP_UPDATE: return "update";
+        case CEPH_CAP_OP_DROP: return "drop";
+        case CEPH_CAP_OP_FLUSH: return "flush";
+        case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
+        case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
+        case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
+        case CEPH_CAP_OP_RELEASE: return "release";
+        case CEPH_CAP_OP_RENEW: return "renew";
+        }
+        return "???";
+}
+const char *ceph_lease_op_name(int o)
+{
+        switch (o) {
+        case CEPH_MDS_LEASE_REVOKE: return "revoke";
+        case CEPH_MDS_LEASE_RELEASE: return "release";
+        case CEPH_MDS_LEASE_RENEW: return "renew";
+        case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
+        }
+        return "???";
+}
+const char *ceph_snap_op_name(int o)
+{
+        switch (o) {
+        case CEPH_SNAP_OP_UPDATE: return "update";
+        case CEPH_SNAP_OP_CREATE: return "create";
+        case CEPH_SNAP_OP_DESTROY: return "destroy";
+        case CEPH_SNAP_OP_SPLIT: return "split";
+        }
+        return "???";
+}
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
new file mode 100644
index 000000000000..fabd302e5779
--- /dev/null
+++ b/fs/ceph/crush/crush.c
@@ -0,0 +1,151 @@
+#ifdef __KERNEL__
+# include <linux/slab.h>
+#else
+# include <stdlib.h>
+# include <assert.h>
+# define kfree(x) do { if (x) free(x); } while (0)
+# define BUG_ON(x) assert(!(x))
+#endif
+#include "crush.h"
+const char *crush_bucket_alg_name(int alg)
+{
+        switch (alg) {
+        case CRUSH_BUCKET_UNIFORM: return "uniform";
+        case CRUSH_BUCKET_LIST: return "list";
+        case CRUSH_BUCKET_TREE: return "tree";
+        case CRUSH_BUCKET_STRAW: return "straw";
+        default: return "unknown";
+        }
+}
+/**
+ * crush_get_bucket_item_weight - Get weight of an item in given bucket
+ * @b: bucket pointer
+ * @p: item index in bucket
+ */
+int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
+{
+        if (p >= b->size)
+                return 0;
+        switch (b->alg) {
+        case CRUSH_BUCKET_UNIFORM:
+                return ((struct crush_bucket_uniform *)b)->item_weight;
+        case CRUSH_BUCKET_LIST:
+                return ((struct crush_bucket_list *)b)->item_weights[p];
+        case CRUSH_BUCKET_TREE:
+                if (p & 1)
+                        return ((struct crush_bucket_tree *)b)->node_weights[p];
+                return 0;
+        case CRUSH_BUCKET_STRAW:
+                return ((struct crush_bucket_straw *)b)->item_weights[p];
+        }
+        return 0;
+}
+/**
+ * crush_calc_parents - Calculate parent vectors for the given crush map.
+ * @map: crush_map pointer
+ */
+void crush_calc_parents(struct crush_map *map)
+{
+        int i, b, c;
+        for (b = 0; b < map->max_buckets; b++) {
+                if (map->buckets[b] == NULL)
+                        continue;
+                for (i = 0; i < map->buckets[b]->size; i++) {
+                        c = map->buckets[b]->items[i];
+                        BUG_ON(c >= map->max_devices ||
+                               c < -map->max_buckets);
+                        if (c >= 0)
+                                map->device_parents[c] = map->buckets[b]->id;
+                        else
+                                map->bucket_parents[-1-c] = map->buckets[b]->id;
+                }
+        }
+}
+void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
+{
+        kfree(b->h.perm);
+        kfree(b->h.items);
+        kfree(b);
+}
+void crush_destroy_bucket_list(struct crush_bucket_list *b)
+{
+        kfree(b->item_weights);
+        kfree(b->sum_weights);
+        kfree(b->h.perm);
+        kfree(b->h.items);
+        kfree(b);
+}
+void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
+{
+        kfree(b->node_weights);
+        kfree(b);
+}
+void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
+{
+        kfree(b->straws);
+        kfree(b->item_weights);
+        kfree(b->h.perm);
+        kfree(b->h.items);
+        kfree(b);
+}
+void crush_destroy_bucket(struct crush_bucket *b)
+{
+        switch (b->alg) {
+        case CRUSH_BUCKET_UNIFORM:
+                crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
+                break;
+        case CRUSH_BUCKET_LIST:
+                crush_destroy_bucket_list((struct crush_bucket_list *)b);
+                break;
+        case CRUSH_BUCKET_TREE:
+                crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
+                break;
+        case CRUSH_BUCKET_STRAW:
+                crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
+                break;
+        }
+}
+/**
+ * crush_destroy - Destroy a crush_map
+ * @map: crush_map pointer
+ */
+void crush_destroy(struct crush_map *map)
+{
+        int b;
+        /* buckets */
+        if (map->buckets) {
+                for (b = 0; b < map->max_buckets; b++) {
+                        if (map->buckets[b] == NULL)
+                                continue;
+                        crush_destroy_bucket(map->buckets[b]);
+                }
+                kfree(map->buckets);
+        }
+        /* rules */
+        if (map->rules) {
+                for (b = 0; b < map->max_rules; b++)
+                        kfree(map->rules[b]);
+                kfree(map->rules);
+        }
+        kfree(map->bucket_parents);
+        kfree(map->device_parents);
+        kfree(map);
+}
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
new file mode 100644
index 000000000000..dcd7e7523700
--- /dev/null
+++ b/fs/ceph/crush/crush.h
@@ -0,0 +1,180 @@
+#ifndef _CRUSH_CRUSH_H
+#define _CRUSH_CRUSH_H
+#include <linux/types.h>
+/*
+ * CRUSH is a pseudo-random data distribution algorithm that
+ * efficiently distributes input values (typically, data objects)
+ * across a heterogeneous, structured storage cluster.
+ *
+ * The algorithm was originally described in detail in this paper
+ * (although the algorithm has evolved somewhat since then):
+ *
+ *     http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
+ *
+ * LGPL2
+ */
+#define CRUSH_MAGIC 0x00010000ul   /* for detecting algorithm revisions */
+#define CRUSH_MAX_DEPTH 10  /* max crush hierarchy depth */
+#define CRUSH_MAX_SET   10  /* max size of a mapping result */
+/*
+ * CRUSH uses user-defined "rules" to describe how inputs should be
+ * mapped to devices.  A rule consists of sequence of steps to perform
+ * to generate the set of output devices.
+ */
+struct crush_rule_step {
+        __u32 op;
+        __s32 arg1;
+        __s32 arg2;
+};
+/* step op codes */
+enum {
+        CRUSH_RULE_NOOP = 0,
+        CRUSH_RULE_TAKE = 1,          /* arg1 = value to start with */
+        CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
+                                      /* arg2 = type */
+        CRUSH_RULE_CHOOSE_INDEP = 3,  /* same */
+        CRUSH_RULE_EMIT = 4,          /* no args */
+        CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
+        CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
+};
+/*
+ * for specifying choose num (arg1) relative to the max parameter
+ * passed to do_rule
+ */
+#define CRUSH_CHOOSE_N            0
+#define CRUSH_CHOOSE_N_MINUS(x)   (-(x))
+/*
+ * The rule mask is used to describe what the rule is intended for.
+ * Given a ruleset and size of output set, we search through the
+ * rule list for a matching rule_mask.
+ */
+struct crush_rule_mask {
+        __u8 ruleset;
+        __u8 type;
+        __u8 min_size;
+        __u8 max_size;
+};
+struct crush_rule {
+        __u32 len;
+        struct crush_rule_mask mask;
+        struct crush_rule_step steps[0];
+};
+#define crush_rule_size(len) (sizeof(struct crush_rule) + \
+                              (len)*sizeof(struct crush_rule_step))
+/*
+ * A bucket is a named container of other items (either devices or
+ * other buckets).  Items within a bucket are chosen using one of a
+ * few different algorithms.  The table summarizes how the speed of
+ * each option measures up against mapping stability when items are
+ * added or removed.
+ *
+ *  Bucket Alg     Speed       Additions    Removals
+ *  ------------------------------------------------
+ *  uniform         O(1)       poor         poor
+ *  list            O(n)       optimal      poor
+ *  tree            O(log n)   good         good
+ *  straw           O(n)       optimal      optimal
+ */
+enum {
+        CRUSH_BUCKET_UNIFORM = 1,
+        CRUSH_BUCKET_LIST = 2,
+        CRUSH_BUCKET_TREE = 3,
+        CRUSH_BUCKET_STRAW = 4
+};
+extern const char *crush_bucket_alg_name(int alg);
+struct crush_bucket {
+        __s32 id;        /* this'll be negative */
+        __u16 type;      /* non-zero; type=0 is reserved for devices */
+        __u8 alg;        /* one of CRUSH_BUCKET_* */
+        __u8 hash;       /* which hash function to use, CRUSH_HASH_* */
+        __u32 weight;    /* 16-bit fixed point */
+        __u32 size;      /* num items */
+        __s32 *items;
+        /*
+         * cached random permutation: used for uniform bucket and for
+         * the linear search fallback for the other bucket types.
+         */
+        __u32 perm_x;  /* @x for which *perm is defined */
+        __u32 perm_n;  /* num elements of *perm that are permuted/defined */
+        __u32 *perm;
+};
+struct crush_bucket_uniform {
+        struct crush_bucket h;
+        __u32 item_weight;  /* 16-bit fixed point; all items equally weighted */
+};
+struct crush_bucket_list {
+        struct crush_bucket h;
+        __u32 *item_weights;  /* 16-bit fixed point */
+        __u32 *sum_weights;   /* 16-bit fixed point.  element i is sum
+                                 of weights 0..i, inclusive */
+};
+struct crush_bucket_tree {
+        struct crush_bucket h;  /* note: h.size is _tree_ size, not number of
+                                   actual items */
+        __u8 num_nodes;
+        __u32 *node_weights;
+};
+struct crush_bucket_straw {
+        struct crush_bucket h;
+        __u32 *item_weights;   /* 16-bit fixed point */
+        __u32 *straws;         /* 16-bit fixed point */
+};
+/*
+ * CRUSH map includes all buckets, rules, etc.
+ */
+struct crush_map {
+        struct crush_bucket **buckets;
+        struct crush_rule **rules;
+        /*
+         * Parent pointers to identify the parent bucket a device or
+         * bucket in the hierarchy.  If an item appears more than
+         * once, this is the _last_ time it appeared (where buckets
+         * are processed in bucket id order, from -1 on down to
+         * -max_buckets.
+         */
+        __u32 *bucket_parents;
+        __u32 *device_parents;
+        __s32 max_buckets;
+        __u32 max_rules;
+        __s32 max_devices;
+};
+/* crush.c */
+extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
+extern void crush_calc_parents(struct crush_map *map);
+extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
+extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
+extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
+extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
+extern void crush_destroy_bucket(struct crush_bucket *b);
+extern void crush_destroy(struct crush_map *map);
+#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
new file mode 100644
index 000000000000..5873aed694bf
--- /dev/null
+++ b/fs/ceph/crush/hash.c
@@ -0,0 +1,149 @@
+#include <linux/types.h>
+#include "hash.h"
+/*
+ * Robert Jenkins' function for mixing 32-bit values
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * a, b = random bits, c = input and output
+ */
+#define crush_hashmix(a, b, c) do {                     \
+                a = a-b;  a = a-c;  a = a^(c>>13);      \
+                b = b-c;  b = b-a;  b = b^(a<<8);       \
+                c = c-a;  c = c-b;  c = c^(b>>13);      \
+                a = a-b;  a = a-c;  a = a^(c>>12);      \
+                b = b-c;  b = b-a;  b = b^(a<<16);      \
+                c = c-a;  c = c-b;  c = c^(b>>5);       \
+                a = a-b;  a = a-c;  a = a^(c>>3);       \
+                b = b-c;  b = b-a;  b = b^(a<<10);      \
+                c = c-a;  c = c-b;  c = c^(b>>15);      \
+        } while (0)
+#define crush_hash_seed 1315423911
+static __u32 crush_hash32_rjenkins1(__u32 a)
+{
+        __u32 hash = crush_hash_seed ^ a;
+        __u32 b = a;
+        __u32 x = 231232;
+        __u32 y = 1232;
+        crush_hashmix(b, x, hash);
+        crush_hashmix(y, a, hash);
+        return hash;
+}
+static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
+{
+        __u32 hash = crush_hash_seed ^ a ^ b;
+        __u32 x = 231232;
+        __u32 y = 1232;
+        crush_hashmix(a, b, hash);
+        crush_hashmix(x, a, hash);
+        crush_hashmix(b, y, hash);
+        return hash;
+}
+static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
+{
+        __u32 hash = crush_hash_seed ^ a ^ b ^ c;
+        __u32 x = 231232;
+        __u32 y = 1232;
+        crush_hashmix(a, b, hash);
+        crush_hashmix(c, x, hash);
+        crush_hashmix(y, a, hash);
+        crush_hashmix(b, x, hash);
+        crush_hashmix(y, c, hash);
+        return hash;
+}
+static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
+{
+        __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
+        __u32 x = 231232;
+        __u32 y = 1232;
+        crush_hashmix(a, b, hash);
+        crush_hashmix(c, d, hash);
+        crush_hashmix(a, x, hash);
+        crush_hashmix(y, b, hash);
+        crush_hashmix(c, x, hash);
+        crush_hashmix(y, d, hash);
+        return hash;
+}
+static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
+                                      __u32 e)
+{
+        __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
+        __u32 x = 231232;
+        __u32 y = 1232;
+        crush_hashmix(a, b, hash);
+        crush_hashmix(c, d, hash);
+        crush_hashmix(e, x, hash);
+        crush_hashmix(y, a, hash);
+        crush_hashmix(b, x, hash);
+        crush_hashmix(y, c, hash);
+        crush_hashmix(d, x, hash);
+        crush_hashmix(y, e, hash);
+        return hash;
+}
+__u32 crush_hash32(int type, __u32 a)
+{
+        switch (type) {
+        case CRUSH_HASH_RJENKINS1:
+                return crush_hash32_rjenkins1(a);
+        default:
+                return 0;
+        }
+}
+__u32 crush_hash32_2(int type, __u32 a, __u32 b)
+{
+        switch (type) {
+        case CRUSH_HASH_RJENKINS1:
+                return crush_hash32_rjenkins1_2(a, b);
+        default:
+                return 0;
+        }
+}
+__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
+{
+        switch (type) {
+        case CRUSH_HASH_RJENKINS1:
+                return crush_hash32_rjenkins1_3(a, b, c);
+        default:
+                return 0;
+        }
+}
+__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
+{
+        switch (type) {
+        case CRUSH_HASH_RJENKINS1:
+                return crush_hash32_rjenkins1_4(a, b, c, d);
+        default:
+                return 0;
+        }
+}
+__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
+{
+        switch (type) {
+        case CRUSH_HASH_RJENKINS1:
+                return crush_hash32_rjenkins1_5(a, b, c, d, e);
+        default:
+                return 0;
+        }
+}
+const char *crush_hash_name(int type)
+{
+        switch (type) {
+        case CRUSH_HASH_RJENKINS1:
+                return "rjenkins1";
+        default:
+                return "unknown";
+        }
+}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
new file mode 100644
index 000000000000..ff48e110e4bb
--- /dev/null
+++ b/fs/ceph/crush/hash.h
@@ -0,0 +1,17 @@
+#ifndef _CRUSH_HASH_H
+#define _CRUSH_HASH_H
+#define CRUSH_HASH_RJENKINS1   0
+#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
+extern const char *crush_hash_name(int type);
+extern __u32 crush_hash32(int type, __u32 a);
+extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
+extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
+extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
+extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
+                            __u32 e);
+#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
new file mode 100644
index 000000000000..9ba54efb6543
--- /dev/null
+++ b/fs/ceph/crush/mapper.c
@@ -0,0 +1,596 @@
+#ifdef __KERNEL__
+# include <linux/string.h>
+# include <linux/slab.h>
+# include <linux/bug.h>
+# include <linux/kernel.h>
+# ifndef dprintk
+#  define dprintk(args...)
+# endif
+#else
+# include <string.h>
+# include <stdio.h>
+# include <stdlib.h>
+# include <assert.h>
+# define BUG_ON(x) assert(!(x))
+# define dprintk(args...) /* printf(args) */
+# define kmalloc(x, f) malloc(x)
+# define kfree(x) free(x)
+#endif
+#include "crush.h"
+#include "hash.h"
+/*
+ * Implement the core CRUSH mapping algorithm.
+ */
+/**
+ * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
+ * @map: the crush_map
+ * @ruleset: the storage ruleset id (user defined)
+ * @type: storage ruleset type (user defined)
+ * @size: output set size
+ */
+int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
+{
+        int i;
+        for (i = 0; i < map->max_rules; i++) {
+                if (map->rules[i] &&
+                    map->rules[i]->mask.ruleset == ruleset &&
+                    map->rules[i]->mask.type == type &&
+                    map->rules[i]->mask.min_size <= size &&
+                    map->rules[i]->mask.max_size >= size)
+                        return i;
+        }
+        return -1;
+}
+/*
+ * bucket choose methods
+ *
+ * For each bucket algorithm, we have a "choose" method that, given a
+ * crush input @x and replica position (usually, position in output set) @r,
+ * will produce an item in the bucket.
+ */
+/*
+ * Choose based on a random permutation of the bucket.
+ *
+ * We used to use some prime number arithmetic to do this, but it
+ * wasn't very random, and had some other bad behaviors.  Instead, we
+ * calculate an actual random permutation of the bucket members.
+ * Since this is expensive, we optimize for the r=0 case, which
+ * captures the vast majority of calls.
+ */
+static int bucket_perm_choose(struct crush_bucket *bucket,
+                              int x, int r)
+{
+        unsigned pr = r % bucket->size;
+        unsigned i, s;
+        /* start a new permutation if @x has changed */
+        if (bucket->perm_x != x || bucket->perm_n == 0) {
+                dprintk("bucket %d new x=%d\n", bucket->id, x);
+                bucket->perm_x = x;
+                /* optimize common r=0 case */
+                if (pr == 0) {
+                        s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
+                                bucket->size;
+                        bucket->perm[0] = s;
+                        bucket->perm_n = 0xffff;   /* magic value, see below */
+                        goto out;
+                }
+                for (i = 0; i < bucket->size; i++)
+                        bucket->perm[i] = i;
+                bucket->perm_n = 0;
+        } else if (bucket->perm_n == 0xffff) {
+                /* clean up after the r=0 case above */
+                for (i = 1; i < bucket->size; i++)
+                        bucket->perm[i] = i;
+                bucket->perm[bucket->perm[0]] = 0;
+                bucket->perm_n = 1;
+        }
+        /* calculate permutation up to pr */
+        for (i = 0; i < bucket->perm_n; i++)
+                dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
+        while (bucket->perm_n <= pr) {
+                unsigned p = bucket->perm_n;
+                /* no point in swapping the final entry */
+                if (p < bucket->size - 1) {
+                        i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
+                                (bucket->size - p);
+                        if (i) {
+                                unsigned t = bucket->perm[p + i];
+                                bucket->perm[p + i] = bucket->perm[p];
+                                bucket->perm[p] = t;
+                        }
+                        dprintk(" perm_choose swap %d with %d\n", p, p+i);
+                }
+                bucket->perm_n++;
+        }
+        for (i = 0; i < bucket->size; i++)
+                dprintk(" perm_choose  %d: %d\n", i, bucket->perm[i]);
+        s = bucket->perm[pr];
+out:
+        dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
+                bucket->size, x, r, pr, s);
+        return bucket->items[s];
+}
+/* uniform */
+static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
+                                 int x, int r)
+{
+        return bucket_perm_choose(&bucket->h, x, r);
+}
+/* list */
+static int bucket_list_choose(struct crush_bucket_list *bucket,
+                              int x, int r)
+{
+        int i;
+        for (i = bucket->h.size-1; i >= 0; i--) {
+                __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
+                                         r, bucket->h.id);
+                w &= 0xffff;
+                dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
+                        "sw %x rand %llx",
+                        i, x, r, bucket->h.items[i], bucket->item_weights[i],
+                        bucket->sum_weights[i], w);
+                w *= bucket->sum_weights[i];
+                w = w >> 16;
+                /*dprintk(" scaled %llx\n", w);*/
+                if (w < bucket->item_weights[i])
+                        return bucket->h.items[i];
+        }
+        BUG_ON(1);
+        return 0;
+}
+/* (binary) tree */
+static int height(int n)
+{
+        int h = 0;
+        while ((n & 1) == 0) {
+                h++;
+                n = n >> 1;
+        }
+        return h;
+}
+static int left(int x)
+{
+        int h = height(x);
+        return x - (1 << (h-1));
+}
+static int right(int x)
+{
+        int h = height(x);
+        return x + (1 << (h-1));
+}
+static int terminal(int x)
+{
+        return x & 1;
+}
+static int bucket_tree_choose(struct crush_bucket_tree *bucket,
+                              int x, int r)
+{
+        int n, l;
+        __u32 w;
+        __u64 t;
+        /* start at root */
+        n = bucket->num_nodes >> 1;
+        while (!terminal(n)) {
+                /* pick point in [0, w) */
+                w = bucket->node_weights[n];
+                t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
+                                          bucket->h.id) * (__u64)w;
+                t = t >> 32;
+                /* descend to the left or right? */
+                l = left(n);
+                if (t < bucket->node_weights[l])
+                        n = l;
+                else
+                        n = right(n);
+        }
+        return bucket->h.items[n >> 1];
+}
+/* straw */
+static int bucket_straw_choose(struct crush_bucket_straw *bucket,
+                               int x, int r)
+{
+        int i;
+        int high = 0;
+        __u64 high_draw = 0;
+        __u64 draw;
+        for (i = 0; i < bucket->h.size; i++) {
+                draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
+                draw &= 0xffff;
+                draw *= bucket->straws[i];
+                if (i == 0 || draw > high_draw) {
+                        high = i;
+                        high_draw = draw;
+                }
+        }
+        return bucket->h.items[high];
+}
+static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
+{
+        dprintk("choose %d x=%d r=%d\n", in->id, x, r);
+        switch (in->alg) {
+        case CRUSH_BUCKET_UNIFORM:
+                return bucket_uniform_choose((struct crush_bucket_uniform *)in,
+                                          x, r);
+        case CRUSH_BUCKET_LIST:
+                return bucket_list_choose((struct crush_bucket_list *)in,
+                                          x, r);
+        case CRUSH_BUCKET_TREE:
+                return bucket_tree_choose((struct crush_bucket_tree *)in,
+                                          x, r);
+        case CRUSH_BUCKET_STRAW:
+                return bucket_straw_choose((struct crush_bucket_straw *)in,
+                                           x, r);
+        default:
+                BUG_ON(1);
+                return in->items[0];
+        }
+}
+/*
+ * true if device is marked "out" (failed, fully offloaded)
+ * of the cluster
+ */
+static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
+{
+        if (weight[item] >= 0x1000)
+                return 0;
+        if (weight[item] == 0)
+                return 1;
+        if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
+            < weight[item])
+                return 0;
+        return 1;
+}
+/**
+ * crush_choose - choose numrep distinct items of given type
+ * @map: the crush_map
+ * @bucket: the bucket we are choose an item from
+ * @x: crush input value
+ * @numrep: the number of items to choose
+ * @type: the type of item to choose
+ * @out: pointer to output vector
+ * @outpos: our position in that vector
+ * @firstn: true if choosing "first n" items, false if choosing "indep"
+ * @recurse_to_leaf: true if we want one device under each item of given type
+ * @out2: second output vector for leaf items (if @recurse_to_leaf)
+ */
+static int crush_choose(struct crush_map *map,
+                        struct crush_bucket *bucket,
+                        __u32 *weight,
+                        int x, int numrep, int type,
+                        int *out, int outpos,
+                        int firstn, int recurse_to_leaf,
+                        int *out2)
+{
+        int rep;
+        int ftotal, flocal;
+        int retry_descent, retry_bucket, skip_rep;
+        struct crush_bucket *in = bucket;
+        int r;
+        int i;
+        int item = 0;
+        int itemtype;
+        int collide, reject;
+        const int orig_tries = 5; /* attempts before we fall back to search */
+        dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos);
+        for (rep = outpos; rep < numrep; rep++) {
+                /* keep trying until we get a non-out, non-colliding item */
+                ftotal = 0;
+                skip_rep = 0;
+                do {
+                        retry_descent = 0;
+                        in = bucket;               /* initial bucket */
+                        /* choose through intervening buckets */
+                        flocal = 0;
+                        do {
+                                collide = 0;
+                                retry_bucket = 0;
+                                r = rep;
+                                if (in->alg == CRUSH_BUCKET_UNIFORM) {
+                                        /* be careful */
+                                        if (firstn || numrep >= in->size)
+                                                /* r' = r + f_total */
+                                                r += ftotal;
+                                        else if (in->size % numrep == 0)
+                                                /* r'=r+(n+1)*f_local */
+                                                r += (numrep+1) *
+                                                        (flocal+ftotal);
+                                        else
+                                                /* r' = r + n*f_local */
+                                                r += numrep * (flocal+ftotal);
+                                } else {
+                                        if (firstn)
+                                                /* r' = r + f_total */
+                                                r += ftotal;
+                                        else
+                                                /* r' = r + n*f_local */
+                                                r += numrep * (flocal+ftotal);
+                                }
+                                /* bucket choose */
+                                if (in->size == 0) {
+                                        reject = 1;
+                                        goto reject;
+                                }
+                                if (flocal >= (in->size>>1) &&
+                                    flocal > orig_tries)
+                                        item = bucket_perm_choose(in, x, r);
+                                else
+                                        item = crush_bucket_choose(in, x, r);
+                                BUG_ON(item >= map->max_devices);
+                                /* desired type? */
+                                if (item < 0)
+                                        itemtype = map->buckets[-1-item]->type;
+                                else
+                                        itemtype = 0;
+                                dprintk("  item %d type %d\n", item, itemtype);
+                                /* keep going? */
+                                if (itemtype != type) {
+                                        BUG_ON(item >= 0 ||
+                                               (-1-item) >= map->max_buckets);
+                                        in = map->buckets[-1-item];
+                                        continue;
+                                }
+                                /* collision? */
+                                for (i = 0; i < outpos; i++) {
+                                        if (out[i] == item) {
+                                                collide = 1;
+                                                break;
+                                        }
+                                }
+                                if (recurse_to_leaf &&
+                                    item < 0 &&
+                                    crush_choose(map, map->buckets[-1-item],
+                                                 weight,
+                                                 x, outpos+1, 0,
+                                                 out2, outpos,
+                                                 firstn, 0, NULL) <= outpos) {
+                                        reject = 1;
+                                } else {
+                                        /* out? */
+                                        if (itemtype == 0)
+                                                reject = is_out(map, weight,
+                                                                item, x);
+                                        else
+                                                reject = 0;
+                                }
+reject:
+                                if (reject || collide) {
+                                        ftotal++;
+                                        flocal++;
+                                        if (collide && flocal < 3)
+                                                /* retry locally a few times */
+                                                retry_bucket = 1;
+                                        else if (flocal < in->size + orig_tries)
+                                                /* exhaustive bucket search */
+                                                retry_bucket = 1;
+                                        else if (ftotal < 20)
+                                                /* then retry descent */
+                                                retry_descent = 1;
+                                        else
+                                                /* else give up */
+                                                skip_rep = 1;
+                                        dprintk("  reject %d  collide %d  "
+                                                "ftotal %d  flocal %d\n",
+                                                reject, collide, ftotal,
+                                                flocal);
+                                }
+                        } while (retry_bucket);
+                } while (retry_descent);
+                if (skip_rep) {
+                        dprintk("skip rep\n");
+                        continue;
+                }
+                dprintk("choose got %d\n", item);
+                out[outpos] = item;
+                outpos++;
+        }
+        dprintk("choose returns %d\n", outpos);
+        return outpos;
+}
+/**
+ * crush_do_rule - calculate a mapping with the given input and rule
+ * @map: the crush_map
+ * @ruleno: the rule id
+ * @x: hash input
+ * @result: pointer to result vector
+ * @result_max: maximum result size
+ * @force: force initial replica choice; -1 for none
+ */
+int crush_do_rule(struct crush_map *map,
+                  int ruleno, int x, int *result, int result_max,
+                  int force, __u32 *weight)
+{
+        int result_len;
+        int force_context[CRUSH_MAX_DEPTH];
+        int force_pos = -1;
+        int a[CRUSH_MAX_SET];
+        int b[CRUSH_MAX_SET];
+        int c[CRUSH_MAX_SET];
+        int recurse_to_leaf;
+        int *w;
+        int wsize = 0;
+        int *o;
+        int osize;
+        int *tmp;
+        struct crush_rule *rule;
+        int step;
+        int i, j;
+        int numrep;
+        int firstn;
+        int rc = -1;
+        BUG_ON(ruleno >= map->max_rules);
+        rule = map->rules[ruleno];
+        result_len = 0;
+        w = a;
+        o = b;
+        /*
+         * determine hierarchical context of force, if any.  note
+         * that this may or may not correspond to the specific types
+         * referenced by the crush rule.
+         */
+        if (force >= 0) {
+                if (force >= map->max_devices ||
+                    map->device_parents[force] == 0) {
+                        /*dprintk("CRUSH: forcefed device dne\n");*/
+                        rc = -1;  /* force fed device dne */
+                        goto out;
+                }
+                if (!is_out(map, weight, force, x)) {
+                        while (1) {
+                                force_context[++force_pos] = force;
+                                if (force >= 0)
+                                        force = map->device_parents[force];
+                                else
+                                        force = map->bucket_parents[-1-force];
+                                if (force == 0)
+                                        break;
+                        }
+                }
+        }
+        for (step = 0; step < rule->len; step++) {
+                firstn = 0;
+                switch (rule->steps[step].op) {
+                case CRUSH_RULE_TAKE:
+                        w[0] = rule->steps[step].arg1;
+                        if (force_pos >= 0) {
+                                BUG_ON(force_context[force_pos] != w[0]);
+                                force_pos--;
+                        }
+                        wsize = 1;
+                        break;
+                case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
+                case CRUSH_RULE_CHOOSE_FIRSTN:
+                        firstn = 1;
+                case CRUSH_RULE_CHOOSE_LEAF_INDEP:
+                case CRUSH_RULE_CHOOSE_INDEP:
+                        BUG_ON(wsize == 0);
+                        recurse_to_leaf =
+                                rule->steps[step].op ==
+                                 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
+                                rule->steps[step].op ==
+                                CRUSH_RULE_CHOOSE_LEAF_INDEP;
+                        /* reset output */
+                        osize = 0;
+                        for (i = 0; i < wsize; i++) {
+                                /*
+                                 * see CRUSH_N, CRUSH_N_MINUS macros.
+                                 * basically, numrep <= 0 means relative to
+                                 * the provided result_max
+                                 */
+                                numrep = rule->steps[step].arg1;
+                                if (numrep <= 0) {
+                                        numrep += result_max;
+                                        if (numrep <= 0)
+                                                continue;
+                                }
+                                j = 0;
+                                if (osize == 0 && force_pos >= 0) {
+                                        /* skip any intermediate types */
+                                        while (force_pos &&
+                                               force_context[force_pos] < 0 &&
+                                               rule->steps[step].arg2 !=
+                                               map->buckets[-1 -
+                                               force_context[force_pos]]->type)
+                                                force_pos--;
+                                        o[osize] = force_context[force_pos];
+                                        if (recurse_to_leaf)
+                                                c[osize] = force_context[0];
+                                        j++;
+                                        force_pos--;
+                                }
+                                osize += crush_choose(map,
+                                                      map->buckets[-1-w[i]],
+                                                      weight,
+                                                      x, numrep,
+                                                      rule->steps[step].arg2,
+                                                      o+osize, j,
+                                                      firstn,
+                                                      recurse_to_leaf, c+osize);
+                        }
+                        if (recurse_to_leaf)
+                                /* copy final _leaf_ values to output set */
+                                memcpy(o, c, osize*sizeof(*o));
+                        /* swap t and w arrays */
+                        tmp = o;
+                        o = w;
+                        w = tmp;
+                        wsize = osize;
+                        break;
+                case CRUSH_RULE_EMIT:
+                        for (i = 0; i < wsize && result_len < result_max; i++) {
+                                result[result_len] = w[i];
+                                result_len++;
+                        }
+                        wsize = 0;
+                        break;
+                default:
+                        BUG_ON(1);
+                }
+        }
+        rc = result_len;
+out:
+        return rc;
+}
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
new file mode 100644
index 000000000000..98e90046fd9f
--- /dev/null
+++ b/fs/ceph/crush/mapper.h
@@ -0,0 +1,20 @@
+#ifndef _CRUSH_MAPPER_H
+#define _CRUSH_MAPPER_H
+/*
+ * CRUSH functions for find rules and then mapping an input to an
+ * output set.
+ *
+ * LGPL2
+ */
+#include "crush.h"
+extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
+extern int crush_do_rule(struct crush_map *map,
+                         int ruleno,
+                         int x, int *result, int result_max,
+                         int forcefeed,    /* -1 for none */
+                         __u32 *weights);
+#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
new file mode 100644
index 000000000000..f704b3b62424
--- /dev/null
+++ b/fs/ceph/crypto.c
@@ -0,0 +1,409 @@
+#include "ceph_debug.h"
+#include <linux/err.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <crypto/hash.h>
+#include "crypto.h"
+#include "decode.h"
+int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
+{
+        if (*p + sizeof(u16) + sizeof(key->created) +
+            sizeof(u16) + key->len > end)
+                return -ERANGE;
+        ceph_encode_16(p, key->type);
+        ceph_encode_copy(p, &key->created, sizeof(key->created));
+        ceph_encode_16(p, key->len);
+        ceph_encode_copy(p, key->key, key->len);
+        return 0;
+}
+int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
+{
+        ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
+        key->type = ceph_decode_16(p);
+        ceph_decode_copy(p, &key->created, sizeof(key->created));
+        key->len = ceph_decode_16(p);
+        ceph_decode_need(p, end, key->len, bad);
+        key->key = kmalloc(key->len, GFP_NOFS);
+        if (!key->key)
+                return -ENOMEM;
+        ceph_decode_copy(p, key->key, key->len);
+        return 0;
+bad:
+        dout("failed to decode crypto key\n");
+        return -EINVAL;
+}
+int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
+{
+        int inlen = strlen(inkey);
+        int blen = inlen * 3 / 4;
+        void *buf, *p;
+        int ret;
+        dout("crypto_key_unarmor %s\n", inkey);
+        buf = kmalloc(blen, GFP_NOFS);
+        if (!buf)
+                return -ENOMEM;
+        blen = ceph_unarmor(buf, inkey, inkey+inlen);
+        if (blen < 0) {
+                kfree(buf);
+                return blen;
+        }
+        p = buf;
+        ret = ceph_crypto_key_decode(key, &p, p + blen);
+        kfree(buf);
+        if (ret)
+                return ret;
+        dout("crypto_key_unarmor key %p type %d len %d\n", key,
+             key->type, key->len);
+        return 0;
+}
+#define AES_KEY_SIZE 16
+static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
+{
+        return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
+}
+const u8 *aes_iv = "cephsageyudagreg";
+int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
+                     const void *src, size_t src_len)
+{
+        struct scatterlist sg_in[2], sg_out[1];
+        struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+        struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+        int ret;
+        void *iv;
+        int ivsize;
+        size_t zero_padding = (0x10 - (src_len & 0x0f));
+        char pad[16];
+        if (IS_ERR(tfm))
+                return PTR_ERR(tfm);
+        memset(pad, zero_padding, zero_padding);
+        *dst_len = src_len + zero_padding;
+        crypto_blkcipher_setkey((void *)tfm, key, key_len);
+        sg_init_table(sg_in, 2);
+        sg_set_buf(&sg_in[0], src, src_len);
+        sg_set_buf(&sg_in[1], pad, zero_padding);
+        sg_init_table(sg_out, 1);
+        sg_set_buf(sg_out, dst, *dst_len);
+        iv = crypto_blkcipher_crt(tfm)->iv;
+        ivsize = crypto_blkcipher_ivsize(tfm);
+        memcpy(iv, aes_iv, ivsize);
+        /*
+        print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
+                       key, key_len, 1);
+        print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
+                        src, src_len, 1);
+        print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
+                        pad, zero_padding, 1);
+        */
+        ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
+                                     src_len + zero_padding);
+        crypto_free_blkcipher(tfm);
+        if (ret < 0)
+                pr_err("ceph_aes_crypt failed %d\n", ret);
+        /*
+        print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
+                       dst, *dst_len, 1);
+        */
+        return 0;
+}
+int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
+                      const void *src1, size_t src1_len,
+                      const void *src2, size_t src2_len)
+{
+        struct scatterlist sg_in[3], sg_out[1];
+        struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+        struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+        int ret;
+        void *iv;
+        int ivsize;
+        size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
+        char pad[16];
+        if (IS_ERR(tfm))
+                return PTR_ERR(tfm);
+        memset(pad, zero_padding, zero_padding);
+        *dst_len = src1_len + src2_len + zero_padding;
+        crypto_blkcipher_setkey((void *)tfm, key, key_len);
+        sg_init_table(sg_in, 3);
+        sg_set_buf(&sg_in[0], src1, src1_len);
+        sg_set_buf(&sg_in[1], src2, src2_len);
+        sg_set_buf(&sg_in[2], pad, zero_padding);
+        sg_init_table(sg_out, 1);
+        sg_set_buf(sg_out, dst, *dst_len);
+        iv = crypto_blkcipher_crt(tfm)->iv;
+        ivsize = crypto_blkcipher_ivsize(tfm);
+        memcpy(iv, aes_iv, ivsize);
+        /*
+        print_hex_dump(KERN_ERR, "enc  key: ", DUMP_PREFIX_NONE, 16, 1,
+                       key, key_len, 1);
+        print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
+                        src1, src1_len, 1);
+        print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
+                        src2, src2_len, 1);
+        print_hex_dump(KERN_ERR, "enc  pad: ", DUMP_PREFIX_NONE, 16, 1,
+                        pad, zero_padding, 1);
+        */
+        ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
+                                     src1_len + src2_len + zero_padding);
+        crypto_free_blkcipher(tfm);
+        if (ret < 0)
+                pr_err("ceph_aes_crypt2 failed %d\n", ret);
+        /*
+        print_hex_dump(KERN_ERR, "enc  out: ", DUMP_PREFIX_NONE, 16, 1,
+                       dst, *dst_len, 1);
+        */
+        return 0;
+}
+int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
+                     const void *src, size_t src_len)
+{
+        struct scatterlist sg_in[1], sg_out[2];
+        struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+        struct blkcipher_desc desc = { .tfm = tfm };
+        char pad[16];
+        void *iv;
+        int ivsize;
+        int ret;
+        int last_byte;
+        if (IS_ERR(tfm))
+                return PTR_ERR(tfm);
+        crypto_blkcipher_setkey((void *)tfm, key, key_len);
+        sg_init_table(sg_in, 1);
+        sg_init_table(sg_out, 2);
+        sg_set_buf(sg_in, src, src_len);
+        sg_set_buf(&sg_out[0], dst, *dst_len);
+        sg_set_buf(&sg_out[1], pad, sizeof(pad));
+        iv = crypto_blkcipher_crt(tfm)->iv;
+        ivsize = crypto_blkcipher_ivsize(tfm);
+        memcpy(iv, aes_iv, ivsize);
+        /*
+        print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
+                       key, key_len, 1);
+        print_hex_dump(KERN_ERR, "dec  in: ", DUMP_PREFIX_NONE, 16, 1,
+                       src, src_len, 1);
+        */
+        ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
+        crypto_free_blkcipher(tfm);
+        if (ret < 0) {
+                pr_err("ceph_aes_decrypt failed %d\n", ret);
+                return ret;
+        }
+        if (src_len <= *dst_len)
+                last_byte = ((char *)dst)[src_len - 1];
+        else
+                last_byte = pad[src_len - *dst_len - 1];
+        if (last_byte <= 16 && src_len >= last_byte) {
+                *dst_len = src_len - last_byte;
+        } else {
+                pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
+                       last_byte, (int)src_len);
+                return -EPERM;  /* bad padding */
+        }
+        /*
+        print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
+                       dst, *dst_len, 1);
+        */
+        return 0;
+}
+int ceph_aes_decrypt2(const void *key, int key_len,
+                      void *dst1, size_t *dst1_len,
+                      void *dst2, size_t *dst2_len,
+                      const void *src, size_t src_len)
+{
+        struct scatterlist sg_in[1], sg_out[3];
+        struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+        struct blkcipher_desc desc = { .tfm = tfm };
+        char pad[16];
+        void *iv;
+        int ivsize;
+        int ret;
+        int last_byte;
+        if (IS_ERR(tfm))
+                return PTR_ERR(tfm);
+        sg_init_table(sg_in, 1);
+        sg_set_buf(sg_in, src, src_len);
+        sg_init_table(sg_out, 3);
+        sg_set_buf(&sg_out[0], dst1, *dst1_len);
+        sg_set_buf(&sg_out[1], dst2, *dst2_len);
+        sg_set_buf(&sg_out[2], pad, sizeof(pad));
+        crypto_blkcipher_setkey((void *)tfm, key, key_len);
+        iv = crypto_blkcipher_crt(tfm)->iv;
+        ivsize = crypto_blkcipher_ivsize(tfm);
+        memcpy(iv, aes_iv, ivsize);
+        /*
+        print_hex_dump(KERN_ERR, "dec  key: ", DUMP_PREFIX_NONE, 16, 1,
+                       key, key_len, 1);
+        print_hex_dump(KERN_ERR, "dec   in: ", DUMP_PREFIX_NONE, 16, 1,
+                       src, src_len, 1);
+        */
+        ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
+        crypto_free_blkcipher(tfm);
+        if (ret < 0) {
+                pr_err("ceph_aes_decrypt failed %d\n", ret);
+                return ret;
+        }
+        if (src_len <= *dst1_len)
+                last_byte = ((char *)dst1)[src_len - 1];
+        else if (src_len <= *dst1_len + *dst2_len)
+                last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
+        else
+                last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
+        if (last_byte <= 16 && src_len >= last_byte) {
+                src_len -= last_byte;
+        } else {
+                pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
+                       last_byte, (int)src_len);
+                return -EPERM;  /* bad padding */
+        }
+        if (src_len < *dst1_len) {
+                *dst1_len = src_len;
+                *dst2_len = 0;
+        } else {
+                *dst2_len = src_len - *dst1_len;
+        }
+        /*
+        print_hex_dump(KERN_ERR, "dec  out1: ", DUMP_PREFIX_NONE, 16, 1,
+                       dst1, *dst1_len, 1);
+        print_hex_dump(KERN_ERR, "dec  out2: ", DUMP_PREFIX_NONE, 16, 1,
+                       dst2, *dst2_len, 1);
+        */
+        return 0;
+}
+int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+                 const void *src, size_t src_len)
+{
+        switch (secret->type) {
+        case CEPH_CRYPTO_NONE:
+                if (*dst_len < src_len)
+                        return -ERANGE;
+                memcpy(dst, src, src_len);
+                *dst_len = src_len;
+                return 0;
+        case CEPH_CRYPTO_AES:
+                return ceph_aes_decrypt(secret->key, secret->len, dst,
+                                        dst_len, src, src_len);
+        default:
+                return -EINVAL;
+        }
+}
+int ceph_decrypt2(struct ceph_crypto_key *secret,
+                        void *dst1, size_t *dst1_len,
+                        void *dst2, size_t *dst2_len,
+                        const void *src, size_t src_len)
+{
+        size_t t;
+        switch (secret->type) {
+        case CEPH_CRYPTO_NONE:
+                if (*dst1_len + *dst2_len < src_len)
+                        return -ERANGE;
+                t = min(*dst1_len, src_len);
+                memcpy(dst1, src, t);
+                *dst1_len = t;
+                src += t;
+                src_len -= t;
+                if (src_len) {
+                        t = min(*dst2_len, src_len);
+                        memcpy(dst2, src, t);
+                        *dst2_len = t;
+                }
+                return 0;
+        case CEPH_CRYPTO_AES:
+                return ceph_aes_decrypt2(secret->key, secret->len,
+                                         dst1, dst1_len, dst2, dst2_len,
+                                         src, src_len);
+        default:
+                return -EINVAL;
+        }
+}
+int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+                 const void *src, size_t src_len)
+{
+        switch (secret->type) {
+        case CEPH_CRYPTO_NONE:
+                if (*dst_len < src_len)
+                        return -ERANGE;
+                memcpy(dst, src, src_len);
+                *dst_len = src_len;
+                return 0;
+        case CEPH_CRYPTO_AES:
+                return ceph_aes_encrypt(secret->key, secret->len, dst,
+                                        dst_len, src, src_len);
+        default:
+                return -EINVAL;
+        }
+}
+int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+                  const void *src1, size_t src1_len,
+                  const void *src2, size_t src2_len)
+{
+        switch (secret->type) {
+        case CEPH_CRYPTO_NONE:
+                if (*dst_len < src1_len + src2_len)
+                        return -ERANGE;
+                memcpy(dst, src1, src1_len);
+                memcpy(dst + src1_len, src2, src2_len);
+                *dst_len = src1_len + src2_len;
+                return 0;
+        case CEPH_CRYPTO_AES:
+                return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
+                                         src1, src1_len, src2, src2_len);
+        default:
+                return -EINVAL;
+        }
+}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
new file mode 100644
index 000000000000..40b502e6bd89
--- /dev/null
+++ b/fs/ceph/crypto.h
@@ -0,0 +1,48 @@
+#ifndef _FS_CEPH_CRYPTO_H
+#define _FS_CEPH_CRYPTO_H
+#include "types.h"
+#include "buffer.h"
+/*
+ * cryptographic secret
+ */
+struct ceph_crypto_key {
+        int type;
+        struct ceph_timespec created;
+        int len;
+        void *key;
+};
+static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
+{
+        kfree(key->key);
+}
+extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
+                                  void **p, void *end);
+extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
+                                  void **p, void *end);
+extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
+/* crypto.c */
+extern int ceph_decrypt(struct ceph_crypto_key *secret,
+                        void *dst, size_t *dst_len,
+                        const void *src, size_t src_len);
+extern int ceph_encrypt(struct ceph_crypto_key *secret,
+                        void *dst, size_t *dst_len,
+                        const void *src, size_t src_len);
+extern int ceph_decrypt2(struct ceph_crypto_key *secret,
+                        void *dst1, size_t *dst1_len,
+                        void *dst2, size_t *dst2_len,
+                        const void *src, size_t src_len);
+extern int ceph_encrypt2(struct ceph_crypto_key *secret,
+                         void *dst, size_t *dst_len,
+                         const void *src1, size_t src1_len,
+                         const void *src2, size_t src2_len);
+/* armor.c */
+extern int ceph_armor(char *dst, const void *src, const void *end);
+extern int ceph_unarmor(void *dst, const char *src, const char *end);
+#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
new file mode 100644
index 000000000000..f7048da92acc
--- /dev/null
+++ b/fs/ceph/debugfs.c
@@ -0,0 +1,484 @@
+#include "ceph_debug.h"
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include "super.h"
+#include "mds_client.h"
+#include "mon_client.h"
+#include "auth.h"
+#ifdef CONFIG_DEBUG_FS
+/*
+ * Implement /sys/kernel/debug/ceph fun
+ *
+ * /sys/kernel/debug/ceph/client*  - an instance of the ceph client
+ *      .../osdmap      - current osdmap
+ *      .../mdsmap      - current mdsmap
+ *      .../monmap      - current monmap
+ *      .../osdc        - active osd requests
+ *      .../mdsc        - active mds requests
+ *      .../monc        - mon client state
+ *      .../dentry_lru  - dump contents of dentry lru
+ *      .../caps        - expose cap (reservation) stats
+ *      .../bdi         - symlink to ../../bdi/something
+ */
+static struct dentry *ceph_debugfs_dir;
+static int monmap_show(struct seq_file *s, void *p)
+{
+        int i;
+        struct ceph_client *client = s->private;
+        if (client->monc.monmap == NULL)
+                return 0;
+        seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
+        for (i = 0; i < client->monc.monmap->num_mon; i++) {
+                struct ceph_entity_inst *inst =
+                        &client->monc.monmap->mon_inst[i];
+                seq_printf(s, "\t%s%lld\t%s\n",
+                           ENTITY_NAME(inst->name),
+                           pr_addr(&inst->addr.in_addr));
+        }
+        return 0;
+}
+static int mdsmap_show(struct seq_file *s, void *p)
+{
+        int i;
+        struct ceph_client *client = s->private;
+        if (client->mdsc.mdsmap == NULL)
+                return 0;
+        seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
+        seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
+        seq_printf(s, "session_timeout %d\n",
+                       client->mdsc.mdsmap->m_session_timeout);
+        seq_printf(s, "session_autoclose %d\n",
+                       client->mdsc.mdsmap->m_session_autoclose);
+        for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
+                struct ceph_entity_addr *addr =
+                        &client->mdsc.mdsmap->m_info[i].addr;
+                int state = client->mdsc.mdsmap->m_info[i].state;
+                seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
+                               ceph_mds_state_name(state));
+        }
+        return 0;
+}
+static int osdmap_show(struct seq_file *s, void *p)
+{
+        int i;
+        struct ceph_client *client = s->private;
+        struct rb_node *n;
+        if (client->osdc.osdmap == NULL)
+                return 0;
+        seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
+        seq_printf(s, "flags%s%s\n",
+                   (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
+                   " NEARFULL" : "",
+                   (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
+                   " FULL" : "");
+        for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
+                struct ceph_pg_pool_info *pool =
+                        rb_entry(n, struct ceph_pg_pool_info, node);
+                seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
+                           pool->id, pool->v.pg_num, pool->pg_num_mask,
+                           pool->v.lpg_num, pool->lpg_num_mask);
+        }
+        for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
+                struct ceph_entity_addr *addr =
+                        &client->osdc.osdmap->osd_addr[i];
+                int state = client->osdc.osdmap->osd_state[i];
+                char sb[64];
+                seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
+                           i, pr_addr(&addr->in_addr),
+                           ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
+                           ceph_osdmap_state_str(sb, sizeof(sb), state));
+        }
+        return 0;
+}
+static int monc_show(struct seq_file *s, void *p)
+{
+        struct ceph_client *client = s->private;
+        struct ceph_mon_statfs_request *req;
+        struct ceph_mon_client *monc = &client->monc;
+        struct rb_node *rp;
+        mutex_lock(&monc->mutex);
+        if (monc->have_mdsmap)
+                seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
+        if (monc->have_osdmap)
+                seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
+        if (monc->want_next_osdmap)
+                seq_printf(s, "want next osdmap\n");
+        for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) {
+                req = rb_entry(rp, struct ceph_mon_statfs_request, node);
+                seq_printf(s, "%lld statfs\n", req->tid);
+        }
+        mutex_unlock(&monc->mutex);
+        return 0;
+}
+static int mdsc_show(struct seq_file *s, void *p)
+{
+        struct ceph_client *client = s->private;
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_request *req;
+        struct rb_node *rp;
+        int pathlen;
+        u64 pathbase;
+        char *path;
+        mutex_lock(&mdsc->mutex);
+        for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
+                req = rb_entry(rp, struct ceph_mds_request, r_node);
+                if (req->r_request)
+                        seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
+                else
+                        seq_printf(s, "%lld\t(no request)\t", req->r_tid);
+                seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
+                if (req->r_got_unsafe)
+                        seq_printf(s, "\t(unsafe)");
+                else
+                        seq_printf(s, "\t");
+                if (req->r_inode) {
+                        seq_printf(s, " #%llx", ceph_ino(req->r_inode));
+                } else if (req->r_dentry) {
+                        path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+                                                    &pathbase, 0);
+                        spin_lock(&req->r_dentry->d_lock);
+                        seq_printf(s, " #%llx/%.*s (%s)",
+                                   ceph_ino(req->r_dentry->d_parent->d_inode),
+                                   req->r_dentry->d_name.len,
+                                   req->r_dentry->d_name.name,
+                                   path ? path : "");
+                        spin_unlock(&req->r_dentry->d_lock);
+                        kfree(path);
+                } else if (req->r_path1) {
+                        seq_printf(s, " #%llx/%s", req->r_ino1.ino,
+                                   req->r_path1);
+                }
+                if (req->r_old_dentry) {
+                        path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
+                                                    &pathbase, 0);
+                        spin_lock(&req->r_old_dentry->d_lock);
+                        seq_printf(s, " #%llx/%.*s (%s)",
+                           ceph_ino(req->r_old_dentry->d_parent->d_inode),
+                                   req->r_old_dentry->d_name.len,
+                                   req->r_old_dentry->d_name.name,
+                                   path ? path : "");
+                        spin_unlock(&req->r_old_dentry->d_lock);
+                        kfree(path);
+                } else if (req->r_path2) {
+                        if (req->r_ino2.ino)
+                                seq_printf(s, " #%llx/%s", req->r_ino2.ino,
+                                           req->r_path2);
+                        else
+                                seq_printf(s, " %s", req->r_path2);
+                }
+                seq_printf(s, "\n");
+        }
+        mutex_unlock(&mdsc->mutex);
+        return 0;
+}
+static int osdc_show(struct seq_file *s, void *pp)
+{
+        struct ceph_client *client = s->private;
+        struct ceph_osd_client *osdc = &client->osdc;
+        struct rb_node *p;
+        mutex_lock(&osdc->request_mutex);
+        for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+                struct ceph_osd_request *req;
+                struct ceph_osd_request_head *head;
+                struct ceph_osd_op *op;
+                int num_ops;
+                int opcode, olen;
+                int i;
+                req = rb_entry(p, struct ceph_osd_request, r_node);
+                seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
+                           req->r_osd ? req->r_osd->o_osd : -1,
+                           le32_to_cpu(req->r_pgid.pool),
+                           le16_to_cpu(req->r_pgid.ps));
+                head = req->r_request->front.iov_base;
+                op = (void *)(head + 1);
+                num_ops = le16_to_cpu(head->num_ops);
+                olen = le32_to_cpu(head->object_len);
+                seq_printf(s, "%.*s", olen,
+                           (const char *)(head->ops + num_ops));
+                if (req->r_reassert_version.epoch)
+                        seq_printf(s, "\t%u'%llu",
+                           (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
+                           le64_to_cpu(req->r_reassert_version.version));
+                else
+                        seq_printf(s, "\t");
+                for (i = 0; i < num_ops; i++) {
+                        opcode = le16_to_cpu(op->op);
+                        seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
+                        op++;
+                }
+                seq_printf(s, "\n");
+        }
+        mutex_unlock(&osdc->request_mutex);
+        return 0;
+}
+static int caps_show(struct seq_file *s, void *p)
+{
+        struct ceph_client *client = p;
+        int total, avail, used, reserved, min;
+        ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
+        seq_printf(s, "total\t\t%d\n"
+                   "avail\t\t%d\n"
+                   "used\t\t%d\n"
+                   "reserved\t%d\n"
+                   "min\t%d\n",
+                   total, avail, used, reserved, min);
+        return 0;
+}
+static int dentry_lru_show(struct seq_file *s, void *ptr)
+{
+        struct ceph_client *client = s->private;
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_dentry_info *di;
+        spin_lock(&mdsc->dentry_lru_lock);
+        list_for_each_entry(di, &mdsc->dentry_lru, lru) {
+                struct dentry *dentry = di->dentry;
+                seq_printf(s, "%p %p\t%.*s\n",
+                           di, dentry, dentry->d_name.len, dentry->d_name.name);
+        }
+        spin_unlock(&mdsc->dentry_lru_lock);
+        return 0;
+}
+#define DEFINE_SHOW_FUNC(name)                                          \
+static int name##_open(struct inode *inode, struct file *file)          \
+{                                                                       \
+        struct seq_file *sf;                                            \
+        int ret;                                                        \
+                                                                        \
+        ret = single_open(file, name, NULL);                            \
+        sf = file->private_data;                                        \
+        sf->private = inode->i_private;                                 \
+        return ret;                                                     \
+}                                                                       \
+                                                                        \
+static const struct file_operations name##_fops = {                     \
+        .open           = name##_open,                                  \
+        .read           = seq_read,                                     \
+        .llseek         = seq_lseek,                                    \
+        .release        = single_release,                               \
+};
+DEFINE_SHOW_FUNC(monmap_show)
+DEFINE_SHOW_FUNC(mdsmap_show)
+DEFINE_SHOW_FUNC(osdmap_show)
+DEFINE_SHOW_FUNC(monc_show)
+DEFINE_SHOW_FUNC(mdsc_show)
+DEFINE_SHOW_FUNC(osdc_show)
+DEFINE_SHOW_FUNC(dentry_lru_show)
+DEFINE_SHOW_FUNC(caps_show)
+static int congestion_kb_set(void *data, u64 val)
+{
+        struct ceph_client *client = (struct ceph_client *)data;
+        if (client)
+                client->mount_args->congestion_kb = (int)val;
+        return 0;
+}
+static int congestion_kb_get(void *data, u64 *val)
+{
+        struct ceph_client *client = (struct ceph_client *)data;
+        if (client)
+                *val = (u64)client->mount_args->congestion_kb;
+        return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
+                        congestion_kb_set, "%llu\n");
+int __init ceph_debugfs_init(void)
+{
+        ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
+        if (!ceph_debugfs_dir)
+                return -ENOMEM;
+        return 0;
+}
+void ceph_debugfs_cleanup(void)
+{
+        debugfs_remove(ceph_debugfs_dir);
+}
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+        int ret = 0;
+        char name[80];
+        snprintf(name, sizeof(name), FSID_FORMAT ".client%lld",
+                 PR_FSID(&client->fsid), client->monc.auth->global_id);
+        client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
+        if (!client->debugfs_dir)
+                goto out;
+        client->monc.debugfs_file = debugfs_create_file("monc",
+                                                      0600,
+                                                      client->debugfs_dir,
+                                                      client,
+                                                      &monc_show_fops);
+        if (!client->monc.debugfs_file)
+                goto out;
+        client->mdsc.debugfs_file = debugfs_create_file("mdsc",
+                                                      0600,
+                                                      client->debugfs_dir,
+                                                      client,
+                                                      &mdsc_show_fops);
+        if (!client->mdsc.debugfs_file)
+                goto out;
+        client->osdc.debugfs_file = debugfs_create_file("osdc",
+                                                      0600,
+                                                      client->debugfs_dir,
+                                                      client,
+                                                      &osdc_show_fops);
+        if (!client->osdc.debugfs_file)
+                goto out;
+        client->debugfs_monmap = debugfs_create_file("monmap",
+                                        0600,
+                                        client->debugfs_dir,
+                                        client,
+                                        &monmap_show_fops);
+        if (!client->debugfs_monmap)
+                goto out;
+        client->debugfs_mdsmap = debugfs_create_file("mdsmap",
+                                        0600,
+                                        client->debugfs_dir,
+                                        client,
+                                        &mdsmap_show_fops);
+        if (!client->debugfs_mdsmap)
+                goto out;
+        client->debugfs_osdmap = debugfs_create_file("osdmap",
+                                        0600,
+                                        client->debugfs_dir,
+                                        client,
+                                        &osdmap_show_fops);
+        if (!client->debugfs_osdmap)
+                goto out;
+        client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
+                                        0600,
+                                        client->debugfs_dir,
+                                        client,
+                                        &dentry_lru_show_fops);
+        if (!client->debugfs_dentry_lru)
+                goto out;
+        client->debugfs_caps = debugfs_create_file("caps",
+                                                   0400,
+                                                   client->debugfs_dir,
+                                                   client,
+                                                   &caps_show_fops);
+        if (!client->debugfs_caps)
+                goto out;
+        client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb",
+                                                   0600,
+                                                   client->debugfs_dir,
+                                                   client,
+                                                   &congestion_kb_fops);
+        if (!client->debugfs_congestion_kb)
+                goto out;
+        sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
+        client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
+                                                     name);
+        return 0;
+out:
+        ceph_debugfs_client_cleanup(client);
+        return ret;
+}
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+        debugfs_remove(client->debugfs_bdi);
+        debugfs_remove(client->debugfs_caps);
+        debugfs_remove(client->debugfs_dentry_lru);
+        debugfs_remove(client->debugfs_osdmap);
+        debugfs_remove(client->debugfs_mdsmap);
+        debugfs_remove(client->debugfs_monmap);
+        debugfs_remove(client->osdc.debugfs_file);
+        debugfs_remove(client->mdsc.debugfs_file);
+        debugfs_remove(client->monc.debugfs_file);
+        debugfs_remove(client->debugfs_congestion_kb);
+        debugfs_remove(client->debugfs_dir);
+}
+#else  // CONFIG_DEBUG_FS
+int __init ceph_debugfs_init(void)
+{
+        return 0;
+}
+void ceph_debugfs_cleanup(void)
+{
+}
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+        return 0;
+}
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+}
+#endif  // CONFIG_DEBUG_FS
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
new file mode 100644
index 000000000000..65b3e022eaf5
--- /dev/null
+++ b/fs/ceph/decode.h
@@ -0,0 +1,194 @@
+#ifndef __CEPH_DECODE_H
+#define __CEPH_DECODE_H
+#include <asm/unaligned.h>
+#include <linux/time.h>
+#include "types.h"
+/*
+ * in all cases,
+ *   void **p     pointer to position pointer
+ *   void *end    pointer to end of buffer (last byte + 1)
+ */
+static inline u64 ceph_decode_64(void **p)
+{
+        u64 v = get_unaligned_le64(*p);
+        *p += sizeof(u64);
+        return v;
+}
+static inline u32 ceph_decode_32(void **p)
+{
+        u32 v = get_unaligned_le32(*p);
+        *p += sizeof(u32);
+        return v;
+}
+static inline u16 ceph_decode_16(void **p)
+{
+        u16 v = get_unaligned_le16(*p);
+        *p += sizeof(u16);
+        return v;
+}
+static inline u8 ceph_decode_8(void **p)
+{
+        u8 v = *(u8 *)*p;
+        (*p)++;
+        return v;
+}
+static inline void ceph_decode_copy(void **p, void *pv, size_t n)
+{
+        memcpy(pv, *p, n);
+        *p += n;
+}
+/*
+ * bounds check input.
+ */
+#define ceph_decode_need(p, end, n, bad)                \
+        do {                                            \
+                if (unlikely(*(p) + (n) > (end)))       \
+                        goto bad;                       \
+        } while (0)
+#define ceph_decode_64_safe(p, end, v, bad)                     \
+        do {                                                    \
+                ceph_decode_need(p, end, sizeof(u64), bad);     \
+                v = ceph_decode_64(p);                          \
+        } while (0)
+#define ceph_decode_32_safe(p, end, v, bad)                     \
+        do {                                                    \
+                ceph_decode_need(p, end, sizeof(u32), bad);     \
+                v = ceph_decode_32(p);                          \
+        } while (0)
+#define ceph_decode_16_safe(p, end, v, bad)                     \
+        do {                                                    \
+                ceph_decode_need(p, end, sizeof(u16), bad);     \
+                v = ceph_decode_16(p);                          \
+        } while (0)
+#define ceph_decode_8_safe(p, end, v, bad)                      \
+        do {                                                    \
+                ceph_decode_need(p, end, sizeof(u8), bad);      \
+                v = ceph_decode_8(p);                           \
+        } while (0)
+#define ceph_decode_copy_safe(p, end, pv, n, bad)               \
+        do {                                                    \
+                ceph_decode_need(p, end, n, bad);               \
+                ceph_decode_copy(p, pv, n);                     \
+        } while (0)
+/*
+ * struct ceph_timespec <-> struct timespec
+ */
+static inline void ceph_decode_timespec(struct timespec *ts,
+                                        const struct ceph_timespec *tv)
+{
+        ts->tv_sec = le32_to_cpu(tv->tv_sec);
+        ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
+}
+static inline void ceph_encode_timespec(struct ceph_timespec *tv,
+                                        const struct timespec *ts)
+{
+        tv->tv_sec = cpu_to_le32(ts->tv_sec);
+        tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
+}
+/*
+ * sockaddr_storage <-> ceph_sockaddr
+ */
+static inline void ceph_encode_addr(struct ceph_entity_addr *a)
+{
+        a->in_addr.ss_family = htons(a->in_addr.ss_family);
+}
+static inline void ceph_decode_addr(struct ceph_entity_addr *a)
+{
+        a->in_addr.ss_family = ntohs(a->in_addr.ss_family);
+        WARN_ON(a->in_addr.ss_family == 512);
+}
+/*
+ * encoders
+ */
+static inline void ceph_encode_64(void **p, u64 v)
+{
+        put_unaligned_le64(v, (__le64 *)*p);
+        *p += sizeof(u64);
+}
+static inline void ceph_encode_32(void **p, u32 v)
+{
+        put_unaligned_le32(v, (__le32 *)*p);
+        *p += sizeof(u32);
+}
+static inline void ceph_encode_16(void **p, u16 v)
+{
+        put_unaligned_le16(v, (__le16 *)*p);
+        *p += sizeof(u16);
+}
+static inline void ceph_encode_8(void **p, u8 v)
+{
+        *(u8 *)*p = v;
+        (*p)++;
+}
+static inline void ceph_encode_copy(void **p, const void *s, int len)
+{
+        memcpy(*p, s, len);
+        *p += len;
+}
+/*
+ * filepath, string encoders
+ */
+static inline void ceph_encode_filepath(void **p, void *end,
+                                        u64 ino, const char *path)
+{
+        u32 len = path ? strlen(path) : 0;
+        BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
+        ceph_encode_8(p, 1);
+        ceph_encode_64(p, ino);
+        ceph_encode_32(p, len);
+        if (len)
+                memcpy(*p, path, len);
+        *p += len;
+}
+static inline void ceph_encode_string(void **p, void *end,
+                                      const char *s, u32 len)
+{
+        BUG_ON(*p + sizeof(len) + len > end);
+        ceph_encode_32(p, len);
+        if (len)
+                memcpy(*p, s, len);
+        *p += len;
+}
+#define ceph_encode_need(p, end, n, bad)                \
+        do {                                            \
+                if (unlikely(*(p) + (n) > (end)))       \
+                        goto bad;                       \
+        } while (0)
+#define ceph_encode_64_safe(p, end, v, bad)                     \
+        do {                                                    \
+                ceph_encode_need(p, end, sizeof(u64), bad);     \
+                ceph_encode_64(p, v);                           \
+        } while (0)
+#define ceph_encode_32_safe(p, end, v, bad)                     \
+        do {                                                    \
+                ceph_encode_need(p, end, sizeof(u32), bad);     \
+                ceph_encode_32(p, v);                   \
+        } while (0)
+#define ceph_encode_16_safe(p, end, v, bad)                     \
+        do {                                                    \
+                ceph_encode_need(p, end, sizeof(u16), bad);     \
+                ceph_encode_16(p, v);                   \
+        } while (0)
+#define ceph_encode_copy_safe(p, end, pv, n, bad)               \
+        do {                                                    \
+                ceph_encode_need(p, end, n, bad);               \
+                ceph_encode_copy(p, pv, n);                     \
+        } while (0)
+#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
new file mode 100644
index 000000000000..7261dc6c2ead
--- /dev/null
+++ b/fs/ceph/dir.c
@@ -0,0 +1,1223 @@
+#include "ceph_debug.h"
+#include <linux/spinlock.h>
+#include <linux/fs_struct.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include "super.h"
+/*
+ * Directory operations: readdir, lookup, create, link, unlink,
+ * rename, etc.
+ */
+/*
+ * Ceph MDS operations are specified in terms of a base ino and
+ * relative path.  Thus, the client can specify an operation on a
+ * specific inode (e.g., a getattr due to fstat(2)), or as a path
+ * relative to, say, the root directory.
+ *
+ * Normally, we limit ourselves to strict inode ops (no path component)
+ * or dentry operations (a single path component relative to an ino).  The
+ * exception to this is open_root_dentry(), which will open the mount
+ * point by name.
+ */
+const struct inode_operations ceph_dir_iops;
+const struct file_operations ceph_dir_fops;
+struct dentry_operations ceph_dentry_ops;
+/*
+ * Initialize ceph dentry state.
+ */
+int ceph_init_dentry(struct dentry *dentry)
+{
+        struct ceph_dentry_info *di;
+        if (dentry->d_fsdata)
+                return 0;
+        if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
+                dentry->d_op = &ceph_dentry_ops;
+        else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
+                dentry->d_op = &ceph_snapdir_dentry_ops;
+        else
+                dentry->d_op = &ceph_snap_dentry_ops;
+        di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
+        if (!di)
+                return -ENOMEM;          /* oh well */
+        spin_lock(&dentry->d_lock);
+        if (dentry->d_fsdata) /* lost a race */
+                goto out_unlock;
+        di->dentry = dentry;
+        di->lease_session = NULL;
+        dentry->d_fsdata = di;
+        dentry->d_time = jiffies;
+        ceph_dentry_lru_add(dentry);
+out_unlock:
+        spin_unlock(&dentry->d_lock);
+        return 0;
+}
+/*
+ * for readdir, we encode the directory frag and offset within that
+ * frag into f_pos.
+ */
+static unsigned fpos_frag(loff_t p)
+{
+        return p >> 32;
+}
+static unsigned fpos_off(loff_t p)
+{
+        return p & 0xffffffff;
+}
+/*
+ * When possible, we try to satisfy a readdir by peeking at the
+ * dcache.  We make this work by carefully ordering dentries on
+ * d_u.d_child when we initially get results back from the MDS, and
+ * falling back to a "normal" sync readdir if any dentries in the dir
+ * are dropped.
+ *
+ * I_COMPLETE tells indicates we have all dentries in the dir.  It is
+ * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
+ * the MDS if/when the directory is modified).
+ */
+static int __dcache_readdir(struct file *filp,
+                            void *dirent, filldir_t filldir)
+{
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct ceph_file_info *fi = filp->private_data;
+        struct dentry *parent = filp->f_dentry;
+        struct inode *dir = parent->d_inode;
+        struct list_head *p;
+        struct dentry *dentry, *last;
+        struct ceph_dentry_info *di;
+        int err = 0;
+        /* claim ref on last dentry we returned */
+        last = fi->dentry;
+        fi->dentry = NULL;
+        dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
+             last);
+        spin_lock(&dcache_lock);
+        /* start at beginning? */
+        if (filp->f_pos == 2 || (last &&
+                                 filp->f_pos < ceph_dentry(last)->offset)) {
+                if (list_empty(&parent->d_subdirs))
+                        goto out_unlock;
+                p = parent->d_subdirs.prev;
+                dout(" initial p %p/%p\n", p->prev, p->next);
+        } else {
+                p = last->d_u.d_child.prev;
+        }
+more:
+        dentry = list_entry(p, struct dentry, d_u.d_child);
+        di = ceph_dentry(dentry);
+        while (1) {
+                dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
+                     parent->d_subdirs.prev, parent->d_subdirs.next);
+                if (p == &parent->d_subdirs) {
+                        fi->at_end = 1;
+                        goto out_unlock;
+                }
+                if (!d_unhashed(dentry) && dentry->d_inode &&
+                    ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
+                    ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
+                    filp->f_pos <= di->offset)
+                        break;
+                dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
+                     dentry->d_name.len, dentry->d_name.name, di->offset,
+                     filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
+                     !dentry->d_inode ? " null" : "");
+                p = p->prev;
+                dentry = list_entry(p, struct dentry, d_u.d_child);
+                di = ceph_dentry(dentry);
+        }
+        atomic_inc(&dentry->d_count);
+        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
+        dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
+             dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+        filp->f_pos = di->offset;
+        err = filldir(dirent, dentry->d_name.name,
+                      dentry->d_name.len, di->offset,
+                      dentry->d_inode->i_ino,
+                      dentry->d_inode->i_mode >> 12);
+        if (last) {
+                if (err < 0) {
+                        /* remember our position */
+                        fi->dentry = last;
+                        fi->next_offset = di->offset;
+                } else {
+                        dput(last);
+                }
+                last = NULL;
+        }
+        spin_lock(&inode->i_lock);
+        spin_lock(&dcache_lock);
+        if (err < 0)
+                goto out_unlock;
+        last = dentry;
+        p = p->prev;
+        filp->f_pos++;
+        /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
+        if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
+                goto more;
+        dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
+        err = -EAGAIN;
+out_unlock:
+        spin_unlock(&dcache_lock);
+        if (last) {
+                spin_unlock(&inode->i_lock);
+                dput(last);
+                spin_lock(&inode->i_lock);
+        }
+        return err;
+}
+/*
+ * make note of the last dentry we read, so we can
+ * continue at the same lexicographical point,
+ * regardless of what dir changes take place on the
+ * server.
+ */
+static int note_last_dentry(struct ceph_file_info *fi, const char *name,
+                            int len)
+{
+        kfree(fi->last_name);
+        fi->last_name = kmalloc(len+1, GFP_NOFS);
+        if (!fi->last_name)
+                return -ENOMEM;
+        memcpy(fi->last_name, name, len);
+        fi->last_name[len] = 0;
+        dout("note_last_dentry '%s'\n", fi->last_name);
+        return 0;
+}
+static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+        struct ceph_file_info *fi = filp->private_data;
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_client *client = ceph_inode_to_client(inode);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        unsigned frag = fpos_frag(filp->f_pos);
+        int off = fpos_off(filp->f_pos);
+        int err;
+        u32 ftype;
+        struct ceph_mds_reply_info_parsed *rinfo;
+        const int max_entries = client->mount_args->max_readdir;
+        dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
+        if (fi->at_end)
+                return 0;
+        /* always start with . and .. */
+        if (filp->f_pos == 0) {
+                /* note dir version at start of readdir so we can tell
+                 * if any dentries get dropped */
+                fi->dir_release_count = ci->i_release_count;
+                dout("readdir off 0 -> '.'\n");
+                if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
+                            inode->i_ino, inode->i_mode >> 12) < 0)
+                        return 0;
+                filp->f_pos = 1;
+                off = 1;
+        }
+        if (filp->f_pos == 1) {
+                dout("readdir off 1 -> '..'\n");
+                if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
+                            filp->f_dentry->d_parent->d_inode->i_ino,
+                            inode->i_mode >> 12) < 0)
+                        return 0;
+                filp->f_pos = 2;
+                off = 2;
+        }
+        /* can we use the dcache? */
+        spin_lock(&inode->i_lock);
+        if ((filp->f_pos == 2 || fi->dentry) &&
+            !ceph_test_opt(client, NOASYNCREADDIR) &&
+            (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+            __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+                err = __dcache_readdir(filp, dirent, filldir);
+                if (err != -EAGAIN) {
+                        spin_unlock(&inode->i_lock);
+                        return err;
+                }
+        }
+        spin_unlock(&inode->i_lock);
+        if (fi->dentry) {
+                err = note_last_dentry(fi, fi->dentry->d_name.name,
+                                       fi->dentry->d_name.len);
+                if (err)
+                        return err;
+                dput(fi->dentry);
+                fi->dentry = NULL;
+        }
+        /* proceed with a normal readdir */
+more:
+        /* do we have the correct frag content buffered? */
+        if (fi->frag != frag || fi->last_readdir == NULL) {
+                struct ceph_mds_request *req;
+                int op = ceph_snap(inode) == CEPH_SNAPDIR ?
+                        CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
+                /* discard old result, if any */
+                if (fi->last_readdir) {
+                        ceph_mdsc_put_request(fi->last_readdir);
+                        fi->last_readdir = NULL;
+                }
+                /* requery frag tree, as the frag topology may have changed */
+                frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
+                dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
+                     ceph_vinop(inode), frag, fi->last_name);
+                req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+                if (IS_ERR(req))
+                        return PTR_ERR(req);
+                req->r_inode = igrab(inode);
+                req->r_dentry = dget(filp->f_dentry);
+                /* hints to request -> mds selection code */
+                req->r_direct_mode = USE_AUTH_MDS;
+                req->r_direct_hash = ceph_frag_value(frag);
+                req->r_direct_is_hash = true;
+                req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
+                req->r_readdir_offset = fi->next_offset;
+                req->r_args.readdir.frag = cpu_to_le32(frag);
+                req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
+                req->r_num_caps = max_entries;
+                err = ceph_mdsc_do_request(mdsc, NULL, req);
+                if (err < 0) {
+                        ceph_mdsc_put_request(req);
+                        return err;
+                }
+                dout("readdir got and parsed readdir result=%d"
+                     " on frag %x, end=%d, complete=%d\n", err, frag,
+                     (int)req->r_reply_info.dir_end,
+                     (int)req->r_reply_info.dir_complete);
+                if (!req->r_did_prepopulate) {
+                        dout("readdir !did_prepopulate");
+                        fi->dir_release_count--;    /* preclude I_COMPLETE */
+                }
+                /* note next offset and last dentry name */
+                fi->offset = fi->next_offset;
+                fi->last_readdir = req;
+                if (req->r_reply_info.dir_end) {
+                        kfree(fi->last_name);
+                        fi->last_name = NULL;
+                        fi->next_offset = 0;
+                } else {
+                        rinfo = &req->r_reply_info;
+                        err = note_last_dentry(fi,
+                                       rinfo->dir_dname[rinfo->dir_nr-1],
+                                       rinfo->dir_dname_len[rinfo->dir_nr-1]);
+                        if (err)
+                                return err;
+                        fi->next_offset += rinfo->dir_nr;
+                }
+        }
+        rinfo = &fi->last_readdir->r_reply_info;
+        dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
+             rinfo->dir_nr, off, fi->offset);
+        while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) {
+                u64 pos = ceph_make_fpos(frag, off);
+                struct ceph_mds_reply_inode *in =
+                        rinfo->dir_in[off - fi->offset].in;
+                dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
+                     off, off - fi->offset, rinfo->dir_nr, pos,
+                     rinfo->dir_dname_len[off - fi->offset],
+                     rinfo->dir_dname[off - fi->offset], in);
+                BUG_ON(!in);
+                ftype = le32_to_cpu(in->mode) >> 12;
+                if (filldir(dirent,
+                            rinfo->dir_dname[off - fi->offset],
+                            rinfo->dir_dname_len[off - fi->offset],
+                            pos,
+                            le64_to_cpu(in->ino),
+                            ftype) < 0) {
+                        dout("filldir stopping us...\n");
+                        return 0;
+                }
+                off++;
+                filp->f_pos = pos + 1;
+        }
+        if (fi->last_name) {
+                ceph_mdsc_put_request(fi->last_readdir);
+                fi->last_readdir = NULL;
+                goto more;
+        }
+        /* more frags? */
+        if (!ceph_frag_is_rightmost(frag)) {
+                frag = ceph_frag_next(frag);
+                off = 0;
+                filp->f_pos = ceph_make_fpos(frag, off);
+                dout("readdir next frag is %x\n", frag);
+                goto more;
+        }
+        fi->at_end = 1;
+        /*
+         * if dir_release_count still matches the dir, no dentries
+         * were released during the whole readdir, and we should have
+         * the complete dir contents in our cache.
+         */
+        spin_lock(&inode->i_lock);
+        if (ci->i_release_count == fi->dir_release_count) {
+                dout(" marking %p complete\n", inode);
+                ci->i_ceph_flags |= CEPH_I_COMPLETE;
+                ci->i_max_offset = filp->f_pos;
+        }
+        spin_unlock(&inode->i_lock);
+        dout("readdir %p filp %p done.\n", inode, filp);
+        return 0;
+}
+static void reset_readdir(struct ceph_file_info *fi)
+{
+        if (fi->last_readdir) {
+                ceph_mdsc_put_request(fi->last_readdir);
+                fi->last_readdir = NULL;
+        }
+        kfree(fi->last_name);
+        fi->next_offset = 2;  /* compensate for . and .. */
+        if (fi->dentry) {
+                dput(fi->dentry);
+                fi->dentry = NULL;
+        }
+        fi->at_end = 0;
+}
+static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+        struct ceph_file_info *fi = file->private_data;
+        struct inode *inode = file->f_mapping->host;
+        loff_t old_offset = offset;
+        loff_t retval;
+        mutex_lock(&inode->i_mutex);
+        switch (origin) {
+        case SEEK_END:
+                offset += inode->i_size + 2;   /* FIXME */
+                break;
+        case SEEK_CUR:
+                offset += file->f_pos;
+        }
+        retval = -EINVAL;
+        if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
+                if (offset != file->f_pos) {
+                        file->f_pos = offset;
+                        file->f_version = 0;
+                        fi->at_end = 0;
+                }
+                retval = offset;
+                /*
+                 * discard buffered readdir content on seekdir(0), or
+                 * seek to new frag, or seek prior to current chunk.
+                 */
+                if (offset == 0 ||
+                    fpos_frag(offset) != fpos_frag(old_offset) ||
+                    fpos_off(offset) < fi->offset) {
+                        dout("dir_llseek dropping %p content\n", file);
+                        reset_readdir(fi);
+                }
+                /* bump dir_release_count if we did a forward seek */
+                if (offset > old_offset)
+                        fi->dir_release_count--;
+        }
+        mutex_unlock(&inode->i_mutex);
+        return retval;
+}
+/*
+ * Process result of a lookup/open request.
+ *
+ * Mainly, make sure we return the final req->r_dentry (if it already
+ * existed) in place of the original VFS-provided dentry when they
+ * differ.
+ *
+ * Gracefully handle the case where the MDS replies with -ENOENT and
+ * no trace (which it may do, at its discretion, e.g., if it doesn't
+ * care to issue a lease on the negative dentry).
+ */
+struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
+                                  struct dentry *dentry, int err)
+{
+        struct ceph_client *client = ceph_client(dentry->d_sb);
+        struct inode *parent = dentry->d_parent->d_inode;
+        /* .snap dir? */
+        if (err == -ENOENT &&
+            ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
+            strcmp(dentry->d_name.name,
+                   client->mount_args->snapdir_name) == 0) {
+                struct inode *inode = ceph_get_snapdir(parent);
+                dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
+                     dentry, dentry->d_name.len, dentry->d_name.name, inode);
+                d_add(dentry, inode);
+                err = 0;
+        }
+        if (err == -ENOENT) {
+                /* no trace? */
+                err = 0;
+                if (!req->r_reply_info.head->is_dentry) {
+                        dout("ENOENT and no trace, dentry %p inode %p\n",
+                             dentry, dentry->d_inode);
+                        if (dentry->d_inode) {
+                                d_drop(dentry);
+                                err = -ENOENT;
+                        } else {
+                                d_add(dentry, NULL);
+                        }
+                }
+        }
+        if (err)
+                dentry = ERR_PTR(err);
+        else if (dentry != req->r_dentry)
+                dentry = dget(req->r_dentry);   /* we got spliced */
+        else
+                dentry = NULL;
+        return dentry;
+}
+static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
+{
+        return ceph_ino(inode) == CEPH_INO_ROOT &&
+                strncmp(dentry->d_name.name, ".ceph", 5) == 0;
+}
+/*
+ * Look up a single dir entry.  If there is a lookup intent, inform
+ * the MDS so that it gets our 'caps wanted' value in a single op.
+ */
+static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
+                                  struct nameidata *nd)
+{
+        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_request *req;
+        int op;
+        int err;
+        dout("lookup %p dentry %p '%.*s'\n",
+             dir, dentry, dentry->d_name.len, dentry->d_name.name);
+        if (dentry->d_name.len > NAME_MAX)
+                return ERR_PTR(-ENAMETOOLONG);
+        err = ceph_init_dentry(dentry);
+        if (err < 0)
+                return ERR_PTR(err);
+        /* open (but not create!) intent? */
+        if (nd &&
+            (nd->flags & LOOKUP_OPEN) &&
+            (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */
+            !(nd->intent.open.flags & O_CREAT)) {
+                int mode = nd->intent.open.create_mode & ~current->fs->umask;
+                return ceph_lookup_open(dir, dentry, nd, mode, 1);
+        }
+        /* can we conclude ENOENT locally? */
+        if (dentry->d_inode == NULL) {
+                struct ceph_inode_info *ci = ceph_inode(dir);
+                struct ceph_dentry_info *di = ceph_dentry(dentry);
+                spin_lock(&dir->i_lock);
+                dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
+                if (strncmp(dentry->d_name.name,
+                            client->mount_args->snapdir_name,
+                            dentry->d_name.len) &&
+                    !is_root_ceph_dentry(dir, dentry) &&
+                    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+                    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
+                        di->offset = ci->i_max_offset++;
+                        spin_unlock(&dir->i_lock);
+                        dout(" dir %p complete, -ENOENT\n", dir);
+                        d_add(dentry, NULL);
+                        di->lease_shared_gen = ci->i_shared_gen;
+                        return NULL;
+                }
+                spin_unlock(&dir->i_lock);
+        }
+        op = ceph_snap(dir) == CEPH_SNAPDIR ?
+                CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
+        req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
+        if (IS_ERR(req))
+                return ERR_PTR(PTR_ERR(req));
+        req->r_dentry = dget(dentry);
+        req->r_num_caps = 2;
+        /* we only need inode linkage */
+        req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
+        req->r_locked_dir = dir;
+        err = ceph_mdsc_do_request(mdsc, NULL, req);
+        dentry = ceph_finish_lookup(req, dentry, err);
+        ceph_mdsc_put_request(req);  /* will dput(dentry) */
+        dout("lookup result=%p\n", dentry);
+        return dentry;
+}
+/*
+ * If we do a create but get no trace back from the MDS, follow up with
+ * a lookup (the VFS expects us to link up the provided dentry).
+ */
+int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
+{
+        struct dentry *result = ceph_lookup(dir, dentry, NULL);
+        if (result && !IS_ERR(result)) {
+                /*
+                 * We created the item, then did a lookup, and found
+                 * it was already linked to another inode we already
+                 * had in our cache (and thus got spliced).  Link our
+                 * dentry to that inode, but don't hash it, just in
+                 * case the VFS wants to dereference it.
+                 */
+                BUG_ON(!result->d_inode);
+                d_instantiate(dentry, result->d_inode);
+                return 0;
+        }
+        return PTR_ERR(result);
+}
+static int ceph_mknod(struct inode *dir, struct dentry *dentry,
+                      int mode, dev_t rdev)
+{
+        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_request *req;
+        int err;
+        if (ceph_snap(dir) != CEPH_NOSNAP)
+                return -EROFS;
+        dout("mknod in dir %p dentry %p mode 0%o rdev %d\n",
+             dir, dentry, mode, rdev);
+        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
+        if (IS_ERR(req)) {
+                d_drop(dentry);
+                return PTR_ERR(req);
+        }
+        req->r_dentry = dget(dentry);
+        req->r_num_caps = 2;
+        req->r_locked_dir = dir;
+        req->r_args.mknod.mode = cpu_to_le32(mode);
+        req->r_args.mknod.rdev = cpu_to_le32(rdev);
+        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+        err = ceph_mdsc_do_request(mdsc, dir, req);
+        if (!err && !req->r_reply_info.head->is_dentry)
+                err = ceph_handle_notrace_create(dir, dentry);
+        ceph_mdsc_put_request(req);
+        if (err)
+                d_drop(dentry);
+        return err;
+}
+static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
+                       struct nameidata *nd)
+{
+        dout("create in dir %p dentry %p name '%.*s'\n",
+             dir, dentry, dentry->d_name.len, dentry->d_name.name);
+        if (ceph_snap(dir) != CEPH_NOSNAP)
+                return -EROFS;
+        if (nd) {
+                BUG_ON((nd->flags & LOOKUP_OPEN) == 0);
+                dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);
+                /* hrm, what should i do here if we get aliased? */
+                if (IS_ERR(dentry))
+                        return PTR_ERR(dentry);
+                return 0;
+        }
+        /* fall back to mknod */
+        return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0);
+}
+static int ceph_symlink(struct inode *dir, struct dentry *dentry,
+                            const char *dest)
+{
+        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_request *req;
+        int err;
+        if (ceph_snap(dir) != CEPH_NOSNAP)
+                return -EROFS;
+        dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
+        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
+        if (IS_ERR(req)) {
+                d_drop(dentry);
+                return PTR_ERR(req);
+        }
+        req->r_dentry = dget(dentry);
+        req->r_num_caps = 2;
+        req->r_path2 = kstrdup(dest, GFP_NOFS);
+        req->r_locked_dir = dir;
+        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+        err = ceph_mdsc_do_request(mdsc, dir, req);
+        if (!err && !req->r_reply_info.head->is_dentry)
+                err = ceph_handle_notrace_create(dir, dentry);
+        ceph_mdsc_put_request(req);
+        if (err)
+                d_drop(dentry);
+        return err;
+}
+static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_request *req;
+        int err = -EROFS;
+        int op;
+        if (ceph_snap(dir) == CEPH_SNAPDIR) {
+                /* mkdir .snap/foo is a MKSNAP */
+                op = CEPH_MDS_OP_MKSNAP;
+                dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
+                     dentry->d_name.len, dentry->d_name.name, dentry);
+        } else if (ceph_snap(dir) == CEPH_NOSNAP) {
+                dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode);
+                op = CEPH_MDS_OP_MKDIR;
+        } else {
+                goto out;
+        }
+        req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+        if (IS_ERR(req)) {
+                err = PTR_ERR(req);
+                goto out;
+        }
+        req->r_dentry = dget(dentry);
+        req->r_num_caps = 2;
+        req->r_locked_dir = dir;
+        req->r_args.mkdir.mode = cpu_to_le32(mode);
+        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+        err = ceph_mdsc_do_request(mdsc, dir, req);
+        if (!err && !req->r_reply_info.head->is_dentry)
+                err = ceph_handle_notrace_create(dir, dentry);
+        ceph_mdsc_put_request(req);
+out:
+        if (err < 0)
+                d_drop(dentry);
+        return err;
+}
+static int ceph_link(struct dentry *old_dentry, struct inode *dir,
+                     struct dentry *dentry)
+{
+        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_request *req;
+        int err;
+        if (ceph_snap(dir) != CEPH_NOSNAP)
+                return -EROFS;
+        dout("link in dir %p old_dentry %p dentry %p\n", dir,
+             old_dentry, dentry);
+        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
+        if (IS_ERR(req)) {
+                d_drop(dentry);
+                return PTR_ERR(req);
+        }
+        req->r_dentry = dget(dentry);
+        req->r_num_caps = 2;
+        req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
+        req->r_locked_dir = dir;
+        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+        err = ceph_mdsc_do_request(mdsc, dir, req);
+        if (err)
+                d_drop(dentry);
+        else if (!req->r_reply_info.head->is_dentry)
+                d_instantiate(dentry, igrab(old_dentry->d_inode));
+        ceph_mdsc_put_request(req);
+        return err;
+}
+/*
+ * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps.  If it
+ * looks like the link count will hit 0, drop any other caps (other
+ * than PIN) we don't specifically want (due to the file still being
+ * open).
+ */
+static int drop_caps_for_unlink(struct inode *inode)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
+        spin_lock(&inode->i_lock);
+        if (inode->i_nlink == 1) {
+                drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
+                ci->i_ceph_flags |= CEPH_I_NODELAY;
+        }
+        spin_unlock(&inode->i_lock);
+        return drop;
+}
+/*
+ * rmdir and unlink are differ only by the metadata op code
+ */
+static int ceph_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct inode *inode = dentry->d_inode;
+        struct ceph_mds_request *req;
+        int err = -EROFS;
+        int op;
+        if (ceph_snap(dir) == CEPH_SNAPDIR) {
+                /* rmdir .snap/foo is RMSNAP */
+                dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
+                     dentry->d_name.name, dentry);
+                op = CEPH_MDS_OP_RMSNAP;
+        } else if (ceph_snap(dir) == CEPH_NOSNAP) {
+                dout("unlink/rmdir dir %p dn %p inode %p\n",
+                     dir, dentry, inode);
+                op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ?
+                        CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
+        } else
+                goto out;
+        req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+        if (IS_ERR(req)) {
+                err = PTR_ERR(req);
+                goto out;
+        }
+        req->r_dentry = dget(dentry);
+        req->r_num_caps = 2;
+        req->r_locked_dir = dir;
+        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+        req->r_inode_drop = drop_caps_for_unlink(inode);
+        err = ceph_mdsc_do_request(mdsc, dir, req);
+        if (!err && !req->r_reply_info.head->is_dentry)
+                d_delete(dentry);
+        ceph_mdsc_put_request(req);
+out:
+        return err;
+}
+static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
+                       struct inode *new_dir, struct dentry *new_dentry)
+{
+        struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_request *req;
+        int err;
+        if (ceph_snap(old_dir) != ceph_snap(new_dir))
+                return -EXDEV;
+        if (ceph_snap(old_dir) != CEPH_NOSNAP ||
+            ceph_snap(new_dir) != CEPH_NOSNAP)
+                return -EROFS;
+        dout("rename dir %p dentry %p to dir %p dentry %p\n",
+             old_dir, old_dentry, new_dir, new_dentry);
+        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        req->r_dentry = dget(new_dentry);
+        req->r_num_caps = 2;
+        req->r_old_dentry = dget(old_dentry);
+        req->r_locked_dir = new_dir;
+        req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
+        req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
+        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+        /* release LINK_RDCACHE on source inode (mds will lock it) */
+        req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
+        if (new_dentry->d_inode)
+                req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
+        err = ceph_mdsc_do_request(mdsc, old_dir, req);
+        if (!err && !req->r_reply_info.head->is_dentry) {
+                /*
+                 * Normally d_move() is done by fill_trace (called by
+                 * do_request, above).  If there is no trace, we need
+                 * to do it here.
+                 */
+                d_move(old_dentry, new_dentry);
+        }
+        ceph_mdsc_put_request(req);
+        return err;
+}
+/*
+ * Check if dentry lease is valid.  If not, delete the lease.  Try to
+ * renew if the least is more than half up.
+ */
+static int dentry_lease_is_valid(struct dentry *dentry)
+{
+        struct ceph_dentry_info *di;
+        struct ceph_mds_session *s;
+        int valid = 0;
+        u32 gen;
+        unsigned long ttl;
+        struct ceph_mds_session *session = NULL;
+        struct inode *dir = NULL;
+        u32 seq = 0;
+        spin_lock(&dentry->d_lock);
+        di = ceph_dentry(dentry);
+        if (di && di->lease_session) {
+                s = di->lease_session;
+                spin_lock(&s->s_cap_lock);
+                gen = s->s_cap_gen;
+                ttl = s->s_cap_ttl;
+                spin_unlock(&s->s_cap_lock);
+                if (di->lease_gen == gen &&
+                    time_before(jiffies, dentry->d_time) &&
+                    time_before(jiffies, ttl)) {
+                        valid = 1;
+                        if (di->lease_renew_after &&
+                            time_after(jiffies, di->lease_renew_after)) {
+                                /* we should renew */
+                                dir = dentry->d_parent->d_inode;
+                                session = ceph_get_mds_session(s);
+                                seq = di->lease_seq;
+                                di->lease_renew_after = 0;
+                                di->lease_renew_from = jiffies;
+                        }
+                }
+        }
+        spin_unlock(&dentry->d_lock);
+        if (session) {
+                ceph_mdsc_lease_send_msg(session, dir, dentry,
+                                         CEPH_MDS_LEASE_RENEW, seq);
+                ceph_put_mds_session(session);
+        }
+        dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
+        return valid;
+}
+/*
+ * Check if directory-wide content lease/cap is valid.
+ */
+static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
+{
+        struct ceph_inode_info *ci = ceph_inode(dir);
+        struct ceph_dentry_info *di = ceph_dentry(dentry);
+        int valid = 0;
+        spin_lock(&dir->i_lock);
+        if (ci->i_shared_gen == di->lease_shared_gen)
+                valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
+        spin_unlock(&dir->i_lock);
+        dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
+             dir, (unsigned)ci->i_shared_gen, dentry,
+             (unsigned)di->lease_shared_gen, valid);
+        return valid;
+}
+/*
+ * Check if cached dentry can be trusted.
+ */
+static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        struct inode *dir = dentry->d_parent->d_inode;
+        dout("d_revalidate %p '%.*s' inode %p\n", dentry,
+             dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+        /* always trust cached snapped dentries, snapdir dentry */
+        if (ceph_snap(dir) != CEPH_NOSNAP) {
+                dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
+                     dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+                goto out_touch;
+        }
+        if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR)
+                goto out_touch;
+        if (dentry_lease_is_valid(dentry) ||
+            dir_lease_is_valid(dir, dentry))
+                goto out_touch;
+        dout("d_revalidate %p invalid\n", dentry);
+        d_drop(dentry);
+        return 0;
+out_touch:
+        ceph_dentry_lru_touch(dentry);
+        return 1;
+}
+/*
+ * When a dentry is released, clear the dir I_COMPLETE if it was part
+ * of the current dir gen.
+ */
+static void ceph_dentry_release(struct dentry *dentry)
+{
+        struct ceph_dentry_info *di = ceph_dentry(dentry);
+        struct inode *parent_inode = dentry->d_parent->d_inode;
+        if (parent_inode) {
+                struct ceph_inode_info *ci = ceph_inode(parent_inode);
+                spin_lock(&parent_inode->i_lock);
+                if (ci->i_shared_gen == di->lease_shared_gen) {
+                        dout(" clearing %p complete (d_release)\n",
+                             parent_inode);
+                        ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+                        ci->i_release_count++;
+                }
+                spin_unlock(&parent_inode->i_lock);
+        }
+        if (di) {
+                ceph_dentry_lru_del(dentry);
+                if (di->lease_session)
+                        ceph_put_mds_session(di->lease_session);
+                kmem_cache_free(ceph_dentry_cachep, di);
+                dentry->d_fsdata = NULL;
+        }
+}
+static int ceph_snapdir_d_revalidate(struct dentry *dentry,
+                                          struct nameidata *nd)
+{
+        /*
+         * Eventually, we'll want to revalidate snapped metadata
+         * too... probably...
+         */
+        return 1;
+}
+/*
+ * read() on a dir.  This weird interface hack only works if mounted
+ * with '-o dirstat'.
+ */
+static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
+                             loff_t *ppos)
+{
+        struct ceph_file_info *cf = file->private_data;
+        struct inode *inode = file->f_dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int left;
+        if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
+                return -EISDIR;
+        if (!cf->dir_info) {
+                cf->dir_info = kmalloc(1024, GFP_NOFS);
+                if (!cf->dir_info)
+                        return -ENOMEM;
+                cf->dir_info_len =
+                        sprintf(cf->dir_info,
+                                "entries:   %20lld\n"
+                                " files:    %20lld\n"
+                                " subdirs:  %20lld\n"
+                                "rentries:  %20lld\n"
+                                " rfiles:   %20lld\n"
+                                " rsubdirs: %20lld\n"
+                                "rbytes:    %20lld\n"
+                                "rctime:    %10ld.%09ld\n",
+                                ci->i_files + ci->i_subdirs,
+                                ci->i_files,
+                                ci->i_subdirs,
+                                ci->i_rfiles + ci->i_rsubdirs,
+                                ci->i_rfiles,
+                                ci->i_rsubdirs,
+                                ci->i_rbytes,
+                                (long)ci->i_rctime.tv_sec,
+                                (long)ci->i_rctime.tv_nsec);
+        }
+        if (*ppos >= cf->dir_info_len)
+                return 0;
+        size = min_t(unsigned, size, cf->dir_info_len-*ppos);
+        left = copy_to_user(buf, cf->dir_info + *ppos, size);
+        if (left == size)
+                return -EFAULT;
+        *ppos += (size - left);
+        return size - left;
+}
+/*
+ * an fsync() on a dir will wait for any uncommitted directory
+ * operations to commit.
+ */
+static int ceph_dir_fsync(struct file *file, struct dentry *dentry,
+                          int datasync)
+{
+        struct inode *inode = dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct list_head *head = &ci->i_unsafe_dirops;
+        struct ceph_mds_request *req;
+        u64 last_tid;
+        int ret = 0;
+        dout("dir_fsync %p\n", inode);
+        spin_lock(&ci->i_unsafe_lock);
+        if (list_empty(head))
+                goto out;
+        req = list_entry(head->prev,
+                         struct ceph_mds_request, r_unsafe_dir_item);
+        last_tid = req->r_tid;
+        do {
+                ceph_mdsc_get_request(req);
+                spin_unlock(&ci->i_unsafe_lock);
+                dout("dir_fsync %p wait on tid %llu (until %llu)\n",
+                     inode, req->r_tid, last_tid);
+                if (req->r_timeout) {
+                        ret = wait_for_completion_timeout(
+                                &req->r_safe_completion, req->r_timeout);
+                        if (ret > 0)
+                                ret = 0;
+                        else if (ret == 0)
+                                ret = -EIO;  /* timed out */
+                } else {
+                        wait_for_completion(&req->r_safe_completion);
+                }
+                spin_lock(&ci->i_unsafe_lock);
+                ceph_mdsc_put_request(req);
+                if (ret || list_empty(head))
+                        break;
+                req = list_entry(head->next,
+                                 struct ceph_mds_request, r_unsafe_dir_item);
+        } while (req->r_tid < last_tid);
+out:
+        spin_unlock(&ci->i_unsafe_lock);
+        return ret;
+}
+/*
+ * We maintain a private dentry LRU.
+ *
+ * FIXME: this needs to be changed to a per-mds lru to be useful.
+ */
+void ceph_dentry_lru_add(struct dentry *dn)
+{
+        struct ceph_dentry_info *di = ceph_dentry(dn);
+        struct ceph_mds_client *mdsc;
+        dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
+             dn->d_name.len, dn->d_name.name);
+        if (di) {
+                mdsc = &ceph_client(dn->d_sb)->mdsc;
+                spin_lock(&mdsc->dentry_lru_lock);
+                list_add_tail(&di->lru, &mdsc->dentry_lru);
+                mdsc->num_dentry++;
+                spin_unlock(&mdsc->dentry_lru_lock);
+        }
+}
+void ceph_dentry_lru_touch(struct dentry *dn)
+{
+        struct ceph_dentry_info *di = ceph_dentry(dn);
+        struct ceph_mds_client *mdsc;
+        dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
+             dn->d_name.len, dn->d_name.name);
+        if (di) {
+                mdsc = &ceph_client(dn->d_sb)->mdsc;
+                spin_lock(&mdsc->dentry_lru_lock);
+                list_move_tail(&di->lru, &mdsc->dentry_lru);
+                spin_unlock(&mdsc->dentry_lru_lock);
+        }
+}
+void ceph_dentry_lru_del(struct dentry *dn)
+{
+        struct ceph_dentry_info *di = ceph_dentry(dn);
+        struct ceph_mds_client *mdsc;
+        dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
+             dn->d_name.len, dn->d_name.name);
+        if (di) {
+                mdsc = &ceph_client(dn->d_sb)->mdsc;
+                spin_lock(&mdsc->dentry_lru_lock);
+                list_del_init(&di->lru);
+                mdsc->num_dentry--;
+                spin_unlock(&mdsc->dentry_lru_lock);
+        }
+}
+const struct file_operations ceph_dir_fops = {
+        .read = ceph_read_dir,
+        .readdir = ceph_readdir,
+        .llseek = ceph_dir_llseek,
+        .open = ceph_open,
+        .release = ceph_release,
+        .unlocked_ioctl = ceph_ioctl,
+        .fsync = ceph_dir_fsync,
+};
+const struct inode_operations ceph_dir_iops = {
+        .lookup = ceph_lookup,
+        .permission = ceph_permission,
+        .getattr = ceph_getattr,
+        .setattr = ceph_setattr,
+        .setxattr = ceph_setxattr,
+        .getxattr = ceph_getxattr,
+        .listxattr = ceph_listxattr,
+        .removexattr = ceph_removexattr,
+        .mknod = ceph_mknod,
+        .symlink = ceph_symlink,
+        .mkdir = ceph_mkdir,
+        .link = ceph_link,
+        .unlink = ceph_unlink,
+        .rmdir = ceph_unlink,
+        .rename = ceph_rename,
+        .create = ceph_create,
+};
+struct dentry_operations ceph_dentry_ops = {
+        .d_revalidate = ceph_d_revalidate,
+        .d_release = ceph_dentry_release,
+};
+struct dentry_operations ceph_snapdir_dentry_ops = {
+        .d_revalidate = ceph_snapdir_d_revalidate,
+};
+struct dentry_operations ceph_snap_dentry_ops = {
+};
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
new file mode 100644
index 000000000000..9d67572fb328
--- /dev/null
+++ b/fs/ceph/export.c
@@ -0,0 +1,224 @@
+#include "ceph_debug.h"
+#include <linux/exportfs.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+#include "super.h"
+/*
+ * NFS export support
+ *
+ * NFS re-export of a ceph mount is, at present, only semireliable.
+ * The basic issue is that the Ceph architectures doesn't lend itself
+ * well to generating filehandles that will remain valid forever.
+ *
+ * So, we do our best.  If you're lucky, your inode will be in the
+ * client's cache.  If it's not, and you have a connectable fh, then
+ * the MDS server may be able to find it for you.  Otherwise, you get
+ * ESTALE.
+ *
+ * There are ways to this more reliable, but in the non-connectable fh
+ * case, we won't every work perfectly, and in the connectable case,
+ * some changes are needed on the MDS side to work better.
+ */
+/*
+ * Basic fh
+ */
+struct ceph_nfs_fh {
+        u64 ino;
+} __attribute__ ((packed));
+/*
+ * Larger 'connectable' fh that includes parent ino and name hash.
+ * Use this whenever possible, as it works more reliably.
+ */
+struct ceph_nfs_confh {
+        u64 ino, parent_ino;
+        u32 parent_name_hash;
+} __attribute__ ((packed));
+static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
+                          int connectable)
+{
+        struct ceph_nfs_fh *fh = (void *)rawfh;
+        struct ceph_nfs_confh *cfh = (void *)rawfh;
+        struct dentry *parent = dentry->d_parent;
+        struct inode *inode = dentry->d_inode;
+        int type;
+        /* don't re-export snaps */
+        if (ceph_snap(inode) != CEPH_NOSNAP)
+                return -EINVAL;
+        if (*max_len >= sizeof(*cfh)) {
+                dout("encode_fh %p connectable\n", dentry);
+                cfh->ino = ceph_ino(dentry->d_inode);
+                cfh->parent_ino = ceph_ino(parent->d_inode);
+                cfh->parent_name_hash = parent->d_name.hash;
+                *max_len = sizeof(*cfh);
+                type = 2;
+        } else if (*max_len > sizeof(*fh)) {
+                if (connectable)
+                        return -ENOSPC;
+                dout("encode_fh %p\n", dentry);
+                fh->ino = ceph_ino(dentry->d_inode);
+                *max_len = sizeof(*fh);
+                type = 1;
+        } else {
+                return -ENOSPC;
+        }
+        return type;
+}
+/*
+ * convert regular fh to dentry
+ *
+ * FIXME: we should try harder by querying the mds for the ino.
+ */
+static struct dentry *__fh_to_dentry(struct super_block *sb,
+                                     struct ceph_nfs_fh *fh)
+{
+        struct inode *inode;
+        struct dentry *dentry;
+        struct ceph_vino vino;
+        int err;
+        dout("__fh_to_dentry %llx\n", fh->ino);
+        vino.ino = fh->ino;
+        vino.snap = CEPH_NOSNAP;
+        inode = ceph_find_inode(sb, vino);
+        if (!inode)
+                return ERR_PTR(-ESTALE);
+        dentry = d_obtain_alias(inode);
+        if (!dentry) {
+                pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
+                       fh->ino, inode);
+                iput(inode);
+                return ERR_PTR(-ENOMEM);
+        }
+        err = ceph_init_dentry(dentry);
+        if (err < 0) {
+                iput(inode);
+                return ERR_PTR(err);
+        }
+        dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
+        return dentry;
+}
+/*
+ * convert connectable fh to dentry
+ */
+static struct dentry *__cfh_to_dentry(struct super_block *sb,
+                                      struct ceph_nfs_confh *cfh)
+{
+        struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
+        struct inode *inode;
+        struct dentry *dentry;
+        struct ceph_vino vino;
+        int err;
+        dout("__cfh_to_dentry %llx (%llx/%x)\n",
+             cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
+        vino.ino = cfh->ino;
+        vino.snap = CEPH_NOSNAP;
+        inode = ceph_find_inode(sb, vino);
+        if (!inode) {
+                struct ceph_mds_request *req;
+                req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
+                                               USE_ANY_MDS);
+                if (IS_ERR(req))
+                        return ERR_PTR(PTR_ERR(req));
+                req->r_ino1 = vino;
+                req->r_ino2.ino = cfh->parent_ino;
+                req->r_ino2.snap = CEPH_NOSNAP;
+                req->r_path2 = kmalloc(16, GFP_NOFS);
+                snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
+                req->r_num_caps = 1;
+                err = ceph_mdsc_do_request(mdsc, NULL, req);
+                ceph_mdsc_put_request(req);
+                inode = ceph_find_inode(sb, vino);
+                if (!inode)
+                        return ERR_PTR(err ? err : -ESTALE);
+        }
+        dentry = d_obtain_alias(inode);
+        if (!dentry) {
+                pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
+                       cfh->ino, inode);
+                iput(inode);
+                return ERR_PTR(-ENOMEM);
+        }
+        err = ceph_init_dentry(dentry);
+        if (err < 0) {
+                iput(inode);
+                return ERR_PTR(err);
+        }
+        dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
+        return dentry;
+}
+static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
+                                        int fh_len, int fh_type)
+{
+        if (fh_type == 1)
+                return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw);
+        else
+                return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw);
+}
+/*
+ * get parent, if possible.
+ *
+ * FIXME: we could do better by querying the mds to discover the
+ * parent.
+ */
+static struct dentry *ceph_fh_to_parent(struct super_block *sb,
+                                         struct fid *fid,
+                                        int fh_len, int fh_type)
+{
+        struct ceph_nfs_confh *cfh = (void *)fid->raw;
+        struct ceph_vino vino;
+        struct inode *inode;
+        struct dentry *dentry;
+        int err;
+        if (fh_type == 1)
+                return ERR_PTR(-ESTALE);
+        pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
+                 cfh->parent_name_hash);
+        vino.ino = cfh->ino;
+        vino.snap = CEPH_NOSNAP;
+        inode = ceph_find_inode(sb, vino);
+        if (!inode)
+                return ERR_PTR(-ESTALE);
+        dentry = d_obtain_alias(inode);
+        if (!dentry) {
+                pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
+                       cfh->ino, inode);
+                iput(inode);
+                return ERR_PTR(-ENOMEM);
+        }
+        err = ceph_init_dentry(dentry);
+        if (err < 0) {
+                iput(inode);
+                return ERR_PTR(err);
+        }
+        dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
+        return dentry;
+}
+const struct export_operations ceph_export_ops = {
+        .encode_fh = ceph_encode_fh,
+        .fh_to_dentry = ceph_fh_to_dentry,
+        .fh_to_parent = ceph_fh_to_parent,
+};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
new file mode 100644
index 000000000000..4add3d5da2c1
--- /dev/null
+++ b/fs/ceph/file.c
@@ -0,0 +1,938 @@
+#include "ceph_debug.h"
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+#include "super.h"
+#include "mds_client.h"
+/*
+ * Ceph file operations
+ *
+ * Implement basic open/close functionality, and implement
+ * read/write.
+ *
+ * We implement three modes of file I/O:
+ *  - buffered uses the generic_file_aio_{read,write} helpers
+ *
+ *  - synchronous is used when there is multi-client read/write
+ *    sharing, avoids the page cache, and synchronously waits for an
+ *    ack from the OSD.
+ *
+ *  - direct io takes the variant of the sync path that references
+ *    user pages directly.
+ *
+ * fsync() flushes and waits on dirty pages, but just queues metadata
+ * for writeback: since the MDS can recover size and mtime there is no
+ * need to wait for MDS acknowledgement.
+ */
+/*
+ * Prepare an open request.  Preallocate ceph_cap to avoid an
+ * inopportune ENOMEM later.
+ */
+static struct ceph_mds_request *
+prepare_open_request(struct super_block *sb, int flags, int create_mode)
+{
+        struct ceph_client *client = ceph_sb_to_client(sb);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_request *req;
+        int want_auth = USE_ANY_MDS;
+        int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
+        if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
+                want_auth = USE_AUTH_MDS;
+        req = ceph_mdsc_create_request(mdsc, op, want_auth);
+        if (IS_ERR(req))
+                goto out;
+        req->r_fmode = ceph_flags_to_mode(flags);
+        req->r_args.open.flags = cpu_to_le32(flags);
+        req->r_args.open.mode = cpu_to_le32(create_mode);
+        req->r_args.open.preferred = cpu_to_le32(-1);
+out:
+        return req;
+}
+/*
+ * initialize private struct file data.
+ * if we fail, clean up by dropping fmode reference on the ceph_inode
+ */
+static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
+{
+        struct ceph_file_info *cf;
+        int ret = 0;
+        switch (inode->i_mode & S_IFMT) {
+        case S_IFREG:
+        case S_IFDIR:
+                dout("init_file %p %p 0%o (regular)\n", inode, file,
+                     inode->i_mode);
+                cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
+                if (cf == NULL) {
+                        ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+                        return -ENOMEM;
+                }
+                cf->fmode = fmode;
+                cf->next_offset = 2;
+                file->private_data = cf;
+                BUG_ON(inode->i_fop->release != ceph_release);
+                break;
+        case S_IFLNK:
+                dout("init_file %p %p 0%o (symlink)\n", inode, file,
+                     inode->i_mode);
+                ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+                break;
+        default:
+                dout("init_file %p %p 0%o (special)\n", inode, file,
+                     inode->i_mode);
+                /*
+                 * we need to drop the open ref now, since we don't
+                 * have .release set to ceph_release.
+                 */
+                ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+                BUG_ON(inode->i_fop->release == ceph_release);
+                /* call the proper open fop */
+                ret = inode->i_fop->open(inode, file);
+        }
+        return ret;
+}
+/*
+ * If the filp already has private_data, that means the file was
+ * already opened by intent during lookup, and we do nothing.
+ *
+ * If we already have the requisite capabilities, we can satisfy
+ * the open request locally (no need to request new caps from the
+ * MDS).  We do, however, need to inform the MDS (asynchronously)
+ * if our wanted caps set expands.
+ */
+int ceph_open(struct inode *inode, struct file *file)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_request *req;
+        struct ceph_file_info *cf = file->private_data;
+        struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
+        int err;
+        int flags, fmode, wanted;
+        if (cf) {
+                dout("open file %p is already opened\n", file);
+                return 0;
+        }
+        /* filter out O_CREAT|O_EXCL; vfs did that already.  yuck. */
+        flags = file->f_flags & ~(O_CREAT|O_EXCL);
+        if (S_ISDIR(inode->i_mode))
+                flags = O_DIRECTORY;  /* mds likes to know */
+        dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
+             ceph_vinop(inode), file, flags, file->f_flags);
+        fmode = ceph_flags_to_mode(flags);
+        wanted = ceph_caps_for_mode(fmode);
+        /* snapped files are read-only */
+        if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
+                return -EROFS;
+        /* trivially open snapdir */
+        if (ceph_snap(inode) == CEPH_SNAPDIR) {
+                spin_lock(&inode->i_lock);
+                __ceph_get_fmode(ci, fmode);
+                spin_unlock(&inode->i_lock);
+                return ceph_init_file(inode, file, fmode);
+        }
+        /*
+         * No need to block if we have any caps.  Update wanted set
+         * asynchronously.
+         */
+        spin_lock(&inode->i_lock);
+        if (__ceph_is_any_real_caps(ci)) {
+                int mds_wanted = __ceph_caps_mds_wanted(ci);
+                int issued = __ceph_caps_issued(ci, NULL);
+                dout("open %p fmode %d want %s issued %s using existing\n",
+                     inode, fmode, ceph_cap_string(wanted),
+                     ceph_cap_string(issued));
+                __ceph_get_fmode(ci, fmode);
+                spin_unlock(&inode->i_lock);
+                /* adjust wanted? */
+                if ((issued & wanted) != wanted &&
+                    (mds_wanted & wanted) != wanted &&
+                    ceph_snap(inode) != CEPH_SNAPDIR)
+                        ceph_check_caps(ci, 0, NULL);
+                return ceph_init_file(inode, file, fmode);
+        } else if (ceph_snap(inode) != CEPH_NOSNAP &&
+                   (ci->i_snap_caps & wanted) == wanted) {
+                __ceph_get_fmode(ci, fmode);
+                spin_unlock(&inode->i_lock);
+                return ceph_init_file(inode, file, fmode);
+        }
+        spin_unlock(&inode->i_lock);
+        dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
+        req = prepare_open_request(inode->i_sb, flags, 0);
+        if (IS_ERR(req)) {
+                err = PTR_ERR(req);
+                goto out;
+        }
+        req->r_inode = igrab(inode);
+        req->r_num_caps = 1;
+        err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+        if (!err)
+                err = ceph_init_file(inode, file, req->r_fmode);
+        ceph_mdsc_put_request(req);
+        dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
+out:
+        return err;
+}
+/*
+ * Do a lookup + open with a single request.
+ *
+ * If this succeeds, but some subsequent check in the vfs
+ * may_open() fails, the struct *file gets cleaned up (i.e.
+ * ceph_release gets called).  So fear not!
+ */
+/*
+ * flags
+ *  path_lookup_open   -> LOOKUP_OPEN
+ *  path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE
+ */
+struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
+                                struct nameidata *nd, int mode,
+                                int locked_dir)
+{
+        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct file *file = nd->intent.open.file;
+        struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
+        struct ceph_mds_request *req;
+        int err;
+        int flags = nd->intent.open.flags - 1;  /* silly vfs! */
+        dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n",
+             dentry, dentry->d_name.len, dentry->d_name.name, flags, mode);
+        /* do the open */
+        req = prepare_open_request(dir->i_sb, flags, mode);
+        if (IS_ERR(req))
+                return ERR_PTR(PTR_ERR(req));
+        req->r_dentry = dget(dentry);
+        req->r_num_caps = 2;
+        if (flags & O_CREAT) {
+                req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+                req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+        }
+        req->r_locked_dir = dir;           /* caller holds dir->i_mutex */
+        err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+        dentry = ceph_finish_lookup(req, dentry, err);
+        if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
+                err = ceph_handle_notrace_create(dir, dentry);
+        if (!err)
+                err = ceph_init_file(req->r_dentry->d_inode, file,
+                                     req->r_fmode);
+        ceph_mdsc_put_request(req);
+        dout("ceph_lookup_open result=%p\n", dentry);
+        return dentry;
+}
+int ceph_release(struct inode *inode, struct file *file)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_file_info *cf = file->private_data;
+        dout("release inode %p file %p\n", inode, file);
+        ceph_put_fmode(ci, cf->fmode);
+        if (cf->last_readdir)
+                ceph_mdsc_put_request(cf->last_readdir);
+        kfree(cf->last_name);
+        kfree(cf->dir_info);
+        dput(cf->dentry);
+        kmem_cache_free(ceph_file_cachep, cf);
+        /* wake up anyone waiting for caps on this inode */
+        wake_up(&ci->i_cap_wq);
+        return 0;
+}
+/*
+ * build a vector of user pages
+ */
+static struct page **get_direct_page_vector(const char __user *data,
+                                            int num_pages,
+                                            loff_t off, size_t len)
+{
+        struct page **pages;
+        int rc;
+        pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+        if (!pages)
+                return ERR_PTR(-ENOMEM);
+        down_read(&current->mm->mmap_sem);
+        rc = get_user_pages(current, current->mm, (unsigned long)data,
+                            num_pages, 0, 0, pages, NULL);
+        up_read(&current->mm->mmap_sem);
+        if (rc < 0)
+                goto fail;
+        return pages;
+fail:
+        kfree(pages);
+        return ERR_PTR(rc);
+}
+static void put_page_vector(struct page **pages, int num_pages)
+{
+        int i;
+        for (i = 0; i < num_pages; i++)
+                put_page(pages[i]);
+        kfree(pages);
+}
+void ceph_release_page_vector(struct page **pages, int num_pages)
+{
+        int i;
+        for (i = 0; i < num_pages; i++)
+                __free_pages(pages[i], 0);
+        kfree(pages);
+}
+/*
+ * allocate a vector new pages
+ */
+static struct page **alloc_page_vector(int num_pages)
+{
+        struct page **pages;
+        int i;
+        pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+        if (!pages)
+                return ERR_PTR(-ENOMEM);
+        for (i = 0; i < num_pages; i++) {
+                pages[i] = alloc_page(GFP_NOFS);
+                if (pages[i] == NULL) {
+                        ceph_release_page_vector(pages, i);
+                        return ERR_PTR(-ENOMEM);
+                }
+        }
+        return pages;
+}
+/*
+ * copy user data into a page vector
+ */
+static int copy_user_to_page_vector(struct page **pages,
+                                    const char __user *data,
+                                    loff_t off, size_t len)
+{
+        int i = 0;
+        int po = off & ~PAGE_CACHE_MASK;
+        int left = len;
+        int l, bad;
+        while (left > 0) {
+                l = min_t(int, PAGE_CACHE_SIZE-po, left);
+                bad = copy_from_user(page_address(pages[i]) + po, data, l);
+                if (bad == l)
+                        return -EFAULT;
+                data += l - bad;
+                left -= l - bad;
+                po += l - bad;
+                if (po == PAGE_CACHE_SIZE) {
+                        po = 0;
+                        i++;
+                }
+        }
+        return len;
+}
+/*
+ * copy user data from a page vector into a user pointer
+ */
+static int copy_page_vector_to_user(struct page **pages, char __user *data,
+                                    loff_t off, size_t len)
+{
+        int i = 0;
+        int po = off & ~PAGE_CACHE_MASK;
+        int left = len;
+        int l, bad;
+        while (left > 0) {
+                l = min_t(int, left, PAGE_CACHE_SIZE-po);
+                bad = copy_to_user(data, page_address(pages[i]) + po, l);
+                if (bad == l)
+                        return -EFAULT;
+                data += l - bad;
+                left -= l - bad;
+                if (po) {
+                        po += l - bad;
+                        if (po == PAGE_CACHE_SIZE)
+                                po = 0;
+                }
+                i++;
+        }
+        return len;
+}
+/*
+ * Zero an extent within a page vector.  Offset is relative to the
+ * start of the first page.
+ */
+static void zero_page_vector_range(int off, int len, struct page **pages)
+{
+        int i = off >> PAGE_CACHE_SHIFT;
+        off &= ~PAGE_CACHE_MASK;
+        dout("zero_page_vector_page %u~%u\n", off, len);
+        /* leading partial page? */
+        if (off) {
+                int end = min((int)PAGE_CACHE_SIZE, off + len);
+                dout("zeroing %d %p head from %d\n", i, pages[i],
+                     (int)off);
+                zero_user_segment(pages[i], off, end);
+                len -= (end - off);
+                i++;
+        }
+        while (len >= PAGE_CACHE_SIZE) {
+                dout("zeroing %d %p len=%d\n", i, pages[i], len);
+                zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
+                len -= PAGE_CACHE_SIZE;
+                i++;
+        }
+        /* trailing partial page? */
+        if (len) {
+                dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
+                zero_user_segment(pages[i], 0, len);
+        }
+}
+/*
+ * Read a range of bytes striped over one or more objects.  Iterate over
+ * objects we stripe over.  (That's not atomic, but good enough for now.)
+ *
+ * If we get a short result from the OSD, check against i_size; we need to
+ * only return a short read to the caller if we hit EOF.
+ */
+static int striped_read(struct inode *inode,
+                        u64 off, u64 len,
+                        struct page **pages, int num_pages,
+                        int *checkeof)
+{
+        struct ceph_client *client = ceph_inode_to_client(inode);
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        u64 pos, this_len;
+        int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
+        int left, pages_left;
+        int read;
+        struct page **page_pos;
+        int ret;
+        bool hit_stripe, was_short;
+        /*
+         * we may need to do multiple reads.  not atomic, unfortunately.
+         */
+        pos = off;
+        left = len;
+        page_pos = pages;
+        pages_left = num_pages;
+        read = 0;
+more:
+        this_len = left;
+        ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
+                                  &ci->i_layout, pos, &this_len,
+                                  ci->i_truncate_seq,
+                                  ci->i_truncate_size,
+                                  page_pos, pages_left);
+        hit_stripe = this_len < left;
+        was_short = ret >= 0 && ret < this_len;
+        if (ret == -ENOENT)
+                ret = 0;
+        dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
+             ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
+        if (ret > 0) {
+                int didpages =
+                        ((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT;
+                if (read < pos - off) {
+                        dout(" zero gap %llu to %llu\n", off + read, pos);
+                        zero_page_vector_range(page_off + read,
+                                               pos - off - read, pages);
+                }
+                pos += ret;
+                read = pos - off;
+                left -= ret;
+                page_pos += didpages;
+                pages_left -= didpages;
+                /* hit stripe? */
+                if (left && hit_stripe)
+                        goto more;
+        }
+        if (was_short) {
+                /* was original extent fully inside i_size? */
+                if (pos + left <= inode->i_size) {
+                        dout("zero tail\n");
+                        zero_page_vector_range(page_off + read, len - read,
+                                               pages);
+                        read = len;
+                        goto out;
+                }
+                /* check i_size */
+                *checkeof = 1;
+        }
+out:
+        if (ret >= 0)
+                ret = read;
+        dout("striped_read returns %d\n", ret);
+        return ret;
+}
+/*
+ * Completely synchronous read and write methods.  Direct from __user
+ * buffer to osd, or directly to user pages (if O_DIRECT).
+ *
+ * If the read spans object boundary, just do multiple reads.
+ */
+static ssize_t ceph_sync_read(struct file *file, char __user *data,
+                              unsigned len, loff_t *poff, int *checkeof)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct page **pages;
+        u64 off = *poff;
+        int num_pages = calc_pages_for(off, len);
+        int ret;
+        dout("sync_read on file %p %llu~%u %s\n", file, off, len,
+             (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+        if (file->f_flags & O_DIRECT) {
+                pages = get_direct_page_vector(data, num_pages, off, len);
+                /*
+                 * flush any page cache pages in this range.  this
+                 * will make concurrent normal and O_DIRECT io slow,
+                 * but it will at least behave sensibly when they are
+                 * in sequence.
+                 */
+        } else {
+                pages = alloc_page_vector(num_pages);
+        }
+        if (IS_ERR(pages))
+                return PTR_ERR(pages);
+        ret = filemap_write_and_wait(inode->i_mapping);
+        if (ret < 0)
+                goto done;
+        ret = striped_read(inode, off, len, pages, num_pages, checkeof);
+        if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
+                ret = copy_page_vector_to_user(pages, data, off, ret);
+        if (ret >= 0)
+                *poff = off + ret;
+done:
+        if (file->f_flags & O_DIRECT)
+                put_page_vector(pages, num_pages);
+        else
+                ceph_release_page_vector(pages, num_pages);
+        dout("sync_read result %d\n", ret);
+        return ret;
+}
+/*
+ * Write commit callback, called if we requested both an ACK and
+ * ONDISK commit reply from the OSD.
+ */
+static void sync_write_commit(struct ceph_osd_request *req,
+                              struct ceph_msg *msg)
+{
+        struct ceph_inode_info *ci = ceph_inode(req->r_inode);
+        dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
+        spin_lock(&ci->i_unsafe_lock);
+        list_del_init(&req->r_unsafe_item);
+        spin_unlock(&ci->i_unsafe_lock);
+        ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
+}
+/*
+ * Synchronous write, straight from __user pointer or user pages (if
+ * O_DIRECT).
+ *
+ * If write spans object boundary, just do multiple writes.  (For a
+ * correct atomic write, we should e.g. take write locks on all
+ * objects, rollback on failure, etc.)
+ */
+static ssize_t ceph_sync_write(struct file *file, const char __user *data,
+                               size_t left, loff_t *offset)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_client *client = ceph_inode_to_client(inode);
+        struct ceph_osd_request *req;
+        struct page **pages;
+        int num_pages;
+        long long unsigned pos;
+        u64 len;
+        int written = 0;
+        int flags;
+        int do_sync = 0;
+        int check_caps = 0;
+        int ret;
+        struct timespec mtime = CURRENT_TIME;
+        if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
+                return -EROFS;
+        dout("sync_write on file %p %lld~%u %s\n", file, *offset,
+             (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+        if (file->f_flags & O_APPEND)
+                pos = i_size_read(inode);
+        else
+                pos = *offset;
+        ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
+        if (ret < 0)
+                return ret;
+        ret = invalidate_inode_pages2_range(inode->i_mapping,
+                                            pos >> PAGE_CACHE_SHIFT,
+                                            (pos + left) >> PAGE_CACHE_SHIFT);
+        if (ret < 0)
+                dout("invalidate_inode_pages2_range returned %d\n", ret);
+        flags = CEPH_OSD_FLAG_ORDERSNAP |
+                CEPH_OSD_FLAG_ONDISK |
+                CEPH_OSD_FLAG_WRITE;
+        if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
+                flags |= CEPH_OSD_FLAG_ACK;
+        else
+                do_sync = 1;
+        /*
+         * we may need to do multiple writes here if we span an object
+         * boundary.  this isn't atomic, unfortunately.  :(
+         */
+more:
+        len = left;
+        req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
+                                    ceph_vino(inode), pos, &len,
+                                    CEPH_OSD_OP_WRITE, flags,
+                                    ci->i_snap_realm->cached_context,
+                                    do_sync,
+                                    ci->i_truncate_seq, ci->i_truncate_size,
+                                    &mtime, false, 2);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        num_pages = calc_pages_for(pos, len);
+        if (file->f_flags & O_DIRECT) {
+                pages = get_direct_page_vector(data, num_pages, pos, len);
+                if (IS_ERR(pages)) {
+                        ret = PTR_ERR(pages);
+                        goto out;
+                }
+                /*
+                 * throw out any page cache pages in this range. this
+                 * may block.
+                 */
+                truncate_inode_pages_range(inode->i_mapping, pos, pos+len);
+        } else {
+                pages = alloc_page_vector(num_pages);
+                if (IS_ERR(pages)) {
+                        ret = PTR_ERR(pages);
+                        goto out;
+                }
+                ret = copy_user_to_page_vector(pages, data, pos, len);
+                if (ret < 0) {
+                        ceph_release_page_vector(pages, num_pages);
+                        goto out;
+                }
+                if ((file->f_flags & O_SYNC) == 0) {
+                        /* get a second commit callback */
+                        req->r_safe_callback = sync_write_commit;
+                        req->r_own_pages = 1;
+                }
+        }
+        req->r_pages = pages;
+        req->r_num_pages = num_pages;
+        req->r_inode = inode;
+        ret = ceph_osdc_start_request(&client->osdc, req, false);
+        if (!ret) {
+                if (req->r_safe_callback) {
+                        /*
+                         * Add to inode unsafe list only after we
+                         * start_request so that a tid has been assigned.
+                         */
+                        spin_lock(&ci->i_unsafe_lock);
+                        list_add(&ci->i_unsafe_writes, &req->r_unsafe_item);
+                        spin_unlock(&ci->i_unsafe_lock);
+                        ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
+                }
+                ret = ceph_osdc_wait_request(&client->osdc, req);
+        }
+        if (file->f_flags & O_DIRECT)
+                put_page_vector(pages, num_pages);
+        else if (file->f_flags & O_SYNC)
+                ceph_release_page_vector(pages, num_pages);
+out:
+        ceph_osdc_put_request(req);
+        if (ret == 0) {
+                pos += len;
+                written += len;
+                left -= len;
+                if (left)
+                        goto more;
+                ret = written;
+                *offset = pos;
+                if (pos > i_size_read(inode))
+                        check_caps = ceph_inode_set_size(inode, pos);
+                if (check_caps)
+                        ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
+                                        NULL);
+        }
+        return ret;
+}
+/*
+ * Wrap generic_file_aio_read with checks for cap bits on the inode.
+ * Atomically grab references, so that those bits are not released
+ * back to the MDS mid-read.
+ *
+ * Hmm, the sync read case isn't actually async... should it be?
+ */
+static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
+                             unsigned long nr_segs, loff_t pos)
+{
+        struct file *filp = iocb->ki_filp;
+        loff_t *ppos = &iocb->ki_pos;
+        size_t len = iov->iov_len;
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        void *base = iov->iov_base;
+        ssize_t ret;
+        int got = 0;
+        int checkeof = 0, read = 0;
+        dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
+             inode, ceph_vinop(inode), pos, (unsigned)len, inode);
+again:
+        __ceph_do_pending_vmtruncate(inode);
+        ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE,
+                            &got, -1);
+        if (ret < 0)
+                goto out;
+        dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
+             inode, ceph_vinop(inode), pos, (unsigned)len,
+             ceph_cap_string(got));
+        if ((got & CEPH_CAP_FILE_CACHE) == 0 ||
+            (iocb->ki_filp->f_flags & O_DIRECT) ||
+            (inode->i_sb->s_flags & MS_SYNCHRONOUS))
+                /* hmm, this isn't really async... */
+                ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
+        else
+                ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
+out:
+        dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
+             inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
+        ceph_put_cap_refs(ci, got);
+        if (checkeof && ret >= 0) {
+                int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
+                /* hit EOF or hole? */
+                if (statret == 0 && *ppos < inode->i_size) {
+                        dout("aio_read sync_read hit hole, reading more\n");
+                        read += ret;
+                        base += ret;
+                        len -= ret;
+                        checkeof = 0;
+                        goto again;
+                }
+        }
+        if (ret >= 0)
+                ret += read;
+        return ret;
+}
+/*
+ * Take cap references to avoid releasing caps to MDS mid-write.
+ *
+ * If we are synchronous, and write with an old snap context, the OSD
+ * may return EOLDSNAPC.  In that case, retry the write.. _after_
+ * dropping our cap refs and allowing the pending snap to logically
+ * complete _before_ this write occurs.
+ *
+ * If we are near ENOSPC, write synchronously.
+ */
+static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
+                       unsigned long nr_segs, loff_t pos)
+{
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file->f_dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+        loff_t endoff = pos + iov->iov_len;
+        int got = 0;
+        int ret, err;
+        if (ceph_snap(inode) != CEPH_NOSNAP)
+                return -EROFS;
+retry_snap:
+        if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
+                return -ENOSPC;
+        __ceph_do_pending_vmtruncate(inode);
+        dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
+             inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+             inode->i_size);
+        ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
+                            &got, endoff);
+        if (ret < 0)
+                goto out;
+        dout("aio_write %p %llx.%llx %llu~%u  got cap refs on %s\n",
+             inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+             ceph_cap_string(got));
+        if ((got & CEPH_CAP_FILE_BUFFER) == 0 ||
+            (iocb->ki_filp->f_flags & O_DIRECT) ||
+            (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
+                ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
+                        &iocb->ki_pos);
+        } else {
+                ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+                if ((ret >= 0 || ret == -EIOCBQUEUED) &&
+                    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
+                     || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
+                        err = vfs_fsync_range(file, file->f_path.dentry,
+                                              pos, pos + ret - 1, 1);
+                        if (err < 0)
+                                ret = err;
+                }
+        }
+        if (ret >= 0) {
+                spin_lock(&inode->i_lock);
+                __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+                spin_unlock(&inode->i_lock);
+        }
+out:
+        dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
+             inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+             ceph_cap_string(got));
+        ceph_put_cap_refs(ci, got);
+        if (ret == -EOLDSNAPC) {
+                dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
+                     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
+                goto retry_snap;
+        }
+        return ret;
+}
+/*
+ * llseek.  be sure to verify file size on SEEK_END.
+ */
+static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
+{
+        struct inode *inode = file->f_mapping->host;
+        int ret;
+        mutex_lock(&inode->i_mutex);
+        __ceph_do_pending_vmtruncate(inode);
+        switch (origin) {
+        case SEEK_END:
+                ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
+                if (ret < 0) {
+                        offset = ret;
+                        goto out;
+                }
+                offset += inode->i_size;
+                break;
+        case SEEK_CUR:
+                /*
+                 * Here we special-case the lseek(fd, 0, SEEK_CUR)
+                 * position-querying operation.  Avoid rewriting the "same"
+                 * f_pos value back to the file because a concurrent read(),
+                 * write() or lseek() might have altered it
+                 */
+                if (offset == 0) {
+                        offset = file->f_pos;
+                        goto out;
+                }
+                offset += file->f_pos;
+                break;
+        }
+        if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
+                offset = -EINVAL;
+                goto out;
+        }
+        /* Special lock needed here? */
+        if (offset != file->f_pos) {
+                file->f_pos = offset;
+                file->f_version = 0;
+        }
+out:
+        mutex_unlock(&inode->i_mutex);
+        return offset;
+}
+const struct file_operations ceph_file_fops = {
+        .open = ceph_open,
+        .release = ceph_release,
+        .llseek = ceph_llseek,
+        .read = do_sync_read,
+        .write = do_sync_write,
+        .aio_read = ceph_aio_read,
+        .aio_write = ceph_aio_write,
+        .mmap = ceph_mmap,
+        .fsync = ceph_fsync,
+        .splice_read = generic_file_splice_read,
+        .splice_write = generic_file_splice_write,
+        .unlocked_ioctl = ceph_ioctl,
+        .compat_ioctl   = ceph_ioctl,
+};
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
new file mode 100644
index 000000000000..aca82d55cc53
--- /dev/null
+++ b/fs/ceph/inode.c
@@ -0,0 +1,1766 @@
+#include "ceph_debug.h"
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/smp_lock.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+#include <linux/vmalloc.h>
+#include <linux/pagevec.h>
+#include "super.h"
+#include "decode.h"
+/*
+ * Ceph inode operations
+ *
+ * Implement basic inode helpers (get, alloc) and inode ops (getattr,
+ * setattr, etc.), xattr helpers, and helpers for assimilating
+ * metadata returned by the MDS into our cache.
+ *
+ * Also define helpers for doing asynchronous writeback, invalidation,
+ * and truncation for the benefit of those who can't afford to block
+ * (typically because they are in the message handler path).
+ */
+static const struct inode_operations ceph_symlink_iops;
+static void ceph_invalidate_work(struct work_struct *work);
+static void ceph_writeback_work(struct work_struct *work);
+static void ceph_vmtruncate_work(struct work_struct *work);
+/*
+ * find or create an inode, given the ceph ino number
+ */
+struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
+{
+        struct inode *inode;
+        ino_t t = ceph_vino_to_ino(vino);
+        inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
+        if (inode == NULL)
+                return ERR_PTR(-ENOMEM);
+        if (inode->i_state & I_NEW) {
+                dout("get_inode created new inode %p %llx.%llx ino %llx\n",
+                     inode, ceph_vinop(inode), (u64)inode->i_ino);
+                unlock_new_inode(inode);
+        }
+        dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
+             vino.snap, inode);
+        return inode;
+}
+/*
+ * get/constuct snapdir inode for a given directory
+ */
+struct inode *ceph_get_snapdir(struct inode *parent)
+{
+        struct ceph_vino vino = {
+                .ino = ceph_ino(parent),
+                .snap = CEPH_SNAPDIR,
+        };
+        struct inode *inode = ceph_get_inode(parent->i_sb, vino);
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        BUG_ON(!S_ISDIR(parent->i_mode));
+        if (IS_ERR(inode))
+                return ERR_PTR(PTR_ERR(inode));
+        inode->i_mode = parent->i_mode;
+        inode->i_uid = parent->i_uid;
+        inode->i_gid = parent->i_gid;
+        inode->i_op = &ceph_dir_iops;
+        inode->i_fop = &ceph_dir_fops;
+        ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
+        ci->i_rbytes = 0;
+        return inode;
+}
+const struct inode_operations ceph_file_iops = {
+        .permission = ceph_permission,
+        .setattr = ceph_setattr,
+        .getattr = ceph_getattr,
+        .setxattr = ceph_setxattr,
+        .getxattr = ceph_getxattr,
+        .listxattr = ceph_listxattr,
+        .removexattr = ceph_removexattr,
+};
+/*
+ * We use a 'frag tree' to keep track of the MDS's directory fragments
+ * for a given inode (usually there is just a single fragment).  We
+ * need to know when a child frag is delegated to a new MDS, or when
+ * it is flagged as replicated, so we can direct our requests
+ * accordingly.
+ */
+/*
+ * find/create a frag in the tree
+ */
+static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
+                                                    u32 f)
+{
+        struct rb_node **p;
+        struct rb_node *parent = NULL;
+        struct ceph_inode_frag *frag;
+        int c;
+        p = &ci->i_fragtree.rb_node;
+        while (*p) {
+                parent = *p;
+                frag = rb_entry(parent, struct ceph_inode_frag, node);
+                c = ceph_frag_compare(f, frag->frag);
+                if (c < 0)
+                        p = &(*p)->rb_left;
+                else if (c > 0)
+                        p = &(*p)->rb_right;
+                else
+                        return frag;
+        }
+        frag = kmalloc(sizeof(*frag), GFP_NOFS);
+        if (!frag) {
+                pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
+                       "frag %x\n", &ci->vfs_inode,
+                       ceph_vinop(&ci->vfs_inode), f);
+                return ERR_PTR(-ENOMEM);
+        }
+        frag->frag = f;
+        frag->split_by = 0;
+        frag->mds = -1;
+        frag->ndist = 0;
+        rb_link_node(&frag->node, parent, p);
+        rb_insert_color(&frag->node, &ci->i_fragtree);
+        dout("get_or_create_frag added %llx.%llx frag %x\n",
+             ceph_vinop(&ci->vfs_inode), f);
+        return frag;
+}
+/*
+ * find a specific frag @f
+ */
+struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
+{
+        struct rb_node *n = ci->i_fragtree.rb_node;
+        while (n) {
+                struct ceph_inode_frag *frag =
+                        rb_entry(n, struct ceph_inode_frag, node);
+                int c = ceph_frag_compare(f, frag->frag);
+                if (c < 0)
+                        n = n->rb_left;
+                else if (c > 0)
+                        n = n->rb_right;
+                else
+                        return frag;
+        }
+        return NULL;
+}
+/*
+ * Choose frag containing the given value @v.  If @pfrag is
+ * specified, copy the frag delegation info to the caller if
+ * it is present.
+ */
+u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+                     struct ceph_inode_frag *pfrag,
+                     int *found)
+{
+        u32 t = ceph_frag_make(0, 0);
+        struct ceph_inode_frag *frag;
+        unsigned nway, i;
+        u32 n;
+        if (found)
+                *found = 0;
+        mutex_lock(&ci->i_fragtree_mutex);
+        while (1) {
+                WARN_ON(!ceph_frag_contains_value(t, v));
+                frag = __ceph_find_frag(ci, t);
+                if (!frag)
+                        break; /* t is a leaf */
+                if (frag->split_by == 0) {
+                        if (pfrag)
+                                memcpy(pfrag, frag, sizeof(*pfrag));
+                        if (found)
+                                *found = 1;
+                        break;
+                }
+                /* choose child */
+                nway = 1 << frag->split_by;
+                dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
+                     frag->split_by, nway);
+                for (i = 0; i < nway; i++) {
+                        n = ceph_frag_make_child(t, frag->split_by, i);
+                        if (ceph_frag_contains_value(n, v)) {
+                                t = n;
+                                break;
+                        }
+                }
+                BUG_ON(i == nway);
+        }
+        dout("choose_frag(%x) = %x\n", v, t);
+        mutex_unlock(&ci->i_fragtree_mutex);
+        return t;
+}
+/*
+ * Process dirfrag (delegation) info from the mds.  Include leaf
+ * fragment in tree ONLY if ndist > 0.  Otherwise, only
+ * branches/splits are included in i_fragtree)
+ */
+static int ceph_fill_dirfrag(struct inode *inode,
+                             struct ceph_mds_reply_dirfrag *dirinfo)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_inode_frag *frag;
+        u32 id = le32_to_cpu(dirinfo->frag);
+        int mds = le32_to_cpu(dirinfo->auth);
+        int ndist = le32_to_cpu(dirinfo->ndist);
+        int i;
+        int err = 0;
+        mutex_lock(&ci->i_fragtree_mutex);
+        if (ndist == 0) {
+                /* no delegation info needed. */
+                frag = __ceph_find_frag(ci, id);
+                if (!frag)
+                        goto out;
+                if (frag->split_by == 0) {
+                        /* tree leaf, remove */
+                        dout("fill_dirfrag removed %llx.%llx frag %x"
+                             " (no ref)\n", ceph_vinop(inode), id);
+                        rb_erase(&frag->node, &ci->i_fragtree);
+                        kfree(frag);
+                } else {
+                        /* tree branch, keep and clear */
+                        dout("fill_dirfrag cleared %llx.%llx frag %x"
+                             " referral\n", ceph_vinop(inode), id);
+                        frag->mds = -1;
+                        frag->ndist = 0;
+                }
+                goto out;
+        }
+        /* find/add this frag to store mds delegation info */
+        frag = __get_or_create_frag(ci, id);
+        if (IS_ERR(frag)) {
+                /* this is not the end of the world; we can continue
+                   with bad/inaccurate delegation info */
+                pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
+                       ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
+                err = -ENOMEM;
+                goto out;
+        }
+        frag->mds = mds;
+        frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
+        for (i = 0; i < frag->ndist; i++)
+                frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
+        dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
+             ceph_vinop(inode), frag->frag, frag->ndist);
+out:
+        mutex_unlock(&ci->i_fragtree_mutex);
+        return err;
+}
+/*
+ * initialize a newly allocated inode.
+ */
+struct inode *ceph_alloc_inode(struct super_block *sb)
+{
+        struct ceph_inode_info *ci;
+        int i;
+        ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
+        if (!ci)
+                return NULL;
+        dout("alloc_inode %p\n", &ci->vfs_inode);
+        ci->i_version = 0;
+        ci->i_time_warp_seq = 0;
+        ci->i_ceph_flags = 0;
+        ci->i_release_count = 0;
+        ci->i_symlink = NULL;
+        ci->i_fragtree = RB_ROOT;
+        mutex_init(&ci->i_fragtree_mutex);
+        ci->i_xattrs.blob = NULL;
+        ci->i_xattrs.prealloc_blob = NULL;
+        ci->i_xattrs.dirty = false;
+        ci->i_xattrs.index = RB_ROOT;
+        ci->i_xattrs.count = 0;
+        ci->i_xattrs.names_size = 0;
+        ci->i_xattrs.vals_size = 0;
+        ci->i_xattrs.version = 0;
+        ci->i_xattrs.index_version = 0;
+        ci->i_caps = RB_ROOT;
+        ci->i_auth_cap = NULL;
+        ci->i_dirty_caps = 0;
+        ci->i_flushing_caps = 0;
+        INIT_LIST_HEAD(&ci->i_dirty_item);
+        INIT_LIST_HEAD(&ci->i_flushing_item);
+        ci->i_cap_flush_seq = 0;
+        ci->i_cap_flush_last_tid = 0;
+        memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
+        init_waitqueue_head(&ci->i_cap_wq);
+        ci->i_hold_caps_min = 0;
+        ci->i_hold_caps_max = 0;
+        INIT_LIST_HEAD(&ci->i_cap_delay_list);
+        ci->i_cap_exporting_mds = 0;
+        ci->i_cap_exporting_mseq = 0;
+        ci->i_cap_exporting_issued = 0;
+        INIT_LIST_HEAD(&ci->i_cap_snaps);
+        ci->i_head_snapc = NULL;
+        ci->i_snap_caps = 0;
+        for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
+                ci->i_nr_by_mode[i] = 0;
+        ci->i_truncate_seq = 0;
+        ci->i_truncate_size = 0;
+        ci->i_truncate_pending = 0;
+        ci->i_max_size = 0;
+        ci->i_reported_size = 0;
+        ci->i_wanted_max_size = 0;
+        ci->i_requested_max_size = 0;
+        ci->i_pin_ref = 0;
+        ci->i_rd_ref = 0;
+        ci->i_rdcache_ref = 0;
+        ci->i_wr_ref = 0;
+        ci->i_wrbuffer_ref = 0;
+        ci->i_wrbuffer_ref_head = 0;
+        ci->i_shared_gen = 0;
+        ci->i_rdcache_gen = 0;
+        ci->i_rdcache_revoking = 0;
+        INIT_LIST_HEAD(&ci->i_unsafe_writes);
+        INIT_LIST_HEAD(&ci->i_unsafe_dirops);
+        spin_lock_init(&ci->i_unsafe_lock);
+        ci->i_snap_realm = NULL;
+        INIT_LIST_HEAD(&ci->i_snap_realm_item);
+        INIT_LIST_HEAD(&ci->i_snap_flush_item);
+        INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
+        INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
+        INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
+        return &ci->vfs_inode;
+}
+void ceph_destroy_inode(struct inode *inode)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_inode_frag *frag;
+        struct rb_node *n;
+        dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+        ceph_queue_caps_release(inode);
+        /*
+         * we may still have a snap_realm reference if there are stray
+         * caps in i_cap_exporting_issued or i_snap_caps.
+         */
+        if (ci->i_snap_realm) {
+                struct ceph_mds_client *mdsc =
+                        &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+                struct ceph_snap_realm *realm = ci->i_snap_realm;
+                dout(" dropping residual ref to snap realm %p\n", realm);
+                spin_lock(&realm->inodes_with_caps_lock);
+                list_del_init(&ci->i_snap_realm_item);
+                spin_unlock(&realm->inodes_with_caps_lock);
+                ceph_put_snap_realm(mdsc, realm);
+        }
+        kfree(ci->i_symlink);
+        while ((n = rb_first(&ci->i_fragtree)) != NULL) {
+                frag = rb_entry(n, struct ceph_inode_frag, node);
+                rb_erase(n, &ci->i_fragtree);
+                kfree(frag);
+        }
+        __ceph_destroy_xattrs(ci);
+        if (ci->i_xattrs.blob)
+                ceph_buffer_put(ci->i_xattrs.blob);
+        if (ci->i_xattrs.prealloc_blob)
+                ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+        kmem_cache_free(ceph_inode_cachep, ci);
+}
+/*
+ * Helpers to fill in size, ctime, mtime, and atime.  We have to be
+ * careful because either the client or MDS may have more up to date
+ * info, depending on which capabilities are held, and whether
+ * time_warp_seq or truncate_seq have increased.  (Ordinarily, mtime
+ * and size are monotonically increasing, except when utimes() or
+ * truncate() increments the corresponding _seq values.)
+ */
+int ceph_fill_file_size(struct inode *inode, int issued,
+                        u32 truncate_seq, u64 truncate_size, u64 size)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int queue_trunc = 0;
+        if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
+            (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
+                dout("size %lld -> %llu\n", inode->i_size, size);
+                inode->i_size = size;
+                inode->i_blocks = (size + (1<<9) - 1) >> 9;
+                ci->i_reported_size = size;
+                if (truncate_seq != ci->i_truncate_seq) {
+                        dout("truncate_seq %u -> %u\n",
+                             ci->i_truncate_seq, truncate_seq);
+                        ci->i_truncate_seq = truncate_seq;
+                        /*
+                         * If we hold relevant caps, or in the case where we're
+                         * not the only client referencing this file and we
+                         * don't hold those caps, then we need to check whether
+                         * the file is either opened or mmaped
+                         */
+                        if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
+                                      CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
+                                      CEPH_CAP_FILE_EXCL)) ||
+                            mapping_mapped(inode->i_mapping) ||
+                            __ceph_caps_file_wanted(ci)) {
+                                ci->i_truncate_pending++;
+                                queue_trunc = 1;
+                        }
+                }
+        }
+        if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
+            ci->i_truncate_size != truncate_size) {
+                dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
+                     truncate_size);
+                ci->i_truncate_size = truncate_size;
+        }
+        return queue_trunc;
+}
+void ceph_fill_file_time(struct inode *inode, int issued,
+                         u64 time_warp_seq, struct timespec *ctime,
+                         struct timespec *mtime, struct timespec *atime)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int warn = 0;
+        if (issued & (CEPH_CAP_FILE_EXCL|
+                      CEPH_CAP_FILE_WR|
+                      CEPH_CAP_FILE_BUFFER)) {
+                if (timespec_compare(ctime, &inode->i_ctime) > 0) {
+                        dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
+                             inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
+                             ctime->tv_sec, ctime->tv_nsec);
+                        inode->i_ctime = *ctime;
+                }
+                if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
+                        /* the MDS did a utimes() */
+                        dout("mtime %ld.%09ld -> %ld.%09ld "
+                             "tw %d -> %d\n",
+                             inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
+                             mtime->tv_sec, mtime->tv_nsec,
+                             ci->i_time_warp_seq, (int)time_warp_seq);
+                        inode->i_mtime = *mtime;
+                        inode->i_atime = *atime;
+                        ci->i_time_warp_seq = time_warp_seq;
+                } else if (time_warp_seq == ci->i_time_warp_seq) {
+                        /* nobody did utimes(); take the max */
+                        if (timespec_compare(mtime, &inode->i_mtime) > 0) {
+                                dout("mtime %ld.%09ld -> %ld.%09ld inc\n",
+                                     inode->i_mtime.tv_sec,
+                                     inode->i_mtime.tv_nsec,
+                                     mtime->tv_sec, mtime->tv_nsec);
+                                inode->i_mtime = *mtime;
+                        }
+                        if (timespec_compare(atime, &inode->i_atime) > 0) {
+                                dout("atime %ld.%09ld -> %ld.%09ld inc\n",
+                                     inode->i_atime.tv_sec,
+                                     inode->i_atime.tv_nsec,
+                                     atime->tv_sec, atime->tv_nsec);
+                                inode->i_atime = *atime;
+                        }
+                } else if (issued & CEPH_CAP_FILE_EXCL) {
+                        /* we did a utimes(); ignore mds values */
+                } else {
+                        warn = 1;
+                }
+        } else {
+                /* we have no write caps; whatever the MDS says is true */
+                if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
+                        inode->i_ctime = *ctime;
+                        inode->i_mtime = *mtime;
+                        inode->i_atime = *atime;
+                        ci->i_time_warp_seq = time_warp_seq;
+                } else {
+                        warn = 1;
+                }
+        }
+        if (warn) /* time_warp_seq shouldn't go backwards */
+                dout("%p mds time_warp_seq %llu < %u\n",
+                     inode, time_warp_seq, ci->i_time_warp_seq);
+}
+/*
+ * Populate an inode based on info from mds.  May be called on new or
+ * existing inodes.
+ */
+static int fill_inode(struct inode *inode,
+                      struct ceph_mds_reply_info_in *iinfo,
+                      struct ceph_mds_reply_dirfrag *dirinfo,
+                      struct ceph_mds_session *session,
+                      unsigned long ttl_from, int cap_fmode,
+                      struct ceph_cap_reservation *caps_reservation)
+{
+        struct ceph_mds_reply_inode *info = iinfo->in;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int i;
+        int issued, implemented;
+        struct timespec mtime, atime, ctime;
+        u32 nsplits;
+        struct ceph_buffer *xattr_blob = NULL;
+        int err = 0;
+        int queue_trunc = 0;
+        dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
+             inode, ceph_vinop(inode), le64_to_cpu(info->version),
+             ci->i_version);
+        /*
+         * prealloc xattr data, if it looks like we'll need it.  only
+         * if len > 4 (meaning there are actually xattrs; the first 4
+         * bytes are the xattr count).
+         */
+        if (iinfo->xattr_len > 4) {
+                xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
+                if (!xattr_blob)
+                        pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
+                               iinfo->xattr_len);
+        }
+        spin_lock(&inode->i_lock);
+        /*
+         * provided version will be odd if inode value is projected,
+         * even if stable.  skip the update if we have a newer info
+         * (e.g., due to inode info racing form multiple MDSs), or if
+         * we are getting projected (unstable) inode info.
+         */
+        if (le64_to_cpu(info->version) > 0 &&
+            (ci->i_version & ~1) > le64_to_cpu(info->version))
+                goto no_change;
+        issued = __ceph_caps_issued(ci, &implemented);
+        issued |= implemented | __ceph_caps_dirty(ci);
+        /* update inode */
+        ci->i_version = le64_to_cpu(info->version);
+        inode->i_version++;
+        inode->i_rdev = le32_to_cpu(info->rdev);
+        if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+                inode->i_mode = le32_to_cpu(info->mode);
+                inode->i_uid = le32_to_cpu(info->uid);
+                inode->i_gid = le32_to_cpu(info->gid);
+                dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
+                     inode->i_uid, inode->i_gid);
+        }
+        if ((issued & CEPH_CAP_LINK_EXCL) == 0)
+                inode->i_nlink = le32_to_cpu(info->nlink);
+        /* be careful with mtime, atime, size */
+        ceph_decode_timespec(&atime, &info->atime);
+        ceph_decode_timespec(&mtime, &info->mtime);
+        ceph_decode_timespec(&ctime, &info->ctime);
+        queue_trunc = ceph_fill_file_size(inode, issued,
+                                          le32_to_cpu(info->truncate_seq),
+                                          le64_to_cpu(info->truncate_size),
+                                          le64_to_cpu(info->size));
+        ceph_fill_file_time(inode, issued,
+                            le32_to_cpu(info->time_warp_seq),
+                            &ctime, &mtime, &atime);
+        ci->i_max_size = le64_to_cpu(info->max_size);
+        ci->i_layout = info->layout;
+        inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+        /* xattrs */
+        /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
+        if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
+            le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
+                if (ci->i_xattrs.blob)
+                        ceph_buffer_put(ci->i_xattrs.blob);
+                ci->i_xattrs.blob = xattr_blob;
+                if (xattr_blob)
+                        memcpy(ci->i_xattrs.blob->vec.iov_base,
+                               iinfo->xattr_data, iinfo->xattr_len);
+                ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
+        }
+        inode->i_mapping->a_ops = &ceph_aops;
+        inode->i_mapping->backing_dev_info =
+                &ceph_client(inode->i_sb)->backing_dev_info;
+        switch (inode->i_mode & S_IFMT) {
+        case S_IFIFO:
+        case S_IFBLK:
+        case S_IFCHR:
+        case S_IFSOCK:
+                init_special_inode(inode, inode->i_mode, inode->i_rdev);
+                inode->i_op = &ceph_file_iops;
+                break;
+        case S_IFREG:
+                inode->i_op = &ceph_file_iops;
+                inode->i_fop = &ceph_file_fops;
+                break;
+        case S_IFLNK:
+                inode->i_op = &ceph_symlink_iops;
+                if (!ci->i_symlink) {
+                        int symlen = iinfo->symlink_len;
+                        char *sym;
+                        BUG_ON(symlen != inode->i_size);
+                        spin_unlock(&inode->i_lock);
+                        err = -ENOMEM;
+                        sym = kmalloc(symlen+1, GFP_NOFS);
+                        if (!sym)
+                                goto out;
+                        memcpy(sym, iinfo->symlink, symlen);
+                        sym[symlen] = 0;
+                        spin_lock(&inode->i_lock);
+                        if (!ci->i_symlink)
+                                ci->i_symlink = sym;
+                        else
+                                kfree(sym); /* lost a race */
+                }
+                break;
+        case S_IFDIR:
+                inode->i_op = &ceph_dir_iops;
+                inode->i_fop = &ceph_dir_fops;
+                ci->i_files = le64_to_cpu(info->files);
+                ci->i_subdirs = le64_to_cpu(info->subdirs);
+                ci->i_rbytes = le64_to_cpu(info->rbytes);
+                ci->i_rfiles = le64_to_cpu(info->rfiles);
+                ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
+                ceph_decode_timespec(&ci->i_rctime, &info->rctime);
+                /* set dir completion flag? */
+                if (ci->i_files == 0 && ci->i_subdirs == 0 &&
+                    ceph_snap(inode) == CEPH_NOSNAP &&
+                    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
+                        dout(" marking %p complete (empty)\n", inode);
+                        ci->i_ceph_flags |= CEPH_I_COMPLETE;
+                        ci->i_max_offset = 2;
+                }
+                /* it may be better to set st_size in getattr instead? */
+                if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
+                        inode->i_size = ci->i_rbytes;
+                break;
+        default:
+                pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
+                       ceph_vinop(inode), inode->i_mode);
+        }
+no_change:
+        spin_unlock(&inode->i_lock);
+        /* queue truncate if we saw i_size decrease */
+        if (queue_trunc)
+                ceph_queue_vmtruncate(inode);
+        /* populate frag tree */
+        /* FIXME: move me up, if/when version reflects fragtree changes */
+        nsplits = le32_to_cpu(info->fragtree.nsplits);
+        mutex_lock(&ci->i_fragtree_mutex);
+        for (i = 0; i < nsplits; i++) {
+                u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
+                struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
+                if (IS_ERR(frag))
+                        continue;
+                frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
+                dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+        }
+        mutex_unlock(&ci->i_fragtree_mutex);
+        /* were we issued a capability? */
+        if (info->cap.caps) {
+                if (ceph_snap(inode) == CEPH_NOSNAP) {
+                        ceph_add_cap(inode, session,
+                                     le64_to_cpu(info->cap.cap_id),
+                                     cap_fmode,
+                                     le32_to_cpu(info->cap.caps),
+                                     le32_to_cpu(info->cap.wanted),
+                                     le32_to_cpu(info->cap.seq),
+                                     le32_to_cpu(info->cap.mseq),
+                                     le64_to_cpu(info->cap.realm),
+                                     info->cap.flags,
+                                     caps_reservation);
+                } else {
+                        spin_lock(&inode->i_lock);
+                        dout(" %p got snap_caps %s\n", inode,
+                             ceph_cap_string(le32_to_cpu(info->cap.caps)));
+                        ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
+                        if (cap_fmode >= 0)
+                                __ceph_get_fmode(ci, cap_fmode);
+                        spin_unlock(&inode->i_lock);
+                }
+        }
+        /* update delegation info? */
+        if (dirinfo)
+                ceph_fill_dirfrag(inode, dirinfo);
+        err = 0;
+out:
+        if (xattr_blob)
+                ceph_buffer_put(xattr_blob);
+        return err;
+}
+/*
+ * caller should hold session s_mutex.
+ */
+static void update_dentry_lease(struct dentry *dentry,
+                                struct ceph_mds_reply_lease *lease,
+                                struct ceph_mds_session *session,
+                                unsigned long from_time)
+{
+        struct ceph_dentry_info *di = ceph_dentry(dentry);
+        long unsigned duration = le32_to_cpu(lease->duration_ms);
+        long unsigned ttl = from_time + (duration * HZ) / 1000;
+        long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
+        struct inode *dir;
+        /* only track leases on regular dentries */
+        if (dentry->d_op != &ceph_dentry_ops)
+                return;
+        spin_lock(&dentry->d_lock);
+        dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n",
+             dentry, le16_to_cpu(lease->mask), duration, ttl);
+        /* make lease_rdcache_gen match directory */
+        dir = dentry->d_parent->d_inode;
+        di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
+        if (lease->mask == 0)
+                goto out_unlock;
+        if (di->lease_gen == session->s_cap_gen &&
+            time_before(ttl, dentry->d_time))
+                goto out_unlock;  /* we already have a newer lease. */
+        if (di->lease_session && di->lease_session != session)
+                goto out_unlock;
+        ceph_dentry_lru_touch(dentry);
+        if (!di->lease_session)
+                di->lease_session = ceph_get_mds_session(session);
+        di->lease_gen = session->s_cap_gen;
+        di->lease_seq = le32_to_cpu(lease->seq);
+        di->lease_renew_after = half_ttl;
+        di->lease_renew_from = 0;
+        dentry->d_time = ttl;
+out_unlock:
+        spin_unlock(&dentry->d_lock);
+        return;
+}
+/*
+ * splice a dentry to an inode.
+ * caller must hold directory i_mutex for this to be safe.
+ *
+ * we will only rehash the resulting dentry if @prehash is
+ * true; @prehash will be set to false (for the benefit of
+ * the caller) if we fail.
+ */
+static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
+                                    bool *prehash)
+{
+        struct dentry *realdn;
+        /* dn must be unhashed */
+        if (!d_unhashed(dn))
+                d_drop(dn);
+        realdn = d_materialise_unique(dn, in);
+        if (IS_ERR(realdn)) {
+                pr_err("splice_dentry error %p inode %p ino %llx.%llx\n",
+                       dn, in, ceph_vinop(in));
+                if (prehash)
+                        *prehash = false; /* don't rehash on error */
+                dn = realdn; /* note realdn contains the error */
+                goto out;
+        } else if (realdn) {
+                dout("dn %p (%d) spliced with %p (%d) "
+                     "inode %p ino %llx.%llx\n",
+                     dn, atomic_read(&dn->d_count),
+                     realdn, atomic_read(&realdn->d_count),
+                     realdn->d_inode, ceph_vinop(realdn->d_inode));
+                dput(dn);
+                dn = realdn;
+        } else {
+                BUG_ON(!ceph_dentry(dn));
+                dout("dn %p attached to %p ino %llx.%llx\n",
+                     dn, dn->d_inode, ceph_vinop(dn->d_inode));
+        }
+        if ((!prehash || *prehash) && d_unhashed(dn))
+                d_rehash(dn);
+out:
+        return dn;
+}
+/*
+ * Set dentry's directory position based on the current dir's max, and
+ * order it in d_subdirs, so that dcache_readdir behaves.
+ */
+static void ceph_set_dentry_offset(struct dentry *dn)
+{
+        struct dentry *dir = dn->d_parent;
+        struct inode *inode = dn->d_parent->d_inode;
+        struct ceph_dentry_info *di;
+        BUG_ON(!inode);
+        di = ceph_dentry(dn);
+        spin_lock(&inode->i_lock);
+        di->offset = ceph_inode(inode)->i_max_offset++;
+        spin_unlock(&inode->i_lock);
+        spin_lock(&dcache_lock);
+        spin_lock(&dn->d_lock);
+        list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
+        dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
+             dn->d_u.d_child.prev, dn->d_u.d_child.next);
+        spin_unlock(&dn->d_lock);
+        spin_unlock(&dcache_lock);
+}
+/*
+ * Incorporate results into the local cache.  This is either just
+ * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
+ * after a lookup).
+ *
+ * A reply may contain
+ *         a directory inode along with a dentry.
+ *  and/or a target inode
+ *
+ * Called with snap_rwsem (read).
+ */
+int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
+                    struct ceph_mds_session *session)
+{
+        struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+        struct inode *in = NULL;
+        struct ceph_mds_reply_inode *ininfo;
+        struct ceph_vino vino;
+        int i = 0;
+        int err = 0;
+        dout("fill_trace %p is_dentry %d is_target %d\n", req,
+             rinfo->head->is_dentry, rinfo->head->is_target);
+#if 0
+        /*
+         * Debugging hook:
+         *
+         * If we resend completed ops to a recovering mds, we get no
+         * trace.  Since that is very rare, pretend this is the case
+         * to ensure the 'no trace' handlers in the callers behave.
+         *
+         * Fill in inodes unconditionally to avoid breaking cap
+         * invariants.
+         */
+        if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
+                pr_info("fill_trace faking empty trace on %lld %s\n",
+                        req->r_tid, ceph_mds_op_name(rinfo->head->op));
+                if (rinfo->head->is_dentry) {
+                        rinfo->head->is_dentry = 0;
+                        err = fill_inode(req->r_locked_dir,
+                                         &rinfo->diri, rinfo->dirfrag,
+                                         session, req->r_request_started, -1);
+                }
+                if (rinfo->head->is_target) {
+                        rinfo->head->is_target = 0;
+                        ininfo = rinfo->targeti.in;
+                        vino.ino = le64_to_cpu(ininfo->ino);
+                        vino.snap = le64_to_cpu(ininfo->snapid);
+                        in = ceph_get_inode(sb, vino);
+                        err = fill_inode(in, &rinfo->targeti, NULL,
+                                         session, req->r_request_started,
+                                         req->r_fmode);
+                        iput(in);
+                }
+        }
+#endif
+        if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
+                dout("fill_trace reply is empty!\n");
+                if (rinfo->head->result == 0 && req->r_locked_dir) {
+                        struct ceph_inode_info *ci =
+                                ceph_inode(req->r_locked_dir);
+                        dout(" clearing %p complete (empty trace)\n",
+                             req->r_locked_dir);
+                        ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+                        ci->i_release_count++;
+                }
+                return 0;
+        }
+        if (rinfo->head->is_dentry) {
+                struct inode *dir = req->r_locked_dir;
+                err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
+                                 session, req->r_request_started, -1,
+                                 &req->r_caps_reservation);
+                if (err < 0)
+                        return err;
+        }
+        if (rinfo->head->is_dentry && !req->r_aborted) {
+                /*
+                 * lookup link rename   : null -> possibly existing inode
+                 * mknod symlink mkdir  : null -> new inode
+                 * unlink               : linked -> null
+                 */
+                struct inode *dir = req->r_locked_dir;
+                struct dentry *dn = req->r_dentry;
+                bool have_dir_cap, have_lease;
+                BUG_ON(!dn);
+                BUG_ON(!dir);
+                BUG_ON(dn->d_parent->d_inode != dir);
+                BUG_ON(ceph_ino(dir) !=
+                       le64_to_cpu(rinfo->diri.in->ino));
+                BUG_ON(ceph_snap(dir) !=
+                       le64_to_cpu(rinfo->diri.in->snapid));
+                /* do we have a lease on the whole dir? */
+                have_dir_cap =
+                        (le32_to_cpu(rinfo->diri.in->cap.caps) &
+                         CEPH_CAP_FILE_SHARED);
+                /* do we have a dn lease? */
+                have_lease = have_dir_cap ||
+                        (le16_to_cpu(rinfo->dlease->mask) &
+                         CEPH_LOCK_DN);
+                if (!have_lease)
+                        dout("fill_trace  no dentry lease or dir cap\n");
+                /* rename? */
+                if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
+                        dout(" src %p '%.*s' dst %p '%.*s'\n",
+                             req->r_old_dentry,
+                             req->r_old_dentry->d_name.len,
+                             req->r_old_dentry->d_name.name,
+                             dn, dn->d_name.len, dn->d_name.name);
+                        dout("fill_trace doing d_move %p -> %p\n",
+                             req->r_old_dentry, dn);
+                        d_move(req->r_old_dentry, dn);
+                        dout(" src %p '%.*s' dst %p '%.*s'\n",
+                             req->r_old_dentry,
+                             req->r_old_dentry->d_name.len,
+                             req->r_old_dentry->d_name.name,
+                             dn, dn->d_name.len, dn->d_name.name);
+                        /* ensure target dentry is invalidated, despite
+                           rehashing bug in vfs_rename_dir */
+                        dn->d_time = jiffies;
+                        ceph_dentry(dn)->lease_shared_gen = 0;
+                        /* take overwritten dentry's readdir offset */
+                        ceph_dentry(req->r_old_dentry)->offset =
+                                ceph_dentry(dn)->offset;
+                        dn = req->r_old_dentry;  /* use old_dentry */
+                        in = dn->d_inode;
+                }
+                /* null dentry? */
+                if (!rinfo->head->is_target) {
+                        dout("fill_trace null dentry\n");
+                        if (dn->d_inode) {
+                                dout("d_delete %p\n", dn);
+                                d_delete(dn);
+                        } else {
+                                dout("d_instantiate %p NULL\n", dn);
+                                d_instantiate(dn, NULL);
+                                if (have_lease && d_unhashed(dn))
+                                        d_rehash(dn);
+                                update_dentry_lease(dn, rinfo->dlease,
+                                                    session,
+                                                    req->r_request_started);
+                        }
+                        goto done;
+                }
+                /* attach proper inode */
+                ininfo = rinfo->targeti.in;
+                vino.ino = le64_to_cpu(ininfo->ino);
+                vino.snap = le64_to_cpu(ininfo->snapid);
+                if (!dn->d_inode) {
+                        in = ceph_get_inode(sb, vino);
+                        if (IS_ERR(in)) {
+                                pr_err("fill_trace bad get_inode "
+                                       "%llx.%llx\n", vino.ino, vino.snap);
+                                err = PTR_ERR(in);
+                                d_delete(dn);
+                                goto done;
+                        }
+                        dn = splice_dentry(dn, in, &have_lease);
+                        if (IS_ERR(dn)) {
+                                err = PTR_ERR(dn);
+                                goto done;
+                        }
+                        req->r_dentry = dn;  /* may have spliced */
+                        ceph_set_dentry_offset(dn);
+                        igrab(in);
+                } else if (ceph_ino(in) == vino.ino &&
+                           ceph_snap(in) == vino.snap) {
+                        igrab(in);
+                } else {
+                        dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
+                             dn, in, ceph_ino(in), ceph_snap(in),
+                             vino.ino, vino.snap);
+                        have_lease = false;
+                        in = NULL;
+                }
+                if (have_lease)
+                        update_dentry_lease(dn, rinfo->dlease, session,
+                                            req->r_request_started);
+                dout(" final dn %p\n", dn);
+                i++;
+        } else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
+                   req->r_op == CEPH_MDS_OP_MKSNAP) {
+                struct dentry *dn = req->r_dentry;
+                /* fill out a snapdir LOOKUPSNAP dentry */
+                BUG_ON(!dn);
+                BUG_ON(!req->r_locked_dir);
+                BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
+                ininfo = rinfo->targeti.in;
+                vino.ino = le64_to_cpu(ininfo->ino);
+                vino.snap = le64_to_cpu(ininfo->snapid);
+                in = ceph_get_inode(sb, vino);
+                if (IS_ERR(in)) {
+                        pr_err("fill_inode get_inode badness %llx.%llx\n",
+                               vino.ino, vino.snap);
+                        err = PTR_ERR(in);
+                        d_delete(dn);
+                        goto done;
+                }
+                dout(" linking snapped dir %p to dn %p\n", in, dn);
+                dn = splice_dentry(dn, in, NULL);
+                if (IS_ERR(dn)) {
+                        err = PTR_ERR(dn);
+                        goto done;
+                }
+                ceph_set_dentry_offset(dn);
+                req->r_dentry = dn;  /* may have spliced */
+                igrab(in);
+                rinfo->head->is_dentry = 1;  /* fool notrace handlers */
+        }
+        if (rinfo->head->is_target) {
+                vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+                vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+                if (in == NULL || ceph_ino(in) != vino.ino ||
+                    ceph_snap(in) != vino.snap) {
+                        in = ceph_get_inode(sb, vino);
+                        if (IS_ERR(in)) {
+                                err = PTR_ERR(in);
+                                goto done;
+                        }
+                }
+                req->r_target_inode = in;
+                err = fill_inode(in,
+                                 &rinfo->targeti, NULL,
+                                 session, req->r_request_started,
+                                 (le32_to_cpu(rinfo->head->result) == 0) ?
+                                 req->r_fmode : -1,
+                                 &req->r_caps_reservation);
+                if (err < 0) {
+                        pr_err("fill_inode badness %p %llx.%llx\n",
+                               in, ceph_vinop(in));
+                        goto done;
+                }
+        }
+done:
+        dout("fill_trace done err=%d\n", err);
+        return err;
+}
+/*
+ * Prepopulate our cache with readdir results, leases, etc.
+ */
+int ceph_readdir_prepopulate(struct ceph_mds_request *req,
+                             struct ceph_mds_session *session)
+{
+        struct dentry *parent = req->r_dentry;
+        struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+        struct qstr dname;
+        struct dentry *dn;
+        struct inode *in;
+        int err = 0, i;
+        struct inode *snapdir = NULL;
+        struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
+        u64 frag = le32_to_cpu(rhead->args.readdir.frag);
+        struct ceph_dentry_info *di;
+        if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
+                snapdir = ceph_get_snapdir(parent->d_inode);
+                parent = d_find_alias(snapdir);
+                dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
+                     rinfo->dir_nr, parent);
+        } else {
+                dout("readdir_prepopulate %d items under dn %p\n",
+                     rinfo->dir_nr, parent);
+                if (rinfo->dir_dir)
+                        ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
+        }
+        for (i = 0; i < rinfo->dir_nr; i++) {
+                struct ceph_vino vino;
+                dname.name = rinfo->dir_dname[i];
+                dname.len = rinfo->dir_dname_len[i];
+                dname.hash = full_name_hash(dname.name, dname.len);
+                vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
+                vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+retry_lookup:
+                dn = d_lookup(parent, &dname);
+                dout("d_lookup on parent=%p name=%.*s got %p\n",
+                     parent, dname.len, dname.name, dn);
+                if (!dn) {
+                        dn = d_alloc(parent, &dname);
+                        dout("d_alloc %p '%.*s' = %p\n", parent,
+                             dname.len, dname.name, dn);
+                        if (dn == NULL) {
+                                dout("d_alloc badness\n");
+                                err = -ENOMEM;
+                                goto out;
+                        }
+                        err = ceph_init_dentry(dn);
+                        if (err < 0)
+                                goto out;
+                } else if (dn->d_inode &&
+                           (ceph_ino(dn->d_inode) != vino.ino ||
+                            ceph_snap(dn->d_inode) != vino.snap)) {
+                        dout(" dn %p points to wrong inode %p\n",
+                             dn, dn->d_inode);
+                        d_delete(dn);
+                        dput(dn);
+                        goto retry_lookup;
+                } else {
+                        /* reorder parent's d_subdirs */
+                        spin_lock(&dcache_lock);
+                        spin_lock(&dn->d_lock);
+                        list_move(&dn->d_u.d_child, &parent->d_subdirs);
+                        spin_unlock(&dn->d_lock);
+                        spin_unlock(&dcache_lock);
+                }
+                di = dn->d_fsdata;
+                di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
+                /* inode */
+                if (dn->d_inode) {
+                        in = dn->d_inode;
+                } else {
+                        in = ceph_get_inode(parent->d_sb, vino);
+                        if (in == NULL) {
+                                dout("new_inode badness\n");
+                                d_delete(dn);
+                                dput(dn);
+                                err = -ENOMEM;
+                                goto out;
+                        }
+                        dn = splice_dentry(dn, in, NULL);
+                }
+                if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
+                               req->r_request_started, -1,
+                               &req->r_caps_reservation) < 0) {
+                        pr_err("fill_inode badness on %p\n", in);
+                        dput(dn);
+                        continue;
+                }
+                update_dentry_lease(dn, rinfo->dir_dlease[i],
+                                    req->r_session, req->r_request_started);
+                dput(dn);
+        }
+        req->r_did_prepopulate = true;
+out:
+        if (snapdir) {
+                iput(snapdir);
+                dput(parent);
+        }
+        dout("readdir_prepopulate done\n");
+        return err;
+}
+int ceph_inode_set_size(struct inode *inode, loff_t size)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int ret = 0;
+        spin_lock(&inode->i_lock);
+        dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
+        inode->i_size = size;
+        inode->i_blocks = (size + (1 << 9) - 1) >> 9;
+        /* tell the MDS if we are approaching max_size */
+        if ((size << 1) >= ci->i_max_size &&
+            (ci->i_reported_size << 1) < ci->i_max_size)
+                ret = 1;
+        spin_unlock(&inode->i_lock);
+        return ret;
+}
+/*
+ * Write back inode data in a worker thread.  (This can't be done
+ * in the message handler context.)
+ */
+void ceph_queue_writeback(struct inode *inode)
+{
+        if (queue_work(ceph_inode_to_client(inode)->wb_wq,
+                       &ceph_inode(inode)->i_wb_work)) {
+                dout("ceph_queue_writeback %p\n", inode);
+                igrab(inode);
+        } else {
+                dout("ceph_queue_writeback %p failed\n", inode);
+        }
+}
+static void ceph_writeback_work(struct work_struct *work)
+{
+        struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+                                                  i_wb_work);
+        struct inode *inode = &ci->vfs_inode;
+        dout("writeback %p\n", inode);
+        filemap_fdatawrite(&inode->i_data);
+        iput(inode);
+}
+/*
+ * queue an async invalidation
+ */
+void ceph_queue_invalidate(struct inode *inode)
+{
+        if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
+                       &ceph_inode(inode)->i_pg_inv_work)) {
+                dout("ceph_queue_invalidate %p\n", inode);
+                igrab(inode);
+        } else {
+                dout("ceph_queue_invalidate %p failed\n", inode);
+        }
+}
+/*
+ * invalidate any pages that are not dirty or under writeback.  this
+ * includes pages that are clean and mapped.
+ */
+static void ceph_invalidate_nondirty_pages(struct address_space *mapping)
+{
+        struct pagevec pvec;
+        pgoff_t next = 0;
+        int i;
+        pagevec_init(&pvec, 0);
+        while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+                for (i = 0; i < pagevec_count(&pvec); i++) {
+                        struct page *page = pvec.pages[i];
+                        pgoff_t index;
+                        int skip_page =
+                                (PageDirty(page) || PageWriteback(page));
+                        if (!skip_page)
+                                skip_page = !trylock_page(page);
+                        /*
+                         * We really shouldn't be looking at the ->index of an
+                         * unlocked page.  But we're not allowed to lock these
+                         * pages.  So we rely upon nobody altering the ->index
+                         * of this (pinned-by-us) page.
+                         */
+                        index = page->index;
+                        if (index > next)
+                                next = index;
+                        next++;
+                        if (skip_page)
+                                continue;
+                        generic_error_remove_page(mapping, page);
+                        unlock_page(page);
+                }
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+}
+/*
+ * Invalidate inode pages in a worker thread.  (This can't be done
+ * in the message handler context.)
+ */
+static void ceph_invalidate_work(struct work_struct *work)
+{
+        struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+                                                  i_pg_inv_work);
+        struct inode *inode = &ci->vfs_inode;
+        u32 orig_gen;
+        int check = 0;
+        spin_lock(&inode->i_lock);
+        dout("invalidate_pages %p gen %d revoking %d\n", inode,
+             ci->i_rdcache_gen, ci->i_rdcache_revoking);
+        if (ci->i_rdcache_gen == 0 ||
+            ci->i_rdcache_revoking != ci->i_rdcache_gen) {
+                BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
+                /* nevermind! */
+                ci->i_rdcache_revoking = 0;
+                spin_unlock(&inode->i_lock);
+                goto out;
+        }
+        orig_gen = ci->i_rdcache_gen;
+        spin_unlock(&inode->i_lock);
+        ceph_invalidate_nondirty_pages(inode->i_mapping);
+        spin_lock(&inode->i_lock);
+        if (orig_gen == ci->i_rdcache_gen) {
+                dout("invalidate_pages %p gen %d successful\n", inode,
+                     ci->i_rdcache_gen);
+                ci->i_rdcache_gen = 0;
+                ci->i_rdcache_revoking = 0;
+                check = 1;
+        } else {
+                dout("invalidate_pages %p gen %d raced, gen now %d\n",
+                     inode, orig_gen, ci->i_rdcache_gen);
+        }
+        spin_unlock(&inode->i_lock);
+        if (check)
+                ceph_check_caps(ci, 0, NULL);
+out:
+        iput(inode);
+}
+/*
+ * called by trunc_wq; take i_mutex ourselves
+ *
+ * We also truncate in a separate thread as well.
+ */
+static void ceph_vmtruncate_work(struct work_struct *work)
+{
+        struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+                                                  i_vmtruncate_work);
+        struct inode *inode = &ci->vfs_inode;
+        dout("vmtruncate_work %p\n", inode);
+        mutex_lock(&inode->i_mutex);
+        __ceph_do_pending_vmtruncate(inode);
+        mutex_unlock(&inode->i_mutex);
+        iput(inode);
+}
+/*
+ * Queue an async vmtruncate.  If we fail to queue work, we will handle
+ * the truncation the next time we call __ceph_do_pending_vmtruncate.
+ */
+void ceph_queue_vmtruncate(struct inode *inode)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
+                       &ci->i_vmtruncate_work)) {
+                dout("ceph_queue_vmtruncate %p\n", inode);
+                igrab(inode);
+        } else {
+                dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
+                     inode, ci->i_truncate_pending);
+        }
+}
+/*
+ * called with i_mutex held.
+ *
+ * Make sure any pending truncation is applied before doing anything
+ * that may depend on it.
+ */
+void __ceph_do_pending_vmtruncate(struct inode *inode)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        u64 to;
+        int wrbuffer_refs, wake = 0;
+retry:
+        spin_lock(&inode->i_lock);
+        if (ci->i_truncate_pending == 0) {
+                dout("__do_pending_vmtruncate %p none pending\n", inode);
+                spin_unlock(&inode->i_lock);
+                return;
+        }
+        /*
+         * make sure any dirty snapped pages are flushed before we
+         * possibly truncate them.. so write AND block!
+         */
+        if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
+                dout("__do_pending_vmtruncate %p flushing snaps first\n",
+                     inode);
+                spin_unlock(&inode->i_lock);
+                filemap_write_and_wait_range(&inode->i_data, 0,
+                                             inode->i_sb->s_maxbytes);
+                goto retry;
+        }
+        to = ci->i_truncate_size;
+        wrbuffer_refs = ci->i_wrbuffer_ref;
+        dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
+             ci->i_truncate_pending, to);
+        spin_unlock(&inode->i_lock);
+        truncate_inode_pages(inode->i_mapping, to);
+        spin_lock(&inode->i_lock);
+        ci->i_truncate_pending--;
+        if (ci->i_truncate_pending == 0)
+                wake = 1;
+        spin_unlock(&inode->i_lock);
+        if (wrbuffer_refs == 0)
+                ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+        if (wake)
+                wake_up(&ci->i_cap_wq);
+}
+/*
+ * symlinks
+ */
+static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+        struct ceph_inode_info *ci = ceph_inode(dentry->d_inode);
+        nd_set_link(nd, ci->i_symlink);
+        return NULL;
+}
+static const struct inode_operations ceph_symlink_iops = {
+        .readlink = generic_readlink,
+        .follow_link = ceph_sym_follow_link,
+};
+/*
+ * setattr
+ */
+int ceph_setattr(struct dentry *dentry, struct iattr *attr)
+{
+        struct inode *inode = dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct inode *parent_inode = dentry->d_parent->d_inode;
+        const unsigned int ia_valid = attr->ia_valid;
+        struct ceph_mds_request *req;
+        struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
+        int issued;
+        int release = 0, dirtied = 0;
+        int mask = 0;
+        int err = 0;
+        if (ceph_snap(inode) != CEPH_NOSNAP)
+                return -EROFS;
+        __ceph_do_pending_vmtruncate(inode);
+        err = inode_change_ok(inode, attr);
+        if (err != 0)
+                return err;
+        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
+                                       USE_AUTH_MDS);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        spin_lock(&inode->i_lock);
+        issued = __ceph_caps_issued(ci, NULL);
+        dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
+        if (ia_valid & ATTR_UID) {
+                dout("setattr %p uid %d -> %d\n", inode,
+                     inode->i_uid, attr->ia_uid);
+                if (issued & CEPH_CAP_AUTH_EXCL) {
+                        inode->i_uid = attr->ia_uid;
+                        dirtied |= CEPH_CAP_AUTH_EXCL;
+                } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+                           attr->ia_uid != inode->i_uid) {
+                        req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
+                        mask |= CEPH_SETATTR_UID;
+                        release |= CEPH_CAP_AUTH_SHARED;
+                }
+        }
+        if (ia_valid & ATTR_GID) {
+                dout("setattr %p gid %d -> %d\n", inode,
+                     inode->i_gid, attr->ia_gid);
+                if (issued & CEPH_CAP_AUTH_EXCL) {
+                        inode->i_gid = attr->ia_gid;
+                        dirtied |= CEPH_CAP_AUTH_EXCL;
+                } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+                           attr->ia_gid != inode->i_gid) {
+                        req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
+                        mask |= CEPH_SETATTR_GID;
+                        release |= CEPH_CAP_AUTH_SHARED;
+                }
+        }
+        if (ia_valid & ATTR_MODE) {
+                dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
+                     attr->ia_mode);
+                if (issued & CEPH_CAP_AUTH_EXCL) {
+                        inode->i_mode = attr->ia_mode;
+                        dirtied |= CEPH_CAP_AUTH_EXCL;
+                } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+                           attr->ia_mode != inode->i_mode) {
+                        req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
+                        mask |= CEPH_SETATTR_MODE;
+                        release |= CEPH_CAP_AUTH_SHARED;
+                }
+        }
+        if (ia_valid & ATTR_ATIME) {
+                dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode,
+                     inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
+                     attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
+                if (issued & CEPH_CAP_FILE_EXCL) {
+                        ci->i_time_warp_seq++;
+                        inode->i_atime = attr->ia_atime;
+                        dirtied |= CEPH_CAP_FILE_EXCL;
+                } else if ((issued & CEPH_CAP_FILE_WR) &&
+                           timespec_compare(&inode->i_atime,
+                                            &attr->ia_atime) < 0) {
+                        inode->i_atime = attr->ia_atime;
+                        dirtied |= CEPH_CAP_FILE_WR;
+                } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+                           !timespec_equal(&inode->i_atime, &attr->ia_atime)) {
+                        ceph_encode_timespec(&req->r_args.setattr.atime,
+                                             &attr->ia_atime);
+                        mask |= CEPH_SETATTR_ATIME;
+                        release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
+                                CEPH_CAP_FILE_WR;
+                }
+        }
+        if (ia_valid & ATTR_MTIME) {
+                dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode,
+                     inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
+                     attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
+                if (issued & CEPH_CAP_FILE_EXCL) {
+                        ci->i_time_warp_seq++;
+                        inode->i_mtime = attr->ia_mtime;
+                        dirtied |= CEPH_CAP_FILE_EXCL;
+                } else if ((issued & CEPH_CAP_FILE_WR) &&
+                           timespec_compare(&inode->i_mtime,
+                                            &attr->ia_mtime) < 0) {
+                        inode->i_mtime = attr->ia_mtime;
+                        dirtied |= CEPH_CAP_FILE_WR;
+                } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+                           !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
+                        ceph_encode_timespec(&req->r_args.setattr.mtime,
+                                             &attr->ia_mtime);
+                        mask |= CEPH_SETATTR_MTIME;
+                        release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
+                                CEPH_CAP_FILE_WR;
+                }
+        }
+        if (ia_valid & ATTR_SIZE) {
+                dout("setattr %p size %lld -> %lld\n", inode,
+                     inode->i_size, attr->ia_size);
+                if (attr->ia_size > inode->i_sb->s_maxbytes) {
+                        err = -EINVAL;
+                        goto out;
+                }
+                if ((issued & CEPH_CAP_FILE_EXCL) &&
+                    attr->ia_size > inode->i_size) {
+                        inode->i_size = attr->ia_size;
+                        inode->i_blocks =
+                                (attr->ia_size + (1 << 9) - 1) >> 9;
+                        inode->i_ctime = attr->ia_ctime;
+                        ci->i_reported_size = attr->ia_size;
+                        dirtied |= CEPH_CAP_FILE_EXCL;
+                } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+                           attr->ia_size != inode->i_size) {
+                        req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
+                        req->r_args.setattr.old_size =
+                                cpu_to_le64(inode->i_size);
+                        mask |= CEPH_SETATTR_SIZE;
+                        release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
+                                CEPH_CAP_FILE_WR;
+                }
+        }
+        /* these do nothing */
+        if (ia_valid & ATTR_CTIME) {
+                bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
+                                         ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
+                dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode,
+                     inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
+                     attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
+                     only ? "ctime only" : "ignored");
+                inode->i_ctime = attr->ia_ctime;
+                if (only) {
+                        /*
+                         * if kernel wants to dirty ctime but nothing else,
+                         * we need to choose a cap to dirty under, or do
+                         * a almost-no-op setattr
+                         */
+                        if (issued & CEPH_CAP_AUTH_EXCL)
+                                dirtied |= CEPH_CAP_AUTH_EXCL;
+                        else if (issued & CEPH_CAP_FILE_EXCL)
+                                dirtied |= CEPH_CAP_FILE_EXCL;
+                        else if (issued & CEPH_CAP_XATTR_EXCL)
+                                dirtied |= CEPH_CAP_XATTR_EXCL;
+                        else
+                                mask |= CEPH_SETATTR_CTIME;
+                }
+        }
+        if (ia_valid & ATTR_FILE)
+                dout("setattr %p ATTR_FILE ... hrm!\n", inode);
+        if (dirtied) {
+                __ceph_mark_dirty_caps(ci, dirtied);
+                inode->i_ctime = CURRENT_TIME;
+        }
+        release &= issued;
+        spin_unlock(&inode->i_lock);
+        if (mask) {
+                req->r_inode = igrab(inode);
+                req->r_inode_drop = release;
+                req->r_args.setattr.mask = cpu_to_le32(mask);
+                req->r_num_caps = 1;
+                err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+        }
+        dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
+             ceph_cap_string(dirtied), mask);
+        ceph_mdsc_put_request(req);
+        __ceph_do_pending_vmtruncate(inode);
+        return err;
+out:
+        spin_unlock(&inode->i_lock);
+        ceph_mdsc_put_request(req);
+        return err;
+}
+/*
+ * Verify that we have a lease on the given mask.  If not,
+ * do a getattr against an mds.
+ */
+int ceph_do_getattr(struct inode *inode, int mask)
+{
+        struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_request *req;
+        int err;
+        if (ceph_snap(inode) == CEPH_SNAPDIR) {
+                dout("do_getattr inode %p SNAPDIR\n", inode);
+                return 0;
+        }
+        dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
+        if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
+                return 0;
+        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        req->r_inode = igrab(inode);
+        req->r_num_caps = 1;
+        req->r_args.getattr.mask = cpu_to_le32(mask);
+        err = ceph_mdsc_do_request(mdsc, NULL, req);
+        ceph_mdsc_put_request(req);
+        dout("do_getattr result=%d\n", err);
+        return err;
+}
+/*
+ * Check inode permissions.  We verify we have a valid value for
+ * the AUTH cap, then call the generic handler.
+ */
+int ceph_permission(struct inode *inode, int mask)
+{
+        int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
+        if (!err)
+                err = generic_permission(inode, mask, NULL);
+        return err;
+}
+/*
+ * Get all attributes.  Hopefully somedata we'll have a statlite()
+ * and can limit the fields we require to be accurate.
+ */
+int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                 struct kstat *stat)
+{
+        struct inode *inode = dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int err;
+        err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
+        if (!err) {
+                generic_fillattr(inode, stat);
+                stat->ino = inode->i_ino;
+                if (ceph_snap(inode) != CEPH_NOSNAP)
+                        stat->dev = ceph_snap(inode);
+                else
+                        stat->dev = 0;
+                if (S_ISDIR(inode->i_mode)) {
+                        stat->size = ci->i_rbytes;
+                        stat->blocks = 0;
+                        stat->blksize = 65536;
+                }
+        }
+        return err;
+}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
new file mode 100644
index 000000000000..8a5bcae62846
--- /dev/null
+++ b/fs/ceph/ioctl.c
@@ -0,0 +1,160 @@
+#include <linux/in.h>
+#include "ioctl.h"
+#include "super.h"
+#include "ceph_debug.h"
+/*
+ * ioctls
+ */
+/*
+ * get and set the file layout
+ */
+static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
+{
+        struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
+        struct ceph_ioctl_layout l;
+        int err;
+        err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
+        if (!err) {
+                l.stripe_unit = ceph_file_layout_su(ci->i_layout);
+                l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+                l.object_size = ceph_file_layout_object_size(ci->i_layout);
+                l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
+                l.preferred_osd =
+                        (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
+                if (copy_to_user(arg, &l, sizeof(l)))
+                        return -EFAULT;
+        }
+        return err;
+}
+static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+        struct ceph_mds_request *req;
+        struct ceph_ioctl_layout l;
+        int err, i;
+        /* copy and validate */
+        if (copy_from_user(&l, arg, sizeof(l)))
+                return -EFAULT;
+        if ((l.object_size & ~PAGE_MASK) ||
+            (l.stripe_unit & ~PAGE_MASK) ||
+            !l.stripe_unit ||
+            (l.object_size &&
+             (unsigned)l.object_size % (unsigned)l.stripe_unit))
+                return -EINVAL;
+        /* make sure it's a valid data pool */
+        if (l.data_pool > 0) {
+                mutex_lock(&mdsc->mutex);
+                err = -EINVAL;
+                for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
+                        if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
+                                err = 0;
+                                break;
+                        }
+                mutex_unlock(&mdsc->mutex);
+                if (err)
+                        return err;
+        }
+        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
+                                       USE_AUTH_MDS);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        req->r_inode = igrab(inode);
+        req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
+        req->r_args.setlayout.layout.fl_stripe_unit =
+                cpu_to_le32(l.stripe_unit);
+        req->r_args.setlayout.layout.fl_stripe_count =
+                cpu_to_le32(l.stripe_count);
+        req->r_args.setlayout.layout.fl_object_size =
+                cpu_to_le32(l.object_size);
+        req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
+        req->r_args.setlayout.layout.fl_pg_preferred =
+                cpu_to_le32(l.preferred_osd);
+        err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+        ceph_mdsc_put_request(req);
+        return err;
+}
+/*
+ * Return object name, size/offset information, and location (OSD
+ * number, network address) for a given file offset.
+ */
+static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
+{
+        struct ceph_ioctl_dataloc dl;
+        struct inode *inode = file->f_dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+        u64 len = 1, olen;
+        u64 tmp;
+        struct ceph_object_layout ol;
+        struct ceph_pg pgid;
+        /* copy and validate */
+        if (copy_from_user(&dl, arg, sizeof(dl)))
+                return -EFAULT;
+        down_read(&osdc->map_sem);
+        ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
+                                      &dl.object_no, &dl.object_offset, &olen);
+        dl.file_offset -= dl.object_offset;
+        dl.object_size = ceph_file_layout_object_size(ci->i_layout);
+        dl.block_size = ceph_file_layout_su(ci->i_layout);
+        /* block_offset = object_offset % block_size */
+        tmp = dl.object_offset;
+        dl.block_offset = do_div(tmp, dl.block_size);
+        snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
+                 ceph_ino(inode), dl.object_no);
+        ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
+                                osdc->osdmap);
+        pgid = ol.ol_pgid;
+        dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
+        if (dl.osd >= 0) {
+                struct ceph_entity_addr *a =
+                        ceph_osd_addr(osdc->osdmap, dl.osd);
+                if (a)
+                        memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
+        } else {
+                memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
+        }
+        up_read(&osdc->map_sem);
+        /* send result back to user */
+        if (copy_to_user(arg, &dl, sizeof(dl)))
+                return -EFAULT;
+        return 0;
+}
+long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+        dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
+        switch (cmd) {
+        case CEPH_IOC_GET_LAYOUT:
+                return ceph_ioctl_get_layout(file, (void __user *)arg);
+        case CEPH_IOC_SET_LAYOUT:
+                return ceph_ioctl_set_layout(file, (void __user *)arg);
+        case CEPH_IOC_GET_DATALOC:
+                return ceph_ioctl_get_dataloc(file, (void __user *)arg);
+        }
+        return -ENOTTY;
+}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
new file mode 100644
index 000000000000..25e4f1a9d059
--- /dev/null
+++ b/fs/ceph/ioctl.h
@@ -0,0 +1,40 @@
+#ifndef FS_CEPH_IOCTL_H
+#define FS_CEPH_IOCTL_H
+#include <linux/ioctl.h>
+#include <linux/types.h>
+#define CEPH_IOCTL_MAGIC 0x97
+/* just use u64 to align sanely on all archs */
+struct ceph_ioctl_layout {
+        __u64 stripe_unit, stripe_count, object_size;
+        __u64 data_pool;
+        __s64 preferred_osd;
+};
+#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1,           \
+                                   struct ceph_ioctl_layout)
+#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2,           \
+                                   struct ceph_ioctl_layout)
+/*
+ * Extract identity, address of the OSD and object storing a given
+ * file offset.
+ */
+struct ceph_ioctl_dataloc {
+        __u64 file_offset;           /* in+out: file offset */
+        __u64 object_offset;         /* out: offset in object */
+        __u64 object_no;             /* out: object # */
+        __u64 object_size;           /* out: object size */
+        char object_name[64];        /* out: object name */
+        __u64 block_offset;          /* out: offset in block */
+        __u64 block_size;            /* out: block length */
+        __s64 osd;                   /* out: osd # */
+        struct sockaddr_storage osd_addr; /* out: osd address */
+};
+#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
+                                   struct ceph_ioctl_dataloc)
+#endif
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
new file mode 100644
index 000000000000..60a9a4ae47be
--- /dev/null
+++ b/fs/ceph/mds_client.c
@@ -0,0 +1,3043 @@
+#include "ceph_debug.h"
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include "mds_client.h"
+#include "mon_client.h"
+#include "super.h"
+#include "messenger.h"
+#include "decode.h"
+#include "auth.h"
+#include "pagelist.h"
+/*
+ * A cluster of MDS (metadata server) daemons is responsible for
+ * managing the file system namespace (the directory hierarchy and
+ * inodes) and for coordinating shared access to storage.  Metadata is
+ * partitioning hierarchically across a number of servers, and that
+ * partition varies over time as the cluster adjusts the distribution
+ * in order to balance load.
+ *
+ * The MDS client is primarily responsible to managing synchronous
+ * metadata requests for operations like open, unlink, and so forth.
+ * If there is a MDS failure, we find out about it when we (possibly
+ * request and) receive a new MDS map, and can resubmit affected
+ * requests.
+ *
+ * For the most part, though, we take advantage of a lossless
+ * communications channel to the MDS, and do not need to worry about
+ * timing out or resubmitting requests.
+ *
+ * We maintain a stateful "session" with each MDS we interact with.
+ * Within each session, we sent periodic heartbeat messages to ensure
+ * any capabilities or leases we have been issues remain valid.  If
+ * the session times out and goes stale, our leases and capabilities
+ * are no longer valid.
+ */
+static void __wake_requests(struct ceph_mds_client *mdsc,
+                            struct list_head *head);
+const static struct ceph_connection_operations mds_con_ops;
+/*
+ * mds reply parsing
+ */
+/*
+ * parse individual inode info
+ */
+static int parse_reply_info_in(void **p, void *end,
+                               struct ceph_mds_reply_info_in *info)
+{
+        int err = -EIO;
+        info->in = *p;
+        *p += sizeof(struct ceph_mds_reply_inode) +
+                sizeof(*info->in->fragtree.splits) *
+                le32_to_cpu(info->in->fragtree.nsplits);
+        ceph_decode_32_safe(p, end, info->symlink_len, bad);
+        ceph_decode_need(p, end, info->symlink_len, bad);
+        info->symlink = *p;
+        *p += info->symlink_len;
+        ceph_decode_32_safe(p, end, info->xattr_len, bad);
+        ceph_decode_need(p, end, info->xattr_len, bad);
+        info->xattr_data = *p;
+        *p += info->xattr_len;
+        return 0;
+bad:
+        return err;
+}
+/*
+ * parse a normal reply, which may contain a (dir+)dentry and/or a
+ * target inode.
+ */
+static int parse_reply_info_trace(void **p, void *end,
+                                  struct ceph_mds_reply_info_parsed *info)
+{
+        int err;
+        if (info->head->is_dentry) {
+                err = parse_reply_info_in(p, end, &info->diri);
+                if (err < 0)
+                        goto out_bad;
+                if (unlikely(*p + sizeof(*info->dirfrag) > end))
+                        goto bad;
+                info->dirfrag = *p;
+                *p += sizeof(*info->dirfrag) +
+                        sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
+                if (unlikely(*p > end))
+                        goto bad;
+                ceph_decode_32_safe(p, end, info->dname_len, bad);
+                ceph_decode_need(p, end, info->dname_len, bad);
+                info->dname = *p;
+                *p += info->dname_len;
+                info->dlease = *p;
+                *p += sizeof(*info->dlease);
+        }
+        if (info->head->is_target) {
+                err = parse_reply_info_in(p, end, &info->targeti);
+                if (err < 0)
+                        goto out_bad;
+        }
+        if (unlikely(*p != end))
+                goto bad;
+        return 0;
+bad:
+        err = -EIO;
+out_bad:
+        pr_err("problem parsing mds trace %d\n", err);
+        return err;
+}
+/*
+ * parse readdir results
+ */
+static int parse_reply_info_dir(void **p, void *end,
+                                struct ceph_mds_reply_info_parsed *info)
+{
+        u32 num, i = 0;
+        int err;
+        info->dir_dir = *p;
+        if (*p + sizeof(*info->dir_dir) > end)
+                goto bad;
+        *p += sizeof(*info->dir_dir) +
+                sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
+        if (*p > end)
+                goto bad;
+        ceph_decode_need(p, end, sizeof(num) + 2, bad);
+        num = ceph_decode_32(p);
+        info->dir_end = ceph_decode_8(p);
+        info->dir_complete = ceph_decode_8(p);
+        if (num == 0)
+                goto done;
+        /* alloc large array */
+        info->dir_nr = num;
+        info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
+                               sizeof(*info->dir_dname) +
+                               sizeof(*info->dir_dname_len) +
+                               sizeof(*info->dir_dlease),
+                               GFP_NOFS);
+        if (info->dir_in == NULL) {
+                err = -ENOMEM;
+                goto out_bad;
+        }
+        info->dir_dname = (void *)(info->dir_in + num);
+        info->dir_dname_len = (void *)(info->dir_dname + num);
+        info->dir_dlease = (void *)(info->dir_dname_len + num);
+        while (num) {
+                /* dentry */
+                ceph_decode_need(p, end, sizeof(u32)*2, bad);
+                info->dir_dname_len[i] = ceph_decode_32(p);
+                ceph_decode_need(p, end, info->dir_dname_len[i], bad);
+                info->dir_dname[i] = *p;
+                *p += info->dir_dname_len[i];
+                dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
+                     info->dir_dname[i]);
+                info->dir_dlease[i] = *p;
+                *p += sizeof(struct ceph_mds_reply_lease);
+                /* inode */
+                err = parse_reply_info_in(p, end, &info->dir_in[i]);
+                if (err < 0)
+                        goto out_bad;
+                i++;
+                num--;
+        }
+done:
+        if (*p != end)
+                goto bad;
+        return 0;
+bad:
+        err = -EIO;
+out_bad:
+        pr_err("problem parsing dir contents %d\n", err);
+        return err;
+}
+/*
+ * parse entire mds reply
+ */
+static int parse_reply_info(struct ceph_msg *msg,
+                            struct ceph_mds_reply_info_parsed *info)
+{
+        void *p, *end;
+        u32 len;
+        int err;
+        info->head = msg->front.iov_base;
+        p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
+        end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
+        /* trace */
+        ceph_decode_32_safe(&p, end, len, bad);
+        if (len > 0) {
+                err = parse_reply_info_trace(&p, p+len, info);
+                if (err < 0)
+                        goto out_bad;
+        }
+        /* dir content */
+        ceph_decode_32_safe(&p, end, len, bad);
+        if (len > 0) {
+                err = parse_reply_info_dir(&p, p+len, info);
+                if (err < 0)
+                        goto out_bad;
+        }
+        /* snap blob */
+        ceph_decode_32_safe(&p, end, len, bad);
+        info->snapblob_len = len;
+        info->snapblob = p;
+        p += len;
+        if (p != end)
+                goto bad;
+        return 0;
+bad:
+        err = -EIO;
+out_bad:
+        pr_err("mds parse_reply err %d\n", err);
+        return err;
+}
+static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
+{
+        kfree(info->dir_in);
+}
+/*
+ * sessions
+ */
+static const char *session_state_name(int s)
+{
+        switch (s) {
+        case CEPH_MDS_SESSION_NEW: return "new";
+        case CEPH_MDS_SESSION_OPENING: return "opening";
+        case CEPH_MDS_SESSION_OPEN: return "open";
+        case CEPH_MDS_SESSION_HUNG: return "hung";
+        case CEPH_MDS_SESSION_CLOSING: return "closing";
+        case CEPH_MDS_SESSION_RESTARTING: return "restarting";
+        case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
+        default: return "???";
+        }
+}
+static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
+{
+        if (atomic_inc_not_zero(&s->s_ref)) {
+                dout("mdsc get_session %p %d -> %d\n", s,
+                     atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
+                return s;
+        } else {
+                dout("mdsc get_session %p 0 -- FAIL", s);
+                return NULL;
+        }
+}
+void ceph_put_mds_session(struct ceph_mds_session *s)
+{
+        dout("mdsc put_session %p %d -> %d\n", s,
+             atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
+        if (atomic_dec_and_test(&s->s_ref)) {
+                if (s->s_authorizer)
+                        s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
+                                s->s_mdsc->client->monc.auth, s->s_authorizer);
+                kfree(s);
+        }
+}
+/*
+ * called under mdsc->mutex
+ */
+struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
+                                                   int mds)
+{
+        struct ceph_mds_session *session;
+        if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
+                return NULL;
+        session = mdsc->sessions[mds];
+        dout("lookup_mds_session %p %d\n", session,
+             atomic_read(&session->s_ref));
+        get_session(session);
+        return session;
+}
+static bool __have_session(struct ceph_mds_client *mdsc, int mds)
+{
+        if (mds >= mdsc->max_sessions)
+                return false;
+        return mdsc->sessions[mds];
+}
+static int __verify_registered_session(struct ceph_mds_client *mdsc,
+                                       struct ceph_mds_session *s)
+{
+        if (s->s_mds >= mdsc->max_sessions ||
+            mdsc->sessions[s->s_mds] != s)
+                return -ENOENT;
+        return 0;
+}
+/*
+ * create+register a new session for given mds.
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
+                                                 int mds)
+{
+        struct ceph_mds_session *s;
+        s = kzalloc(sizeof(*s), GFP_NOFS);
+        if (!s)
+                return ERR_PTR(-ENOMEM);
+        s->s_mdsc = mdsc;
+        s->s_mds = mds;
+        s->s_state = CEPH_MDS_SESSION_NEW;
+        s->s_ttl = 0;
+        s->s_seq = 0;
+        mutex_init(&s->s_mutex);
+        ceph_con_init(mdsc->client->msgr, &s->s_con);
+        s->s_con.private = s;
+        s->s_con.ops = &mds_con_ops;
+        s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
+        s->s_con.peer_name.num = cpu_to_le64(mds);
+        spin_lock_init(&s->s_cap_lock);
+        s->s_cap_gen = 0;
+        s->s_cap_ttl = 0;
+        s->s_renew_requested = 0;
+        s->s_renew_seq = 0;
+        INIT_LIST_HEAD(&s->s_caps);
+        s->s_nr_caps = 0;
+        s->s_trim_caps = 0;
+        atomic_set(&s->s_ref, 1);
+        INIT_LIST_HEAD(&s->s_waiting);
+        INIT_LIST_HEAD(&s->s_unsafe);
+        s->s_num_cap_releases = 0;
+        s->s_cap_iterator = NULL;
+        INIT_LIST_HEAD(&s->s_cap_releases);
+        INIT_LIST_HEAD(&s->s_cap_releases_done);
+        INIT_LIST_HEAD(&s->s_cap_flushing);
+        INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
+        dout("register_session mds%d\n", mds);
+        if (mds >= mdsc->max_sessions) {
+                int newmax = 1 << get_count_order(mds+1);
+                struct ceph_mds_session **sa;
+                dout("register_session realloc to %d\n", newmax);
+                sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
+                if (sa == NULL)
+                        goto fail_realloc;
+                if (mdsc->sessions) {
+                        memcpy(sa, mdsc->sessions,
+                               mdsc->max_sessions * sizeof(void *));
+                        kfree(mdsc->sessions);
+                }
+                mdsc->sessions = sa;
+                mdsc->max_sessions = newmax;
+        }
+        mdsc->sessions[mds] = s;
+        atomic_inc(&s->s_ref);  /* one ref to sessions[], one to caller */
+        ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+        return s;
+fail_realloc:
+        kfree(s);
+        return ERR_PTR(-ENOMEM);
+}
+/*
+ * called under mdsc->mutex
+ */
+static void __unregister_session(struct ceph_mds_client *mdsc,
+                               struct ceph_mds_session *s)
+{
+        dout("__unregister_session mds%d %p\n", s->s_mds, s);
+        BUG_ON(mdsc->sessions[s->s_mds] != s);
+        mdsc->sessions[s->s_mds] = NULL;
+        ceph_con_close(&s->s_con);
+        ceph_put_mds_session(s);
+}
+/*
+ * drop session refs in request.
+ *
+ * should be last request ref, or hold mdsc->mutex
+ */
+static void put_request_session(struct ceph_mds_request *req)
+{
+        if (req->r_session) {
+                ceph_put_mds_session(req->r_session);
+                req->r_session = NULL;
+        }
+}
+void ceph_mdsc_release_request(struct kref *kref)
+{
+        struct ceph_mds_request *req = container_of(kref,
+                                                    struct ceph_mds_request,
+                                                    r_kref);
+        if (req->r_request)
+                ceph_msg_put(req->r_request);
+        if (req->r_reply) {
+                ceph_msg_put(req->r_reply);
+                destroy_reply_info(&req->r_reply_info);
+        }
+        if (req->r_inode) {
+                ceph_put_cap_refs(ceph_inode(req->r_inode),
+                                  CEPH_CAP_PIN);
+                iput(req->r_inode);
+        }
+        if (req->r_locked_dir)
+                ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
+                                  CEPH_CAP_PIN);
+        if (req->r_target_inode)
+                iput(req->r_target_inode);
+        if (req->r_dentry)
+                dput(req->r_dentry);
+        if (req->r_old_dentry) {
+                ceph_put_cap_refs(
+                        ceph_inode(req->r_old_dentry->d_parent->d_inode),
+                        CEPH_CAP_PIN);
+                dput(req->r_old_dentry);
+        }
+        kfree(req->r_path1);
+        kfree(req->r_path2);
+        put_request_session(req);
+        ceph_unreserve_caps(&req->r_caps_reservation);
+        kfree(req);
+}
+/*
+ * lookup session, bump ref if found.
+ *
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
+                                             u64 tid)
+{
+        struct ceph_mds_request *req;
+        struct rb_node *n = mdsc->request_tree.rb_node;
+        while (n) {
+                req = rb_entry(n, struct ceph_mds_request, r_node);
+                if (tid < req->r_tid)
+                        n = n->rb_left;
+                else if (tid > req->r_tid)
+                        n = n->rb_right;
+                else {
+                        ceph_mdsc_get_request(req);
+                        return req;
+                }
+        }
+        return NULL;
+}
+static void __insert_request(struct ceph_mds_client *mdsc,
+                             struct ceph_mds_request *new)
+{
+        struct rb_node **p = &mdsc->request_tree.rb_node;
+        struct rb_node *parent = NULL;
+        struct ceph_mds_request *req = NULL;
+        while (*p) {
+                parent = *p;
+                req = rb_entry(parent, struct ceph_mds_request, r_node);
+                if (new->r_tid < req->r_tid)
+                        p = &(*p)->rb_left;
+                else if (new->r_tid > req->r_tid)
+                        p = &(*p)->rb_right;
+                else
+                        BUG();
+        }
+        rb_link_node(&new->r_node, parent, p);
+        rb_insert_color(&new->r_node, &mdsc->request_tree);
+}
+/*
+ * Register an in-flight request, and assign a tid.  Link to directory
+ * are modifying (if any).
+ *
+ * Called under mdsc->mutex.
+ */
+static void __register_request(struct ceph_mds_client *mdsc,
+                               struct ceph_mds_request *req,
+                               struct inode *dir)
+{
+        req->r_tid = ++mdsc->last_tid;
+        if (req->r_num_caps)
+                ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps);
+        dout("__register_request %p tid %lld\n", req, req->r_tid);
+        ceph_mdsc_get_request(req);
+        __insert_request(mdsc, req);
+        if (dir) {
+                struct ceph_inode_info *ci = ceph_inode(dir);
+                spin_lock(&ci->i_unsafe_lock);
+                req->r_unsafe_dir = dir;
+                list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
+                spin_unlock(&ci->i_unsafe_lock);
+        }
+}
+static void __unregister_request(struct ceph_mds_client *mdsc,
+                                 struct ceph_mds_request *req)
+{
+        dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+        rb_erase(&req->r_node, &mdsc->request_tree);
+        RB_CLEAR_NODE(&req->r_node);
+        if (req->r_unsafe_dir) {
+                struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
+                spin_lock(&ci->i_unsafe_lock);
+                list_del_init(&req->r_unsafe_dir_item);
+                spin_unlock(&ci->i_unsafe_lock);
+        }
+        ceph_mdsc_put_request(req);
+}
+/*
+ * Choose mds to send request to next.  If there is a hint set in the
+ * request (e.g., due to a prior forward hint from the mds), use that.
+ * Otherwise, consult frag tree and/or caps to identify the
+ * appropriate mds.  If all else fails, choose randomly.
+ *
+ * Called under mdsc->mutex.
+ */
+static int __choose_mds(struct ceph_mds_client *mdsc,
+                        struct ceph_mds_request *req)
+{
+        struct inode *inode;
+        struct ceph_inode_info *ci;
+        struct ceph_cap *cap;
+        int mode = req->r_direct_mode;
+        int mds = -1;
+        u32 hash = req->r_direct_hash;
+        bool is_hash = req->r_direct_is_hash;
+        /*
+         * is there a specific mds we should try?  ignore hint if we have
+         * no session and the mds is not up (active or recovering).
+         */
+        if (req->r_resend_mds >= 0 &&
+            (__have_session(mdsc, req->r_resend_mds) ||
+             ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
+                dout("choose_mds using resend_mds mds%d\n",
+                     req->r_resend_mds);
+                return req->r_resend_mds;
+        }
+        if (mode == USE_RANDOM_MDS)
+                goto random;
+        inode = NULL;
+        if (req->r_inode) {
+                inode = req->r_inode;
+        } else if (req->r_dentry) {
+                if (req->r_dentry->d_inode) {
+                        inode = req->r_dentry->d_inode;
+                } else {
+                        inode = req->r_dentry->d_parent->d_inode;
+                        hash = req->r_dentry->d_name.hash;
+                        is_hash = true;
+                }
+        }
+        dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
+             (int)hash, mode);
+        if (!inode)
+                goto random;
+        ci = ceph_inode(inode);
+        if (is_hash && S_ISDIR(inode->i_mode)) {
+                struct ceph_inode_frag frag;
+                int found;
+                ceph_choose_frag(ci, hash, &frag, &found);
+                if (found) {
+                        if (mode == USE_ANY_MDS && frag.ndist > 0) {
+                                u8 r;
+                                /* choose a random replica */
+                                get_random_bytes(&r, 1);
+                                r %= frag.ndist;
+                                mds = frag.dist[r];
+                                dout("choose_mds %p %llx.%llx "
+                                     "frag %u mds%d (%d/%d)\n",
+                                     inode, ceph_vinop(inode),
+                                     frag.frag, frag.mds,
+                                     (int)r, frag.ndist);
+                                return mds;
+                        }
+                        /* since this file/dir wasn't known to be
+                         * replicated, then we want to look for the
+                         * authoritative mds. */
+                        mode = USE_AUTH_MDS;
+                        if (frag.mds >= 0) {
+                                /* choose auth mds */
+                                mds = frag.mds;
+                                dout("choose_mds %p %llx.%llx "
+                                     "frag %u mds%d (auth)\n",
+                                     inode, ceph_vinop(inode), frag.frag, mds);
+                                return mds;
+                        }
+                }
+        }
+        spin_lock(&inode->i_lock);
+        cap = NULL;
+        if (mode == USE_AUTH_MDS)
+                cap = ci->i_auth_cap;
+        if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
+                cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
+        if (!cap) {
+                spin_unlock(&inode->i_lock);
+                goto random;
+        }
+        mds = cap->session->s_mds;
+        dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
+             inode, ceph_vinop(inode), mds,
+             cap == ci->i_auth_cap ? "auth " : "", cap);
+        spin_unlock(&inode->i_lock);
+        return mds;
+random:
+        mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
+        dout("choose_mds chose random mds%d\n", mds);
+        return mds;
+}
+/*
+ * session messages
+ */
+static struct ceph_msg *create_session_msg(u32 op, u64 seq)
+{
+        struct ceph_msg *msg;
+        struct ceph_mds_session_head *h;
+        msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
+        if (IS_ERR(msg)) {
+                pr_err("create_session_msg ENOMEM creating msg\n");
+                return ERR_PTR(PTR_ERR(msg));
+        }
+        h = msg->front.iov_base;
+        h->op = cpu_to_le32(op);
+        h->seq = cpu_to_le64(seq);
+        return msg;
+}
+/*
+ * send session open request.
+ *
+ * called under mdsc->mutex
+ */
+static int __open_session(struct ceph_mds_client *mdsc,
+                          struct ceph_mds_session *session)
+{
+        struct ceph_msg *msg;
+        int mstate;
+        int mds = session->s_mds;
+        int err = 0;
+        /* wait for mds to go active? */
+        mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
+        dout("open_session to mds%d (%s)\n", mds,
+             ceph_mds_state_name(mstate));
+        session->s_state = CEPH_MDS_SESSION_OPENING;
+        session->s_renew_requested = jiffies;
+        /* send connect message */
+        msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
+        if (IS_ERR(msg)) {
+                err = PTR_ERR(msg);
+                goto out;
+        }
+        ceph_con_send(&session->s_con, msg);
+out:
+        return 0;
+}
+/*
+ * session caps
+ */
+/*
+ * Free preallocated cap messages assigned to this session
+ */
+static void cleanup_cap_releases(struct ceph_mds_session *session)
+{
+        struct ceph_msg *msg;
+        spin_lock(&session->s_cap_lock);
+        while (!list_empty(&session->s_cap_releases)) {
+                msg = list_first_entry(&session->s_cap_releases,
+                                       struct ceph_msg, list_head);
+                list_del_init(&msg->list_head);
+                ceph_msg_put(msg);
+        }
+        while (!list_empty(&session->s_cap_releases_done)) {
+                msg = list_first_entry(&session->s_cap_releases_done,
+                                       struct ceph_msg, list_head);
+                list_del_init(&msg->list_head);
+                ceph_msg_put(msg);
+        }
+        spin_unlock(&session->s_cap_lock);
+}
+/*
+ * Helper to safely iterate over all caps associated with a session.
+ *
+ * caller must hold session s_mutex
+ */
+static int iterate_session_caps(struct ceph_mds_session *session,
+                                 int (*cb)(struct inode *, struct ceph_cap *,
+                                            void *), void *arg)
+{
+        struct list_head *p;
+        struct ceph_cap *cap;
+        struct inode *inode, *last_inode = NULL;
+        struct ceph_cap *old_cap = NULL;
+        int ret;
+        dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
+        spin_lock(&session->s_cap_lock);
+        p = session->s_caps.next;
+        while (p != &session->s_caps) {
+                cap = list_entry(p, struct ceph_cap, session_caps);
+                inode = igrab(&cap->ci->vfs_inode);
+                if (!inode) {
+                        p = p->next;
+                        continue;
+                }
+                session->s_cap_iterator = cap;
+                spin_unlock(&session->s_cap_lock);
+                if (last_inode) {
+                        iput(last_inode);
+                        last_inode = NULL;
+                }
+                if (old_cap) {
+                        ceph_put_cap(old_cap);
+                        old_cap = NULL;
+                }
+                ret = cb(inode, cap, arg);
+                last_inode = inode;
+                spin_lock(&session->s_cap_lock);
+                p = p->next;
+                if (cap->ci == NULL) {
+                        dout("iterate_session_caps  finishing cap %p removal\n",
+                             cap);
+                        BUG_ON(cap->session != session);
+                        list_del_init(&cap->session_caps);
+                        session->s_nr_caps--;
+                        cap->session = NULL;
+                        old_cap = cap;  /* put_cap it w/o locks held */
+                }
+                if (ret < 0)
+                        goto out;
+        }
+        ret = 0;
+out:
+        session->s_cap_iterator = NULL;
+        spin_unlock(&session->s_cap_lock);
+        if (last_inode)
+                iput(last_inode);
+        if (old_cap)
+                ceph_put_cap(old_cap);
+        return ret;
+}
+static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
+                                   void *arg)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        dout("removing cap %p, ci is %p, inode is %p\n",
+             cap, ci, &ci->vfs_inode);
+        ceph_remove_cap(cap);
+        return 0;
+}
+/*
+ * caller must hold session s_mutex
+ */
+static void remove_session_caps(struct ceph_mds_session *session)
+{
+        dout("remove_session_caps on %p\n", session);
+        iterate_session_caps(session, remove_session_caps_cb, NULL);
+        BUG_ON(session->s_nr_caps > 0);
+        cleanup_cap_releases(session);
+}
+/*
+ * wake up any threads waiting on this session's caps.  if the cap is
+ * old (didn't get renewed on the client reconnect), remove it now.
+ *
+ * caller must hold s_mutex.
+ */
+static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
+                              void *arg)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        wake_up(&ci->i_cap_wq);
+        if (arg) {
+                spin_lock(&inode->i_lock);
+                ci->i_wanted_max_size = 0;
+                ci->i_requested_max_size = 0;
+                spin_unlock(&inode->i_lock);
+        }
+        return 0;
+}
+static void wake_up_session_caps(struct ceph_mds_session *session,
+                                 int reconnect)
+{
+        dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
+        iterate_session_caps(session, wake_up_session_cb,
+                             (void *)(unsigned long)reconnect);
+}
+/*
+ * Send periodic message to MDS renewing all currently held caps.  The
+ * ack will reset the expiration for all caps from this session.
+ *
+ * caller holds s_mutex
+ */
+static int send_renew_caps(struct ceph_mds_client *mdsc,
+                           struct ceph_mds_session *session)
+{
+        struct ceph_msg *msg;
+        int state;
+        if (time_after_eq(jiffies, session->s_cap_ttl) &&
+            time_after_eq(session->s_cap_ttl, session->s_renew_requested))
+                pr_info("mds%d caps stale\n", session->s_mds);
+        session->s_renew_requested = jiffies;
+        /* do not try to renew caps until a recovering mds has reconnected
+         * with its clients. */
+        state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
+        if (state < CEPH_MDS_STATE_RECONNECT) {
+                dout("send_renew_caps ignoring mds%d (%s)\n",
+                     session->s_mds, ceph_mds_state_name(state));
+                return 0;
+        }
+        dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
+                ceph_mds_state_name(state));
+        msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
+                                 ++session->s_renew_seq);
+        if (IS_ERR(msg))
+                return PTR_ERR(msg);
+        ceph_con_send(&session->s_con, msg);
+        return 0;
+}
+/*
+ * Note new cap ttl, and any transition from stale -> not stale (fresh?).
+ *
+ * Called under session->s_mutex
+ */
+static void renewed_caps(struct ceph_mds_client *mdsc,
+                         struct ceph_mds_session *session, int is_renew)
+{
+        int was_stale;
+        int wake = 0;
+        spin_lock(&session->s_cap_lock);
+        was_stale = is_renew && (session->s_cap_ttl == 0 ||
+                                 time_after_eq(jiffies, session->s_cap_ttl));
+        session->s_cap_ttl = session->s_renew_requested +
+                mdsc->mdsmap->m_session_timeout*HZ;
+        if (was_stale) {
+                if (time_before(jiffies, session->s_cap_ttl)) {
+                        pr_info("mds%d caps renewed\n", session->s_mds);
+                        wake = 1;
+                } else {
+                        pr_info("mds%d caps still stale\n", session->s_mds);
+                }
+        }
+        dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
+             session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
+             time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
+        spin_unlock(&session->s_cap_lock);
+        if (wake)
+                wake_up_session_caps(session, 0);
+}
+/*
+ * send a session close request
+ */
+static int request_close_session(struct ceph_mds_client *mdsc,
+                                 struct ceph_mds_session *session)
+{
+        struct ceph_msg *msg;
+        int err = 0;
+        dout("request_close_session mds%d state %s seq %lld\n",
+             session->s_mds, session_state_name(session->s_state),
+             session->s_seq);
+        msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
+        if (IS_ERR(msg))
+                err = PTR_ERR(msg);
+        else
+                ceph_con_send(&session->s_con, msg);
+        return err;
+}
+/*
+ * Called with s_mutex held.
+ */
+static int __close_session(struct ceph_mds_client *mdsc,
+                         struct ceph_mds_session *session)
+{
+        if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
+                return 0;
+        session->s_state = CEPH_MDS_SESSION_CLOSING;
+        return request_close_session(mdsc, session);
+}
+/*
+ * Trim old(er) caps.
+ *
+ * Because we can't cache an inode without one or more caps, we do
+ * this indirectly: if a cap is unused, we prune its aliases, at which
+ * point the inode will hopefully get dropped to.
+ *
+ * Yes, this is a bit sloppy.  Our only real goal here is to respond to
+ * memory pressure from the MDS, though, so it needn't be perfect.
+ */
+static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
+{
+        struct ceph_mds_session *session = arg;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int used, oissued, mine;
+        if (session->s_trim_caps <= 0)
+                return -1;
+        spin_lock(&inode->i_lock);
+        mine = cap->issued | cap->implemented;
+        used = __ceph_caps_used(ci);
+        oissued = __ceph_caps_issued_other(ci, cap);
+        dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
+             inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
+             ceph_cap_string(used));
+        if (ci->i_dirty_caps)
+                goto out;   /* dirty caps */
+        if ((used & ~oissued) & mine)
+                goto out;   /* we need these caps */
+        session->s_trim_caps--;
+        if (oissued) {
+                /* we aren't the only cap.. just remove us */
+                __ceph_remove_cap(cap);
+        } else {
+                /* try to drop referring dentries */
+                spin_unlock(&inode->i_lock);
+                d_prune_aliases(inode);
+                dout("trim_caps_cb %p cap %p  pruned, count now %d\n",
+                     inode, cap, atomic_read(&inode->i_count));
+                return 0;
+        }
+out:
+        spin_unlock(&inode->i_lock);
+        return 0;
+}
+/*
+ * Trim session cap count down to some max number.
+ */
+static int trim_caps(struct ceph_mds_client *mdsc,
+                     struct ceph_mds_session *session,
+                     int max_caps)
+{
+        int trim_caps = session->s_nr_caps - max_caps;
+        dout("trim_caps mds%d start: %d / %d, trim %d\n",
+             session->s_mds, session->s_nr_caps, max_caps, trim_caps);
+        if (trim_caps > 0) {
+                session->s_trim_caps = trim_caps;
+                iterate_session_caps(session, trim_caps_cb, session);
+                dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
+                     session->s_mds, session->s_nr_caps, max_caps,
+                        trim_caps - session->s_trim_caps);
+                session->s_trim_caps = 0;
+        }
+        return 0;
+}
+/*
+ * Allocate cap_release messages.  If there is a partially full message
+ * in the queue, try to allocate enough to cover it's remainder, so that
+ * we can send it immediately.
+ *
+ * Called under s_mutex.
+ */
+static int add_cap_releases(struct ceph_mds_client *mdsc,
+                            struct ceph_mds_session *session,
+                            int extra)
+{
+        struct ceph_msg *msg;
+        struct ceph_mds_cap_release *head;
+        int err = -ENOMEM;
+        if (extra < 0)
+                extra = mdsc->client->mount_args->cap_release_safety;
+        spin_lock(&session->s_cap_lock);
+        if (!list_empty(&session->s_cap_releases)) {
+                msg = list_first_entry(&session->s_cap_releases,
+                                       struct ceph_msg,
+                                 list_head);
+                head = msg->front.iov_base;
+                extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
+        }
+        while (session->s_num_cap_releases < session->s_nr_caps + extra) {
+                spin_unlock(&session->s_cap_lock);
+                msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
+                                   0, 0, NULL);
+                if (!msg)
+                        goto out_unlocked;
+                dout("add_cap_releases %p msg %p now %d\n", session, msg,
+                     (int)msg->front.iov_len);
+                head = msg->front.iov_base;
+                head->num = cpu_to_le32(0);
+                msg->front.iov_len = sizeof(*head);
+                spin_lock(&session->s_cap_lock);
+                list_add(&msg->list_head, &session->s_cap_releases);
+                session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
+        }
+        if (!list_empty(&session->s_cap_releases)) {
+                msg = list_first_entry(&session->s_cap_releases,
+                                       struct ceph_msg,
+                                       list_head);
+                head = msg->front.iov_base;
+                if (head->num) {
+                        dout(" queueing non-full %p (%d)\n", msg,
+                             le32_to_cpu(head->num));
+                        list_move_tail(&msg->list_head,
+                                      &session->s_cap_releases_done);
+                        session->s_num_cap_releases -=
+                                CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
+                }
+        }
+        err = 0;
+        spin_unlock(&session->s_cap_lock);
+out_unlocked:
+        return err;
+}
+/*
+ * flush all dirty inode data to disk.
+ *
+ * returns true if we've flushed through want_flush_seq
+ */
+static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
+{
+        int mds, ret = 1;
+        dout("check_cap_flush want %lld\n", want_flush_seq);
+        mutex_lock(&mdsc->mutex);
+        for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
+                struct ceph_mds_session *session = mdsc->sessions[mds];
+                if (!session)
+                        continue;
+                get_session(session);
+                mutex_unlock(&mdsc->mutex);
+                mutex_lock(&session->s_mutex);
+                if (!list_empty(&session->s_cap_flushing)) {
+                        struct ceph_inode_info *ci =
+                                list_entry(session->s_cap_flushing.next,
+                                           struct ceph_inode_info,
+                                           i_flushing_item);
+                        struct inode *inode = &ci->vfs_inode;
+                        spin_lock(&inode->i_lock);
+                        if (ci->i_cap_flush_seq <= want_flush_seq) {
+                                dout("check_cap_flush still flushing %p "
+                                     "seq %lld <= %lld to mds%d\n", inode,
+                                     ci->i_cap_flush_seq, want_flush_seq,
+                                     session->s_mds);
+                                ret = 0;
+                        }
+                        spin_unlock(&inode->i_lock);
+                }
+                mutex_unlock(&session->s_mutex);
+                ceph_put_mds_session(session);
+                if (!ret)
+                        return ret;
+                mutex_lock(&mdsc->mutex);
+        }
+        mutex_unlock(&mdsc->mutex);
+        dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
+        return ret;
+}
+/*
+ * called under s_mutex
+ */
+static void send_cap_releases(struct ceph_mds_client *mdsc,
+                       struct ceph_mds_session *session)
+{
+        struct ceph_msg *msg;
+        dout("send_cap_releases mds%d\n", session->s_mds);
+        while (1) {
+                spin_lock(&session->s_cap_lock);
+                if (list_empty(&session->s_cap_releases_done))
+                        break;
+                msg = list_first_entry(&session->s_cap_releases_done,
+                                 struct ceph_msg, list_head);
+                list_del_init(&msg->list_head);
+                spin_unlock(&session->s_cap_lock);
+                msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+                dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
+                ceph_con_send(&session->s_con, msg);
+        }
+        spin_unlock(&session->s_cap_lock);
+}
+/*
+ * requests
+ */
+/*
+ * Create an mds request.
+ */
+struct ceph_mds_request *
+ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
+{
+        struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
+        if (!req)
+                return ERR_PTR(-ENOMEM);
+        req->r_started = jiffies;
+        req->r_resend_mds = -1;
+        INIT_LIST_HEAD(&req->r_unsafe_dir_item);
+        req->r_fmode = -1;
+        kref_init(&req->r_kref);
+        INIT_LIST_HEAD(&req->r_wait);
+        init_completion(&req->r_completion);
+        init_completion(&req->r_safe_completion);
+        INIT_LIST_HEAD(&req->r_unsafe_item);
+        req->r_op = op;
+        req->r_direct_mode = mode;
+        return req;
+}
+/*
+ * return oldest (lowest) request, tid in request tree, 0 if none.
+ *
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
+{
+        if (RB_EMPTY_ROOT(&mdsc->request_tree))
+                return NULL;
+        return rb_entry(rb_first(&mdsc->request_tree),
+                        struct ceph_mds_request, r_node);
+}
+static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
+{
+        struct ceph_mds_request *req = __get_oldest_req(mdsc);
+        if (req)
+                return req->r_tid;
+        return 0;
+}
+/*
+ * Build a dentry's path.  Allocate on heap; caller must kfree.  Based
+ * on build_path_from_dentry in fs/cifs/dir.c.
+ *
+ * If @stop_on_nosnap, generate path relative to the first non-snapped
+ * inode.
+ *
+ * Encode hidden .snap dirs as a double /, i.e.
+ *   foo/.snap/bar -> foo//bar
+ */
+char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+                           int stop_on_nosnap)
+{
+        struct dentry *temp;
+        char *path;
+        int len, pos;
+        if (dentry == NULL)
+                return ERR_PTR(-EINVAL);
+retry:
+        len = 0;
+        for (temp = dentry; !IS_ROOT(temp);) {
+                struct inode *inode = temp->d_inode;
+                if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
+                        len++;  /* slash only */
+                else if (stop_on_nosnap && inode &&
+                         ceph_snap(inode) == CEPH_NOSNAP)
+                        break;
+                else
+                        len += 1 + temp->d_name.len;
+                temp = temp->d_parent;
+                if (temp == NULL) {
+                        pr_err("build_path_dentry corrupt dentry %p\n", dentry);
+                        return ERR_PTR(-EINVAL);
+                }
+        }
+        if (len)
+                len--;  /* no leading '/' */
+        path = kmalloc(len+1, GFP_NOFS);
+        if (path == NULL)
+                return ERR_PTR(-ENOMEM);
+        pos = len;
+        path[pos] = 0;  /* trailing null */
+        for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
+                struct inode *inode = temp->d_inode;
+                if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
+                        dout("build_path_dentry path+%d: %p SNAPDIR\n",
+                             pos, temp);
+                } else if (stop_on_nosnap && inode &&
+                           ceph_snap(inode) == CEPH_NOSNAP) {
+                        break;
+                } else {
+                        pos -= temp->d_name.len;
+                        if (pos < 0)
+                                break;
+                        strncpy(path + pos, temp->d_name.name,
+                                temp->d_name.len);
+                        dout("build_path_dentry path+%d: %p '%.*s'\n",
+                             pos, temp, temp->d_name.len, path + pos);
+                }
+                if (pos)
+                        path[--pos] = '/';
+                temp = temp->d_parent;
+                if (temp == NULL) {
+                        pr_err("build_path_dentry corrupt dentry\n");
+                        kfree(path);
+                        return ERR_PTR(-EINVAL);
+                }
+        }
+        if (pos != 0) {
+                pr_err("build_path_dentry did not end path lookup where "
+                       "expected, namelen is %d, pos is %d\n", len, pos);
+                /* presumably this is only possible if racing with a
+                   rename of one of the parent directories (we can not
+                   lock the dentries above us to prevent this, but
+                   retrying should be harmless) */
+                kfree(path);
+                goto retry;
+        }
+        *base = ceph_ino(temp->d_inode);
+        *plen = len;
+        dout("build_path_dentry on %p %d built %llx '%.*s'\n",
+             dentry, atomic_read(&dentry->d_count), *base, len, path);
+        return path;
+}
+static int build_dentry_path(struct dentry *dentry,
+                             const char **ppath, int *ppathlen, u64 *pino,
+                             int *pfreepath)
+{
+        char *path;
+        if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
+                *pino = ceph_ino(dentry->d_parent->d_inode);
+                *ppath = dentry->d_name.name;
+                *ppathlen = dentry->d_name.len;
+                return 0;
+        }
+        path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+        if (IS_ERR(path))
+                return PTR_ERR(path);
+        *ppath = path;
+        *pfreepath = 1;
+        return 0;
+}
+static int build_inode_path(struct inode *inode,
+                            const char **ppath, int *ppathlen, u64 *pino,
+                            int *pfreepath)
+{
+        struct dentry *dentry;
+        char *path;
+        if (ceph_snap(inode) == CEPH_NOSNAP) {
+                *pino = ceph_ino(inode);
+                *ppathlen = 0;
+                return 0;
+        }
+        dentry = d_find_alias(inode);
+        path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+        dput(dentry);
+        if (IS_ERR(path))
+                return PTR_ERR(path);
+        *ppath = path;
+        *pfreepath = 1;
+        return 0;
+}
+/*
+ * request arguments may be specified via an inode *, a dentry *, or
+ * an explicit ino+path.
+ */
+static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
+                                  const char *rpath, u64 rino,
+                                  const char **ppath, int *pathlen,
+                                  u64 *ino, int *freepath)
+{
+        int r = 0;
+        if (rinode) {
+                r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
+                dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
+                     ceph_snap(rinode));
+        } else if (rdentry) {
+                r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
+                dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
+                     *ppath);
+        } else if (rpath) {
+                *ino = rino;
+                *ppath = rpath;
+                *pathlen = strlen(rpath);
+                dout(" path %.*s\n", *pathlen, rpath);
+        }
+        return r;
+}
+/*
+ * called under mdsc->mutex
+ */
+static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
+                                               struct ceph_mds_request *req,
+                                               int mds)
+{
+        struct ceph_msg *msg;
+        struct ceph_mds_request_head *head;
+        const char *path1 = NULL;
+        const char *path2 = NULL;
+        u64 ino1 = 0, ino2 = 0;
+        int pathlen1 = 0, pathlen2 = 0;
+        int freepath1 = 0, freepath2 = 0;
+        int len;
+        u16 releases;
+        void *p, *end;
+        int ret;
+        ret = set_request_path_attr(req->r_inode, req->r_dentry,
+                              req->r_path1, req->r_ino1.ino,
+                              &path1, &pathlen1, &ino1, &freepath1);
+        if (ret < 0) {
+                msg = ERR_PTR(ret);
+                goto out;
+        }
+        ret = set_request_path_attr(NULL, req->r_old_dentry,
+                              req->r_path2, req->r_ino2.ino,
+                              &path2, &pathlen2, &ino2, &freepath2);
+        if (ret < 0) {
+                msg = ERR_PTR(ret);
+                goto out_free1;
+        }
+        len = sizeof(*head) +
+                pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
+        /* calculate (max) length for cap releases */
+        len += sizeof(struct ceph_mds_request_release) *
+                (!!req->r_inode_drop + !!req->r_dentry_drop +
+                 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
+        if (req->r_dentry_drop)
+                len += req->r_dentry->d_name.len;
+        if (req->r_old_dentry_drop)
+                len += req->r_old_dentry->d_name.len;
+        msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
+        if (IS_ERR(msg))
+                goto out_free2;
+        msg->hdr.tid = cpu_to_le64(req->r_tid);
+        head = msg->front.iov_base;
+        p = msg->front.iov_base + sizeof(*head);
+        end = msg->front.iov_base + msg->front.iov_len;
+        head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
+        head->op = cpu_to_le32(req->r_op);
+        head->caller_uid = cpu_to_le32(current_fsuid());
+        head->caller_gid = cpu_to_le32(current_fsgid());
+        head->args = req->r_args;
+        ceph_encode_filepath(&p, end, ino1, path1);
+        ceph_encode_filepath(&p, end, ino2, path2);
+        /* cap releases */
+        releases = 0;
+        if (req->r_inode_drop)
+                releases += ceph_encode_inode_release(&p,
+                      req->r_inode ? req->r_inode : req->r_dentry->d_inode,
+                      mds, req->r_inode_drop, req->r_inode_unless, 0);
+        if (req->r_dentry_drop)
+                releases += ceph_encode_dentry_release(&p, req->r_dentry,
+                       mds, req->r_dentry_drop, req->r_dentry_unless);
+        if (req->r_old_dentry_drop)
+                releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
+                       mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
+        if (req->r_old_inode_drop)
+                releases += ceph_encode_inode_release(&p,
+                      req->r_old_dentry->d_inode,
+                      mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
+        head->num_releases = cpu_to_le16(releases);
+        BUG_ON(p > end);
+        msg->front.iov_len = p - msg->front.iov_base;
+        msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+        msg->pages = req->r_pages;
+        msg->nr_pages = req->r_num_pages;
+        msg->hdr.data_len = cpu_to_le32(req->r_data_len);
+        msg->hdr.data_off = cpu_to_le16(0);
+out_free2:
+        if (freepath2)
+                kfree((char *)path2);
+out_free1:
+        if (freepath1)
+                kfree((char *)path1);
+out:
+        return msg;
+}
+/*
+ * called under mdsc->mutex if error, under no mutex if
+ * success.
+ */
+static void complete_request(struct ceph_mds_client *mdsc,
+                             struct ceph_mds_request *req)
+{
+        if (req->r_callback)
+                req->r_callback(mdsc, req);
+        else
+                complete(&req->r_completion);
+}
+/*
+ * called under mdsc->mutex
+ */
+static int __prepare_send_request(struct ceph_mds_client *mdsc,
+                                  struct ceph_mds_request *req,
+                                  int mds)
+{
+        struct ceph_mds_request_head *rhead;
+        struct ceph_msg *msg;
+        int flags = 0;
+        req->r_mds = mds;
+        req->r_attempts++;
+        dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
+             req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
+        if (req->r_request) {
+                ceph_msg_put(req->r_request);
+                req->r_request = NULL;
+        }
+        msg = create_request_message(mdsc, req, mds);
+        if (IS_ERR(msg)) {
+                req->r_reply = ERR_PTR(PTR_ERR(msg));
+                complete_request(mdsc, req);
+                return -PTR_ERR(msg);
+        }
+        req->r_request = msg;
+        rhead = msg->front.iov_base;
+        rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
+        if (req->r_got_unsafe)
+                flags |= CEPH_MDS_FLAG_REPLAY;
+        if (req->r_locked_dir)
+                flags |= CEPH_MDS_FLAG_WANT_DENTRY;
+        rhead->flags = cpu_to_le32(flags);
+        rhead->num_fwd = req->r_num_fwd;
+        rhead->num_retry = req->r_attempts - 1;
+        dout(" r_locked_dir = %p\n", req->r_locked_dir);
+        if (req->r_target_inode && req->r_got_unsafe)
+                rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
+        else
+                rhead->ino = 0;
+        return 0;
+}
+/*
+ * send request, or put it on the appropriate wait list.
+ */
+static int __do_request(struct ceph_mds_client *mdsc,
+                        struct ceph_mds_request *req)
+{
+        struct ceph_mds_session *session = NULL;
+        int mds = -1;
+        int err = -EAGAIN;
+        if (req->r_reply)
+                goto out;
+        if (req->r_timeout &&
+            time_after_eq(jiffies, req->r_started + req->r_timeout)) {
+                dout("do_request timed out\n");
+                err = -EIO;
+                goto finish;
+        }
+        mds = __choose_mds(mdsc, req);
+        if (mds < 0 ||
+            ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
+                dout("do_request no mds or not active, waiting for map\n");
+                list_add(&req->r_wait, &mdsc->waiting_for_map);
+                goto out;
+        }
+        /* get, open session */
+        session = __ceph_lookup_mds_session(mdsc, mds);
+        if (!session) {
+                session = register_session(mdsc, mds);
+                if (IS_ERR(session)) {
+                        err = PTR_ERR(session);
+                        goto finish;
+                }
+        }
+        dout("do_request mds%d session %p state %s\n", mds, session,
+             session_state_name(session->s_state));
+        if (session->s_state != CEPH_MDS_SESSION_OPEN &&
+            session->s_state != CEPH_MDS_SESSION_HUNG) {
+                if (session->s_state == CEPH_MDS_SESSION_NEW ||
+                    session->s_state == CEPH_MDS_SESSION_CLOSING)
+                        __open_session(mdsc, session);
+                list_add(&req->r_wait, &session->s_waiting);
+                goto out_session;
+        }
+        /* send request */
+        req->r_session = get_session(session);
+        req->r_resend_mds = -1;   /* forget any previous mds hint */
+        if (req->r_request_started == 0)   /* note request start time */
+                req->r_request_started = jiffies;
+        err = __prepare_send_request(mdsc, req, mds);
+        if (!err) {
+                ceph_msg_get(req->r_request);
+                ceph_con_send(&session->s_con, req->r_request);
+        }
+out_session:
+        ceph_put_mds_session(session);
+out:
+        return err;
+finish:
+        req->r_reply = ERR_PTR(err);
+        complete_request(mdsc, req);
+        goto out;
+}
+/*
+ * called under mdsc->mutex
+ */
+static void __wake_requests(struct ceph_mds_client *mdsc,
+                            struct list_head *head)
+{
+        struct ceph_mds_request *req, *nreq;
+        list_for_each_entry_safe(req, nreq, head, r_wait) {
+                list_del_init(&req->r_wait);
+                __do_request(mdsc, req);
+        }
+}
+/*
+ * Wake up threads with requests pending for @mds, so that they can
+ * resubmit their requests to a possibly different mds.  If @all is set,
+ * wake up if their requests has been forwarded to @mds, too.
+ */
+static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
+{
+        struct ceph_mds_request *req;
+        struct rb_node *p;
+        dout("kick_requests mds%d\n", mds);
+        for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
+                req = rb_entry(p, struct ceph_mds_request, r_node);
+                if (req->r_got_unsafe)
+                        continue;
+                if (req->r_session &&
+                    req->r_session->s_mds == mds) {
+                        dout(" kicking tid %llu\n", req->r_tid);
+                        put_request_session(req);
+                        __do_request(mdsc, req);
+                }
+        }
+}
+void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+                              struct ceph_mds_request *req)
+{
+        dout("submit_request on %p\n", req);
+        mutex_lock(&mdsc->mutex);
+        __register_request(mdsc, req, NULL);
+        __do_request(mdsc, req);
+        mutex_unlock(&mdsc->mutex);
+}
+/*
+ * Synchrously perform an mds request.  Take care of all of the
+ * session setup, forwarding, retry details.
+ */
+int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
+                         struct inode *dir,
+                         struct ceph_mds_request *req)
+{
+        int err;
+        dout("do_request on %p\n", req);
+        /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
+        if (req->r_inode)
+                ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
+        if (req->r_locked_dir)
+                ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
+        if (req->r_old_dentry)
+                ceph_get_cap_refs(
+                        ceph_inode(req->r_old_dentry->d_parent->d_inode),
+                        CEPH_CAP_PIN);
+        /* issue */
+        mutex_lock(&mdsc->mutex);
+        __register_request(mdsc, req, dir);
+        __do_request(mdsc, req);
+        /* wait */
+        if (!req->r_reply) {
+                mutex_unlock(&mdsc->mutex);
+                if (req->r_timeout) {
+                        err = (long)wait_for_completion_interruptible_timeout(
+                                &req->r_completion, req->r_timeout);
+                        if (err == 0)
+                                req->r_reply = ERR_PTR(-EIO);
+                        else if (err < 0)
+                                req->r_reply = ERR_PTR(err);
+                } else {
+                        err = wait_for_completion_interruptible(
+                                &req->r_completion);
+                        if (err)
+                                req->r_reply = ERR_PTR(err);
+                }
+                mutex_lock(&mdsc->mutex);
+        }
+        if (IS_ERR(req->r_reply)) {
+                err = PTR_ERR(req->r_reply);
+                req->r_reply = NULL;
+                if (err == -ERESTARTSYS) {
+                        /* aborted */
+                        req->r_aborted = true;
+                        if (req->r_locked_dir &&
+                            (req->r_op & CEPH_MDS_OP_WRITE)) {
+                                struct ceph_inode_info *ci =
+                                        ceph_inode(req->r_locked_dir);
+                                dout("aborted, clearing I_COMPLETE on %p\n", 
+                                     req->r_locked_dir);
+                                spin_lock(&req->r_locked_dir->i_lock);
+                                ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+                                ci->i_release_count++;
+                                spin_unlock(&req->r_locked_dir->i_lock);
+                        }
+                } else {
+                        /* clean up this request */
+                        __unregister_request(mdsc, req);
+                        if (!list_empty(&req->r_unsafe_item))
+                                list_del_init(&req->r_unsafe_item);
+                        complete(&req->r_safe_completion);
+                }
+        } else if (req->r_err) {
+                err = req->r_err;
+        } else {
+                err = le32_to_cpu(req->r_reply_info.head->result);
+        }
+        mutex_unlock(&mdsc->mutex);
+        dout("do_request %p done, result %d\n", req, err);
+        return err;
+}
+/*
+ * Handle mds reply.
+ *
+ * We take the session mutex and parse and process the reply immediately.
+ * This preserves the logical ordering of replies, capabilities, etc., sent
+ * by the MDS as they are applied to our local cache.
+ */
+static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
+{
+        struct ceph_mds_client *mdsc = session->s_mdsc;
+        struct ceph_mds_request *req;
+        struct ceph_mds_reply_head *head = msg->front.iov_base;
+        struct ceph_mds_reply_info_parsed *rinfo;  /* parsed reply info */
+        u64 tid;
+        int err, result;
+        int mds = session->s_mds;
+        if (msg->front.iov_len < sizeof(*head)) {
+                pr_err("mdsc_handle_reply got corrupt (short) reply\n");
+                ceph_msg_dump(msg);
+                return;
+        }
+        /* get request, session */
+        tid = le64_to_cpu(msg->hdr.tid);
+        mutex_lock(&mdsc->mutex);
+        req = __lookup_request(mdsc, tid);
+        if (!req) {
+                dout("handle_reply on unknown tid %llu\n", tid);
+                mutex_unlock(&mdsc->mutex);
+                return;
+        }
+        dout("handle_reply %p\n", req);
+        /* correct session? */
+        if (req->r_session != session) {
+                pr_err("mdsc_handle_reply got %llu on session mds%d"
+                       " not mds%d\n", tid, session->s_mds,
+                       req->r_session ? req->r_session->s_mds : -1);
+                mutex_unlock(&mdsc->mutex);
+                goto out;
+        }
+        /* dup? */
+        if ((req->r_got_unsafe && !head->safe) ||
+            (req->r_got_safe && head->safe)) {
+                pr_warning("got a dup %s reply on %llu from mds%d\n",
+                           head->safe ? "safe" : "unsafe", tid, mds);
+                mutex_unlock(&mdsc->mutex);
+                goto out;
+        }
+        result = le32_to_cpu(head->result);
+        /*
+         * Tolerate 2 consecutive ESTALEs from the same mds.
+         * FIXME: we should be looking at the cap migrate_seq.
+         */
+        if (result == -ESTALE) {
+                req->r_direct_mode = USE_AUTH_MDS;
+                req->r_num_stale++;
+                if (req->r_num_stale <= 2) {
+                        __do_request(mdsc, req);
+                        mutex_unlock(&mdsc->mutex);
+                        goto out;
+                }
+        } else {
+                req->r_num_stale = 0;
+        }
+        if (head->safe) {
+                req->r_got_safe = true;
+                __unregister_request(mdsc, req);
+                complete(&req->r_safe_completion);
+                if (req->r_got_unsafe) {
+                        /*
+                         * We already handled the unsafe response, now do the
+                         * cleanup.  No need to examine the response; the MDS
+                         * doesn't include any result info in the safe
+                         * response.  And even if it did, there is nothing
+                         * useful we could do with a revised return value.
+                         */
+                        dout("got safe reply %llu, mds%d\n", tid, mds);
+                        list_del_init(&req->r_unsafe_item);
+                        /* last unsafe request during umount? */
+                        if (mdsc->stopping && !__get_oldest_req(mdsc))
+                                complete(&mdsc->safe_umount_waiters);
+                        mutex_unlock(&mdsc->mutex);
+                        goto out;
+                }
+        }
+        BUG_ON(req->r_reply);
+        if (!head->safe) {
+                req->r_got_unsafe = true;
+                list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
+        }
+        dout("handle_reply tid %lld result %d\n", tid, result);
+        rinfo = &req->r_reply_info;
+        err = parse_reply_info(msg, rinfo);
+        mutex_unlock(&mdsc->mutex);
+        mutex_lock(&session->s_mutex);
+        if (err < 0) {
+                pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
+                ceph_msg_dump(msg);
+                goto out_err;
+        }
+        /* snap trace */
+        if (rinfo->snapblob_len) {
+                down_write(&mdsc->snap_rwsem);
+                ceph_update_snap_trace(mdsc, rinfo->snapblob,
+                               rinfo->snapblob + rinfo->snapblob_len,
+                               le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
+                downgrade_write(&mdsc->snap_rwsem);
+        } else {
+                down_read(&mdsc->snap_rwsem);
+        }
+        /* insert trace into our cache */
+        err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
+        if (err == 0) {
+                if (result == 0 && rinfo->dir_nr)
+                        ceph_readdir_prepopulate(req, req->r_session);
+                ceph_unreserve_caps(&req->r_caps_reservation);
+        }
+        up_read(&mdsc->snap_rwsem);
+out_err:
+        if (err) {
+                req->r_err = err;
+        } else {
+                req->r_reply = msg;
+                ceph_msg_get(msg);
+        }
+        add_cap_releases(mdsc, req->r_session, -1);
+        mutex_unlock(&session->s_mutex);
+        /* kick calling process */
+        complete_request(mdsc, req);
+out:
+        ceph_mdsc_put_request(req);
+        return;
+}
+/*
+ * handle mds notification that our request has been forwarded.
+ */
+static void handle_forward(struct ceph_mds_client *mdsc,
+                           struct ceph_mds_session *session,
+                           struct ceph_msg *msg)
+{
+        struct ceph_mds_request *req;
+        u64 tid = le64_to_cpu(msg->hdr.tid);
+        u32 next_mds;
+        u32 fwd_seq;
+        int err = -EINVAL;
+        void *p = msg->front.iov_base;
+        void *end = p + msg->front.iov_len;
+        ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+        next_mds = ceph_decode_32(&p);
+        fwd_seq = ceph_decode_32(&p);
+        mutex_lock(&mdsc->mutex);
+        req = __lookup_request(mdsc, tid);
+        if (!req) {
+                dout("forward %llu to mds%d - req dne\n", tid, next_mds);
+                goto out;  /* dup reply? */
+        }
+        if (fwd_seq <= req->r_num_fwd) {
+                dout("forward %llu to mds%d - old seq %d <= %d\n",
+                     tid, next_mds, req->r_num_fwd, fwd_seq);
+        } else {
+                /* resend. forward race not possible; mds would drop */
+                dout("forward %llu to mds%d (we resend)\n", tid, next_mds);
+                req->r_num_fwd = fwd_seq;
+                req->r_resend_mds = next_mds;
+                put_request_session(req);
+                __do_request(mdsc, req);
+        }
+        ceph_mdsc_put_request(req);
+out:
+        mutex_unlock(&mdsc->mutex);
+        return;
+bad:
+        pr_err("mdsc_handle_forward decode error err=%d\n", err);
+}
+/*
+ * handle a mds session control message
+ */
+static void handle_session(struct ceph_mds_session *session,
+                           struct ceph_msg *msg)
+{
+        struct ceph_mds_client *mdsc = session->s_mdsc;
+        u32 op;
+        u64 seq;
+        int mds = session->s_mds;
+        struct ceph_mds_session_head *h = msg->front.iov_base;
+        int wake = 0;
+        /* decode */
+        if (msg->front.iov_len != sizeof(*h))
+                goto bad;
+        op = le32_to_cpu(h->op);
+        seq = le64_to_cpu(h->seq);
+        mutex_lock(&mdsc->mutex);
+        if (op == CEPH_SESSION_CLOSE)
+                __unregister_session(mdsc, session);
+        /* FIXME: this ttl calculation is generous */
+        session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
+        mutex_unlock(&mdsc->mutex);
+        mutex_lock(&session->s_mutex);
+        dout("handle_session mds%d %s %p state %s seq %llu\n",
+             mds, ceph_session_op_name(op), session,
+             session_state_name(session->s_state), seq);
+        if (session->s_state == CEPH_MDS_SESSION_HUNG) {
+                session->s_state = CEPH_MDS_SESSION_OPEN;
+                pr_info("mds%d came back\n", session->s_mds);
+        }
+        switch (op) {
+        case CEPH_SESSION_OPEN:
+                session->s_state = CEPH_MDS_SESSION_OPEN;
+                renewed_caps(mdsc, session, 0);
+                wake = 1;
+                if (mdsc->stopping)
+                        __close_session(mdsc, session);
+                break;
+        case CEPH_SESSION_RENEWCAPS:
+                if (session->s_renew_seq == seq)
+                        renewed_caps(mdsc, session, 1);
+                break;
+        case CEPH_SESSION_CLOSE:
+                remove_session_caps(session);
+                wake = 1; /* for good measure */
+                complete(&mdsc->session_close_waiters);
+                kick_requests(mdsc, mds, 0);      /* cur only */
+                break;
+        case CEPH_SESSION_STALE:
+                pr_info("mds%d caps went stale, renewing\n",
+                        session->s_mds);
+                spin_lock(&session->s_cap_lock);
+                session->s_cap_gen++;
+                session->s_cap_ttl = 0;
+                spin_unlock(&session->s_cap_lock);
+                send_renew_caps(mdsc, session);
+                break;
+        case CEPH_SESSION_RECALL_STATE:
+                trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
+                break;
+        default:
+                pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
+                WARN_ON(1);
+        }
+        mutex_unlock(&session->s_mutex);
+        if (wake) {
+                mutex_lock(&mdsc->mutex);
+                __wake_requests(mdsc, &session->s_waiting);
+                mutex_unlock(&mdsc->mutex);
+        }
+        return;
+bad:
+        pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
+               (int)msg->front.iov_len);
+        ceph_msg_dump(msg);
+        return;
+}
+/*
+ * called under session->mutex.
+ */
+static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
+                                   struct ceph_mds_session *session)
+{
+        struct ceph_mds_request *req, *nreq;
+        int err;
+        dout("replay_unsafe_requests mds%d\n", session->s_mds);
+        mutex_lock(&mdsc->mutex);
+        list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
+                err = __prepare_send_request(mdsc, req, session->s_mds);
+                if (!err) {
+                        ceph_msg_get(req->r_request);
+                        ceph_con_send(&session->s_con, req->r_request);
+                }
+        }
+        mutex_unlock(&mdsc->mutex);
+}
+/*
+ * Encode information about a cap for a reconnect with the MDS.
+ */
+static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
+                          void *arg)
+{
+        struct ceph_mds_cap_reconnect rec;
+        struct ceph_inode_info *ci;
+        struct ceph_pagelist *pagelist = arg;
+        char *path;
+        int pathlen, err;
+        u64 pathbase;
+        struct dentry *dentry;
+        ci = cap->ci;
+        dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
+             inode, ceph_vinop(inode), cap, cap->cap_id,
+             ceph_cap_string(cap->issued));
+        err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
+        if (err)
+                return err;
+        dentry = d_find_alias(inode);
+        if (dentry) {
+                path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
+                if (IS_ERR(path)) {
+                        err = PTR_ERR(path);
+                        BUG_ON(err);
+                }
+        } else {
+                path = NULL;
+                pathlen = 0;
+        }
+        err = ceph_pagelist_encode_string(pagelist, path, pathlen);
+        if (err)
+                goto out;
+        spin_lock(&inode->i_lock);
+        cap->seq = 0;        /* reset cap seq */
+        cap->issue_seq = 0;  /* and issue_seq */
+        rec.cap_id = cpu_to_le64(cap->cap_id);
+        rec.pathbase = cpu_to_le64(pathbase);
+        rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
+        rec.issued = cpu_to_le32(cap->issued);
+        rec.size = cpu_to_le64(inode->i_size);
+        ceph_encode_timespec(&rec.mtime, &inode->i_mtime);
+        ceph_encode_timespec(&rec.atime, &inode->i_atime);
+        rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
+        spin_unlock(&inode->i_lock);
+        err = ceph_pagelist_append(pagelist, &rec, sizeof(rec));
+out:
+        kfree(path);
+        dput(dentry);
+        return err;
+}
+/*
+ * If an MDS fails and recovers, clients need to reconnect in order to
+ * reestablish shared state.  This includes all caps issued through
+ * this session _and_ the snap_realm hierarchy.  Because it's not
+ * clear which snap realms the mds cares about, we send everything we
+ * know about.. that ensures we'll then get any new info the
+ * recovering MDS might have.
+ *
+ * This is a relatively heavyweight operation, but it's rare.
+ *
+ * called with mdsc->mutex held.
+ */
+static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
+{
+        struct ceph_mds_session *session = NULL;
+        struct ceph_msg *reply;
+        struct rb_node *p;
+        int err;
+        struct ceph_pagelist *pagelist;
+        pr_info("reconnect to recovering mds%d\n", mds);
+        pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
+        if (!pagelist)
+                goto fail_nopagelist;
+        ceph_pagelist_init(pagelist);
+        reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL);
+        if (IS_ERR(reply)) {
+                err = PTR_ERR(reply);
+                goto fail_nomsg;
+        }
+        /* find session */
+        session = __ceph_lookup_mds_session(mdsc, mds);
+        mutex_unlock(&mdsc->mutex);    /* drop lock for duration */
+        if (session) {
+                mutex_lock(&session->s_mutex);
+                session->s_state = CEPH_MDS_SESSION_RECONNECTING;
+                session->s_seq = 0;
+                ceph_con_open(&session->s_con,
+                              ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+                /* replay unsafe requests */
+                replay_unsafe_requests(mdsc, session);
+        } else {
+                dout("no session for mds%d, will send short reconnect\n",
+                     mds);
+        }
+        down_read(&mdsc->snap_rwsem);
+        if (!session)
+                goto send;
+        dout("session %p state %s\n", session,
+             session_state_name(session->s_state));
+        /* traverse this session's caps */
+        err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
+        if (err)
+                goto fail;
+        err = iterate_session_caps(session, encode_caps_cb, pagelist);
+        if (err < 0)
+                goto out;
+        /*
+         * snaprealms.  we provide mds with the ino, seq (version), and
+         * parent for all of our realms.  If the mds has any newer info,
+         * it will tell us.
+         */
+        for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
+                struct ceph_snap_realm *realm =
+                        rb_entry(p, struct ceph_snap_realm, node);
+                struct ceph_mds_snaprealm_reconnect sr_rec;
+                dout(" adding snap realm %llx seq %lld parent %llx\n",
+                     realm->ino, realm->seq, realm->parent_ino);
+                sr_rec.ino = cpu_to_le64(realm->ino);
+                sr_rec.seq = cpu_to_le64(realm->seq);
+                sr_rec.parent = cpu_to_le64(realm->parent_ino);
+                err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
+                if (err)
+                        goto fail;
+        }
+send:
+        reply->pagelist = pagelist;
+        reply->hdr.data_len = cpu_to_le32(pagelist->length);
+        reply->nr_pages = calc_pages_for(0, pagelist->length);
+        ceph_con_send(&session->s_con, reply);
+        if (session) {
+                session->s_state = CEPH_MDS_SESSION_OPEN;
+                __wake_requests(mdsc, &session->s_waiting);
+        }
+out:
+        up_read(&mdsc->snap_rwsem);
+        if (session) {
+                mutex_unlock(&session->s_mutex);
+                ceph_put_mds_session(session);
+        }
+        mutex_lock(&mdsc->mutex);
+        return;
+fail:
+        ceph_msg_put(reply);
+fail_nomsg:
+        ceph_pagelist_release(pagelist);
+        kfree(pagelist);
+fail_nopagelist:
+        pr_err("ENOMEM preparing reconnect for mds%d\n", mds);
+        goto out;
+}
+/*
+ * compare old and new mdsmaps, kicking requests
+ * and closing out old connections as necessary
+ *
+ * called under mdsc->mutex.
+ */
+static void check_new_map(struct ceph_mds_client *mdsc,
+                          struct ceph_mdsmap *newmap,
+                          struct ceph_mdsmap *oldmap)
+{
+        int i;
+        int oldstate, newstate;
+        struct ceph_mds_session *s;
+        dout("check_new_map new %u old %u\n",
+             newmap->m_epoch, oldmap->m_epoch);
+        for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
+                if (mdsc->sessions[i] == NULL)
+                        continue;
+                s = mdsc->sessions[i];
+                oldstate = ceph_mdsmap_get_state(oldmap, i);
+                newstate = ceph_mdsmap_get_state(newmap, i);
+                dout("check_new_map mds%d state %s -> %s (session %s)\n",
+                     i, ceph_mds_state_name(oldstate),
+                     ceph_mds_state_name(newstate),
+                     session_state_name(s->s_state));
+                if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
+                           ceph_mdsmap_get_addr(newmap, i),
+                           sizeof(struct ceph_entity_addr))) {
+                        if (s->s_state == CEPH_MDS_SESSION_OPENING) {
+                                /* the session never opened, just close it
+                                 * out now */
+                                __wake_requests(mdsc, &s->s_waiting);
+                                __unregister_session(mdsc, s);
+                        } else {
+                                /* just close it */
+                                mutex_unlock(&mdsc->mutex);
+                                mutex_lock(&s->s_mutex);
+                                mutex_lock(&mdsc->mutex);
+                                ceph_con_close(&s->s_con);
+                                mutex_unlock(&s->s_mutex);
+                                s->s_state = CEPH_MDS_SESSION_RESTARTING;
+                        }
+                        /* kick any requests waiting on the recovering mds */
+                        kick_requests(mdsc, i, 1);
+                } else if (oldstate == newstate) {
+                        continue;  /* nothing new with this mds */
+                }
+                /*
+                 * send reconnect?
+                 */
+                if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
+                    newstate >= CEPH_MDS_STATE_RECONNECT)
+                        send_mds_reconnect(mdsc, i);
+                /*
+                 * kick requests on any mds that has gone active.
+                 *
+                 * kick requests on cur or forwarder: we may have sent
+                 * the request to mds1, mds1 told us it forwarded it
+                 * to mds2, but then we learn mds1 failed and can't be
+                 * sure it successfully forwarded our request before
+                 * it died.
+                 */
+                if (oldstate < CEPH_MDS_STATE_ACTIVE &&
+                    newstate >= CEPH_MDS_STATE_ACTIVE) {
+                        pr_info("mds%d reconnect completed\n", s->s_mds);
+                        kick_requests(mdsc, i, 1);
+                        ceph_kick_flushing_caps(mdsc, s);
+                        wake_up_session_caps(s, 1);
+                }
+        }
+}
+/*
+ * leases
+ */
+/*
+ * caller must hold session s_mutex, dentry->d_lock
+ */
+void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
+{
+        struct ceph_dentry_info *di = ceph_dentry(dentry);
+        ceph_put_mds_session(di->lease_session);
+        di->lease_session = NULL;
+}
+static void handle_lease(struct ceph_mds_client *mdsc,
+                         struct ceph_mds_session *session,
+                         struct ceph_msg *msg)
+{
+        struct super_block *sb = mdsc->client->sb;
+        struct inode *inode;
+        struct ceph_inode_info *ci;
+        struct dentry *parent, *dentry;
+        struct ceph_dentry_info *di;
+        int mds = session->s_mds;
+        struct ceph_mds_lease *h = msg->front.iov_base;
+        struct ceph_vino vino;
+        int mask;
+        struct qstr dname;
+        int release = 0;
+        dout("handle_lease from mds%d\n", mds);
+        /* decode */
+        if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
+                goto bad;
+        vino.ino = le64_to_cpu(h->ino);
+        vino.snap = CEPH_NOSNAP;
+        mask = le16_to_cpu(h->mask);
+        dname.name = (void *)h + sizeof(*h) + sizeof(u32);
+        dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
+        if (dname.len != get_unaligned_le32(h+1))
+                goto bad;
+        mutex_lock(&session->s_mutex);
+        session->s_seq++;
+        /* lookup inode */
+        inode = ceph_find_inode(sb, vino);
+        dout("handle_lease '%s', mask %d, ino %llx %p\n",
+             ceph_lease_op_name(h->action), mask, vino.ino, inode);
+        if (inode == NULL) {
+                dout("handle_lease no inode %llx\n", vino.ino);
+                goto release;
+        }
+        ci = ceph_inode(inode);
+        /* dentry */
+        parent = d_find_alias(inode);
+        if (!parent) {
+                dout("no parent dentry on inode %p\n", inode);
+                WARN_ON(1);
+                goto release;  /* hrm... */
+        }
+        dname.hash = full_name_hash(dname.name, dname.len);
+        dentry = d_lookup(parent, &dname);
+        dput(parent);
+        if (!dentry)
+                goto release;
+        spin_lock(&dentry->d_lock);
+        di = ceph_dentry(dentry);
+        switch (h->action) {
+        case CEPH_MDS_LEASE_REVOKE:
+                if (di && di->lease_session == session) {
+                        h->seq = cpu_to_le32(di->lease_seq);
+                        __ceph_mdsc_drop_dentry_lease(dentry);
+                }
+                release = 1;
+                break;
+        case CEPH_MDS_LEASE_RENEW:
+                if (di && di->lease_session == session &&
+                    di->lease_gen == session->s_cap_gen &&
+                    di->lease_renew_from &&
+                    di->lease_renew_after == 0) {
+                        unsigned long duration =
+                                le32_to_cpu(h->duration_ms) * HZ / 1000;
+                        di->lease_seq = le32_to_cpu(h->seq);
+                        dentry->d_time = di->lease_renew_from + duration;
+                        di->lease_renew_after = di->lease_renew_from +
+                                (duration >> 1);
+                        di->lease_renew_from = 0;
+                }
+                break;
+        }
+        spin_unlock(&dentry->d_lock);
+        dput(dentry);
+        if (!release)
+                goto out;
+release:
+        /* let's just reuse the same message */
+        h->action = CEPH_MDS_LEASE_REVOKE_ACK;
+        ceph_msg_get(msg);
+        ceph_con_send(&session->s_con, msg);
+out:
+        iput(inode);
+        mutex_unlock(&session->s_mutex);
+        return;
+bad:
+        pr_err("corrupt lease message\n");
+        ceph_msg_dump(msg);
+}
+void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
+                              struct inode *inode,
+                              struct dentry *dentry, char action,
+                              u32 seq)
+{
+        struct ceph_msg *msg;
+        struct ceph_mds_lease *lease;
+        int len = sizeof(*lease) + sizeof(u32);
+        int dnamelen = 0;
+        dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
+             inode, dentry, ceph_lease_op_name(action), session->s_mds);
+        dnamelen = dentry->d_name.len;
+        len += dnamelen;
+        msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
+        if (IS_ERR(msg))
+                return;
+        lease = msg->front.iov_base;
+        lease->action = action;
+        lease->mask = cpu_to_le16(CEPH_LOCK_DN);
+        lease->ino = cpu_to_le64(ceph_vino(inode).ino);
+        lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
+        lease->seq = cpu_to_le32(seq);
+        put_unaligned_le32(dnamelen, lease + 1);
+        memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
+        /*
+         * if this is a preemptive lease RELEASE, no need to
+         * flush request stream, since the actual request will
+         * soon follow.
+         */
+        msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
+        ceph_con_send(&session->s_con, msg);
+}
+/*
+ * Preemptively release a lease we expect to invalidate anyway.
+ * Pass @inode always, @dentry is optional.
+ */
+void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
+                             struct dentry *dentry, int mask)
+{
+        struct ceph_dentry_info *di;
+        struct ceph_mds_session *session;
+        u32 seq;
+        BUG_ON(inode == NULL);
+        BUG_ON(dentry == NULL);
+        BUG_ON(mask != CEPH_LOCK_DN);
+        /* is dentry lease valid? */
+        spin_lock(&dentry->d_lock);
+        di = ceph_dentry(dentry);
+        if (!di || !di->lease_session ||
+            di->lease_session->s_mds < 0 ||
+            di->lease_gen != di->lease_session->s_cap_gen ||
+            !time_before(jiffies, dentry->d_time)) {
+                dout("lease_release inode %p dentry %p -- "
+                     "no lease on %d\n",
+                     inode, dentry, mask);
+                spin_unlock(&dentry->d_lock);
+                return;
+        }
+        /* we do have a lease on this dentry; note mds and seq */
+        session = ceph_get_mds_session(di->lease_session);
+        seq = di->lease_seq;
+        __ceph_mdsc_drop_dentry_lease(dentry);
+        spin_unlock(&dentry->d_lock);
+        dout("lease_release inode %p dentry %p mask %d to mds%d\n",
+             inode, dentry, mask, session->s_mds);
+        ceph_mdsc_lease_send_msg(session, inode, dentry,
+                                 CEPH_MDS_LEASE_RELEASE, seq);
+        ceph_put_mds_session(session);
+}
+/*
+ * drop all leases (and dentry refs) in preparation for umount
+ */
+static void drop_leases(struct ceph_mds_client *mdsc)
+{
+        int i;
+        dout("drop_leases\n");
+        mutex_lock(&mdsc->mutex);
+        for (i = 0; i < mdsc->max_sessions; i++) {
+                struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
+                if (!s)
+                        continue;
+                mutex_unlock(&mdsc->mutex);
+                mutex_lock(&s->s_mutex);
+                mutex_unlock(&s->s_mutex);
+                ceph_put_mds_session(s);
+                mutex_lock(&mdsc->mutex);
+        }
+        mutex_unlock(&mdsc->mutex);
+}
+/*
+ * delayed work -- periodically trim expired leases, renew caps with mds
+ */
+static void schedule_delayed(struct ceph_mds_client *mdsc)
+{
+        int delay = 5;
+        unsigned hz = round_jiffies_relative(HZ * delay);
+        schedule_delayed_work(&mdsc->delayed_work, hz);
+}
+static void delayed_work(struct work_struct *work)
+{
+        int i;
+        struct ceph_mds_client *mdsc =
+                container_of(work, struct ceph_mds_client, delayed_work.work);
+        int renew_interval;
+        int renew_caps;
+        dout("mdsc delayed_work\n");
+        ceph_check_delayed_caps(mdsc);
+        mutex_lock(&mdsc->mutex);
+        renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
+        renew_caps = time_after_eq(jiffies, HZ*renew_interval +
+                                   mdsc->last_renew_caps);
+        if (renew_caps)
+                mdsc->last_renew_caps = jiffies;
+        for (i = 0; i < mdsc->max_sessions; i++) {
+                struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
+                if (s == NULL)
+                        continue;
+                if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
+                        dout("resending session close request for mds%d\n",
+                             s->s_mds);
+                        request_close_session(mdsc, s);
+                        ceph_put_mds_session(s);
+                        continue;
+                }
+                if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
+                        if (s->s_state == CEPH_MDS_SESSION_OPEN) {
+                                s->s_state = CEPH_MDS_SESSION_HUNG;
+                                pr_info("mds%d hung\n", s->s_mds);
+                        }
+                }
+                if (s->s_state < CEPH_MDS_SESSION_OPEN) {
+                        /* this mds is failed or recovering, just wait */
+                        ceph_put_mds_session(s);
+                        continue;
+                }
+                mutex_unlock(&mdsc->mutex);
+                mutex_lock(&s->s_mutex);
+                if (renew_caps)
+                        send_renew_caps(mdsc, s);
+                else
+                        ceph_con_keepalive(&s->s_con);
+                add_cap_releases(mdsc, s, -1);
+                send_cap_releases(mdsc, s);
+                mutex_unlock(&s->s_mutex);
+                ceph_put_mds_session(s);
+                mutex_lock(&mdsc->mutex);
+        }
+        mutex_unlock(&mdsc->mutex);
+        schedule_delayed(mdsc);
+}
+int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
+{
+        mdsc->client = client;
+        mutex_init(&mdsc->mutex);
+        mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
+        init_completion(&mdsc->safe_umount_waiters);
+        init_completion(&mdsc->session_close_waiters);
+        INIT_LIST_HEAD(&mdsc->waiting_for_map);
+        mdsc->sessions = NULL;
+        mdsc->max_sessions = 0;
+        mdsc->stopping = 0;
+        init_rwsem(&mdsc->snap_rwsem);
+        mdsc->snap_realms = RB_ROOT;
+        INIT_LIST_HEAD(&mdsc->snap_empty);
+        spin_lock_init(&mdsc->snap_empty_lock);
+        mdsc->last_tid = 0;
+        mdsc->request_tree = RB_ROOT;
+        INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
+        mdsc->last_renew_caps = jiffies;
+        INIT_LIST_HEAD(&mdsc->cap_delay_list);
+        spin_lock_init(&mdsc->cap_delay_lock);
+        INIT_LIST_HEAD(&mdsc->snap_flush_list);
+        spin_lock_init(&mdsc->snap_flush_lock);
+        mdsc->cap_flush_seq = 0;
+        INIT_LIST_HEAD(&mdsc->cap_dirty);
+        mdsc->num_cap_flushing = 0;
+        spin_lock_init(&mdsc->cap_dirty_lock);
+        init_waitqueue_head(&mdsc->cap_flushing_wq);
+        spin_lock_init(&mdsc->dentry_lru_lock);
+        INIT_LIST_HEAD(&mdsc->dentry_lru);
+        return 0;
+}
+/*
+ * Wait for safe replies on open mds requests.  If we time out, drop
+ * all requests from the tree to avoid dangling dentry refs.
+ */
+static void wait_requests(struct ceph_mds_client *mdsc)
+{
+        struct ceph_mds_request *req;
+        struct ceph_client *client = mdsc->client;
+        mutex_lock(&mdsc->mutex);
+        if (__get_oldest_req(mdsc)) {
+                mutex_unlock(&mdsc->mutex);
+                dout("wait_requests waiting for requests\n");
+                wait_for_completion_timeout(&mdsc->safe_umount_waiters,
+                                    client->mount_args->mount_timeout * HZ);
+                /* tear down remaining requests */
+                mutex_lock(&mdsc->mutex);
+                while ((req = __get_oldest_req(mdsc))) {
+                        dout("wait_requests timed out on tid %llu\n",
+                             req->r_tid);
+                        __unregister_request(mdsc, req);
+                }
+        }
+        mutex_unlock(&mdsc->mutex);
+        dout("wait_requests done\n");
+}
+/*
+ * called before mount is ro, and before dentries are torn down.
+ * (hmm, does this still race with new lookups?)
+ */
+void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
+{
+        dout("pre_umount\n");
+        mdsc->stopping = 1;
+        drop_leases(mdsc);
+        ceph_flush_dirty_caps(mdsc);
+        wait_requests(mdsc);
+}
+/*
+ * wait for all write mds requests to flush.
+ */
+static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
+{
+        struct ceph_mds_request *req = NULL, *nextreq;
+        struct rb_node *n;
+        mutex_lock(&mdsc->mutex);
+        dout("wait_unsafe_requests want %lld\n", want_tid);
+restart:
+        req = __get_oldest_req(mdsc);
+        while (req && req->r_tid <= want_tid) {
+                /* find next request */
+                n = rb_next(&req->r_node);
+                if (n)
+                        nextreq = rb_entry(n, struct ceph_mds_request, r_node);
+                else
+                        nextreq = NULL;
+                if ((req->r_op & CEPH_MDS_OP_WRITE)) {
+                        /* write op */
+                        ceph_mdsc_get_request(req);
+                        if (nextreq)
+                                ceph_mdsc_get_request(nextreq);
+                        mutex_unlock(&mdsc->mutex);
+                        dout("wait_unsafe_requests  wait on %llu (want %llu)\n",
+                             req->r_tid, want_tid);
+                        wait_for_completion(&req->r_safe_completion);
+                        mutex_lock(&mdsc->mutex);
+                        ceph_mdsc_put_request(req);
+                        if (!nextreq)
+                                break;  /* next dne before, so we're done! */
+                        if (RB_EMPTY_NODE(&nextreq->r_node)) {
+                                /* next request was removed from tree */
+                                ceph_mdsc_put_request(nextreq);
+                                goto restart;
+                        }
+                        ceph_mdsc_put_request(nextreq);  /* won't go away */
+                }
+                req = nextreq;
+        }
+        mutex_unlock(&mdsc->mutex);
+        dout("wait_unsafe_requests done\n");
+}
+void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
+{
+        u64 want_tid, want_flush;
+        dout("sync\n");
+        mutex_lock(&mdsc->mutex);
+        want_tid = mdsc->last_tid;
+        want_flush = mdsc->cap_flush_seq;
+        mutex_unlock(&mdsc->mutex);
+        dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
+        ceph_flush_dirty_caps(mdsc);
+        wait_unsafe_requests(mdsc, want_tid);
+        wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
+}
+/*
+ * called after sb is ro.
+ */
+void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
+{
+        struct ceph_mds_session *session;
+        int i;
+        int n;
+        struct ceph_client *client = mdsc->client;
+        unsigned long started, timeout = client->mount_args->mount_timeout * HZ;
+        dout("close_sessions\n");
+        mutex_lock(&mdsc->mutex);
+        /* close sessions */
+        started = jiffies;
+        while (time_before(jiffies, started + timeout)) {
+                dout("closing sessions\n");
+                n = 0;
+                for (i = 0; i < mdsc->max_sessions; i++) {
+                        session = __ceph_lookup_mds_session(mdsc, i);
+                        if (!session)
+                                continue;
+                        mutex_unlock(&mdsc->mutex);
+                        mutex_lock(&session->s_mutex);
+                        __close_session(mdsc, session);
+                        mutex_unlock(&session->s_mutex);
+                        ceph_put_mds_session(session);
+                        mutex_lock(&mdsc->mutex);
+                        n++;
+                }
+                if (n == 0)
+                        break;
+                if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
+                        break;
+                dout("waiting for sessions to close\n");
+                mutex_unlock(&mdsc->mutex);
+                wait_for_completion_timeout(&mdsc->session_close_waiters,
+                                            timeout);
+                mutex_lock(&mdsc->mutex);
+        }
+        /* tear down remaining sessions */
+        for (i = 0; i < mdsc->max_sessions; i++) {
+                if (mdsc->sessions[i]) {
+                        session = get_session(mdsc->sessions[i]);
+                        __unregister_session(mdsc, session);
+                        mutex_unlock(&mdsc->mutex);
+                        mutex_lock(&session->s_mutex);
+                        remove_session_caps(session);
+                        mutex_unlock(&session->s_mutex);
+                        ceph_put_mds_session(session);
+                        mutex_lock(&mdsc->mutex);
+                }
+        }
+        WARN_ON(!list_empty(&mdsc->cap_delay_list));
+        mutex_unlock(&mdsc->mutex);
+        ceph_cleanup_empty_realms(mdsc);
+        cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+        dout("stopped\n");
+}
+void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
+{
+        dout("stop\n");
+        cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+        if (mdsc->mdsmap)
+                ceph_mdsmap_destroy(mdsc->mdsmap);
+        kfree(mdsc->sessions);
+}
+/*
+ * handle mds map update.
+ */
+void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+{
+        u32 epoch;
+        u32 maplen;
+        void *p = msg->front.iov_base;
+        void *end = p + msg->front.iov_len;
+        struct ceph_mdsmap *newmap, *oldmap;
+        struct ceph_fsid fsid;
+        int err = -EINVAL;
+        ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
+        ceph_decode_copy(&p, &fsid, sizeof(fsid));
+        if (ceph_check_fsid(mdsc->client, &fsid) < 0)
+                return;
+        epoch = ceph_decode_32(&p);
+        maplen = ceph_decode_32(&p);
+        dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
+        /* do we need it? */
+        ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
+        mutex_lock(&mdsc->mutex);
+        if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
+                dout("handle_map epoch %u <= our %u\n",
+                     epoch, mdsc->mdsmap->m_epoch);
+                mutex_unlock(&mdsc->mutex);
+                return;
+        }
+        newmap = ceph_mdsmap_decode(&p, end);
+        if (IS_ERR(newmap)) {
+                err = PTR_ERR(newmap);
+                goto bad_unlock;
+        }
+        /* swap into place */
+        if (mdsc->mdsmap) {
+                oldmap = mdsc->mdsmap;
+                mdsc->mdsmap = newmap;
+                check_new_map(mdsc, newmap, oldmap);
+                ceph_mdsmap_destroy(oldmap);
+        } else {
+                mdsc->mdsmap = newmap;  /* first mds map */
+        }
+        mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
+        __wake_requests(mdsc, &mdsc->waiting_for_map);
+        mutex_unlock(&mdsc->mutex);
+        schedule_delayed(mdsc);
+        return;
+bad_unlock:
+        mutex_unlock(&mdsc->mutex);
+bad:
+        pr_err("error decoding mdsmap %d\n", err);
+        return;
+}
+static struct ceph_connection *con_get(struct ceph_connection *con)
+{
+        struct ceph_mds_session *s = con->private;
+        if (get_session(s)) {
+                dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
+                return con;
+        }
+        dout("mdsc con_get %p FAIL\n", s);
+        return NULL;
+}
+static void con_put(struct ceph_connection *con)
+{
+        struct ceph_mds_session *s = con->private;
+        ceph_put_mds_session(s);
+        dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref));
+}
+/*
+ * if the client is unresponsive for long enough, the mds will kill
+ * the session entirely.
+ */
+static void peer_reset(struct ceph_connection *con)
+{
+        struct ceph_mds_session *s = con->private;
+        pr_err("mds%d gave us the boot.  IMPLEMENT RECONNECT.\n",
+               s->s_mds);
+}
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+        struct ceph_mds_session *s = con->private;
+        struct ceph_mds_client *mdsc = s->s_mdsc;
+        int type = le16_to_cpu(msg->hdr.type);
+        mutex_lock(&mdsc->mutex);
+        if (__verify_registered_session(mdsc, s) < 0) {
+                mutex_unlock(&mdsc->mutex);
+                goto out;
+        }
+        mutex_unlock(&mdsc->mutex);
+        switch (type) {
+        case CEPH_MSG_MDS_MAP:
+                ceph_mdsc_handle_map(mdsc, msg);
+                break;
+        case CEPH_MSG_CLIENT_SESSION:
+                handle_session(s, msg);
+                break;
+        case CEPH_MSG_CLIENT_REPLY:
+                handle_reply(s, msg);
+                break;
+        case CEPH_MSG_CLIENT_REQUEST_FORWARD:
+                handle_forward(mdsc, s, msg);
+                break;
+        case CEPH_MSG_CLIENT_CAPS:
+                ceph_handle_caps(s, msg);
+                break;
+        case CEPH_MSG_CLIENT_SNAP:
+                ceph_handle_snap(mdsc, s, msg);
+                break;
+        case CEPH_MSG_CLIENT_LEASE:
+                handle_lease(mdsc, s, msg);
+                break;
+        default:
+                pr_err("received unknown message type %d %s\n", type,
+                       ceph_msg_type_name(type));
+        }
+out:
+        ceph_msg_put(msg);
+}
+/*
+ * authentication
+ */
+static int get_authorizer(struct ceph_connection *con,
+                          void **buf, int *len, int *proto,
+                          void **reply_buf, int *reply_len, int force_new)
+{
+        struct ceph_mds_session *s = con->private;
+        struct ceph_mds_client *mdsc = s->s_mdsc;
+        struct ceph_auth_client *ac = mdsc->client->monc.auth;
+        int ret = 0;
+        if (force_new && s->s_authorizer) {
+                ac->ops->destroy_authorizer(ac, s->s_authorizer);
+                s->s_authorizer = NULL;
+        }
+        if (s->s_authorizer == NULL) {
+                if (ac->ops->create_authorizer) {
+                        ret = ac->ops->create_authorizer(
+                                ac, CEPH_ENTITY_TYPE_MDS,
+                                &s->s_authorizer,
+                                &s->s_authorizer_buf,
+                                &s->s_authorizer_buf_len,
+                                &s->s_authorizer_reply_buf,
+                                &s->s_authorizer_reply_buf_len);
+                        if (ret)
+                                return ret;
+                }
+        }
+        *proto = ac->protocol;
+        *buf = s->s_authorizer_buf;
+        *len = s->s_authorizer_buf_len;
+        *reply_buf = s->s_authorizer_reply_buf;
+        *reply_len = s->s_authorizer_reply_buf_len;
+        return 0;
+}
+static int verify_authorizer_reply(struct ceph_connection *con, int len)
+{
+        struct ceph_mds_session *s = con->private;
+        struct ceph_mds_client *mdsc = s->s_mdsc;
+        struct ceph_auth_client *ac = mdsc->client->monc.auth;
+        return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
+}
+static int invalidate_authorizer(struct ceph_connection *con)
+{
+        struct ceph_mds_session *s = con->private;
+        struct ceph_mds_client *mdsc = s->s_mdsc;
+        struct ceph_auth_client *ac = mdsc->client->monc.auth;
+        if (ac->ops->invalidate_authorizer)
+                ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
+        return ceph_monc_validate_auth(&mdsc->client->monc);
+}
+const static struct ceph_connection_operations mds_con_ops = {
+        .get = con_get,
+        .put = con_put,
+        .dispatch = dispatch,
+        .get_authorizer = get_authorizer,
+        .verify_authorizer_reply = verify_authorizer_reply,
+        .invalidate_authorizer = invalidate_authorizer,
+        .peer_reset = peer_reset,
+};
+/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
new file mode 100644
index 000000000000..961cc6f65878
--- /dev/null
+++ b/fs/ceph/mds_client.h
@@ -0,0 +1,335 @@
+#ifndef _FS_CEPH_MDS_CLIENT_H
+#define _FS_CEPH_MDS_CLIENT_H
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include "types.h"
+#include "messenger.h"
+#include "mdsmap.h"
+/*
+ * Some lock dependencies:
+ *
+ * session->s_mutex
+ *         mdsc->mutex
+ *
+ *         mdsc->snap_rwsem
+ *
+ *         inode->i_lock
+ *                 mdsc->snap_flush_lock
+ *                 mdsc->cap_delay_lock
+ *
+ */
+struct ceph_client;
+struct ceph_cap;
+/*
+ * parsed info about a single inode.  pointers are into the encoded
+ * on-wire structures within the mds reply message payload.
+ */
+struct ceph_mds_reply_info_in {
+        struct ceph_mds_reply_inode *in;
+        u32 symlink_len;
+        char *symlink;
+        u32 xattr_len;
+        char *xattr_data;
+};
+/*
+ * parsed info about an mds reply, including information about the
+ * target inode and/or its parent directory and dentry, and directory
+ * contents (for readdir results).
+ */
+struct ceph_mds_reply_info_parsed {
+        struct ceph_mds_reply_head    *head;
+        struct ceph_mds_reply_info_in diri, targeti;
+        struct ceph_mds_reply_dirfrag *dirfrag;
+        char                          *dname;
+        u32                           dname_len;
+        struct ceph_mds_reply_lease   *dlease;
+        struct ceph_mds_reply_dirfrag *dir_dir;
+        int                           dir_nr;
+        char                          **dir_dname;
+        u32                           *dir_dname_len;
+        struct ceph_mds_reply_lease   **dir_dlease;
+        struct ceph_mds_reply_info_in *dir_in;
+        u8                            dir_complete, dir_end;
+        /* encoded blob describing snapshot contexts for certain
+           operations (e.g., open) */
+        void *snapblob;
+        int snapblob_len;
+};
+/*
+ * cap releases are batched and sent to the MDS en masse.
+ */
+#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE -                       \
+                                sizeof(struct ceph_mds_cap_release)) /  \
+                               sizeof(struct ceph_mds_cap_item))
+/*
+ * state associated with each MDS<->client session
+ */
+enum {
+        CEPH_MDS_SESSION_NEW = 1,
+        CEPH_MDS_SESSION_OPENING = 2,
+        CEPH_MDS_SESSION_OPEN = 3,
+        CEPH_MDS_SESSION_HUNG = 4,
+        CEPH_MDS_SESSION_CLOSING = 5,
+        CEPH_MDS_SESSION_RESTARTING = 6,
+        CEPH_MDS_SESSION_RECONNECTING = 7,
+};
+struct ceph_mds_session {
+        struct ceph_mds_client *s_mdsc;
+        int               s_mds;
+        int               s_state;
+        unsigned long     s_ttl;      /* time until mds kills us */
+        u64               s_seq;      /* incoming msg seq # */
+        struct mutex      s_mutex;    /* serialize session messages */
+        struct ceph_connection s_con;
+        struct ceph_authorizer *s_authorizer;
+        void             *s_authorizer_buf, *s_authorizer_reply_buf;
+        size_t            s_authorizer_buf_len, s_authorizer_reply_buf_len;
+        /* protected by s_cap_lock */
+        spinlock_t        s_cap_lock;
+        u32               s_cap_gen;  /* inc each time we get mds stale msg */
+        unsigned long     s_cap_ttl;  /* when session caps expire */
+        struct list_head  s_caps;     /* all caps issued by this session */
+        int               s_nr_caps, s_trim_caps;
+        int               s_num_cap_releases;
+        struct list_head  s_cap_releases; /* waiting cap_release messages */
+        struct list_head  s_cap_releases_done; /* ready to send */
+        struct ceph_cap  *s_cap_iterator;
+        /* protected by mutex */
+        struct list_head  s_cap_flushing;     /* inodes w/ flushing caps */
+        struct list_head  s_cap_snaps_flushing;
+        unsigned long     s_renew_requested; /* last time we sent a renew req */
+        u64               s_renew_seq;
+        atomic_t          s_ref;
+        struct list_head  s_waiting;  /* waiting requests */
+        struct list_head  s_unsafe;   /* unsafe requests */
+};
+/*
+ * modes of choosing which MDS to send a request to
+ */
+enum {
+        USE_ANY_MDS,
+        USE_RANDOM_MDS,
+        USE_AUTH_MDS,   /* prefer authoritative mds for this metadata item */
+};
+struct ceph_mds_request;
+struct ceph_mds_client;
+/*
+ * request completion callback
+ */
+typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
+                                             struct ceph_mds_request *req);
+/*
+ * an in-flight mds request
+ */
+struct ceph_mds_request {
+        u64 r_tid;                   /* transaction id */
+        struct rb_node r_node;
+        int r_op;                    /* mds op code */
+        int r_mds;
+        /* operation on what? */
+        struct inode *r_inode;              /* arg1 */
+        struct dentry *r_dentry;            /* arg1 */
+        struct dentry *r_old_dentry;        /* arg2: rename from or link from */
+        char *r_path1, *r_path2;
+        struct ceph_vino r_ino1, r_ino2;
+        struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
+        struct inode *r_target_inode;       /* resulting inode */
+        union ceph_mds_request_args r_args;
+        int r_fmode;        /* file mode, if expecting cap */
+        /* for choosing which mds to send this request to */
+        int r_direct_mode;
+        u32 r_direct_hash;      /* choose dir frag based on this dentry hash */
+        bool r_direct_is_hash;  /* true if r_direct_hash is valid */
+        /* data payload is used for xattr ops */
+        struct page **r_pages;
+        int r_num_pages;
+        int r_data_len;
+        /* what caps shall we drop? */
+        int r_inode_drop, r_inode_unless;
+        int r_dentry_drop, r_dentry_unless;
+        int r_old_dentry_drop, r_old_dentry_unless;
+        struct inode *r_old_inode;
+        int r_old_inode_drop, r_old_inode_unless;
+        struct ceph_msg  *r_request;  /* original request */
+        struct ceph_msg  *r_reply;
+        struct ceph_mds_reply_info_parsed r_reply_info;
+        int r_err;
+        bool r_aborted;
+        unsigned long r_timeout;  /* optional.  jiffies */
+        unsigned long r_started;  /* start time to measure timeout against */
+        unsigned long r_request_started; /* start time for mds request only,
+                                            used to measure lease durations */
+        /* link unsafe requests to parent directory, for fsync */
+        struct inode    *r_unsafe_dir;
+        struct list_head r_unsafe_dir_item;
+        struct ceph_mds_session *r_session;
+        int               r_attempts;   /* resend attempts */
+        int               r_num_fwd;    /* number of forward attempts */
+        int               r_num_stale;
+        int               r_resend_mds; /* mds to resend to next, if any*/
+        struct kref       r_kref;
+        struct list_head  r_wait;
+        struct completion r_completion;
+        struct completion r_safe_completion;
+        ceph_mds_request_callback_t r_callback;
+        struct list_head  r_unsafe_item;  /* per-session unsafe list item */
+        bool              r_got_unsafe, r_got_safe;
+        bool              r_did_prepopulate;
+        u32               r_readdir_offset;
+        struct ceph_cap_reservation r_caps_reservation;
+        int r_num_caps;
+};
+/*
+ * mds client state
+ */
+struct ceph_mds_client {
+        struct ceph_client      *client;
+        struct mutex            mutex;         /* all nested structures */
+        struct ceph_mdsmap      *mdsmap;
+        struct completion       safe_umount_waiters, session_close_waiters;
+        struct list_head        waiting_for_map;
+        struct ceph_mds_session **sessions;    /* NULL for mds if no session */
+        int                     max_sessions;  /* len of s_mds_sessions */
+        int                     stopping;      /* true if shutting down */
+        /*
+         * snap_rwsem will cover cap linkage into snaprealms, and
+         * realm snap contexts.  (later, we can do per-realm snap
+         * contexts locks..)  the empty list contains realms with no
+         * references (implying they contain no inodes with caps) that
+         * should be destroyed.
+         */
+        struct rw_semaphore     snap_rwsem;
+        struct rb_root          snap_realms;
+        struct list_head        snap_empty;
+        spinlock_t              snap_empty_lock;  /* protect snap_empty */
+        u64                    last_tid;      /* most recent mds request */
+        struct rb_root         request_tree;  /* pending mds requests */
+        struct delayed_work    delayed_work;  /* delayed work */
+        unsigned long    last_renew_caps;  /* last time we renewed our caps */
+        struct list_head cap_delay_list;   /* caps with delayed release */
+        spinlock_t       cap_delay_lock;   /* protects cap_delay_list */
+        struct list_head snap_flush_list;  /* cap_snaps ready to flush */
+        spinlock_t       snap_flush_lock;
+        u64               cap_flush_seq;
+        struct list_head  cap_dirty;        /* inodes with dirty caps */
+        int               num_cap_flushing; /* # caps we are flushing */
+        spinlock_t        cap_dirty_lock;   /* protects above items */
+        wait_queue_head_t cap_flushing_wq;
+#ifdef CONFIG_DEBUG_FS
+        struct dentry     *debugfs_file;
+#endif
+        spinlock_t        dentry_lru_lock;
+        struct list_head  dentry_lru;
+        int               num_dentry;
+};
+extern const char *ceph_mds_op_name(int op);
+extern struct ceph_mds_session *
+__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
+static inline struct ceph_mds_session *
+ceph_get_mds_session(struct ceph_mds_session *s)
+{
+        atomic_inc(&s->s_ref);
+        return s;
+}
+extern void ceph_put_mds_session(struct ceph_mds_session *s);
+extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
+                             struct ceph_msg *msg, int mds);
+extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
+                           struct ceph_client *client);
+extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
+                                    struct inode *inode,
+                                    struct dentry *dn, int mask);
+extern struct ceph_mds_request *
+ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
+extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+                                     struct ceph_mds_request *req);
+extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
+                                struct inode *dir,
+                                struct ceph_mds_request *req);
+static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
+{
+        kref_get(&req->r_kref);
+}
+extern void ceph_mdsc_release_request(struct kref *kref);
+static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
+{
+        kref_put(&req->r_kref, ceph_mdsc_release_request);
+}
+extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
+extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+                                  int stop_on_nosnap);
+extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
+extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
+                                     struct inode *inode,
+                                     struct dentry *dentry, char action,
+                                     u32 seq);
+extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
+                                 struct ceph_msg *msg);
+#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
new file mode 100644
index 000000000000..c4c498e6dfef
--- /dev/null
+++ b/fs/ceph/mdsmap.c
@@ -0,0 +1,174 @@
+#include "ceph_debug.h"
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include "mdsmap.h"
+#include "messenger.h"
+#include "decode.h"
+#include "super.h"
+/*
+ * choose a random mds that is "up" (i.e. has a state > 0), or -1.
+ */
+int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
+{
+        int n = 0;
+        int i;
+        char r;
+        /* count */
+        for (i = 0; i < m->m_max_mds; i++)
+                if (m->m_info[i].state > 0)
+                        n++;
+        if (n == 0)
+                return -1;
+        /* pick */
+        get_random_bytes(&r, 1);
+        n = r % n;
+        i = 0;
+        for (i = 0; n > 0; i++, n--)
+                while (m->m_info[i].state <= 0)
+                        i++;
+        return i;
+}
+/*
+ * Decode an MDS map
+ *
+ * Ignore any fields we don't care about (there are quite a few of
+ * them).
+ */
+struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
+{
+        struct ceph_mdsmap *m;
+        const void *start = *p;
+        int i, j, n;
+        int err = -EINVAL;
+        u16 version;
+        m = kzalloc(sizeof(*m), GFP_NOFS);
+        if (m == NULL)
+                return ERR_PTR(-ENOMEM);
+        ceph_decode_16_safe(p, end, version, bad);
+        ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
+        m->m_epoch = ceph_decode_32(p);
+        m->m_client_epoch = ceph_decode_32(p);
+        m->m_last_failure = ceph_decode_32(p);
+        m->m_root = ceph_decode_32(p);
+        m->m_session_timeout = ceph_decode_32(p);
+        m->m_session_autoclose = ceph_decode_32(p);
+        m->m_max_file_size = ceph_decode_64(p);
+        m->m_max_mds = ceph_decode_32(p);
+        m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
+        if (m->m_info == NULL)
+                goto badmem;
+        /* pick out active nodes from mds_info (state > 0) */
+        n = ceph_decode_32(p);
+        for (i = 0; i < n; i++) {
+                u64 global_id;
+                u32 namelen;
+                s32 mds, inc, state;
+                u64 state_seq;
+                u8 infoversion;
+                struct ceph_entity_addr addr;
+                u32 num_export_targets;
+                void *pexport_targets = NULL;
+                ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
+                global_id = ceph_decode_64(p);
+                infoversion = ceph_decode_8(p);
+                *p += sizeof(u64);
+                namelen = ceph_decode_32(p);  /* skip mds name */
+                *p += namelen;
+                ceph_decode_need(p, end,
+                                 4*sizeof(u32) + sizeof(u64) +
+                                 sizeof(addr) + sizeof(struct ceph_timespec),
+                                 bad);
+                mds = ceph_decode_32(p);
+                inc = ceph_decode_32(p);
+                state = ceph_decode_32(p);
+                state_seq = ceph_decode_64(p);
+                ceph_decode_copy(p, &addr, sizeof(addr));
+                ceph_decode_addr(&addr);
+                *p += sizeof(struct ceph_timespec);
+                *p += sizeof(u32);
+                ceph_decode_32_safe(p, end, namelen, bad);
+                *p += namelen;
+                if (infoversion >= 2) {
+                        ceph_decode_32_safe(p, end, num_export_targets, bad);
+                        pexport_targets = *p;
+                        *p += num_export_targets * sizeof(u32);
+                } else {
+                        num_export_targets = 0;
+                }
+                dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
+                     i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
+                     ceph_mds_state_name(state));
+                if (mds >= 0 && mds < m->m_max_mds && state > 0) {
+                        m->m_info[mds].global_id = global_id;
+                        m->m_info[mds].state = state;
+                        m->m_info[mds].addr = addr;
+                        m->m_info[mds].num_export_targets = num_export_targets;
+                        if (num_export_targets) {
+                                m->m_info[mds].export_targets =
+                                        kcalloc(num_export_targets, sizeof(u32),
+                                                GFP_NOFS);
+                                for (j = 0; j < num_export_targets; j++)
+                                        m->m_info[mds].export_targets[j] =
+                                               ceph_decode_32(&pexport_targets);
+                        } else {
+                                m->m_info[mds].export_targets = NULL;
+                        }
+                }
+        }
+        /* pg_pools */
+        ceph_decode_32_safe(p, end, n, bad);
+        m->m_num_data_pg_pools = n;
+        m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
+        if (!m->m_data_pg_pools)
+                goto badmem;
+        ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
+        for (i = 0; i < n; i++)
+                m->m_data_pg_pools[i] = ceph_decode_32(p);
+        m->m_cas_pg_pool = ceph_decode_32(p);
+        /* ok, we don't care about the rest. */
+        dout("mdsmap_decode success epoch %u\n", m->m_epoch);
+        return m;
+badmem:
+        err = -ENOMEM;
+bad:
+        pr_err("corrupt mdsmap\n");
+        print_hex_dump(KERN_DEBUG, "mdsmap: ",
+                       DUMP_PREFIX_OFFSET, 16, 1,
+                       start, end - start, true);
+        ceph_mdsmap_destroy(m);
+        return ERR_PTR(-EINVAL);
+}
+void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
+{
+        int i;
+        for (i = 0; i < m->m_max_mds; i++)
+                kfree(m->m_info[i].export_targets);
+        kfree(m->m_info);
+        kfree(m->m_data_pg_pools);
+        kfree(m);
+}
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
new file mode 100644
index 000000000000..eacc131aa5cb
--- /dev/null
+++ b/fs/ceph/mdsmap.h
@@ -0,0 +1,54 @@
+#ifndef _FS_CEPH_MDSMAP_H
+#define _FS_CEPH_MDSMAP_H
+#include "types.h"
+/*
+ * mds map - describe servers in the mds cluster.
+ *
+ * we limit fields to those the client actually xcares about
+ */
+struct ceph_mds_info {
+        u64 global_id;
+        struct ceph_entity_addr addr;
+        s32 state;
+        int num_export_targets;
+        u32 *export_targets;
+};
+struct ceph_mdsmap {
+        u32 m_epoch, m_client_epoch, m_last_failure;
+        u32 m_root;
+        u32 m_session_timeout;          /* seconds */
+        u32 m_session_autoclose;        /* seconds */
+        u64 m_max_file_size;
+        u32 m_max_mds;                  /* size of m_addr, m_state arrays */
+        struct ceph_mds_info *m_info;
+        /* which object pools file data can be stored in */
+        int m_num_data_pg_pools;
+        u32 *m_data_pg_pools;
+        u32 m_cas_pg_pool;
+};
+static inline struct ceph_entity_addr *
+ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
+{
+        if (w >= m->m_max_mds)
+                return NULL;
+        return &m->m_info[w].addr;
+}
+static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
+{
+        BUG_ON(w < 0);
+        if (w >= m->m_max_mds)
+                return CEPH_MDS_STATE_DNE;
+        return m->m_info[w].state;
+}
+extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
+extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
+extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
+#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
new file mode 100644
index 000000000000..8f1715ffbe4b
--- /dev/null
+++ b/fs/ceph/messenger.c
@@ -0,0 +1,2240 @@
+#include "ceph_debug.h"
+#include <linux/crc32c.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/inet.h>
+#include <linux/kthread.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <net/tcp.h>
+#include "super.h"
+#include "messenger.h"
+#include "decode.h"
+#include "pagelist.h"
+/*
+ * Ceph uses the messenger to exchange ceph_msg messages with other
+ * hosts in the system.  The messenger provides ordered and reliable
+ * delivery.  We tolerate TCP disconnects by reconnecting (with
+ * exponential backoff) in the case of a fault (disconnection, bad
+ * crc, protocol error).  Acks allow sent messages to be discarded by
+ * the sender.
+ */
+/* static tag bytes (protocol control messages) */
+static char tag_msg = CEPH_MSGR_TAG_MSG;
+static char tag_ack = CEPH_MSGR_TAG_ACK;
+static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+static void queue_con(struct ceph_connection *con);
+static void con_work(struct work_struct *);
+static void ceph_fault(struct ceph_connection *con);
+const char *ceph_name_type_str(int t)
+{
+        switch (t) {
+        case CEPH_ENTITY_TYPE_MON: return "mon";
+        case CEPH_ENTITY_TYPE_MDS: return "mds";
+        case CEPH_ENTITY_TYPE_OSD: return "osd";
+        case CEPH_ENTITY_TYPE_CLIENT: return "client";
+        case CEPH_ENTITY_TYPE_ADMIN: return "admin";
+        default: return "???";
+        }
+}
+/*
+ * nicely render a sockaddr as a string.
+ */
+#define MAX_ADDR_STR 20
+static char addr_str[MAX_ADDR_STR][40];
+static DEFINE_SPINLOCK(addr_str_lock);
+static int last_addr_str;
+const char *pr_addr(const struct sockaddr_storage *ss)
+{
+        int i;
+        char *s;
+        struct sockaddr_in *in4 = (void *)ss;
+        unsigned char *quad = (void *)&in4->sin_addr.s_addr;
+        struct sockaddr_in6 *in6 = (void *)ss;
+        spin_lock(&addr_str_lock);
+        i = last_addr_str++;
+        if (last_addr_str == MAX_ADDR_STR)
+                last_addr_str = 0;
+        spin_unlock(&addr_str_lock);
+        s = addr_str[i];
+        switch (ss->ss_family) {
+        case AF_INET:
+                sprintf(s, "%u.%u.%u.%u:%u",
+                        (unsigned int)quad[0],
+                        (unsigned int)quad[1],
+                        (unsigned int)quad[2],
+                        (unsigned int)quad[3],
+                        (unsigned int)ntohs(in4->sin_port));
+                break;
+        case AF_INET6:
+                sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u",
+                        in6->sin6_addr.s6_addr16[0],
+                        in6->sin6_addr.s6_addr16[1],
+                        in6->sin6_addr.s6_addr16[2],
+                        in6->sin6_addr.s6_addr16[3],
+                        in6->sin6_addr.s6_addr16[4],
+                        in6->sin6_addr.s6_addr16[5],
+                        in6->sin6_addr.s6_addr16[6],
+                        in6->sin6_addr.s6_addr16[7],
+                        (unsigned int)ntohs(in6->sin6_port));
+                break;
+        default:
+                sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
+        }
+        return s;
+}
+static void encode_my_addr(struct ceph_messenger *msgr)
+{
+        memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
+        ceph_encode_addr(&msgr->my_enc_addr);
+}
+/*
+ * work queue for all reading and writing to/from the socket.
+ */
+struct workqueue_struct *ceph_msgr_wq;
+int __init ceph_msgr_init(void)
+{
+        ceph_msgr_wq = create_workqueue("ceph-msgr");
+        if (IS_ERR(ceph_msgr_wq)) {
+                int ret = PTR_ERR(ceph_msgr_wq);
+                pr_err("msgr_init failed to create workqueue: %d\n", ret);
+                ceph_msgr_wq = NULL;
+                return ret;
+        }
+        return 0;
+}
+void ceph_msgr_exit(void)
+{
+        destroy_workqueue(ceph_msgr_wq);
+}
+/*
+ * socket callback functions
+ */
+/* data available on socket, or listen socket received a connect */
+static void ceph_data_ready(struct sock *sk, int count_unused)
+{
+        struct ceph_connection *con =
+                (struct ceph_connection *)sk->sk_user_data;
+        if (sk->sk_state != TCP_CLOSE_WAIT) {
+                dout("ceph_data_ready on %p state = %lu, queueing work\n",
+                     con, con->state);
+                queue_con(con);
+        }
+}
+/* socket has buffer space for writing */
+static void ceph_write_space(struct sock *sk)
+{
+        struct ceph_connection *con =
+                (struct ceph_connection *)sk->sk_user_data;
+        /* only queue to workqueue if there is data we want to write. */
+        if (test_bit(WRITE_PENDING, &con->state)) {
+                dout("ceph_write_space %p queueing write work\n", con);
+                queue_con(con);
+        } else {
+                dout("ceph_write_space %p nothing to write\n", con);
+        }
+        /* since we have our own write_space, clear the SOCK_NOSPACE flag */
+        clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+}
+/* socket's state has changed */
+static void ceph_state_change(struct sock *sk)
+{
+        struct ceph_connection *con =
+                (struct ceph_connection *)sk->sk_user_data;
+        dout("ceph_state_change %p state = %lu sk_state = %u\n",
+             con, con->state, sk->sk_state);
+        if (test_bit(CLOSED, &con->state))
+                return;
+        switch (sk->sk_state) {
+        case TCP_CLOSE:
+                dout("ceph_state_change TCP_CLOSE\n");
+        case TCP_CLOSE_WAIT:
+                dout("ceph_state_change TCP_CLOSE_WAIT\n");
+                if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
+                        if (test_bit(CONNECTING, &con->state))
+                                con->error_msg = "connection failed";
+                        else
+                                con->error_msg = "socket closed";
+                        queue_con(con);
+                }
+                break;
+        case TCP_ESTABLISHED:
+                dout("ceph_state_change TCP_ESTABLISHED\n");
+                queue_con(con);
+                break;
+        }
+}
+/*
+ * set up socket callbacks
+ */
+static void set_sock_callbacks(struct socket *sock,
+                               struct ceph_connection *con)
+{
+        struct sock *sk = sock->sk;
+        sk->sk_user_data = (void *)con;
+        sk->sk_data_ready = ceph_data_ready;
+        sk->sk_write_space = ceph_write_space;
+        sk->sk_state_change = ceph_state_change;
+}
+/*
+ * socket helpers
+ */
+/*
+ * initiate connection to a remote socket.
+ */
+static struct socket *ceph_tcp_connect(struct ceph_connection *con)
+{
+        struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr;
+        struct socket *sock;
+        int ret;
+        BUG_ON(con->sock);
+        ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+        if (ret)
+                return ERR_PTR(ret);
+        con->sock = sock;
+        sock->sk->sk_allocation = GFP_NOFS;
+        set_sock_callbacks(sock, con);
+        dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
+        ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK);
+        if (ret == -EINPROGRESS) {
+                dout("connect %s EINPROGRESS sk_state = %u\n",
+                     pr_addr(&con->peer_addr.in_addr),
+                     sock->sk->sk_state);
+                ret = 0;
+        }
+        if (ret < 0) {
+                pr_err("connect %s error %d\n",
+                       pr_addr(&con->peer_addr.in_addr), ret);
+                sock_release(sock);
+                con->sock = NULL;
+                con->error_msg = "connect error";
+        }
+        if (ret < 0)
+                return ERR_PTR(ret);
+        return sock;
+}
+static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
+{
+        struct kvec iov = {buf, len};
+        struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+        return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
+}
+/*
+ * write something.  @more is true if caller will be sending more data
+ * shortly.
+ */
+static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
+                     size_t kvlen, size_t len, int more)
+{
+        struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+        if (more)
+                msg.msg_flags |= MSG_MORE;
+        else
+                msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
+        return kernel_sendmsg(sock, &msg, iov, kvlen, len);
+}
+/*
+ * Shutdown/close the socket for the given connection.
+ */
+static int con_close_socket(struct ceph_connection *con)
+{
+        int rc;
+        dout("con_close_socket on %p sock %p\n", con, con->sock);
+        if (!con->sock)
+                return 0;
+        set_bit(SOCK_CLOSED, &con->state);
+        rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
+        sock_release(con->sock);
+        con->sock = NULL;
+        clear_bit(SOCK_CLOSED, &con->state);
+        return rc;
+}
+/*
+ * Reset a connection.  Discard all incoming and outgoing messages
+ * and clear *_seq state.
+ */
+static void ceph_msg_remove(struct ceph_msg *msg)
+{
+        list_del_init(&msg->list_head);
+        ceph_msg_put(msg);
+}
+static void ceph_msg_remove_list(struct list_head *head)
+{
+        while (!list_empty(head)) {
+                struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
+                                                        list_head);
+                ceph_msg_remove(msg);
+        }
+}
+static void reset_connection(struct ceph_connection *con)
+{
+        /* reset connection, out_queue, msg_ and connect_seq */
+        /* discard existing out_queue and msg_seq */
+        ceph_msg_remove_list(&con->out_queue);
+        ceph_msg_remove_list(&con->out_sent);
+        if (con->in_msg) {
+                ceph_msg_put(con->in_msg);
+                con->in_msg = NULL;
+        }
+        con->connect_seq = 0;
+        con->out_seq = 0;
+        if (con->out_msg) {
+                ceph_msg_put(con->out_msg);
+                con->out_msg = NULL;
+        }
+        con->in_seq = 0;
+}
+/*
+ * mark a peer down.  drop any open connections.
+ */
+void ceph_con_close(struct ceph_connection *con)
+{
+        dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
+        set_bit(CLOSED, &con->state);  /* in case there's queued work */
+        clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
+        clear_bit(LOSSYTX, &con->state);  /* so we retry next connect */
+        clear_bit(KEEPALIVE_PENDING, &con->state);
+        clear_bit(WRITE_PENDING, &con->state);
+        mutex_lock(&con->mutex);
+        reset_connection(con);
+        cancel_delayed_work(&con->work);
+        mutex_unlock(&con->mutex);
+        queue_con(con);
+}
+/*
+ * Reopen a closed connection, with a new peer address.
+ */
+void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
+{
+        dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
+        set_bit(OPENING, &con->state);
+        clear_bit(CLOSED, &con->state);
+        memcpy(&con->peer_addr, addr, sizeof(*addr));
+        con->delay = 0;      /* reset backoff memory */
+        queue_con(con);
+}
+/*
+ * return true if this connection ever successfully opened
+ */
+bool ceph_con_opened(struct ceph_connection *con)
+{
+        return con->connect_seq > 0;
+}
+/*
+ * generic get/put
+ */
+struct ceph_connection *ceph_con_get(struct ceph_connection *con)
+{
+        dout("con_get %p nref = %d -> %d\n", con,
+             atomic_read(&con->nref), atomic_read(&con->nref) + 1);
+        if (atomic_inc_not_zero(&con->nref))
+                return con;
+        return NULL;
+}
+void ceph_con_put(struct ceph_connection *con)
+{
+        dout("con_put %p nref = %d -> %d\n", con,
+             atomic_read(&con->nref), atomic_read(&con->nref) - 1);
+        BUG_ON(atomic_read(&con->nref) == 0);
+        if (atomic_dec_and_test(&con->nref)) {
+                BUG_ON(con->sock);
+                kfree(con);
+        }
+}
+/*
+ * initialize a new connection.
+ */
+void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
+{
+        dout("con_init %p\n", con);
+        memset(con, 0, sizeof(*con));
+        atomic_set(&con->nref, 1);
+        con->msgr = msgr;
+        mutex_init(&con->mutex);
+        INIT_LIST_HEAD(&con->out_queue);
+        INIT_LIST_HEAD(&con->out_sent);
+        INIT_DELAYED_WORK(&con->work, con_work);
+}
+/*
+ * We maintain a global counter to order connection attempts.  Get
+ * a unique seq greater than @gt.
+ */
+static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
+{
+        u32 ret;
+        spin_lock(&msgr->global_seq_lock);
+        if (msgr->global_seq < gt)
+                msgr->global_seq = gt;
+        ret = ++msgr->global_seq;
+        spin_unlock(&msgr->global_seq_lock);
+        return ret;
+}
+/*
+ * Prepare footer for currently outgoing message, and finish things
+ * off.  Assumes out_kvec* are already valid.. we just add on to the end.
+ */
+static void prepare_write_message_footer(struct ceph_connection *con, int v)
+{
+        struct ceph_msg *m = con->out_msg;
+        dout("prepare_write_message_footer %p\n", con);
+        con->out_kvec_is_msg = true;
+        con->out_kvec[v].iov_base = &m->footer;
+        con->out_kvec[v].iov_len = sizeof(m->footer);
+        con->out_kvec_bytes += sizeof(m->footer);
+        con->out_kvec_left++;
+        con->out_more = m->more_to_follow;
+        con->out_msg_done = true;
+}
+/*
+ * Prepare headers for the next outgoing message.
+ */
+static void prepare_write_message(struct ceph_connection *con)
+{
+        struct ceph_msg *m;
+        int v = 0;
+        con->out_kvec_bytes = 0;
+        con->out_kvec_is_msg = true;
+        con->out_msg_done = false;
+        /* Sneak an ack in there first?  If we can get it into the same
+         * TCP packet that's a good thing. */
+        if (con->in_seq > con->in_seq_acked) {
+                con->in_seq_acked = con->in_seq;
+                con->out_kvec[v].iov_base = &tag_ack;
+                con->out_kvec[v++].iov_len = 1;
+                con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+                con->out_kvec[v].iov_base = &con->out_temp_ack;
+                con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
+                con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+        }
+        m = list_first_entry(&con->out_queue,
+                       struct ceph_msg, list_head);
+        con->out_msg = m;
+        if (test_bit(LOSSYTX, &con->state)) {
+                list_del_init(&m->list_head);
+        } else {
+                /* put message on sent list */
+                ceph_msg_get(m);
+                list_move_tail(&m->list_head, &con->out_sent);
+        }
+        m->hdr.seq = cpu_to_le64(++con->out_seq);
+        dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
+             m, con->out_seq, le16_to_cpu(m->hdr.type),
+             le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
+             le32_to_cpu(m->hdr.data_len),
+             m->nr_pages);
+        BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
+        /* tag + hdr + front + middle */
+        con->out_kvec[v].iov_base = &tag_msg;
+        con->out_kvec[v++].iov_len = 1;
+        con->out_kvec[v].iov_base = &m->hdr;
+        con->out_kvec[v++].iov_len = sizeof(m->hdr);
+        con->out_kvec[v++] = m->front;
+        if (m->middle)
+                con->out_kvec[v++] = m->middle->vec;
+        con->out_kvec_left = v;
+        con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
+                (m->middle ? m->middle->vec.iov_len : 0);
+        con->out_kvec_cur = con->out_kvec;
+        /* fill in crc (except data pages), footer */
+        con->out_msg->hdr.crc =
+                cpu_to_le32(crc32c(0, (void *)&m->hdr,
+                                      sizeof(m->hdr) - sizeof(m->hdr.crc)));
+        con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
+        con->out_msg->footer.front_crc =
+                cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
+        if (m->middle)
+                con->out_msg->footer.middle_crc =
+                        cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
+                                           m->middle->vec.iov_len));
+        else
+                con->out_msg->footer.middle_crc = 0;
+        con->out_msg->footer.data_crc = 0;
+        dout("prepare_write_message front_crc %u data_crc %u\n",
+             le32_to_cpu(con->out_msg->footer.front_crc),
+             le32_to_cpu(con->out_msg->footer.middle_crc));
+        /* is there a data payload? */
+        if (le32_to_cpu(m->hdr.data_len) > 0) {
+                /* initialize page iterator */
+                con->out_msg_pos.page = 0;
+                con->out_msg_pos.page_pos =
+                        le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
+                con->out_msg_pos.data_pos = 0;
+                con->out_msg_pos.did_page_crc = 0;
+                con->out_more = 1;  /* data + footer will follow */
+        } else {
+                /* no, queue up footer too and be done */
+                prepare_write_message_footer(con, v);
+        }
+        set_bit(WRITE_PENDING, &con->state);
+}
+/*
+ * Prepare an ack.
+ */
+static void prepare_write_ack(struct ceph_connection *con)
+{
+        dout("prepare_write_ack %p %llu -> %llu\n", con,
+             con->in_seq_acked, con->in_seq);
+        con->in_seq_acked = con->in_seq;
+        con->out_kvec[0].iov_base = &tag_ack;
+        con->out_kvec[0].iov_len = 1;
+        con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+        con->out_kvec[1].iov_base = &con->out_temp_ack;
+        con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
+        con->out_kvec_left = 2;
+        con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+        con->out_kvec_cur = con->out_kvec;
+        con->out_more = 1;  /* more will follow.. eventually.. */
+        set_bit(WRITE_PENDING, &con->state);
+}
+/*
+ * Prepare to write keepalive byte.
+ */
+static void prepare_write_keepalive(struct ceph_connection *con)
+{
+        dout("prepare_write_keepalive %p\n", con);
+        con->out_kvec[0].iov_base = &tag_keepalive;
+        con->out_kvec[0].iov_len = 1;
+        con->out_kvec_left = 1;
+        con->out_kvec_bytes = 1;
+        con->out_kvec_cur = con->out_kvec;
+        set_bit(WRITE_PENDING, &con->state);
+}
+/*
+ * Connection negotiation.
+ */
+static void prepare_connect_authorizer(struct ceph_connection *con)
+{
+        void *auth_buf;
+        int auth_len = 0;
+        int auth_protocol = 0;
+        mutex_unlock(&con->mutex);
+        if (con->ops->get_authorizer)
+                con->ops->get_authorizer(con, &auth_buf, &auth_len,
+                                         &auth_protocol, &con->auth_reply_buf,
+                                         &con->auth_reply_buf_len,
+                                         con->auth_retry);
+        mutex_lock(&con->mutex);
+        con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
+        con->out_connect.authorizer_len = cpu_to_le32(auth_len);
+        con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
+        con->out_kvec[con->out_kvec_left].iov_len = auth_len;
+        con->out_kvec_left++;
+        con->out_kvec_bytes += auth_len;
+}
+/*
+ * We connected to a peer and are saying hello.
+ */
+static void prepare_write_banner(struct ceph_messenger *msgr,
+                                 struct ceph_connection *con)
+{
+        int len = strlen(CEPH_BANNER);
+        con->out_kvec[0].iov_base = CEPH_BANNER;
+        con->out_kvec[0].iov_len = len;
+        con->out_kvec[1].iov_base = &msgr->my_enc_addr;
+        con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
+        con->out_kvec_left = 2;
+        con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
+        con->out_kvec_cur = con->out_kvec;
+        con->out_more = 0;
+        set_bit(WRITE_PENDING, &con->state);
+}
+static void prepare_write_connect(struct ceph_messenger *msgr,
+                                  struct ceph_connection *con,
+                                  int after_banner)
+{
+        unsigned global_seq = get_global_seq(con->msgr, 0);
+        int proto;
+        switch (con->peer_name.type) {
+        case CEPH_ENTITY_TYPE_MON:
+                proto = CEPH_MONC_PROTOCOL;
+                break;
+        case CEPH_ENTITY_TYPE_OSD:
+                proto = CEPH_OSDC_PROTOCOL;
+                break;
+        case CEPH_ENTITY_TYPE_MDS:
+                proto = CEPH_MDSC_PROTOCOL;
+                break;
+        default:
+                BUG();
+        }
+        dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
+             con->connect_seq, global_seq, proto);
+        con->out_connect.features = CEPH_FEATURE_SUPPORTED;
+        con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
+        con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
+        con->out_connect.global_seq = cpu_to_le32(global_seq);
+        con->out_connect.protocol_version = cpu_to_le32(proto);
+        con->out_connect.flags = 0;
+        if (!after_banner) {
+                con->out_kvec_left = 0;
+                con->out_kvec_bytes = 0;
+        }
+        con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
+        con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
+        con->out_kvec_left++;
+        con->out_kvec_bytes += sizeof(con->out_connect);
+        con->out_kvec_cur = con->out_kvec;
+        con->out_more = 0;
+        set_bit(WRITE_PENDING, &con->state);
+        prepare_connect_authorizer(con);
+}
+/*
+ * write as much of pending kvecs to the socket as we can.
+ *  1 -> done
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_kvec(struct ceph_connection *con)
+{
+        int ret;
+        dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
+        while (con->out_kvec_bytes > 0) {
+                ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
+                                       con->out_kvec_left, con->out_kvec_bytes,
+                                       con->out_more);
+                if (ret <= 0)
+                        goto out;
+                con->out_kvec_bytes -= ret;
+                if (con->out_kvec_bytes == 0)
+                        break;            /* done */
+                while (ret > 0) {
+                        if (ret >= con->out_kvec_cur->iov_len) {
+                                ret -= con->out_kvec_cur->iov_len;
+                                con->out_kvec_cur++;
+                                con->out_kvec_left--;
+                        } else {
+                                con->out_kvec_cur->iov_len -= ret;
+                                con->out_kvec_cur->iov_base += ret;
+                                ret = 0;
+                                break;
+                        }
+                }
+        }
+        con->out_kvec_left = 0;
+        con->out_kvec_is_msg = false;
+        ret = 1;
+out:
+        dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
+             con->out_kvec_bytes, con->out_kvec_left, ret);
+        return ret;  /* done! */
+}
+/*
+ * Write as much message data payload as we can.  If we finish, queue
+ * up the footer.
+ *  1 -> done, footer is now queued in out_kvec[].
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_msg_pages(struct ceph_connection *con)
+{
+        struct ceph_msg *msg = con->out_msg;
+        unsigned data_len = le32_to_cpu(msg->hdr.data_len);
+        size_t len;
+        int crc = con->msgr->nocrc;
+        int ret;
+        dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
+             con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
+             con->out_msg_pos.page_pos);
+        while (con->out_msg_pos.page < con->out_msg->nr_pages) {
+                struct page *page = NULL;
+                void *kaddr = NULL;
+                /*
+                 * if we are calculating the data crc (the default), we need
+                 * to map the page.  if our pages[] has been revoked, use the
+                 * zero page.
+                 */
+                if (msg->pages) {
+                        page = msg->pages[con->out_msg_pos.page];
+                        if (crc)
+                                kaddr = kmap(page);
+                } else if (msg->pagelist) {
+                        page = list_first_entry(&msg->pagelist->head,
+                                                struct page, lru);
+                        if (crc)
+                                kaddr = kmap(page);
+                } else {
+                        page = con->msgr->zero_page;
+                        if (crc)
+                                kaddr = page_address(con->msgr->zero_page);
+                }
+                len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
+                          (int)(data_len - con->out_msg_pos.data_pos));
+                if (crc && !con->out_msg_pos.did_page_crc) {
+                        void *base = kaddr + con->out_msg_pos.page_pos;
+                        u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
+                        BUG_ON(kaddr == NULL);
+                        con->out_msg->footer.data_crc =
+                                cpu_to_le32(crc32c(tmpcrc, base, len));
+                        con->out_msg_pos.did_page_crc = 1;
+                }
+                ret = kernel_sendpage(con->sock, page,
+                                      con->out_msg_pos.page_pos, len,
+                                      MSG_DONTWAIT | MSG_NOSIGNAL |
+                                      MSG_MORE);
+                if (crc && (msg->pages || msg->pagelist))
+                        kunmap(page);
+                if (ret <= 0)
+                        goto out;
+                con->out_msg_pos.data_pos += ret;
+                con->out_msg_pos.page_pos += ret;
+                if (ret == len) {
+                        con->out_msg_pos.page_pos = 0;
+                        con->out_msg_pos.page++;
+                        con->out_msg_pos.did_page_crc = 0;
+                        if (msg->pagelist)
+                                list_move_tail(&page->lru,
+                                               &msg->pagelist->head);
+                }
+        }
+        dout("write_partial_msg_pages %p msg %p done\n", con, msg);
+        /* prepare and queue up footer, too */
+        if (!crc)
+                con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
+        con->out_kvec_bytes = 0;
+        con->out_kvec_left = 0;
+        con->out_kvec_cur = con->out_kvec;
+        prepare_write_message_footer(con, 0);
+        ret = 1;
+out:
+        return ret;
+}
+/*
+ * write some zeros
+ */
+static int write_partial_skip(struct ceph_connection *con)
+{
+        int ret;
+        while (con->out_skip > 0) {
+                struct kvec iov = {
+                        .iov_base = page_address(con->msgr->zero_page),
+                        .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
+                };
+                ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
+                if (ret <= 0)
+                        goto out;
+                con->out_skip -= ret;
+        }
+        ret = 1;
+out:
+        return ret;
+}
+/*
+ * Prepare to read connection handshake, or an ack.
+ */
+static void prepare_read_banner(struct ceph_connection *con)
+{
+        dout("prepare_read_banner %p\n", con);
+        con->in_base_pos = 0;
+}
+static void prepare_read_connect(struct ceph_connection *con)
+{
+        dout("prepare_read_connect %p\n", con);
+        con->in_base_pos = 0;
+}
+static void prepare_read_ack(struct ceph_connection *con)
+{
+        dout("prepare_read_ack %p\n", con);
+        con->in_base_pos = 0;
+}
+static void prepare_read_tag(struct ceph_connection *con)
+{
+        dout("prepare_read_tag %p\n", con);
+        con->in_base_pos = 0;
+        con->in_tag = CEPH_MSGR_TAG_READY;
+}
+/*
+ * Prepare to read a message.
+ */
+static int prepare_read_message(struct ceph_connection *con)
+{
+        dout("prepare_read_message %p\n", con);
+        BUG_ON(con->in_msg != NULL);
+        con->in_base_pos = 0;
+        con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
+        return 0;
+}
+static int read_partial(struct ceph_connection *con,
+                        int *to, int size, void *object)
+{
+        *to += size;
+        while (con->in_base_pos < *to) {
+                int left = *to - con->in_base_pos;
+                int have = size - left;
+                int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
+                if (ret <= 0)
+                        return ret;
+                con->in_base_pos += ret;
+        }
+        return 1;
+}
+/*
+ * Read all or part of the connect-side handshake on a new connection
+ */
+static int read_partial_banner(struct ceph_connection *con)
+{
+        int ret, to = 0;
+        dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
+        /* peer's banner */
+        ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
+        if (ret <= 0)
+                goto out;
+        ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
+                           &con->actual_peer_addr);
+        if (ret <= 0)
+                goto out;
+        ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
+                           &con->peer_addr_for_me);
+        if (ret <= 0)
+                goto out;
+out:
+        return ret;
+}
+static int read_partial_connect(struct ceph_connection *con)
+{
+        int ret, to = 0;
+        dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
+        ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
+        if (ret <= 0)
+                goto out;
+        ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
+                           con->auth_reply_buf);
+        if (ret <= 0)
+                goto out;
+        dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
+             con, (int)con->in_reply.tag,
+             le32_to_cpu(con->in_reply.connect_seq),
+             le32_to_cpu(con->in_reply.global_seq));
+out:
+        return ret;
+}
+/*
+ * Verify the hello banner looks okay.
+ */
+static int verify_hello(struct ceph_connection *con)
+{
+        if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+                pr_err("connect to %s got bad banner\n",
+                       pr_addr(&con->peer_addr.in_addr));
+                con->error_msg = "protocol error, bad banner";
+                return -1;
+        }
+        return 0;
+}
+static bool addr_is_blank(struct sockaddr_storage *ss)
+{
+        switch (ss->ss_family) {
+        case AF_INET:
+                return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
+        case AF_INET6:
+                return
+                     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
+                     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
+                     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
+                     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
+        }
+        return false;
+}
+static int addr_port(struct sockaddr_storage *ss)
+{
+        switch (ss->ss_family) {
+        case AF_INET:
+                return ntohs(((struct sockaddr_in *)ss)->sin_port);
+        case AF_INET6:
+                return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
+        }
+        return 0;
+}
+static void addr_set_port(struct sockaddr_storage *ss, int p)
+{
+        switch (ss->ss_family) {
+        case AF_INET:
+                ((struct sockaddr_in *)ss)->sin_port = htons(p);
+        case AF_INET6:
+                ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
+        }
+}
+/*
+ * Parse an ip[:port] list into an addr array.  Use the default
+ * monitor port if a port isn't specified.
+ */
+int ceph_parse_ips(const char *c, const char *end,
+                   struct ceph_entity_addr *addr,
+                   int max_count, int *count)
+{
+        int i;
+        const char *p = c;
+        dout("parse_ips on '%.*s'\n", (int)(end-c), c);
+        for (i = 0; i < max_count; i++) {
+                const char *ipend;
+                struct sockaddr_storage *ss = &addr[i].in_addr;
+                struct sockaddr_in *in4 = (void *)ss;
+                struct sockaddr_in6 *in6 = (void *)ss;
+                int port;
+                memset(ss, 0, sizeof(*ss));
+                if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
+                             ',', &ipend)) {
+                        ss->ss_family = AF_INET;
+                } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
+                                    ',', &ipend)) {
+                        ss->ss_family = AF_INET6;
+                } else {
+                        goto bad;
+                }
+                p = ipend;
+                /* port? */
+                if (p < end && *p == ':') {
+                        port = 0;
+                        p++;
+                        while (p < end && *p >= '0' && *p <= '9') {
+                                port = (port * 10) + (*p - '0');
+                                p++;
+                        }
+                        if (port > 65535 || port == 0)
+                                goto bad;
+                } else {
+                        port = CEPH_MON_PORT;
+                }
+                addr_set_port(ss, port);
+                dout("parse_ips got %s\n", pr_addr(ss));
+                if (p == end)
+                        break;
+                if (*p != ',')
+                        goto bad;
+                p++;
+        }
+        if (p != end)
+                goto bad;
+        if (count)
+                *count = i + 1;
+        return 0;
+bad:
+        pr_err("parse_ips bad ip '%s'\n", c);
+        return -EINVAL;
+}
+static int process_banner(struct ceph_connection *con)
+{
+        dout("process_banner on %p\n", con);
+        if (verify_hello(con) < 0)
+                return -1;
+        ceph_decode_addr(&con->actual_peer_addr);
+        ceph_decode_addr(&con->peer_addr_for_me);
+        /*
+         * Make sure the other end is who we wanted.  note that the other
+         * end may not yet know their ip address, so if it's 0.0.0.0, give
+         * them the benefit of the doubt.
+         */
+        if (memcmp(&con->peer_addr, &con->actual_peer_addr,
+                   sizeof(con->peer_addr)) != 0 &&
+            !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
+              con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
+                pr_warning("wrong peer, want %s/%lld, got %s/%lld\n",
+                           pr_addr(&con->peer_addr.in_addr),
+                           le64_to_cpu(con->peer_addr.nonce),
+                           pr_addr(&con->actual_peer_addr.in_addr),
+                           le64_to_cpu(con->actual_peer_addr.nonce));
+                con->error_msg = "wrong peer at address";
+                return -1;
+        }
+        /*
+         * did we learn our address?
+         */
+        if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
+                int port = addr_port(&con->msgr->inst.addr.in_addr);
+                memcpy(&con->msgr->inst.addr.in_addr,
+                       &con->peer_addr_for_me.in_addr,
+                       sizeof(con->peer_addr_for_me.in_addr));
+                addr_set_port(&con->msgr->inst.addr.in_addr, port);
+                encode_my_addr(con->msgr);
+                dout("process_banner learned my addr is %s\n",
+                     pr_addr(&con->msgr->inst.addr.in_addr));
+        }
+        set_bit(NEGOTIATING, &con->state);
+        prepare_read_connect(con);
+        return 0;
+}
+static void fail_protocol(struct ceph_connection *con)
+{
+        reset_connection(con);
+        set_bit(CLOSED, &con->state);  /* in case there's queued work */
+        mutex_unlock(&con->mutex);
+        if (con->ops->bad_proto)
+                con->ops->bad_proto(con);
+        mutex_lock(&con->mutex);
+}
+static int process_connect(struct ceph_connection *con)
+{
+        u64 sup_feat = CEPH_FEATURE_SUPPORTED;
+        u64 req_feat = CEPH_FEATURE_REQUIRED;
+        u64 server_feat = le64_to_cpu(con->in_reply.features);
+        dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
+        switch (con->in_reply.tag) {
+        case CEPH_MSGR_TAG_FEATURES:
+                pr_err("%s%lld %s feature set mismatch,"
+                       " my %llx < server's %llx, missing %llx\n",
+                       ENTITY_NAME(con->peer_name),
+                       pr_addr(&con->peer_addr.in_addr),
+                       sup_feat, server_feat, server_feat & ~sup_feat);
+                con->error_msg = "missing required protocol features";
+                fail_protocol(con);
+                return -1;
+        case CEPH_MSGR_TAG_BADPROTOVER:
+                pr_err("%s%lld %s protocol version mismatch,"
+                       " my %d != server's %d\n",
+                       ENTITY_NAME(con->peer_name),
+                       pr_addr(&con->peer_addr.in_addr),
+                       le32_to_cpu(con->out_connect.protocol_version),
+                       le32_to_cpu(con->in_reply.protocol_version));
+                con->error_msg = "protocol version mismatch";
+                fail_protocol(con);
+                return -1;
+        case CEPH_MSGR_TAG_BADAUTHORIZER:
+                con->auth_retry++;
+                dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
+                     con->auth_retry);
+                if (con->auth_retry == 2) {
+                        con->error_msg = "connect authorization failure";
+                        reset_connection(con);
+                        set_bit(CLOSED, &con->state);
+                        return -1;
+                }
+                con->auth_retry = 1;
+                prepare_write_connect(con->msgr, con, 0);
+                prepare_read_connect(con);
+                break;
+        case CEPH_MSGR_TAG_RESETSESSION:
+                /*
+                 * If we connected with a large connect_seq but the peer
+                 * has no record of a session with us (no connection, or
+                 * connect_seq == 0), they will send RESETSESION to indicate
+                 * that they must have reset their session, and may have
+                 * dropped messages.
+                 */
+                dout("process_connect got RESET peer seq %u\n",
+                     le32_to_cpu(con->in_connect.connect_seq));
+                pr_err("%s%lld %s connection reset\n",
+                       ENTITY_NAME(con->peer_name),
+                       pr_addr(&con->peer_addr.in_addr));
+                reset_connection(con);
+                prepare_write_connect(con->msgr, con, 0);
+                prepare_read_connect(con);
+                /* Tell ceph about it. */
+                mutex_unlock(&con->mutex);
+                pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
+                if (con->ops->peer_reset)
+                        con->ops->peer_reset(con);
+                mutex_lock(&con->mutex);
+                break;
+        case CEPH_MSGR_TAG_RETRY_SESSION:
+                /*
+                 * If we sent a smaller connect_seq than the peer has, try
+                 * again with a larger value.
+                 */
+                dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
+                     le32_to_cpu(con->out_connect.connect_seq),
+                     le32_to_cpu(con->in_connect.connect_seq));
+                con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
+                prepare_write_connect(con->msgr, con, 0);
+                prepare_read_connect(con);
+                break;
+        case CEPH_MSGR_TAG_RETRY_GLOBAL:
+                /*
+                 * If we sent a smaller global_seq than the peer has, try
+                 * again with a larger value.
+                 */
+                dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
+                     con->peer_global_seq,
+                     le32_to_cpu(con->in_connect.global_seq));
+                get_global_seq(con->msgr,
+                               le32_to_cpu(con->in_connect.global_seq));
+                prepare_write_connect(con->msgr, con, 0);
+                prepare_read_connect(con);
+                break;
+        case CEPH_MSGR_TAG_READY:
+                if (req_feat & ~server_feat) {
+                        pr_err("%s%lld %s protocol feature mismatch,"
+                               " my required %llx > server's %llx, need %llx\n",
+                               ENTITY_NAME(con->peer_name),
+                               pr_addr(&con->peer_addr.in_addr),
+                               req_feat, server_feat, req_feat & ~server_feat);
+                        con->error_msg = "missing required protocol features";
+                        fail_protocol(con);
+                        return -1;
+                }
+                clear_bit(CONNECTING, &con->state);
+                con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
+                con->connect_seq++;
+                dout("process_connect got READY gseq %d cseq %d (%d)\n",
+                     con->peer_global_seq,
+                     le32_to_cpu(con->in_reply.connect_seq),
+                     con->connect_seq);
+                WARN_ON(con->connect_seq !=
+                        le32_to_cpu(con->in_reply.connect_seq));
+                if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
+                        set_bit(LOSSYTX, &con->state);
+                prepare_read_tag(con);
+                break;
+        case CEPH_MSGR_TAG_WAIT:
+                /*
+                 * If there is a connection race (we are opening
+                 * connections to each other), one of us may just have
+                 * to WAIT.  This shouldn't happen if we are the
+                 * client.
+                 */
+                pr_err("process_connect peer connecting WAIT\n");
+        default:
+                pr_err("connect protocol error, will retry\n");
+                con->error_msg = "protocol error, garbage tag during connect";
+                return -1;
+        }
+        return 0;
+}
+/*
+ * read (part of) an ack
+ */
+static int read_partial_ack(struct ceph_connection *con)
+{
+        int to = 0;
+        return read_partial(con, &to, sizeof(con->in_temp_ack),
+                            &con->in_temp_ack);
+}
+/*
+ * We can finally discard anything that's been acked.
+ */
+static void process_ack(struct ceph_connection *con)
+{
+        struct ceph_msg *m;
+        u64 ack = le64_to_cpu(con->in_temp_ack);
+        u64 seq;
+        while (!list_empty(&con->out_sent)) {
+                m = list_first_entry(&con->out_sent, struct ceph_msg,
+                                     list_head);
+                seq = le64_to_cpu(m->hdr.seq);
+                if (seq > ack)
+                        break;
+                dout("got ack for seq %llu type %d at %p\n", seq,
+                     le16_to_cpu(m->hdr.type), m);
+                ceph_msg_remove(m);
+        }
+        prepare_read_tag(con);
+}
+static int read_partial_message_section(struct ceph_connection *con,
+                                        struct kvec *section, unsigned int sec_len,
+                                        u32 *crc)
+{
+        int left;
+        int ret;
+        BUG_ON(!section);
+        while (section->iov_len < sec_len) {
+                BUG_ON(section->iov_base == NULL);
+                left = sec_len - section->iov_len;
+                ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
+                                       section->iov_len, left);
+                if (ret <= 0)
+                        return ret;
+                section->iov_len += ret;
+                if (section->iov_len == sec_len)
+                        *crc = crc32c(0, section->iov_base,
+                                      section->iov_len);
+        }
+        return 1;
+}
+static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+                                struct ceph_msg_header *hdr,
+                                int *skip);
+/*
+ * read (part of) a message.
+ */
+static int read_partial_message(struct ceph_connection *con)
+{
+        struct ceph_msg *m = con->in_msg;
+        void *p;
+        int ret;
+        int to, left;
+        unsigned front_len, middle_len, data_len, data_off;
+        int datacrc = con->msgr->nocrc;
+        int skip;
+        dout("read_partial_message con %p msg %p\n", con, m);
+        /* header */
+        while (con->in_base_pos < sizeof(con->in_hdr)) {
+                left = sizeof(con->in_hdr) - con->in_base_pos;
+                ret = ceph_tcp_recvmsg(con->sock,
+                                       (char *)&con->in_hdr + con->in_base_pos,
+                                       left);
+                if (ret <= 0)
+                        return ret;
+                con->in_base_pos += ret;
+                if (con->in_base_pos == sizeof(con->in_hdr)) {
+                        u32 crc = crc32c(0, (void *)&con->in_hdr,
+                                 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
+                        if (crc != le32_to_cpu(con->in_hdr.crc)) {
+                                pr_err("read_partial_message bad hdr "
+                                       " crc %u != expected %u\n",
+                                       crc, con->in_hdr.crc);
+                                return -EBADMSG;
+                        }
+                }
+        }
+        front_len = le32_to_cpu(con->in_hdr.front_len);
+        if (front_len > CEPH_MSG_MAX_FRONT_LEN)
+                return -EIO;
+        middle_len = le32_to_cpu(con->in_hdr.middle_len);
+        if (middle_len > CEPH_MSG_MAX_DATA_LEN)
+                return -EIO;
+        data_len = le32_to_cpu(con->in_hdr.data_len);
+        if (data_len > CEPH_MSG_MAX_DATA_LEN)
+                return -EIO;
+        data_off = le16_to_cpu(con->in_hdr.data_off);
+        /* allocate message? */
+        if (!con->in_msg) {
+                dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
+                     con->in_hdr.front_len, con->in_hdr.data_len);
+                con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
+                if (skip) {
+                        /* skip this message */
+                        dout("alloc_msg returned NULL, skipping message\n");
+                        con->in_base_pos = -front_len - middle_len - data_len -
+                                sizeof(m->footer);
+                        con->in_tag = CEPH_MSGR_TAG_READY;
+                        return 0;
+                }
+                if (IS_ERR(con->in_msg)) {
+                        ret = PTR_ERR(con->in_msg);
+                        con->in_msg = NULL;
+                        con->error_msg =
+                                "error allocating memory for incoming message";
+                        return ret;
+                }
+                m = con->in_msg;
+                m->front.iov_len = 0;    /* haven't read it yet */
+                if (m->middle)
+                        m->middle->vec.iov_len = 0;
+                con->in_msg_pos.page = 0;
+                con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
+                con->in_msg_pos.data_pos = 0;
+        }
+        /* front */
+        ret = read_partial_message_section(con, &m->front, front_len,
+                                           &con->in_front_crc);
+        if (ret <= 0)
+                return ret;
+        /* middle */
+        if (m->middle) {
+                ret = read_partial_message_section(con, &m->middle->vec, middle_len,
+                                                   &con->in_middle_crc);
+                if (ret <= 0)
+                        return ret;
+        }
+        /* (page) data */
+        while (con->in_msg_pos.data_pos < data_len) {
+                left = min((int)(data_len - con->in_msg_pos.data_pos),
+                           (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
+                BUG_ON(m->pages == NULL);
+                p = kmap(m->pages[con->in_msg_pos.page]);
+                ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
+                                       left);
+                if (ret > 0 && datacrc)
+                        con->in_data_crc =
+                                crc32c(con->in_data_crc,
+                                          p + con->in_msg_pos.page_pos, ret);
+                kunmap(m->pages[con->in_msg_pos.page]);
+                if (ret <= 0)
+                        return ret;
+                con->in_msg_pos.data_pos += ret;
+                con->in_msg_pos.page_pos += ret;
+                if (con->in_msg_pos.page_pos == PAGE_SIZE) {
+                        con->in_msg_pos.page_pos = 0;
+                        con->in_msg_pos.page++;
+                }
+        }
+        /* footer */
+        to = sizeof(m->hdr) + sizeof(m->footer);
+        while (con->in_base_pos < to) {
+                left = to - con->in_base_pos;
+                ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
+                                       (con->in_base_pos - sizeof(m->hdr)),
+                                       left);
+                if (ret <= 0)
+                        return ret;
+                con->in_base_pos += ret;
+        }
+        dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
+             m, front_len, m->footer.front_crc, middle_len,
+             m->footer.middle_crc, data_len, m->footer.data_crc);
+        /* crc ok? */
+        if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
+                pr_err("read_partial_message %p front crc %u != exp. %u\n",
+                       m, con->in_front_crc, m->footer.front_crc);
+                return -EBADMSG;
+        }
+        if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
+                pr_err("read_partial_message %p middle crc %u != exp %u\n",
+                       m, con->in_middle_crc, m->footer.middle_crc);
+                return -EBADMSG;
+        }
+        if (datacrc &&
+            (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
+            con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
+                pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
+                       con->in_data_crc, le32_to_cpu(m->footer.data_crc));
+                return -EBADMSG;
+        }
+        return 1; /* done! */
+}
+/*
+ * Process message.  This happens in the worker thread.  The callback should
+ * be careful not to do anything that waits on other incoming messages or it
+ * may deadlock.
+ */
+static void process_message(struct ceph_connection *con)
+{
+        struct ceph_msg *msg;
+        msg = con->in_msg;
+        con->in_msg = NULL;
+        /* if first message, set peer_name */
+        if (con->peer_name.type == 0)
+                con->peer_name = msg->hdr.src.name;
+        con->in_seq++;
+        mutex_unlock(&con->mutex);
+        dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
+             msg, le64_to_cpu(msg->hdr.seq),
+             ENTITY_NAME(msg->hdr.src.name),
+             le16_to_cpu(msg->hdr.type),
+             ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+             le32_to_cpu(msg->hdr.front_len),
+             le32_to_cpu(msg->hdr.data_len),
+             con->in_front_crc, con->in_middle_crc, con->in_data_crc);
+        con->ops->dispatch(con, msg);
+        mutex_lock(&con->mutex);
+        prepare_read_tag(con);
+}
+/*
+ * Write something to the socket.  Called in a worker thread when the
+ * socket appears to be writeable and we have something ready to send.
+ */
+static int try_write(struct ceph_connection *con)
+{
+        struct ceph_messenger *msgr = con->msgr;
+        int ret = 1;
+        dout("try_write start %p state %lu nref %d\n", con, con->state,
+             atomic_read(&con->nref));
+        mutex_lock(&con->mutex);
+more:
+        dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
+        /* open the socket first? */
+        if (con->sock == NULL) {
+                /*
+                 * if we were STANDBY and are reconnecting _this_
+                 * connection, bump connect_seq now.  Always bump
+                 * global_seq.
+                 */
+                if (test_and_clear_bit(STANDBY, &con->state))
+                        con->connect_seq++;
+                prepare_write_banner(msgr, con);
+                prepare_write_connect(msgr, con, 1);
+                prepare_read_banner(con);
+                set_bit(CONNECTING, &con->state);
+                clear_bit(NEGOTIATING, &con->state);
+                BUG_ON(con->in_msg);
+                con->in_tag = CEPH_MSGR_TAG_READY;
+                dout("try_write initiating connect on %p new state %lu\n",
+                     con, con->state);
+                con->sock = ceph_tcp_connect(con);
+                if (IS_ERR(con->sock)) {
+                        con->sock = NULL;
+                        con->error_msg = "connect error";
+                        ret = -1;
+                        goto out;
+                }
+        }
+more_kvec:
+        /* kvec data queued? */
+        if (con->out_skip) {
+                ret = write_partial_skip(con);
+                if (ret <= 0)
+                        goto done;
+                if (ret < 0) {
+                        dout("try_write write_partial_skip err %d\n", ret);
+                        goto done;
+                }
+        }
+        if (con->out_kvec_left) {
+                ret = write_partial_kvec(con);
+                if (ret <= 0)
+                        goto done;
+        }
+        /* msg pages? */
+        if (con->out_msg) {
+                if (con->out_msg_done) {
+                        ceph_msg_put(con->out_msg);
+                        con->out_msg = NULL;   /* we're done with this one */
+                        goto do_next;
+                }
+                ret = write_partial_msg_pages(con);
+                if (ret == 1)
+                        goto more_kvec;  /* we need to send the footer, too! */
+                if (ret == 0)
+                        goto done;
+                if (ret < 0) {
+                        dout("try_write write_partial_msg_pages err %d\n",
+                             ret);
+                        goto done;
+                }
+        }
+do_next:
+        if (!test_bit(CONNECTING, &con->state)) {
+                /* is anything else pending? */
+                if (!list_empty(&con->out_queue)) {
+                        prepare_write_message(con);
+                        goto more;
+                }
+                if (con->in_seq > con->in_seq_acked) {
+                        prepare_write_ack(con);
+                        goto more;
+                }
+                if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
+                        prepare_write_keepalive(con);
+                        goto more;
+                }
+        }
+        /* Nothing to do! */
+        clear_bit(WRITE_PENDING, &con->state);
+        dout("try_write nothing else to write.\n");
+done:
+        ret = 0;
+out:
+        mutex_unlock(&con->mutex);
+        dout("try_write done on %p\n", con);
+        return ret;
+}
+/*
+ * Read what we can from the socket.
+ */
+static int try_read(struct ceph_connection *con)
+{
+        struct ceph_messenger *msgr;
+        int ret = -1;
+        if (!con->sock)
+                return 0;
+        if (test_bit(STANDBY, &con->state))
+                return 0;
+        dout("try_read start on %p\n", con);
+        msgr = con->msgr;
+        mutex_lock(&con->mutex);
+more:
+        dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
+             con->in_base_pos);
+        if (test_bit(CONNECTING, &con->state)) {
+                if (!test_bit(NEGOTIATING, &con->state)) {
+                        dout("try_read connecting\n");
+                        ret = read_partial_banner(con);
+                        if (ret <= 0)
+                                goto done;
+                        if (process_banner(con) < 0) {
+                                ret = -1;
+                                goto out;
+                        }
+                }
+                ret = read_partial_connect(con);
+                if (ret <= 0)
+                        goto done;
+                if (process_connect(con) < 0) {
+                        ret = -1;
+                        goto out;
+                }
+                goto more;
+        }
+        if (con->in_base_pos < 0) {
+                /*
+                 * skipping + discarding content.
+                 *
+                 * FIXME: there must be a better way to do this!
+                 */
+                static char buf[1024];
+                int skip = min(1024, -con->in_base_pos);
+                dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
+                ret = ceph_tcp_recvmsg(con->sock, buf, skip);
+                if (ret <= 0)
+                        goto done;
+                con->in_base_pos += ret;
+                if (con->in_base_pos)
+                        goto more;
+        }
+        if (con->in_tag == CEPH_MSGR_TAG_READY) {
+                /*
+                 * what's next?
+                 */
+                ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
+                if (ret <= 0)
+                        goto done;
+                dout("try_read got tag %d\n", (int)con->in_tag);
+                switch (con->in_tag) {
+                case CEPH_MSGR_TAG_MSG:
+                        prepare_read_message(con);
+                        break;
+                case CEPH_MSGR_TAG_ACK:
+                        prepare_read_ack(con);
+                        break;
+                case CEPH_MSGR_TAG_CLOSE:
+                        set_bit(CLOSED, &con->state);   /* fixme */
+                        goto done;
+                default:
+                        goto bad_tag;
+                }
+        }
+        if (con->in_tag == CEPH_MSGR_TAG_MSG) {
+                ret = read_partial_message(con);
+                if (ret <= 0) {
+                        switch (ret) {
+                        case -EBADMSG:
+                                con->error_msg = "bad crc";
+                                ret = -EIO;
+                                goto out;
+                        case -EIO:
+                                con->error_msg = "io error";
+                                goto out;
+                        default:
+                                goto done;
+                        }
+                }
+                if (con->in_tag == CEPH_MSGR_TAG_READY)
+                        goto more;
+                process_message(con);
+                goto more;
+        }
+        if (con->in_tag == CEPH_MSGR_TAG_ACK) {
+                ret = read_partial_ack(con);
+                if (ret <= 0)
+                        goto done;
+                process_ack(con);
+                goto more;
+        }
+done:
+        ret = 0;
+out:
+        mutex_unlock(&con->mutex);
+        dout("try_read done on %p\n", con);
+        return ret;
+bad_tag:
+        pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
+        con->error_msg = "protocol error, garbage tag";
+        ret = -1;
+        goto out;
+}
+/*
+ * Atomically queue work on a connection.  Bump @con reference to
+ * avoid races with connection teardown.
+ *
+ * There is some trickery going on with QUEUED and BUSY because we
+ * only want a _single_ thread operating on each connection at any
+ * point in time, but we want to use all available CPUs.
+ *
+ * The worker thread only proceeds if it can atomically set BUSY.  It
+ * clears QUEUED and does it's thing.  When it thinks it's done, it
+ * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
+ * (tries again to set BUSY).
+ *
+ * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
+ * try to queue work.  If that fails (work is already queued, or BUSY)
+ * we give up (work also already being done or is queued) but leave QUEUED
+ * set so that the worker thread will loop if necessary.
+ */
+static void queue_con(struct ceph_connection *con)
+{
+        if (test_bit(DEAD, &con->state)) {
+                dout("queue_con %p ignoring: DEAD\n",
+                     con);
+                return;
+        }
+        if (!con->ops->get(con)) {
+                dout("queue_con %p ref count 0\n", con);
+                return;
+        }
+        set_bit(QUEUED, &con->state);
+        if (test_bit(BUSY, &con->state)) {
+                dout("queue_con %p - already BUSY\n", con);
+                con->ops->put(con);
+        } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
+                dout("queue_con %p - already queued\n", con);
+                con->ops->put(con);
+        } else {
+                dout("queue_con %p\n", con);
+        }
+}
+/*
+ * Do some work on a connection.  Drop a connection ref when we're done.
+ */
+static void con_work(struct work_struct *work)
+{
+        struct ceph_connection *con = container_of(work, struct ceph_connection,
+                                                   work.work);
+        int backoff = 0;
+more:
+        if (test_and_set_bit(BUSY, &con->state) != 0) {
+                dout("con_work %p BUSY already set\n", con);
+                goto out;
+        }
+        dout("con_work %p start, clearing QUEUED\n", con);
+        clear_bit(QUEUED, &con->state);
+        if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
+                dout("con_work CLOSED\n");
+                con_close_socket(con);
+                goto done;
+        }
+        if (test_and_clear_bit(OPENING, &con->state)) {
+                /* reopen w/ new peer */
+                dout("con_work OPENING\n");
+                con_close_socket(con);
+        }
+        if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
+            try_read(con) < 0 ||
+            try_write(con) < 0) {
+                backoff = 1;
+                ceph_fault(con);     /* error/fault path */
+        }
+done:
+        clear_bit(BUSY, &con->state);
+        dout("con->state=%lu\n", con->state);
+        if (test_bit(QUEUED, &con->state)) {
+                if (!backoff || test_bit(OPENING, &con->state)) {
+                        dout("con_work %p QUEUED reset, looping\n", con);
+                        goto more;
+                }
+                dout("con_work %p QUEUED reset, but just faulted\n", con);
+                clear_bit(QUEUED, &con->state);
+        }
+        dout("con_work %p done\n", con);
+out:
+        con->ops->put(con);
+}
+/*
+ * Generic error/fault handler.  A retry mechanism is used with
+ * exponential backoff
+ */
+static void ceph_fault(struct ceph_connection *con)
+{
+        pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
+               pr_addr(&con->peer_addr.in_addr), con->error_msg);
+        dout("fault %p state %lu to peer %s\n",
+             con, con->state, pr_addr(&con->peer_addr.in_addr));
+        if (test_bit(LOSSYTX, &con->state)) {
+                dout("fault on LOSSYTX channel\n");
+                goto out;
+        }
+        mutex_lock(&con->mutex);
+        if (test_bit(CLOSED, &con->state))
+                goto out_unlock;
+        con_close_socket(con);
+        if (con->in_msg) {
+                ceph_msg_put(con->in_msg);
+                con->in_msg = NULL;
+        }
+        /* Requeue anything that hasn't been acked */
+        list_splice_init(&con->out_sent, &con->out_queue);
+        /* If there are no messages in the queue, place the connection
+         * in a STANDBY state (i.e., don't try to reconnect just yet). */
+        if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
+                dout("fault setting STANDBY\n");
+                set_bit(STANDBY, &con->state);
+        } else {
+                /* retry after a delay. */
+                if (con->delay == 0)
+                        con->delay = BASE_DELAY_INTERVAL;
+                else if (con->delay < MAX_DELAY_INTERVAL)
+                        con->delay *= 2;
+                dout("fault queueing %p delay %lu\n", con, con->delay);
+                con->ops->get(con);
+                if (queue_delayed_work(ceph_msgr_wq, &con->work,
+                                       round_jiffies_relative(con->delay)) == 0)
+                        con->ops->put(con);
+        }
+out_unlock:
+        mutex_unlock(&con->mutex);
+out:
+        /*
+         * in case we faulted due to authentication, invalidate our
+         * current tickets so that we can get new ones.
+         */
+        if (con->auth_retry && con->ops->invalidate_authorizer) {
+                dout("calling invalidate_authorizer()\n");
+                con->ops->invalidate_authorizer(con);
+        }
+        if (con->ops->fault)
+                con->ops->fault(con);
+}
+/*
+ * create a new messenger instance
+ */
+struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
+{
+        struct ceph_messenger *msgr;
+        msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
+        if (msgr == NULL)
+                return ERR_PTR(-ENOMEM);
+        spin_lock_init(&msgr->global_seq_lock);
+        /* the zero page is needed if a request is "canceled" while the message
+         * is being written over the socket */
+        msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+        if (!msgr->zero_page) {
+                kfree(msgr);
+                return ERR_PTR(-ENOMEM);
+        }
+        kmap(msgr->zero_page);
+        if (myaddr)
+                msgr->inst.addr = *myaddr;
+        /* select a random nonce */
+        msgr->inst.addr.type = 0;
+        get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
+        encode_my_addr(msgr);
+        dout("messenger_create %p\n", msgr);
+        return msgr;
+}
+void ceph_messenger_destroy(struct ceph_messenger *msgr)
+{
+        dout("destroy %p\n", msgr);
+        kunmap(msgr->zero_page);
+        __free_page(msgr->zero_page);
+        kfree(msgr);
+        dout("destroyed messenger %p\n", msgr);
+}
+/*
+ * Queue up an outgoing message on the given connection.
+ */
+void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
+{
+        if (test_bit(CLOSED, &con->state)) {
+                dout("con_send %p closed, dropping %p\n", con, msg);
+                ceph_msg_put(msg);
+                return;
+        }
+        /* set src+dst */
+        msg->hdr.src.name = con->msgr->inst.name;
+        msg->hdr.src.addr = con->msgr->my_enc_addr;
+        msg->hdr.orig_src = msg->hdr.src;
+        BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
+        /* queue */
+        mutex_lock(&con->mutex);
+        BUG_ON(!list_empty(&msg->list_head));
+        list_add_tail(&msg->list_head, &con->out_queue);
+        dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
+             ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
+             ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+             le32_to_cpu(msg->hdr.front_len),
+             le32_to_cpu(msg->hdr.middle_len),
+             le32_to_cpu(msg->hdr.data_len));
+        mutex_unlock(&con->mutex);
+        /* if there wasn't anything waiting to send before, queue
+         * new work */
+        if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+                queue_con(con);
+}
+/*
+ * Revoke a message that was previously queued for send
+ */
+void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
+{
+        mutex_lock(&con->mutex);
+        if (!list_empty(&msg->list_head)) {
+                dout("con_revoke %p msg %p\n", con, msg);
+                list_del_init(&msg->list_head);
+                ceph_msg_put(msg);
+                msg->hdr.seq = 0;
+                if (con->out_msg == msg) {
+                        ceph_msg_put(con->out_msg);
+                        con->out_msg = NULL;
+                }
+                if (con->out_kvec_is_msg) {
+                        con->out_skip = con->out_kvec_bytes;
+                        con->out_kvec_is_msg = false;
+                }
+        } else {
+                dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
+        }
+        mutex_unlock(&con->mutex);
+}
+/*
+ * Revoke a message that we may be reading data into
+ */
+void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
+{
+        mutex_lock(&con->mutex);
+        if (con->in_msg && con->in_msg == msg) {
+                unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
+                unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
+                unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
+                /* skip rest of message */
+                dout("con_revoke_pages %p msg %p revoked\n", con, msg);
+                        con->in_base_pos = con->in_base_pos -
+                                sizeof(struct ceph_msg_header) -
+                                front_len -
+                                middle_len -
+                                data_len -
+                                sizeof(struct ceph_msg_footer);
+                ceph_msg_put(con->in_msg);
+                con->in_msg = NULL;
+                con->in_tag = CEPH_MSGR_TAG_READY;
+        } else {
+                dout("con_revoke_pages %p msg %p pages %p no-op\n",
+                     con, con->in_msg, msg);
+        }
+        mutex_unlock(&con->mutex);
+}
+/*
+ * Queue a keepalive byte to ensure the tcp connection is alive.
+ */
+void ceph_con_keepalive(struct ceph_connection *con)
+{
+        if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
+            test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+                queue_con(con);
+}
+/*
+ * construct a new message with given type, size
+ * the new msg has a ref count of 1.
+ */
+struct ceph_msg *ceph_msg_new(int type, int front_len,
+                              int page_len, int page_off, struct page **pages)
+{
+        struct ceph_msg *m;
+        m = kmalloc(sizeof(*m), GFP_NOFS);
+        if (m == NULL)
+                goto out;
+        kref_init(&m->kref);
+        INIT_LIST_HEAD(&m->list_head);
+        m->hdr.type = cpu_to_le16(type);
+        m->hdr.front_len = cpu_to_le32(front_len);
+        m->hdr.middle_len = 0;
+        m->hdr.data_len = cpu_to_le32(page_len);
+        m->hdr.data_off = cpu_to_le16(page_off);
+        m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
+        m->footer.front_crc = 0;
+        m->footer.middle_crc = 0;
+        m->footer.data_crc = 0;
+        m->front_max = front_len;
+        m->front_is_vmalloc = false;
+        m->more_to_follow = false;
+        m->pool = NULL;
+        /* front */
+        if (front_len) {
+                if (front_len > PAGE_CACHE_SIZE) {
+                        m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
+                                                      PAGE_KERNEL);
+                        m->front_is_vmalloc = true;
+                } else {
+                        m->front.iov_base = kmalloc(front_len, GFP_NOFS);
+                }
+                if (m->front.iov_base == NULL) {
+                        pr_err("msg_new can't allocate %d bytes\n",
+                             front_len);
+                        goto out2;
+                }
+        } else {
+                m->front.iov_base = NULL;
+        }
+        m->front.iov_len = front_len;
+        /* middle */
+        m->middle = NULL;
+        /* data */
+        m->nr_pages = calc_pages_for(page_off, page_len);
+        m->pages = pages;
+        m->pagelist = NULL;
+        dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
+             m->nr_pages);
+        return m;
+out2:
+        ceph_msg_put(m);
+out:
+        pr_err("msg_new can't create type %d len %d\n", type, front_len);
+        return ERR_PTR(-ENOMEM);
+}
+/*
+ * Allocate "middle" portion of a message, if it is needed and wasn't
+ * allocated by alloc_msg.  This allows us to read a small fixed-size
+ * per-type header in the front and then gracefully fail (i.e.,
+ * propagate the error to the caller based on info in the front) when
+ * the middle is too large.
+ */
+static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
+{
+        int type = le16_to_cpu(msg->hdr.type);
+        int middle_len = le32_to_cpu(msg->hdr.middle_len);
+        dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
+             ceph_msg_type_name(type), middle_len);
+        BUG_ON(!middle_len);
+        BUG_ON(msg->middle);
+        msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
+        if (!msg->middle)
+                return -ENOMEM;
+        return 0;
+}
+/*
+ * Generic message allocator, for incoming messages.
+ */
+static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+                                struct ceph_msg_header *hdr,
+                                int *skip)
+{
+        int type = le16_to_cpu(hdr->type);
+        int front_len = le32_to_cpu(hdr->front_len);
+        int middle_len = le32_to_cpu(hdr->middle_len);
+        struct ceph_msg *msg = NULL;
+        int ret;
+        if (con->ops->alloc_msg) {
+                mutex_unlock(&con->mutex);
+                msg = con->ops->alloc_msg(con, hdr, skip);
+                mutex_lock(&con->mutex);
+                if (IS_ERR(msg))
+                        return msg;
+                if (*skip)
+                        return NULL;
+        }
+        if (!msg) {
+                *skip = 0;
+                msg = ceph_msg_new(type, front_len, 0, 0, NULL);
+                if (!msg) {
+                        pr_err("unable to allocate msg type %d len %d\n",
+                               type, front_len);
+                        return ERR_PTR(-ENOMEM);
+                }
+        }
+        memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
+        if (middle_len) {
+                ret = ceph_alloc_middle(con, msg);
+                if (ret < 0) {
+                        ceph_msg_put(msg);
+                        return msg;
+                }
+        }
+        return msg;
+}
+/*
+ * Free a generically kmalloc'd message.
+ */
+void ceph_msg_kfree(struct ceph_msg *m)
+{
+        dout("msg_kfree %p\n", m);
+        if (m->front_is_vmalloc)
+                vfree(m->front.iov_base);
+        else
+                kfree(m->front.iov_base);
+        kfree(m);
+}
+/*
+ * Drop a msg ref.  Destroy as needed.
+ */
+void ceph_msg_last_put(struct kref *kref)
+{
+        struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
+        dout("ceph_msg_put last one on %p\n", m);
+        WARN_ON(!list_empty(&m->list_head));
+        /* drop middle, data, if any */
+        if (m->middle) {
+                ceph_buffer_put(m->middle);
+                m->middle = NULL;
+        }
+        m->nr_pages = 0;
+        m->pages = NULL;
+        if (m->pagelist) {
+                ceph_pagelist_release(m->pagelist);
+                kfree(m->pagelist);
+                m->pagelist = NULL;
+        }
+        if (m->pool)
+                ceph_msgpool_put(m->pool, m);
+        else
+                ceph_msg_kfree(m);
+}
+void ceph_msg_dump(struct ceph_msg *msg)
+{
+        pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
+                 msg->front_max, msg->nr_pages);
+        print_hex_dump(KERN_DEBUG, "header: ",
+                       DUMP_PREFIX_OFFSET, 16, 1,
+                       &msg->hdr, sizeof(msg->hdr), true);
+        print_hex_dump(KERN_DEBUG, " front: ",
+                       DUMP_PREFIX_OFFSET, 16, 1,
+                       msg->front.iov_base, msg->front.iov_len, true);
+        if (msg->middle)
+                print_hex_dump(KERN_DEBUG, "middle: ",
+                               DUMP_PREFIX_OFFSET, 16, 1,
+                               msg->middle->vec.iov_base,
+                               msg->middle->vec.iov_len, true);
+        print_hex_dump(KERN_DEBUG, "footer: ",
+                       DUMP_PREFIX_OFFSET, 16, 1,
+                       &msg->footer, sizeof(msg->footer), true);
+}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
new file mode 100644
index 000000000000..a343dae73cdc
--- /dev/null
+++ b/fs/ceph/messenger.h
@@ -0,0 +1,255 @@
+#ifndef __FS_CEPH_MESSENGER_H
+#define __FS_CEPH_MESSENGER_H
+#include <linux/kref.h>
+#include <linux/mutex.h>
+#include <linux/net.h>
+#include <linux/radix-tree.h>
+#include <linux/uio.h>
+#include <linux/version.h>
+#include <linux/workqueue.h>
+#include "types.h"
+#include "buffer.h"
+struct ceph_msg;
+struct ceph_connection;
+extern struct workqueue_struct *ceph_msgr_wq;       /* receive work queue */
+/*
+ * Ceph defines these callbacks for handling connection events.
+ */
+struct ceph_connection_operations {
+        struct ceph_connection *(*get)(struct ceph_connection *);
+        void (*put)(struct ceph_connection *);
+        /* handle an incoming message. */
+        void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
+        /* authorize an outgoing connection */
+        int (*get_authorizer) (struct ceph_connection *con,
+                               void **buf, int *len, int *proto,
+                               void **reply_buf, int *reply_len, int force_new);
+        int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
+        int (*invalidate_authorizer)(struct ceph_connection *con);
+        /* protocol version mismatch */
+        void (*bad_proto) (struct ceph_connection *con);
+        /* there was some error on the socket (disconnect, whatever) */
+        void (*fault) (struct ceph_connection *con);
+        /* a remote host as terminated a message exchange session, and messages
+         * we sent (or they tried to send us) may be lost. */
+        void (*peer_reset) (struct ceph_connection *con);
+        struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
+                                        struct ceph_msg_header *hdr,
+                                        int *skip);
+};
+extern const char *ceph_name_type_str(int t);
+/* use format string %s%d */
+#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
+struct ceph_messenger {
+        struct ceph_entity_inst inst;    /* my name+address */
+        struct ceph_entity_addr my_enc_addr;
+        struct page *zero_page;          /* used in certain error cases */
+        bool nocrc;
+        /*
+         * the global_seq counts connections i (attempt to) initiate
+         * in order to disambiguate certain connect race conditions.
+         */
+        u32 global_seq;
+        spinlock_t global_seq_lock;
+};
+/*
+ * a single message.  it contains a header (src, dest, message type, etc.),
+ * footer (crc values, mainly), a "front" message body, and possibly a
+ * data payload (stored in some number of pages).
+ */
+struct ceph_msg {
+        struct ceph_msg_header hdr;     /* header */
+        struct ceph_msg_footer footer;  /* footer */
+        struct kvec front;              /* unaligned blobs of message */
+        struct ceph_buffer *middle;
+        struct page **pages;            /* data payload.  NOT OWNER. */
+        unsigned nr_pages;              /* size of page array */
+        struct ceph_pagelist *pagelist; /* instead of pages */
+        struct list_head list_head;
+        struct kref kref;
+        bool front_is_vmalloc;
+        bool more_to_follow;
+        int front_max;
+        struct ceph_msgpool *pool;
+};
+struct ceph_msg_pos {
+        int page, page_pos;  /* which page; offset in page */
+        int data_pos;        /* offset in data payload */
+        int did_page_crc;    /* true if we've calculated crc for current page */
+};
+/* ceph connection fault delay defaults, for exponential backoff */
+#define BASE_DELAY_INTERVAL     (HZ/2)
+#define MAX_DELAY_INTERVAL      (5 * 60 * HZ)
+/*
+ * ceph_connection state bit flags
+ *
+ * QUEUED and BUSY are used together to ensure that only a single
+ * thread is currently opening, reading or writing data to the socket.
+ */
+#define LOSSYTX         0  /* we can close channel or drop messages on errors */
+#define CONNECTING      1
+#define NEGOTIATING     2
+#define KEEPALIVE_PENDING      3
+#define WRITE_PENDING   4  /* we have data ready to send */
+#define QUEUED          5  /* there is work queued on this connection */
+#define BUSY            6  /* work is being done */
+#define STANDBY         8  /* no outgoing messages, socket closed.  we keep
+                            * the ceph_connection around to maintain shared
+                            * state with the peer. */
+#define CLOSED          10 /* we've closed the connection */
+#define SOCK_CLOSED     11 /* socket state changed to closed */
+#define OPENING         13 /* open connection w/ (possibly new) peer */
+#define DEAD            14 /* dead, about to kfree */
+/*
+ * A single connection with another host.
+ *
+ * We maintain a queue of outgoing messages, and some session state to
+ * ensure that we can preserve the lossless, ordered delivery of
+ * messages in the case of a TCP disconnect.
+ */
+struct ceph_connection {
+        void *private;
+        atomic_t nref;
+        const struct ceph_connection_operations *ops;
+        struct ceph_messenger *msgr;
+        struct socket *sock;
+        unsigned long state;    /* connection state (see flags above) */
+        const char *error_msg;  /* error message, if any */
+        struct ceph_entity_addr peer_addr; /* peer address */
+        struct ceph_entity_name peer_name; /* peer name */
+        struct ceph_entity_addr peer_addr_for_me;
+        u32 connect_seq;      /* identify the most recent connection
+                                 attempt for this connection, client */
+        u32 peer_global_seq;  /* peer's global seq for this connection */
+        int auth_retry;       /* true if we need a newer authorizer */
+        void *auth_reply_buf;   /* where to put the authorizer reply */
+        int auth_reply_buf_len;
+        struct mutex mutex;
+        /* out queue */
+        struct list_head out_queue;
+        struct list_head out_sent;   /* sending or sent but unacked */
+        u64 out_seq;                 /* last message queued for send */
+        u64 out_seq_sent;            /* last message sent */
+        bool out_keepalive_pending;
+        u64 in_seq, in_seq_acked;  /* last message received, acked */
+        /* connection negotiation temps */
+        char in_banner[CEPH_BANNER_MAX_LEN];
+        union {
+                struct {  /* outgoing connection */
+                        struct ceph_msg_connect out_connect;
+                        struct ceph_msg_connect_reply in_reply;
+                };
+                struct {  /* incoming */
+                        struct ceph_msg_connect in_connect;
+                        struct ceph_msg_connect_reply out_reply;
+                };
+        };
+        struct ceph_entity_addr actual_peer_addr;
+        /* message out temps */
+        struct ceph_msg *out_msg;        /* sending message (== tail of
+                                            out_sent) */
+        bool out_msg_done;
+        struct ceph_msg_pos out_msg_pos;
+        struct kvec out_kvec[8],         /* sending header/footer data */
+                *out_kvec_cur;
+        int out_kvec_left;   /* kvec's left in out_kvec */
+        int out_skip;        /* skip this many bytes */
+        int out_kvec_bytes;  /* total bytes left */
+        bool out_kvec_is_msg; /* kvec refers to out_msg */
+        int out_more;        /* there is more data after the kvecs */
+        __le64 out_temp_ack; /* for writing an ack */
+        /* message in temps */
+        struct ceph_msg_header in_hdr;
+        struct ceph_msg *in_msg;
+        struct ceph_msg_pos in_msg_pos;
+        u32 in_front_crc, in_middle_crc, in_data_crc;  /* calculated crc */
+        char in_tag;         /* protocol control byte */
+        int in_base_pos;     /* bytes read */
+        __le64 in_temp_ack;  /* for reading an ack */
+        struct delayed_work work;           /* send|recv work */
+        unsigned long       delay;          /* current delay interval */
+};
+extern const char *pr_addr(const struct sockaddr_storage *ss);
+extern int ceph_parse_ips(const char *c, const char *end,
+                          struct ceph_entity_addr *addr,
+                          int max_count, int *count);
+extern int ceph_msgr_init(void);
+extern void ceph_msgr_exit(void);
+extern struct ceph_messenger *ceph_messenger_create(
+        struct ceph_entity_addr *myaddr);
+extern void ceph_messenger_destroy(struct ceph_messenger *);
+extern void ceph_con_init(struct ceph_messenger *msgr,
+                          struct ceph_connection *con);
+extern void ceph_con_open(struct ceph_connection *con,
+                          struct ceph_entity_addr *addr);
+extern bool ceph_con_opened(struct ceph_connection *con);
+extern void ceph_con_close(struct ceph_connection *con);
+extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
+extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
+extern void ceph_con_revoke_message(struct ceph_connection *con,
+                                  struct ceph_msg *msg);
+extern void ceph_con_keepalive(struct ceph_connection *con);
+extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
+extern void ceph_con_put(struct ceph_connection *con);
+extern struct ceph_msg *ceph_msg_new(int type, int front_len,
+                                     int page_len, int page_off,
+                                     struct page **pages);
+extern void ceph_msg_kfree(struct ceph_msg *m);
+static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
+{
+        kref_get(&msg->kref);
+        return msg;
+}
+extern void ceph_msg_last_put(struct kref *kref);
+static inline void ceph_msg_put(struct ceph_msg *msg)
+{
+        kref_put(&msg->kref, ceph_msg_last_put);
+}
+extern void ceph_msg_dump(struct ceph_msg *msg);
+#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
new file mode 100644
index 000000000000..8fdc011ca956
--- /dev/null
+++ b/fs/ceph/mon_client.c
@@ -0,0 +1,835 @@
+#include "ceph_debug.h"
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include "mon_client.h"
+#include "super.h"
+#include "auth.h"
+#include "decode.h"
+/*
+ * Interact with Ceph monitor cluster.  Handle requests for new map
+ * versions, and periodically resend as needed.  Also implement
+ * statfs() and umount().
+ *
+ * A small cluster of Ceph "monitors" are responsible for managing critical
+ * cluster configuration and state information.  An odd number (e.g., 3, 5)
+ * of cmon daemons use a modified version of the Paxos part-time parliament
+ * algorithm to manage the MDS map (mds cluster membership), OSD map, and
+ * list of clients who have mounted the file system.
+ *
+ * We maintain an open, active session with a monitor at all times in order to
+ * receive timely MDSMap updates.  We periodically send a keepalive byte on the
+ * TCP socket to ensure we detect a failure.  If the connection does break, we
+ * randomly hunt for a new monitor.  Once the connection is reestablished, we
+ * resend any outstanding requests.
+ */
+const static struct ceph_connection_operations mon_con_ops;
+static int __validate_auth(struct ceph_mon_client *monc);
+/*
+ * Decode a monmap blob (e.g., during mount).
+ */
+struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+{
+        struct ceph_monmap *m = NULL;
+        int i, err = -EINVAL;
+        struct ceph_fsid fsid;
+        u32 epoch, num_mon;
+        u16 version;
+        u32 len;
+        ceph_decode_32_safe(&p, end, len, bad);
+        ceph_decode_need(&p, end, len, bad);
+        dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
+        ceph_decode_16_safe(&p, end, version, bad);
+        ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
+        ceph_decode_copy(&p, &fsid, sizeof(fsid));
+        epoch = ceph_decode_32(&p);
+        num_mon = ceph_decode_32(&p);
+        ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
+        if (num_mon >= CEPH_MAX_MON)
+                goto bad;
+        m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
+        if (m == NULL)
+                return ERR_PTR(-ENOMEM);
+        m->fsid = fsid;
+        m->epoch = epoch;
+        m->num_mon = num_mon;
+        ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
+        for (i = 0; i < num_mon; i++)
+                ceph_decode_addr(&m->mon_inst[i].addr);
+        dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
+             m->num_mon);
+        for (i = 0; i < m->num_mon; i++)
+                dout("monmap_decode  mon%d is %s\n", i,
+                     pr_addr(&m->mon_inst[i].addr.in_addr));
+        return m;
+bad:
+        dout("monmap_decode failed with %d\n", err);
+        kfree(m);
+        return ERR_PTR(err);
+}
+/*
+ * return true if *addr is included in the monmap.
+ */
+int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
+{
+        int i;
+        for (i = 0; i < m->num_mon; i++)
+                if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
+                        return 1;
+        return 0;
+}
+/*
+ * Send an auth request.
+ */
+static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
+{
+        monc->pending_auth = 1;
+        monc->m_auth->front.iov_len = len;
+        monc->m_auth->hdr.front_len = cpu_to_le32(len);
+        ceph_msg_get(monc->m_auth);  /* keep our ref */
+        ceph_con_send(monc->con, monc->m_auth);
+}
+/*
+ * Close monitor session, if any.
+ */
+static void __close_session(struct ceph_mon_client *monc)
+{
+        if (monc->con) {
+                dout("__close_session closing mon%d\n", monc->cur_mon);
+                ceph_con_revoke(monc->con, monc->m_auth);
+                ceph_con_close(monc->con);
+                monc->cur_mon = -1;
+                monc->pending_auth = 0;
+                ceph_auth_reset(monc->auth);
+        }
+}
+/*
+ * Open a session with a (new) monitor.
+ */
+static int __open_session(struct ceph_mon_client *monc)
+{
+        char r;
+        int ret;
+        if (monc->cur_mon < 0) {
+                get_random_bytes(&r, 1);
+                monc->cur_mon = r % monc->monmap->num_mon;
+                dout("open_session num=%d r=%d -> mon%d\n",
+                     monc->monmap->num_mon, r, monc->cur_mon);
+                monc->sub_sent = 0;
+                monc->sub_renew_after = jiffies;  /* i.e., expired */
+                monc->want_next_osdmap = !!monc->want_next_osdmap;
+                dout("open_session mon%d opening\n", monc->cur_mon);
+                monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
+                monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
+                ceph_con_open(monc->con,
+                              &monc->monmap->mon_inst[monc->cur_mon].addr);
+                /* initiatiate authentication handshake */
+                ret = ceph_auth_build_hello(monc->auth,
+                                            monc->m_auth->front.iov_base,
+                                            monc->m_auth->front_max);
+                __send_prepared_auth_request(monc, ret);
+        } else {
+                dout("open_session mon%d already open\n", monc->cur_mon);
+        }
+        return 0;
+}
+static bool __sub_expired(struct ceph_mon_client *monc)
+{
+        return time_after_eq(jiffies, monc->sub_renew_after);
+}
+/*
+ * Reschedule delayed work timer.
+ */
+static void __schedule_delayed(struct ceph_mon_client *monc)
+{
+        unsigned delay;
+        if (monc->cur_mon < 0 || __sub_expired(monc))
+                delay = 10 * HZ;
+        else
+                delay = 20 * HZ;
+        dout("__schedule_delayed after %u\n", delay);
+        schedule_delayed_work(&monc->delayed_work, delay);
+}
+/*
+ * Send subscribe request for mdsmap and/or osdmap.
+ */
+static void __send_subscribe(struct ceph_mon_client *monc)
+{
+        dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
+             (unsigned)monc->sub_sent, __sub_expired(monc),
+             monc->want_next_osdmap);
+        if ((__sub_expired(monc) && !monc->sub_sent) ||
+            monc->want_next_osdmap == 1) {
+                struct ceph_msg *msg;
+                struct ceph_mon_subscribe_item *i;
+                void *p, *end;
+                msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
+                if (!msg)
+                        return;
+                p = msg->front.iov_base;
+                end = p + msg->front.iov_len;
+                dout("__send_subscribe to 'mdsmap' %u+\n",
+                     (unsigned)monc->have_mdsmap);
+                if (monc->want_next_osdmap) {
+                        dout("__send_subscribe to 'osdmap' %u\n",
+                             (unsigned)monc->have_osdmap);
+                        ceph_encode_32(&p, 3);
+                        ceph_encode_string(&p, end, "osdmap", 6);
+                        i = p;
+                        i->have = cpu_to_le64(monc->have_osdmap);
+                        i->onetime = 1;
+                        p += sizeof(*i);
+                        monc->want_next_osdmap = 2;  /* requested */
+                } else {
+                        ceph_encode_32(&p, 2);
+                }
+                ceph_encode_string(&p, end, "mdsmap", 6);
+                i = p;
+                i->have = cpu_to_le64(monc->have_mdsmap);
+                i->onetime = 0;
+                p += sizeof(*i);
+                ceph_encode_string(&p, end, "monmap", 6);
+                i = p;
+                i->have = 0;
+                i->onetime = 0;
+                p += sizeof(*i);
+                msg->front.iov_len = p - msg->front.iov_base;
+                msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+                ceph_con_send(monc->con, msg);
+                monc->sub_sent = jiffies | 1;  /* never 0 */
+        }
+}
+static void handle_subscribe_ack(struct ceph_mon_client *monc,
+                                 struct ceph_msg *msg)
+{
+        unsigned seconds;
+        struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
+        if (msg->front.iov_len < sizeof(*h))
+                goto bad;
+        seconds = le32_to_cpu(h->duration);
+        mutex_lock(&monc->mutex);
+        if (monc->hunting) {
+                pr_info("mon%d %s session established\n",
+                        monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
+                monc->hunting = false;
+        }
+        dout("handle_subscribe_ack after %d seconds\n", seconds);
+        monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
+        monc->sub_sent = 0;
+        mutex_unlock(&monc->mutex);
+        return;
+bad:
+        pr_err("got corrupt subscribe-ack msg\n");
+        ceph_msg_dump(msg);
+}
+/*
+ * Keep track of which maps we have
+ */
+int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
+{
+        mutex_lock(&monc->mutex);
+        monc->have_mdsmap = got;
+        mutex_unlock(&monc->mutex);
+        return 0;
+}
+int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
+{
+        mutex_lock(&monc->mutex);
+        monc->have_osdmap = got;
+        monc->want_next_osdmap = 0;
+        mutex_unlock(&monc->mutex);
+        return 0;
+}
+/*
+ * Register interest in the next osdmap
+ */
+void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
+{
+        dout("request_next_osdmap have %u\n", monc->have_osdmap);
+        mutex_lock(&monc->mutex);
+        if (!monc->want_next_osdmap)
+                monc->want_next_osdmap = 1;
+        if (monc->want_next_osdmap < 2)
+                __send_subscribe(monc);
+        mutex_unlock(&monc->mutex);
+}
+/*
+ *
+ */
+int ceph_monc_open_session(struct ceph_mon_client *monc)
+{
+        if (!monc->con) {
+                monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
+                if (!monc->con)
+                        return -ENOMEM;
+                ceph_con_init(monc->client->msgr, monc->con);
+                monc->con->private = monc;
+                monc->con->ops = &mon_con_ops;
+        }
+        mutex_lock(&monc->mutex);
+        __open_session(monc);
+        __schedule_delayed(monc);
+        mutex_unlock(&monc->mutex);
+        return 0;
+}
+/*
+ * The monitor responds with mount ack indicate mount success.  The
+ * included client ticket allows the client to talk to MDSs and OSDs.
+ */
+static void ceph_monc_handle_map(struct ceph_mon_client *monc,
+                                 struct ceph_msg *msg)
+{
+        struct ceph_client *client = monc->client;
+        struct ceph_monmap *monmap = NULL, *old = monc->monmap;
+        void *p, *end;
+        mutex_lock(&monc->mutex);
+        dout("handle_monmap\n");
+        p = msg->front.iov_base;
+        end = p + msg->front.iov_len;
+        monmap = ceph_monmap_decode(p, end);
+        if (IS_ERR(monmap)) {
+                pr_err("problem decoding monmap, %d\n",
+                       (int)PTR_ERR(monmap));
+                goto out;
+        }
+        if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
+                kfree(monmap);
+                goto out;
+        }
+        client->monc.monmap = monmap;
+        kfree(old);
+out:
+        mutex_unlock(&monc->mutex);
+        wake_up(&client->auth_wq);
+}
+/*
+ * statfs
+ */
+static struct ceph_mon_statfs_request *__lookup_statfs(
+        struct ceph_mon_client *monc, u64 tid)
+{
+        struct ceph_mon_statfs_request *req;
+        struct rb_node *n = monc->statfs_request_tree.rb_node;
+        while (n) {
+                req = rb_entry(n, struct ceph_mon_statfs_request, node);
+                if (tid < req->tid)
+                        n = n->rb_left;
+                else if (tid > req->tid)
+                        n = n->rb_right;
+                else
+                        return req;
+        }
+        return NULL;
+}
+static void __insert_statfs(struct ceph_mon_client *monc,
+                            struct ceph_mon_statfs_request *new)
+{
+        struct rb_node **p = &monc->statfs_request_tree.rb_node;
+        struct rb_node *parent = NULL;
+        struct ceph_mon_statfs_request *req = NULL;
+        while (*p) {
+                parent = *p;
+                req = rb_entry(parent, struct ceph_mon_statfs_request, node);
+                if (new->tid < req->tid)
+                        p = &(*p)->rb_left;
+                else if (new->tid > req->tid)
+                        p = &(*p)->rb_right;
+                else
+                        BUG();
+        }
+        rb_link_node(&new->node, parent, p);
+        rb_insert_color(&new->node, &monc->statfs_request_tree);
+}
+static void handle_statfs_reply(struct ceph_mon_client *monc,
+                                struct ceph_msg *msg)
+{
+        struct ceph_mon_statfs_request *req;
+        struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
+        u64 tid;
+        if (msg->front.iov_len != sizeof(*reply))
+                goto bad;
+        tid = le64_to_cpu(msg->hdr.tid);
+        dout("handle_statfs_reply %p tid %llu\n", msg, tid);
+        mutex_lock(&monc->mutex);
+        req = __lookup_statfs(monc, tid);
+        if (req) {
+                *req->buf = reply->st;
+                req->result = 0;
+        }
+        mutex_unlock(&monc->mutex);
+        if (req)
+                complete(&req->completion);
+        return;
+bad:
+        pr_err("corrupt statfs reply, no tid\n");
+        ceph_msg_dump(msg);
+}
+/*
+ * (re)send a statfs request
+ */
+static int send_statfs(struct ceph_mon_client *monc,
+                       struct ceph_mon_statfs_request *req)
+{
+        struct ceph_msg *msg;
+        struct ceph_mon_statfs *h;
+        dout("send_statfs tid %llu\n", req->tid);
+        msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
+        if (IS_ERR(msg))
+                return PTR_ERR(msg);
+        req->request = msg;
+        msg->hdr.tid = cpu_to_le64(req->tid);
+        h = msg->front.iov_base;
+        h->monhdr.have_version = 0;
+        h->monhdr.session_mon = cpu_to_le16(-1);
+        h->monhdr.session_mon_tid = 0;
+        h->fsid = monc->monmap->fsid;
+        ceph_con_send(monc->con, msg);
+        return 0;
+}
+/*
+ * Do a synchronous statfs().
+ */
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
+{
+        struct ceph_mon_statfs_request req;
+        int err;
+        req.buf = buf;
+        init_completion(&req.completion);
+        /* allocate memory for reply */
+        err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
+        if (err)
+                return err;
+        /* register request */
+        mutex_lock(&monc->mutex);
+        req.tid = ++monc->last_tid;
+        req.last_attempt = jiffies;
+        req.delay = BASE_DELAY_INTERVAL;
+        __insert_statfs(monc, &req);
+        monc->num_statfs_requests++;
+        mutex_unlock(&monc->mutex);
+        /* send request and wait */
+        err = send_statfs(monc, &req);
+        if (!err)
+                err = wait_for_completion_interruptible(&req.completion);
+        mutex_lock(&monc->mutex);
+        rb_erase(&req.node, &monc->statfs_request_tree);
+        monc->num_statfs_requests--;
+        ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
+        mutex_unlock(&monc->mutex);
+        if (!err)
+                err = req.result;
+        return err;
+}
+/*
+ * Resend pending statfs requests.
+ */
+static void __resend_statfs(struct ceph_mon_client *monc)
+{
+        struct ceph_mon_statfs_request *req;
+        struct rb_node *p;
+        for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
+                req = rb_entry(p, struct ceph_mon_statfs_request, node);
+                send_statfs(monc, req);
+        }
+}
+/*
+ * Delayed work.  If we haven't mounted yet, retry.  Otherwise,
+ * renew/retry subscription as needed (in case it is timing out, or we
+ * got an ENOMEM).  And keep the monitor connection alive.
+ */
+static void delayed_work(struct work_struct *work)
+{
+        struct ceph_mon_client *monc =
+                container_of(work, struct ceph_mon_client, delayed_work.work);
+        dout("monc delayed_work\n");
+        mutex_lock(&monc->mutex);
+        if (monc->hunting) {
+                __close_session(monc);
+                __open_session(monc);  /* continue hunting */
+        } else {
+                ceph_con_keepalive(monc->con);
+                __validate_auth(monc);
+                if (monc->auth->ops->is_authenticated(monc->auth))
+                        __send_subscribe(monc);
+        }
+        __schedule_delayed(monc);
+        mutex_unlock(&monc->mutex);
+}
+/*
+ * On startup, we build a temporary monmap populated with the IPs
+ * provided by mount(2).
+ */
+static int build_initial_monmap(struct ceph_mon_client *monc)
+{
+        struct ceph_mount_args *args = monc->client->mount_args;
+        struct ceph_entity_addr *mon_addr = args->mon_addr;
+        int num_mon = args->num_mon;
+        int i;
+        /* build initial monmap */
+        monc->monmap = kzalloc(sizeof(*monc->monmap) +
+                               num_mon*sizeof(monc->monmap->mon_inst[0]),
+                               GFP_KERNEL);
+        if (!monc->monmap)
+                return -ENOMEM;
+        for (i = 0; i < num_mon; i++) {
+                monc->monmap->mon_inst[i].addr = mon_addr[i];
+                monc->monmap->mon_inst[i].addr.nonce = 0;
+                monc->monmap->mon_inst[i].name.type =
+                        CEPH_ENTITY_TYPE_MON;
+                monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
+        }
+        monc->monmap->num_mon = num_mon;
+        monc->have_fsid = false;
+        /* release addr memory */
+        kfree(args->mon_addr);
+        args->mon_addr = NULL;
+        args->num_mon = 0;
+        return 0;
+}
+int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
+{
+        int err = 0;
+        dout("init\n");
+        memset(monc, 0, sizeof(*monc));
+        monc->client = cl;
+        monc->monmap = NULL;
+        mutex_init(&monc->mutex);
+        err = build_initial_monmap(monc);
+        if (err)
+                goto out;
+        monc->con = NULL;
+        /* authentication */
+        monc->auth = ceph_auth_init(cl->mount_args->name,
+                                    cl->mount_args->secret);
+        if (IS_ERR(monc->auth))
+                return PTR_ERR(monc->auth);
+        monc->auth->want_keys =
+                CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
+                CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
+        /* msg pools */
+        err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
+                               sizeof(struct ceph_mon_subscribe_ack), 1, false);
+        if (err < 0)
+                goto out_monmap;
+        err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
+                                sizeof(struct ceph_mon_statfs_reply), 0, false);
+        if (err < 0)
+                goto out_pool1;
+        err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
+        if (err < 0)
+                goto out_pool2;
+        monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
+        monc->pending_auth = 0;
+        if (IS_ERR(monc->m_auth)) {
+                err = PTR_ERR(monc->m_auth);
+                monc->m_auth = NULL;
+                goto out_pool3;
+        }
+        monc->cur_mon = -1;
+        monc->hunting = true;
+        monc->sub_renew_after = jiffies;
+        monc->sub_sent = 0;
+        INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
+        monc->statfs_request_tree = RB_ROOT;
+        monc->num_statfs_requests = 0;
+        monc->last_tid = 0;
+        monc->have_mdsmap = 0;
+        monc->have_osdmap = 0;
+        monc->want_next_osdmap = 1;
+        return 0;
+out_pool3:
+        ceph_msgpool_destroy(&monc->msgpool_auth_reply);
+out_pool2:
+        ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
+out_pool1:
+        ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
+out_monmap:
+        kfree(monc->monmap);
+out:
+        return err;
+}
+void ceph_monc_stop(struct ceph_mon_client *monc)
+{
+        dout("stop\n");
+        cancel_delayed_work_sync(&monc->delayed_work);
+        mutex_lock(&monc->mutex);
+        __close_session(monc);
+        if (monc->con) {
+                monc->con->private = NULL;
+                monc->con->ops->put(monc->con);
+                monc->con = NULL;
+        }
+        mutex_unlock(&monc->mutex);
+        ceph_auth_destroy(monc->auth);
+        ceph_msg_put(monc->m_auth);
+        ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
+        ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
+        ceph_msgpool_destroy(&monc->msgpool_auth_reply);
+        kfree(monc->monmap);
+}
+static void handle_auth_reply(struct ceph_mon_client *monc,
+                              struct ceph_msg *msg)
+{
+        int ret;
+        mutex_lock(&monc->mutex);
+        monc->pending_auth = 0;
+        ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
+                                     msg->front.iov_len,
+                                     monc->m_auth->front.iov_base,
+                                     monc->m_auth->front_max);
+        if (ret < 0) {
+                monc->client->auth_err = ret;
+                wake_up(&monc->client->auth_wq);
+        } else if (ret > 0) {
+                __send_prepared_auth_request(monc, ret);
+        } else if (monc->auth->ops->is_authenticated(monc->auth)) {
+                dout("authenticated, starting session\n");
+                monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
+                monc->client->msgr->inst.name.num = monc->auth->global_id;
+                __send_subscribe(monc);
+                __resend_statfs(monc);
+        }
+        mutex_unlock(&monc->mutex);
+}
+static int __validate_auth(struct ceph_mon_client *monc)
+{
+        int ret;
+        if (monc->pending_auth)
+                return 0;
+        ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
+                              monc->m_auth->front_max);
+        if (ret <= 0)
+                return ret; /* either an error, or no need to authenticate */
+        __send_prepared_auth_request(monc, ret);
+        return 0;
+}
+int ceph_monc_validate_auth(struct ceph_mon_client *monc)
+{
+        int ret;
+        mutex_lock(&monc->mutex);
+        ret = __validate_auth(monc);
+        mutex_unlock(&monc->mutex);
+        return ret;
+}
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+        struct ceph_mon_client *monc = con->private;
+        int type = le16_to_cpu(msg->hdr.type);
+        if (!monc)
+                return;
+        switch (type) {
+        case CEPH_MSG_AUTH_REPLY:
+                handle_auth_reply(monc, msg);
+                break;
+        case CEPH_MSG_MON_SUBSCRIBE_ACK:
+                handle_subscribe_ack(monc, msg);
+                break;
+        case CEPH_MSG_STATFS_REPLY:
+                handle_statfs_reply(monc, msg);
+                break;
+        case CEPH_MSG_MON_MAP:
+                ceph_monc_handle_map(monc, msg);
+                break;
+        case CEPH_MSG_MDS_MAP:
+                ceph_mdsc_handle_map(&monc->client->mdsc, msg);
+                break;
+        case CEPH_MSG_OSD_MAP:
+                ceph_osdc_handle_map(&monc->client->osdc, msg);
+                break;
+        default:
+                pr_err("received unknown message type %d %s\n", type,
+                       ceph_msg_type_name(type));
+        }
+        ceph_msg_put(msg);
+}
+/*
+ * Allocate memory for incoming message
+ */
+static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
+                                      struct ceph_msg_header *hdr,
+                                      int *skip)
+{
+        struct ceph_mon_client *monc = con->private;
+        int type = le16_to_cpu(hdr->type);
+        int front_len = le32_to_cpu(hdr->front_len);
+        struct ceph_msg *m = NULL;
+        *skip = 0;
+        switch (type) {
+        case CEPH_MSG_MON_SUBSCRIBE_ACK:
+                m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
+                break;
+        case CEPH_MSG_STATFS_REPLY:
+                m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len);
+                break;
+        case CEPH_MSG_AUTH_REPLY:
+                m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
+                break;
+        case CEPH_MSG_MON_MAP:
+        case CEPH_MSG_MDS_MAP:
+        case CEPH_MSG_OSD_MAP:
+                m = ceph_msg_new(type, front_len, 0, 0, NULL);
+                break;
+        }
+        if (!m) {
+                pr_info("alloc_msg unknown type %d\n", type);
+                *skip = 1;
+        }
+        return m;
+}
+/*
+ * If the monitor connection resets, pick a new monitor and resubmit
+ * any pending requests.
+ */
+static void mon_fault(struct ceph_connection *con)
+{
+        struct ceph_mon_client *monc = con->private;
+        if (!monc)
+                return;
+        dout("mon_fault\n");
+        mutex_lock(&monc->mutex);
+        if (!con->private)
+                goto out;
+        if (monc->con && !monc->hunting)
+                pr_info("mon%d %s session lost, "
+                        "hunting for new mon\n", monc->cur_mon,
+                        pr_addr(&monc->con->peer_addr.in_addr));
+        __close_session(monc);
+        if (!monc->hunting) {
+                /* start hunting */
+                monc->hunting = true;
+                __open_session(monc);
+        } else {
+                /* already hunting, let's wait a bit */
+                __schedule_delayed(monc);
+        }
+out:
+        mutex_unlock(&monc->mutex);
+}
+const static struct ceph_connection_operations mon_con_ops = {
+        .get = ceph_con_get,
+        .put = ceph_con_put,
+        .dispatch = dispatch,
+        .fault = mon_fault,
+        .alloc_msg = mon_alloc_msg,
+};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
new file mode 100644
index 000000000000..b958ad5afa06
--- /dev/null
+++ b/fs/ceph/mon_client.h
@@ -0,0 +1,119 @@
+#ifndef _FS_CEPH_MON_CLIENT_H
+#define _FS_CEPH_MON_CLIENT_H
+#include <linux/completion.h>
+#include <linux/rbtree.h>
+#include "messenger.h"
+#include "msgpool.h"
+struct ceph_client;
+struct ceph_mount_args;
+struct ceph_auth_client;
+/*
+ * The monitor map enumerates the set of all monitors.
+ */
+struct ceph_monmap {
+        struct ceph_fsid fsid;
+        u32 epoch;
+        u32 num_mon;
+        struct ceph_entity_inst mon_inst[0];
+};
+struct ceph_mon_client;
+struct ceph_mon_statfs_request;
+/*
+ * Generic mechanism for resending monitor requests.
+ */
+typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
+                                         int newmon);
+/* a pending monitor request */
+struct ceph_mon_request {
+        struct ceph_mon_client *monc;
+        struct delayed_work delayed_work;
+        unsigned long delay;
+        ceph_monc_request_func_t do_request;
+};
+/*
+ * statfs() is done a bit differently because we need to get data back
+ * to the caller
+ */
+struct ceph_mon_statfs_request {
+        u64 tid;
+        struct rb_node node;
+        int result;
+        struct ceph_statfs *buf;
+        struct completion completion;
+        unsigned long last_attempt, delay; /* jiffies */
+        struct ceph_msg *request;  /* original request */
+};
+struct ceph_mon_client {
+        struct ceph_client *client;
+        struct ceph_monmap *monmap;
+        struct mutex mutex;
+        struct delayed_work delayed_work;
+        struct ceph_auth_client *auth;
+        struct ceph_msg *m_auth;
+        int pending_auth;
+        bool hunting;
+        int cur_mon;                       /* last monitor i contacted */
+        unsigned long sub_sent, sub_renew_after;
+        struct ceph_connection *con;
+        bool have_fsid;
+        /* msg pools */
+        struct ceph_msgpool msgpool_subscribe_ack;
+        struct ceph_msgpool msgpool_statfs_reply;
+        struct ceph_msgpool msgpool_auth_reply;
+        /* pending statfs requests */
+        struct rb_root statfs_request_tree;
+        int num_statfs_requests;
+        u64 last_tid;
+        /* mds/osd map */
+        int want_next_osdmap; /* 1 = want, 2 = want+asked */
+        u32 have_osdmap, have_mdsmap;
+#ifdef CONFIG_DEBUG_FS
+        struct dentry *debugfs_file;
+#endif
+};
+extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
+extern int ceph_monmap_contains(struct ceph_monmap *m,
+                                struct ceph_entity_addr *addr);
+extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
+extern void ceph_monc_stop(struct ceph_mon_client *monc);
+/*
+ * The model here is to indicate that we need a new map of at least
+ * epoch @want, and also call in when we receive a map.  We will
+ * periodically rerequest the map from the monitor cluster until we
+ * get what we want.
+ */
+extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
+extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
+extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
+extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
+                               struct ceph_statfs *buf);
+extern int ceph_monc_open_session(struct ceph_mon_client *monc);
+extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
+#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
new file mode 100644
index 000000000000..ca3b44a89f2d
--- /dev/null
+++ b/fs/ceph/msgpool.c
@@ -0,0 +1,186 @@
+#include "ceph_debug.h"
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include "msgpool.h"
+/*
+ * We use msg pools to preallocate memory for messages we expect to
+ * receive over the wire, to avoid getting ourselves into OOM
+ * conditions at unexpected times.  We take use a few different
+ * strategies:
+ *
+ *  - for request/response type interactions, we preallocate the
+ * memory needed for the response when we generate the request.
+ *
+ *  - for messages we can receive at any time from the MDS, we preallocate
+ * a pool of messages we can re-use.
+ *
+ *  - for writeback, we preallocate some number of messages to use for
+ * requests and their replies, so that we always make forward
+ * progress.
+ *
+ * The msgpool behaves like a mempool_t, but keeps preallocated
+ * ceph_msgs strung together on a list_head instead of using a pointer
+ * vector.  This avoids vector reallocation when we adjust the number
+ * of preallocated items (which happens frequently).
+ */
+/*
+ * Allocate or release as necessary to meet our target pool size.
+ */
+static int __fill_msgpool(struct ceph_msgpool *pool)
+{
+        struct ceph_msg *msg;
+        while (pool->num < pool->min) {
+                dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
+                     pool->min);
+                spin_unlock(&pool->lock);
+                msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
+                spin_lock(&pool->lock);
+                if (IS_ERR(msg))
+                        return PTR_ERR(msg);
+                msg->pool = pool;
+                list_add(&msg->list_head, &pool->msgs);
+                pool->num++;
+        }
+        while (pool->num > pool->min) {
+                msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
+                dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
+                     pool->min, msg);
+                list_del_init(&msg->list_head);
+                pool->num--;
+                ceph_msg_kfree(msg);
+        }
+        return 0;
+}
+int ceph_msgpool_init(struct ceph_msgpool *pool,
+                      int front_len, int min, bool blocking)
+{
+        int ret;
+        dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
+        spin_lock_init(&pool->lock);
+        pool->front_len = front_len;
+        INIT_LIST_HEAD(&pool->msgs);
+        pool->num = 0;
+        pool->min = min;
+        pool->blocking = blocking;
+        init_waitqueue_head(&pool->wait);
+        spin_lock(&pool->lock);
+        ret = __fill_msgpool(pool);
+        spin_unlock(&pool->lock);
+        return ret;
+}
+void ceph_msgpool_destroy(struct ceph_msgpool *pool)
+{
+        dout("msgpool_destroy %p\n", pool);
+        spin_lock(&pool->lock);
+        pool->min = 0;
+        __fill_msgpool(pool);
+        spin_unlock(&pool->lock);
+}
+int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
+{
+        int ret;
+        spin_lock(&pool->lock);
+        dout("msgpool_resv %p delta %d\n", pool, delta);
+        pool->min += delta;
+        ret = __fill_msgpool(pool);
+        spin_unlock(&pool->lock);
+        return ret;
+}
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
+{
+        wait_queue_t wait;
+        struct ceph_msg *msg;
+        if (front_len && front_len > pool->front_len) {
+                pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
+                       pool, front_len, pool->front_len);
+                WARN_ON(1);
+                /* try to alloc a fresh message */
+                msg = ceph_msg_new(0, front_len, 0, 0, NULL);
+                if (!IS_ERR(msg))
+                        return msg;
+        }
+        if (!front_len)
+                front_len = pool->front_len;
+        if (pool->blocking) {
+                /* mempool_t behavior; first try to alloc */
+                msg = ceph_msg_new(0, front_len, 0, 0, NULL);
+                if (!IS_ERR(msg))
+                        return msg;
+        }
+        while (1) {
+                spin_lock(&pool->lock);
+                if (likely(pool->num)) {
+                        msg = list_entry(pool->msgs.next, struct ceph_msg,
+                                         list_head);
+                        list_del_init(&msg->list_head);
+                        pool->num--;
+                        dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
+                             pool->num, pool->min);
+                        spin_unlock(&pool->lock);
+                        return msg;
+                }
+                pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
+                       pool->min, pool->blocking ? "waiting" : "may fail");
+                spin_unlock(&pool->lock);
+                if (!pool->blocking) {
+                        WARN_ON(1);
+                        /* maybe we can allocate it now? */
+                        msg = ceph_msg_new(0, front_len, 0, 0, NULL);
+                        if (!IS_ERR(msg))
+                                return msg;
+                        pr_err("msgpool_get %p empty + alloc failed\n", pool);
+                        return ERR_PTR(-ENOMEM);
+                }
+                init_wait(&wait);
+                prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
+                schedule();
+                finish_wait(&pool->wait, &wait);
+        }
+}
+void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
+{
+        spin_lock(&pool->lock);
+        if (pool->num < pool->min) {
+                /* reset msg front_len; user may have changed it */
+                msg->front.iov_len = pool->front_len;
+                msg->hdr.front_len = cpu_to_le32(pool->front_len);
+                kref_set(&msg->kref, 1);  /* retake a single ref */
+                list_add(&msg->list_head, &pool->msgs);
+                pool->num++;
+                dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
+                     pool->num, pool->min);
+                spin_unlock(&pool->lock);
+                wake_up(&pool->wait);
+        } else {
+                dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
+                     pool->num, pool->min);
+                spin_unlock(&pool->lock);
+                ceph_msg_kfree(msg);
+        }
+}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
new file mode 100644
index 000000000000..bc834bfcd720
--- /dev/null
+++ b/fs/ceph/msgpool.h
@@ -0,0 +1,27 @@
+#ifndef _FS_CEPH_MSGPOOL
+#define _FS_CEPH_MSGPOOL
+#include "messenger.h"
+/*
+ * we use memory pools for preallocating messages we may receive, to
+ * avoid unexpected OOM conditions.
+ */
+struct ceph_msgpool {
+        spinlock_t lock;
+        int front_len;          /* preallocated payload size */
+        struct list_head msgs;  /* msgs in the pool; each has 1 ref */
+        int num, min;           /* cur, min # msgs in the pool */
+        bool blocking;
+        wait_queue_head_t wait;
+};
+extern int ceph_msgpool_init(struct ceph_msgpool *pool,
+                             int front_len, int size, bool blocking);
+extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
+extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
+extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
+                                         int front_len);
+extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
+#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
new file mode 100644
index 000000000000..8aaab414f3f8
--- /dev/null
+++ b/fs/ceph/msgr.h
@@ -0,0 +1,158 @@
+#ifndef __MSGR_H
+#define __MSGR_H
+/*
+ * Data types for message passing layer used by Ceph.
+ */
+#define CEPH_MON_PORT    6789  /* default monitor port */
+/*
+ * client-side processes will try to bind to ports in this
+ * range, simply for the benefit of tools like nmap or wireshark
+ * that would like to identify the protocol.
+ */
+#define CEPH_PORT_FIRST  6789
+#define CEPH_PORT_START  6800  /* non-monitors start here */
+#define CEPH_PORT_LAST   6900
+/*
+ * tcp connection banner.  include a protocol version. and adjust
+ * whenever the wire protocol changes.  try to keep this string length
+ * constant.
+ */
+#define CEPH_BANNER "ceph v027"
+#define CEPH_BANNER_MAX_LEN 30
+/*
+ * Rollover-safe type and comparator for 32-bit sequence numbers.
+ * Comparator returns -1, 0, or 1.
+ */
+typedef __u32 ceph_seq_t;
+static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
+{
+       return (__s32)a - (__s32)b;
+}
+/*
+ * entity_name -- logical name for a process participating in the
+ * network, e.g. 'mds0' or 'osd3'.
+ */
+struct ceph_entity_name {
+        __u8 type;      /* CEPH_ENTITY_TYPE_* */
+        __le64 num;
+} __attribute__ ((packed));
+#define CEPH_ENTITY_TYPE_MON    0x01
+#define CEPH_ENTITY_TYPE_MDS    0x02
+#define CEPH_ENTITY_TYPE_OSD    0x04
+#define CEPH_ENTITY_TYPE_CLIENT 0x08
+#define CEPH_ENTITY_TYPE_ADMIN  0x10
+#define CEPH_ENTITY_TYPE_AUTH   0x20
+#define CEPH_ENTITY_TYPE_ANY    0xFF
+extern const char *ceph_entity_type_name(int type);
+/*
+ * entity_addr -- network address
+ */
+struct ceph_entity_addr {
+        __le32 type;
+        __le32 nonce;  /* unique id for process (e.g. pid) */
+        struct sockaddr_storage in_addr;
+} __attribute__ ((packed));
+struct ceph_entity_inst {
+        struct ceph_entity_name name;
+        struct ceph_entity_addr addr;
+} __attribute__ ((packed));
+/* used by message exchange protocol */
+#define CEPH_MSGR_TAG_READY         1  /* server->client: ready for messages */
+#define CEPH_MSGR_TAG_RESETSESSION  2  /* server->client: reset, try again */
+#define CEPH_MSGR_TAG_WAIT          3  /* server->client: wait for racing
+                                          incoming connection */
+#define CEPH_MSGR_TAG_RETRY_SESSION 4  /* server->client + cseq: try again
+                                          with higher cseq */
+#define CEPH_MSGR_TAG_RETRY_GLOBAL  5  /* server->client + gseq: try again
+                                          with higher gseq */
+#define CEPH_MSGR_TAG_CLOSE         6  /* closing pipe */
+#define CEPH_MSGR_TAG_MSG           7  /* message */
+#define CEPH_MSGR_TAG_ACK           8  /* message ack */
+#define CEPH_MSGR_TAG_KEEPALIVE     9  /* just a keepalive byte! */
+#define CEPH_MSGR_TAG_BADPROTOVER  10  /* bad protocol version */
+#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
+#define CEPH_MSGR_TAG_FEATURES      12 /* insufficient features */
+/*
+ * connection negotiation
+ */
+struct ceph_msg_connect {
+        __le64 features;     /* supported feature bits */
+        __le32 host_type;    /* CEPH_ENTITY_TYPE_* */
+        __le32 global_seq;   /* count connections initiated by this host */
+        __le32 connect_seq;  /* count connections initiated in this session */
+        __le32 protocol_version;
+        __le32 authorizer_protocol;
+        __le32 authorizer_len;
+        __u8  flags;         /* CEPH_MSG_CONNECT_* */
+} __attribute__ ((packed));
+struct ceph_msg_connect_reply {
+        __u8 tag;
+        __le64 features;     /* feature bits for this session */
+        __le32 global_seq;
+        __le32 connect_seq;
+        __le32 protocol_version;
+        __le32 authorizer_len;
+        __u8 flags;
+} __attribute__ ((packed));
+#define CEPH_MSG_CONNECT_LOSSY  1  /* messages i send may be safely dropped */
+/*
+ * message header
+ */
+struct ceph_msg_header {
+        __le64 seq;       /* message seq# for this session */
+        __le64 tid;       /* transaction id */
+        __le16 type;      /* message type */
+        __le16 priority;  /* priority.  higher value == higher priority */
+        __le16 version;   /* version of message encoding */
+        __le32 front_len; /* bytes in main payload */
+        __le32 middle_len;/* bytes in middle payload */
+        __le32 data_len;  /* bytes of data payload */
+        __le16 data_off;  /* sender: include full offset;
+                             receiver: mask against ~PAGE_MASK */
+        struct ceph_entity_inst src, orig_src;
+        __le32 reserved;
+        __le32 crc;       /* header crc32c */
+} __attribute__ ((packed));
+#define CEPH_MSG_PRIO_LOW     64
+#define CEPH_MSG_PRIO_DEFAULT 127
+#define CEPH_MSG_PRIO_HIGH    196
+#define CEPH_MSG_PRIO_HIGHEST 255
+/*
+ * follows data payload
+ */
+struct ceph_msg_footer {
+        __le32 front_crc, middle_crc, data_crc;
+        __u8 flags;
+} __attribute__ ((packed));
+#define CEPH_MSG_FOOTER_COMPLETE  (1<<0)   /* msg wasn't aborted */
+#define CEPH_MSG_FOOTER_NOCRC     (1<<1)   /* no data crc */
+#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
new file mode 100644
index 000000000000..c7b4dedaace6
--- /dev/null
+++ b/fs/ceph/osd_client.c
@@ -0,0 +1,1550 @@
+#include "ceph_debug.h"
+#include <linux/err.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include "super.h"
+#include "osd_client.h"
+#include "messenger.h"
+#include "decode.h"
+#include "auth.h"
+#define OSD_OP_FRONT_LEN        4096
+#define OSD_OPREPLY_FRONT_LEN   512
+const static struct ceph_connection_operations osd_con_ops;
+static int __kick_requests(struct ceph_osd_client *osdc,
+                          struct ceph_osd *kickosd);
+static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
+/*
+ * Implement client access to distributed object storage cluster.
+ *
+ * All data objects are stored within a cluster/cloud of OSDs, or
+ * "object storage devices."  (Note that Ceph OSDs have _nothing_ to
+ * do with the T10 OSD extensions to SCSI.)  Ceph OSDs are simply
+ * remote daemons serving up and coordinating consistent and safe
+ * access to storage.
+ *
+ * Cluster membership and the mapping of data objects onto storage devices
+ * are described by the osd map.
+ *
+ * We keep track of pending OSD requests (read, write), resubmit
+ * requests to different OSDs when the cluster topology/data layout
+ * change, or retry the affected requests when the communications
+ * channel with an OSD is reset.
+ */
+/*
+ * calculate the mapping of a file extent onto an object, and fill out the
+ * request accordingly.  shorten extent as necessary if it crosses an
+ * object boundary.
+ *
+ * fill osd op in request message.
+ */
+static void calc_layout(struct ceph_osd_client *osdc,
+                        struct ceph_vino vino, struct ceph_file_layout *layout,
+                        u64 off, u64 *plen,
+                        struct ceph_osd_request *req)
+{
+        struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+        struct ceph_osd_op *op = (void *)(reqhead + 1);
+        u64 orig_len = *plen;
+        u64 objoff, objlen;    /* extent in object */
+        u64 bno;
+        reqhead->snapid = cpu_to_le64(vino.snap);
+        /* object extent? */
+        ceph_calc_file_object_mapping(layout, off, plen, &bno,
+                                      &objoff, &objlen);
+        if (*plen < orig_len)
+                dout(" skipping last %llu, final file extent %llu~%llu\n",
+                     orig_len - *plen, off, *plen);
+        sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
+        req->r_oid_len = strlen(req->r_oid);
+        op->extent.offset = cpu_to_le64(objoff);
+        op->extent.length = cpu_to_le64(objlen);
+        req->r_num_pages = calc_pages_for(off, *plen);
+        dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
+             req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
+}
+/*
+ * requests
+ */
+void ceph_osdc_release_request(struct kref *kref)
+{
+        struct ceph_osd_request *req = container_of(kref,
+                                                    struct ceph_osd_request,
+                                                    r_kref);
+        if (req->r_request)
+                ceph_msg_put(req->r_request);
+        if (req->r_reply)
+                ceph_msg_put(req->r_reply);
+        if (req->r_con_filling_msg) {
+                dout("release_request revoking pages %p from con %p\n",
+                     req->r_pages, req->r_con_filling_msg);
+                ceph_con_revoke_message(req->r_con_filling_msg,
+                                      req->r_reply);
+                ceph_con_put(req->r_con_filling_msg);
+        }
+        if (req->r_own_pages)
+                ceph_release_page_vector(req->r_pages,
+                                         req->r_num_pages);
+        ceph_put_snap_context(req->r_snapc);
+        if (req->r_mempool)
+                mempool_free(req, req->r_osdc->req_mempool);
+        else
+                kfree(req);
+}
+/*
+ * build new request AND message, calculate layout, and adjust file
+ * extent as needed.
+ *
+ * if the file was recently truncated, we include information about its
+ * old and new size so that the object can be updated appropriately.  (we
+ * avoid synchronously deleting truncated objects because it's slow.)
+ *
+ * if @do_sync, include a 'startsync' command so that the osd will flush
+ * data quickly.
+ */
+struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
+                                               struct ceph_file_layout *layout,
+                                               struct ceph_vino vino,
+                                               u64 off, u64 *plen,
+                                               int opcode, int flags,
+                                               struct ceph_snap_context *snapc,
+                                               int do_sync,
+                                               u32 truncate_seq,
+                                               u64 truncate_size,
+                                               struct timespec *mtime,
+                                               bool use_mempool, int num_reply)
+{
+        struct ceph_osd_request *req;
+        struct ceph_msg *msg;
+        struct ceph_osd_request_head *head;
+        struct ceph_osd_op *op;
+        void *p;
+        int num_op = 1 + do_sync;
+        size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
+        int i;
+        if (use_mempool) {
+                req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
+                memset(req, 0, sizeof(*req));
+        } else {
+                req = kzalloc(sizeof(*req), GFP_NOFS);
+        }
+        if (req == NULL)
+                return ERR_PTR(-ENOMEM);
+        req->r_osdc = osdc;
+        req->r_mempool = use_mempool;
+        kref_init(&req->r_kref);
+        init_completion(&req->r_completion);
+        init_completion(&req->r_safe_completion);
+        INIT_LIST_HEAD(&req->r_unsafe_item);
+        req->r_flags = flags;
+        WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
+        /* create reply message */
+        if (use_mempool)
+                msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+        else
+                msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
+                                   OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
+        if (IS_ERR(msg)) {
+                ceph_osdc_put_request(req);
+                return ERR_PTR(PTR_ERR(msg));
+        }
+        req->r_reply = msg;
+        /* create request message; allow space for oid */
+        msg_size += 40;
+        if (snapc)
+                msg_size += sizeof(u64) * snapc->num_snaps;
+        if (use_mempool)
+                msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
+        else
+                msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
+        if (IS_ERR(msg)) {
+                ceph_osdc_put_request(req);
+                return ERR_PTR(PTR_ERR(msg));
+        }
+        msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
+        memset(msg->front.iov_base, 0, msg->front.iov_len);
+        head = msg->front.iov_base;
+        op = (void *)(head + 1);
+        p = (void *)(op + num_op);
+        req->r_request = msg;
+        req->r_snapc = ceph_get_snap_context(snapc);
+        head->client_inc = cpu_to_le32(1); /* always, for now. */
+        head->flags = cpu_to_le32(flags);
+        if (flags & CEPH_OSD_FLAG_WRITE)
+                ceph_encode_timespec(&head->mtime, mtime);
+        head->num_ops = cpu_to_le16(num_op);
+        op->op = cpu_to_le16(opcode);
+        /* calculate max write size */
+        calc_layout(osdc, vino, layout, off, plen, req);
+        req->r_file_layout = *layout;  /* keep a copy */
+        if (flags & CEPH_OSD_FLAG_WRITE) {
+                req->r_request->hdr.data_off = cpu_to_le16(off);
+                req->r_request->hdr.data_len = cpu_to_le32(*plen);
+                op->payload_len = cpu_to_le32(*plen);
+        }
+        op->extent.truncate_size = cpu_to_le64(truncate_size);
+        op->extent.truncate_seq = cpu_to_le32(truncate_seq);
+        /* fill in oid */
+        head->object_len = cpu_to_le32(req->r_oid_len);
+        memcpy(p, req->r_oid, req->r_oid_len);
+        p += req->r_oid_len;
+        if (do_sync) {
+                op++;
+                op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
+        }
+        if (snapc) {
+                head->snap_seq = cpu_to_le64(snapc->seq);
+                head->num_snaps = cpu_to_le32(snapc->num_snaps);
+                for (i = 0; i < snapc->num_snaps; i++) {
+                        put_unaligned_le64(snapc->snaps[i], p);
+                        p += sizeof(u64);
+                }
+        }
+        BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
+        msg_size = p - msg->front.iov_base;
+        msg->front.iov_len = msg_size;
+        msg->hdr.front_len = cpu_to_le32(msg_size);
+        return req;
+}
+/*
+ * We keep osd requests in an rbtree, sorted by ->r_tid.
+ */
+static void __insert_request(struct ceph_osd_client *osdc,
+                             struct ceph_osd_request *new)
+{
+        struct rb_node **p = &osdc->requests.rb_node;
+        struct rb_node *parent = NULL;
+        struct ceph_osd_request *req = NULL;
+        while (*p) {
+                parent = *p;
+                req = rb_entry(parent, struct ceph_osd_request, r_node);
+                if (new->r_tid < req->r_tid)
+                        p = &(*p)->rb_left;
+                else if (new->r_tid > req->r_tid)
+                        p = &(*p)->rb_right;
+                else
+                        BUG();
+        }
+        rb_link_node(&new->r_node, parent, p);
+        rb_insert_color(&new->r_node, &osdc->requests);
+}
+static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
+                                                 u64 tid)
+{
+        struct ceph_osd_request *req;
+        struct rb_node *n = osdc->requests.rb_node;
+        while (n) {
+                req = rb_entry(n, struct ceph_osd_request, r_node);
+                if (tid < req->r_tid)
+                        n = n->rb_left;
+                else if (tid > req->r_tid)
+                        n = n->rb_right;
+                else
+                        return req;
+        }
+        return NULL;
+}
+static struct ceph_osd_request *
+__lookup_request_ge(struct ceph_osd_client *osdc,
+                    u64 tid)
+{
+        struct ceph_osd_request *req;
+        struct rb_node *n = osdc->requests.rb_node;
+        while (n) {
+                req = rb_entry(n, struct ceph_osd_request, r_node);
+                if (tid < req->r_tid) {
+                        if (!n->rb_left)
+                                return req;
+                        n = n->rb_left;
+                } else if (tid > req->r_tid) {
+                        n = n->rb_right;
+                } else {
+                        return req;
+                }
+        }
+        return NULL;
+}
+/*
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+static void osd_reset(struct ceph_connection *con)
+{
+        struct ceph_osd *osd = con->private;
+        struct ceph_osd_client *osdc;
+        if (!osd)
+                return;
+        dout("osd_reset osd%d\n", osd->o_osd);
+        osdc = osd->o_osdc;
+        down_read(&osdc->map_sem);
+        kick_requests(osdc, osd);
+        up_read(&osdc->map_sem);
+}
+/*
+ * Track open sessions with osds.
+ */
+static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
+{
+        struct ceph_osd *osd;
+        osd = kzalloc(sizeof(*osd), GFP_NOFS);
+        if (!osd)
+                return NULL;
+        atomic_set(&osd->o_ref, 1);
+        osd->o_osdc = osdc;
+        INIT_LIST_HEAD(&osd->o_requests);
+        INIT_LIST_HEAD(&osd->o_osd_lru);
+        osd->o_incarnation = 1;
+        ceph_con_init(osdc->client->msgr, &osd->o_con);
+        osd->o_con.private = osd;
+        osd->o_con.ops = &osd_con_ops;
+        osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
+        INIT_LIST_HEAD(&osd->o_keepalive_item);
+        return osd;
+}
+static struct ceph_osd *get_osd(struct ceph_osd *osd)
+{
+        if (atomic_inc_not_zero(&osd->o_ref)) {
+                dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
+                     atomic_read(&osd->o_ref));
+                return osd;
+        } else {
+                dout("get_osd %p FAIL\n", osd);
+                return NULL;
+        }
+}
+static void put_osd(struct ceph_osd *osd)
+{
+        dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
+             atomic_read(&osd->o_ref) - 1);
+        if (atomic_dec_and_test(&osd->o_ref))
+                kfree(osd);
+}
+/*
+ * remove an osd from our map
+ */
+static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+        dout("__remove_osd %p\n", osd);
+        BUG_ON(!list_empty(&osd->o_requests));
+        rb_erase(&osd->o_node, &osdc->osds);
+        list_del_init(&osd->o_osd_lru);
+        ceph_con_close(&osd->o_con);
+        put_osd(osd);
+}
+static void __move_osd_to_lru(struct ceph_osd_client *osdc,
+                              struct ceph_osd *osd)
+{
+        dout("__move_osd_to_lru %p\n", osd);
+        BUG_ON(!list_empty(&osd->o_osd_lru));
+        list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
+        osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
+}
+static void __remove_osd_from_lru(struct ceph_osd *osd)
+{
+        dout("__remove_osd_from_lru %p\n", osd);
+        if (!list_empty(&osd->o_osd_lru))
+                list_del_init(&osd->o_osd_lru);
+}
+static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
+{
+        struct ceph_osd *osd, *nosd;
+        dout("__remove_old_osds %p\n", osdc);
+        mutex_lock(&osdc->request_mutex);
+        list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
+                if (!remove_all && time_before(jiffies, osd->lru_ttl))
+                        break;
+                __remove_osd(osdc, osd);
+        }
+        mutex_unlock(&osdc->request_mutex);
+}
+/*
+ * reset osd connect
+ */
+static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+        struct ceph_osd_request *req;
+        int ret = 0;
+        dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
+        if (list_empty(&osd->o_requests)) {
+                __remove_osd(osdc, osd);
+        } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
+                          &osd->o_con.peer_addr,
+                          sizeof(osd->o_con.peer_addr)) == 0 &&
+                   !ceph_con_opened(&osd->o_con)) {
+                dout(" osd addr hasn't changed and connection never opened,"
+                     " letting msgr retry");
+                /* touch each r_stamp for handle_timeout()'s benfit */
+                list_for_each_entry(req, &osd->o_requests, r_osd_item)
+                        req->r_stamp = jiffies;
+                ret = -EAGAIN;
+        } else {
+                ceph_con_close(&osd->o_con);
+                ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
+                osd->o_incarnation++;
+        }
+        return ret;
+}
+static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
+{
+        struct rb_node **p = &osdc->osds.rb_node;
+        struct rb_node *parent = NULL;
+        struct ceph_osd *osd = NULL;
+        while (*p) {
+                parent = *p;
+                osd = rb_entry(parent, struct ceph_osd, o_node);
+                if (new->o_osd < osd->o_osd)
+                        p = &(*p)->rb_left;
+                else if (new->o_osd > osd->o_osd)
+                        p = &(*p)->rb_right;
+                else
+                        BUG();
+        }
+        rb_link_node(&new->o_node, parent, p);
+        rb_insert_color(&new->o_node, &osdc->osds);
+}
+static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
+{
+        struct ceph_osd *osd;
+        struct rb_node *n = osdc->osds.rb_node;
+        while (n) {
+                osd = rb_entry(n, struct ceph_osd, o_node);
+                if (o < osd->o_osd)
+                        n = n->rb_left;
+                else if (o > osd->o_osd)
+                        n = n->rb_right;
+                else
+                        return osd;
+        }
+        return NULL;
+}
+static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
+{
+        schedule_delayed_work(&osdc->timeout_work,
+                        osdc->client->mount_args->osd_keepalive_timeout * HZ);
+}
+static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
+{
+        cancel_delayed_work(&osdc->timeout_work);
+}
+/*
+ * Register request, assign tid.  If this is the first request, set up
+ * the timeout event.
+ */
+static void register_request(struct ceph_osd_client *osdc,
+                             struct ceph_osd_request *req)
+{
+        mutex_lock(&osdc->request_mutex);
+        req->r_tid = ++osdc->last_tid;
+        req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
+        INIT_LIST_HEAD(&req->r_req_lru_item);
+        dout("register_request %p tid %lld\n", req, req->r_tid);
+        __insert_request(osdc, req);
+        ceph_osdc_get_request(req);
+        osdc->num_requests++;
+        if (osdc->num_requests == 1) {
+                dout(" first request, scheduling timeout\n");
+                __schedule_osd_timeout(osdc);
+        }
+        mutex_unlock(&osdc->request_mutex);
+}
+/*
+ * called under osdc->request_mutex
+ */
+static void __unregister_request(struct ceph_osd_client *osdc,
+                                 struct ceph_osd_request *req)
+{
+        dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+        rb_erase(&req->r_node, &osdc->requests);
+        osdc->num_requests--;
+        if (req->r_osd) {
+                /* make sure the original request isn't in flight. */
+                ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+                list_del_init(&req->r_osd_item);
+                if (list_empty(&req->r_osd->o_requests))
+                        __move_osd_to_lru(osdc, req->r_osd);
+                req->r_osd = NULL;
+        }
+        ceph_osdc_put_request(req);
+        list_del_init(&req->r_req_lru_item);
+        if (osdc->num_requests == 0) {
+                dout(" no requests, canceling timeout\n");
+                __cancel_osd_timeout(osdc);
+        }
+}
+/*
+ * Cancel a previously queued request message
+ */
+static void __cancel_request(struct ceph_osd_request *req)
+{
+        if (req->r_sent) {
+                ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+                req->r_sent = 0;
+        }
+        list_del_init(&req->r_req_lru_item);
+}
+/*
+ * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
+ * (as needed), and set the request r_osd appropriately.  If there is
+ * no up osd, set r_osd to NULL.
+ *
+ * Return 0 if unchanged, 1 if changed, or negative on error.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static int __map_osds(struct ceph_osd_client *osdc,
+                      struct ceph_osd_request *req)
+{
+        struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+        struct ceph_pg pgid;
+        int o = -1;
+        int err;
+        dout("map_osds %p tid %lld\n", req, req->r_tid);
+        err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
+                                      &req->r_file_layout, osdc->osdmap);
+        if (err)
+                return err;
+        pgid = reqhead->layout.ol_pgid;
+        req->r_pgid = pgid;
+        o = ceph_calc_pg_primary(osdc->osdmap, pgid);
+        if ((req->r_osd && req->r_osd->o_osd == o &&
+             req->r_sent >= req->r_osd->o_incarnation) ||
+            (req->r_osd == NULL && o == -1))
+                return 0;  /* no change */
+        dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
+             req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
+             req->r_osd ? req->r_osd->o_osd : -1);
+        if (req->r_osd) {
+                __cancel_request(req);
+                list_del_init(&req->r_osd_item);
+                req->r_osd = NULL;
+        }
+        req->r_osd = __lookup_osd(osdc, o);
+        if (!req->r_osd && o >= 0) {
+                err = -ENOMEM;
+                req->r_osd = create_osd(osdc);
+                if (!req->r_osd)
+                        goto out;
+                dout("map_osds osd %p is osd%d\n", req->r_osd, o);
+                req->r_osd->o_osd = o;
+                req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
+                __insert_osd(osdc, req->r_osd);
+                ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
+        }
+        if (req->r_osd) {
+                __remove_osd_from_lru(req->r_osd);
+                list_add(&req->r_osd_item, &req->r_osd->o_requests);
+        }
+        err = 1;   /* osd changed */
+out:
+        return err;
+}
+/*
+ * caller should hold map_sem (for read) and request_mutex
+ */
+static int __send_request(struct ceph_osd_client *osdc,
+                          struct ceph_osd_request *req)
+{
+        struct ceph_osd_request_head *reqhead;
+        int err;
+        err = __map_osds(osdc, req);
+        if (err < 0)
+                return err;
+        if (req->r_osd == NULL) {
+                dout("send_request %p no up osds in pg\n", req);
+                ceph_monc_request_next_osdmap(&osdc->client->monc);
+                return 0;
+        }
+        dout("send_request %p tid %llu to osd%d flags %d\n",
+             req, req->r_tid, req->r_osd->o_osd, req->r_flags);
+        reqhead = req->r_request->front.iov_base;
+        reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
+        reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */
+        reqhead->reassert_version = req->r_reassert_version;
+        req->r_stamp = jiffies;
+        list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
+        ceph_msg_get(req->r_request); /* send consumes a ref */
+        ceph_con_send(&req->r_osd->o_con, req->r_request);
+        req->r_sent = req->r_osd->o_incarnation;
+        return 0;
+}
+/*
+ * Timeout callback, called every N seconds when 1 or more osd
+ * requests has been active for more than N seconds.  When this
+ * happens, we ping all OSDs with requests who have timed out to
+ * ensure any communications channel reset is detected.  Reset the
+ * request timeouts another N seconds in the future as we go.
+ * Reschedule the timeout event another N seconds in future (unless
+ * there are no open requests).
+ */
+static void handle_timeout(struct work_struct *work)
+{
+        struct ceph_osd_client *osdc =
+                container_of(work, struct ceph_osd_client, timeout_work.work);
+        struct ceph_osd_request *req, *last_req = NULL;
+        struct ceph_osd *osd;
+        unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
+        unsigned long keepalive =
+                osdc->client->mount_args->osd_keepalive_timeout * HZ;
+        unsigned long last_stamp = 0;
+        struct rb_node *p;
+        struct list_head slow_osds;
+        dout("timeout\n");
+        down_read(&osdc->map_sem);
+        ceph_monc_request_next_osdmap(&osdc->client->monc);
+        mutex_lock(&osdc->request_mutex);
+        for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+                req = rb_entry(p, struct ceph_osd_request, r_node);
+                if (req->r_resend) {
+                        int err;
+                        dout("osdc resending prev failed %lld\n", req->r_tid);
+                        err = __send_request(osdc, req);
+                        if (err)
+                                dout("osdc failed again on %lld\n", req->r_tid);
+                        else
+                                req->r_resend = false;
+                        continue;
+                }
+        }
+        /*
+         * reset osds that appear to be _really_ unresponsive.  this
+         * is a failsafe measure.. we really shouldn't be getting to
+         * this point if the system is working properly.  the monitors
+         * should mark the osd as failed and we should find out about
+         * it from an updated osd map.
+         */
+        while (!list_empty(&osdc->req_lru)) {
+                req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
+                                 r_req_lru_item);
+                if (time_before(jiffies, req->r_stamp + timeout))
+                        break;
+                BUG_ON(req == last_req && req->r_stamp == last_stamp);
+                last_req = req;
+                last_stamp = req->r_stamp;
+                osd = req->r_osd;
+                BUG_ON(!osd);
+                pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
+                           req->r_tid, osd->o_osd);
+                __kick_requests(osdc, osd);
+        }
+        /*
+         * ping osds that are a bit slow.  this ensures that if there
+         * is a break in the TCP connection we will notice, and reopen
+         * a connection with that osd (from the fault callback).
+         */
+        INIT_LIST_HEAD(&slow_osds);
+        list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
+                if (time_before(jiffies, req->r_stamp + keepalive))
+                        break;
+                osd = req->r_osd;
+                BUG_ON(!osd);
+                dout(" tid %llu is slow, will send keepalive on osd%d\n",
+                     req->r_tid, osd->o_osd);
+                list_move_tail(&osd->o_keepalive_item, &slow_osds);
+        }
+        while (!list_empty(&slow_osds)) {
+                osd = list_entry(slow_osds.next, struct ceph_osd,
+                                 o_keepalive_item);
+                list_del_init(&osd->o_keepalive_item);
+                ceph_con_keepalive(&osd->o_con);
+        }
+        __schedule_osd_timeout(osdc);
+        mutex_unlock(&osdc->request_mutex);
+        up_read(&osdc->map_sem);
+}
+static void handle_osds_timeout(struct work_struct *work)
+{
+        struct ceph_osd_client *osdc =
+                container_of(work, struct ceph_osd_client,
+                             osds_timeout_work.work);
+        unsigned long delay =
+                osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
+        dout("osds timeout\n");
+        down_read(&osdc->map_sem);
+        remove_old_osds(osdc, 0);
+        up_read(&osdc->map_sem);
+        schedule_delayed_work(&osdc->osds_timeout_work,
+                              round_jiffies_relative(delay));
+}
+/*
+ * handle osd op reply.  either call the callback if it is specified,
+ * or do the completion to wake up the waiting thread.
+ */
+static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
+                         struct ceph_connection *con)
+{
+        struct ceph_osd_reply_head *rhead = msg->front.iov_base;
+        struct ceph_osd_request *req;
+        u64 tid;
+        int numops, object_len, flags;
+        tid = le64_to_cpu(msg->hdr.tid);
+        if (msg->front.iov_len < sizeof(*rhead))
+                goto bad;
+        numops = le32_to_cpu(rhead->num_ops);
+        object_len = le32_to_cpu(rhead->object_len);
+        if (msg->front.iov_len != sizeof(*rhead) + object_len +
+            numops * sizeof(struct ceph_osd_op))
+                goto bad;
+        dout("handle_reply %p tid %llu\n", msg, tid);
+        /* lookup */
+        mutex_lock(&osdc->request_mutex);
+        req = __lookup_request(osdc, tid);
+        if (req == NULL) {
+                dout("handle_reply tid %llu dne\n", tid);
+                mutex_unlock(&osdc->request_mutex);
+                return;
+        }
+        ceph_osdc_get_request(req);
+        flags = le32_to_cpu(rhead->flags);
+        /*
+         * if this connection filled our message, drop our reference now, to
+         * avoid a (safe but slower) revoke later.
+         */
+        if (req->r_con_filling_msg == con && req->r_reply == msg) {
+                dout(" dropping con_filling_msg ref %p\n", con);
+                req->r_con_filling_msg = NULL;
+                ceph_con_put(con);
+        }
+        if (!req->r_got_reply) {
+                unsigned bytes;
+                req->r_result = le32_to_cpu(rhead->result);
+                bytes = le32_to_cpu(msg->hdr.data_len);
+                dout("handle_reply result %d bytes %d\n", req->r_result,
+                     bytes);
+                if (req->r_result == 0)
+                        req->r_result = bytes;
+                /* in case this is a write and we need to replay, */
+                req->r_reassert_version = rhead->reassert_version;
+                req->r_got_reply = 1;
+        } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
+                dout("handle_reply tid %llu dup ack\n", tid);
+                mutex_unlock(&osdc->request_mutex);
+                goto done;
+        }
+        dout("handle_reply tid %llu flags %d\n", tid, flags);
+        /* either this is a read, or we got the safe response */
+        if ((flags & CEPH_OSD_FLAG_ONDISK) ||
+            ((flags & CEPH_OSD_FLAG_WRITE) == 0))
+                __unregister_request(osdc, req);
+        mutex_unlock(&osdc->request_mutex);
+        if (req->r_callback)
+                req->r_callback(req, msg);
+        else
+                complete(&req->r_completion);
+        if (flags & CEPH_OSD_FLAG_ONDISK) {
+                if (req->r_safe_callback)
+                        req->r_safe_callback(req, msg);
+                complete(&req->r_safe_completion);  /* fsync waiter */
+        }
+done:
+        ceph_osdc_put_request(req);
+        return;
+bad:
+        pr_err("corrupt osd_op_reply got %d %d expected %d\n",
+               (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
+               (int)sizeof(*rhead));
+        ceph_msg_dump(msg);
+}
+static int __kick_requests(struct ceph_osd_client *osdc,
+                          struct ceph_osd *kickosd)
+{
+        struct ceph_osd_request *req;
+        struct rb_node *p, *n;
+        int needmap = 0;
+        int err;
+        dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
+        if (kickosd) {
+                err = __reset_osd(osdc, kickosd);
+                if (err == -EAGAIN)
+                        return 1;
+        } else {
+                for (p = rb_first(&osdc->osds); p; p = n) {
+                        struct ceph_osd *osd =
+                                rb_entry(p, struct ceph_osd, o_node);
+                        n = rb_next(p);
+                        if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
+                            memcmp(&osd->o_con.peer_addr,
+                                   ceph_osd_addr(osdc->osdmap,
+                                                 osd->o_osd),
+                                   sizeof(struct ceph_entity_addr)) != 0)
+                                __reset_osd(osdc, osd);
+                }
+        }
+        for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+                req = rb_entry(p, struct ceph_osd_request, r_node);
+                if (req->r_resend) {
+                        dout(" r_resend set on tid %llu\n", req->r_tid);
+                        __cancel_request(req);
+                        goto kick;
+                }
+                if (req->r_osd && kickosd == req->r_osd) {
+                        __cancel_request(req);
+                        goto kick;
+                }
+                err = __map_osds(osdc, req);
+                if (err == 0)
+                        continue;  /* no change */
+                if (err < 0) {
+                        /*
+                         * FIXME: really, we should set the request
+                         * error and fail if this isn't a 'nofail'
+                         * request, but that's a fair bit more
+                         * complicated to do.  So retry!
+                         */
+                        dout(" setting r_resend on %llu\n", req->r_tid);
+                        req->r_resend = true;
+                        continue;
+                }
+                if (req->r_osd == NULL) {
+                        dout("tid %llu maps to no valid osd\n", req->r_tid);
+                        needmap++;  /* request a newer map */
+                        continue;
+                }
+kick:
+                dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
+                     req->r_osd ? req->r_osd->o_osd : -1);
+                req->r_flags |= CEPH_OSD_FLAG_RETRY;
+                err = __send_request(osdc, req);
+                if (err) {
+                        dout(" setting r_resend on %llu\n", req->r_tid);
+                        req->r_resend = true;
+                }
+        }
+        return needmap;
+}
+/*
+ * Resubmit osd requests whose osd or osd address has changed.  Request
+ * a new osd map if osds are down, or we are otherwise unable to determine
+ * how to direct a request.
+ *
+ * Close connections to down osds.
+ *
+ * If @who is specified, resubmit requests for that specific osd.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static void kick_requests(struct ceph_osd_client *osdc,
+                          struct ceph_osd *kickosd)
+{
+        int needmap;
+        mutex_lock(&osdc->request_mutex);
+        needmap = __kick_requests(osdc, kickosd);
+        mutex_unlock(&osdc->request_mutex);
+        if (needmap) {
+                dout("%d requests for down osds, need new map\n", needmap);
+                ceph_monc_request_next_osdmap(&osdc->client->monc);
+        }
+}
+/*
+ * Process updated osd map.
+ *
+ * The message contains any number of incremental and full maps, normally
+ * indicating some sort of topology change in the cluster.  Kick requests
+ * off to different OSDs as needed.
+ */
+void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+{
+        void *p, *end, *next;
+        u32 nr_maps, maplen;
+        u32 epoch;
+        struct ceph_osdmap *newmap = NULL, *oldmap;
+        int err;
+        struct ceph_fsid fsid;
+        dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
+        p = msg->front.iov_base;
+        end = p + msg->front.iov_len;
+        /* verify fsid */
+        ceph_decode_need(&p, end, sizeof(fsid), bad);
+        ceph_decode_copy(&p, &fsid, sizeof(fsid));
+        if (ceph_check_fsid(osdc->client, &fsid) < 0)
+                return;
+        down_write(&osdc->map_sem);
+        /* incremental maps */
+        ceph_decode_32_safe(&p, end, nr_maps, bad);
+        dout(" %d inc maps\n", nr_maps);
+        while (nr_maps > 0) {
+                ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+                epoch = ceph_decode_32(&p);
+                maplen = ceph_decode_32(&p);
+                ceph_decode_need(&p, end, maplen, bad);
+                next = p + maplen;
+                if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
+                        dout("applying incremental map %u len %d\n",
+                             epoch, maplen);
+                        newmap = osdmap_apply_incremental(&p, next,
+                                                          osdc->osdmap,
+                                                          osdc->client->msgr);
+                        if (IS_ERR(newmap)) {
+                                err = PTR_ERR(newmap);
+                                goto bad;
+                        }
+                        BUG_ON(!newmap);
+                        if (newmap != osdc->osdmap) {
+                                ceph_osdmap_destroy(osdc->osdmap);
+                                osdc->osdmap = newmap;
+                        }
+                } else {
+                        dout("ignoring incremental map %u len %d\n",
+                             epoch, maplen);
+                }
+                p = next;
+                nr_maps--;
+        }
+        if (newmap)
+                goto done;
+        /* full maps */
+        ceph_decode_32_safe(&p, end, nr_maps, bad);
+        dout(" %d full maps\n", nr_maps);
+        while (nr_maps) {
+                ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+                epoch = ceph_decode_32(&p);
+                maplen = ceph_decode_32(&p);
+                ceph_decode_need(&p, end, maplen, bad);
+                if (nr_maps > 1) {
+                        dout("skipping non-latest full map %u len %d\n",
+                             epoch, maplen);
+                } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
+                        dout("skipping full map %u len %d, "
+                             "older than our %u\n", epoch, maplen,
+                             osdc->osdmap->epoch);
+                } else {
+                        dout("taking full map %u len %d\n", epoch, maplen);
+                        newmap = osdmap_decode(&p, p+maplen);
+                        if (IS_ERR(newmap)) {
+                                err = PTR_ERR(newmap);
+                                goto bad;
+                        }
+                        BUG_ON(!newmap);
+                        oldmap = osdc->osdmap;
+                        osdc->osdmap = newmap;
+                        if (oldmap)
+                                ceph_osdmap_destroy(oldmap);
+                }
+                p += maplen;
+                nr_maps--;
+        }
+done:
+        downgrade_write(&osdc->map_sem);
+        ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
+        if (newmap)
+                kick_requests(osdc, NULL);
+        up_read(&osdc->map_sem);
+        return;
+bad:
+        pr_err("osdc handle_map corrupt msg\n");
+        ceph_msg_dump(msg);
+        up_write(&osdc->map_sem);
+        return;
+}
+/*
+ * A read request prepares specific pages that data is to be read into.
+ * When a message is being read off the wire, we call prepare_pages to
+ * find those pages.
+ *  0 = success, -1 failure.
+ */
+static int __prepare_pages(struct ceph_connection *con,
+                         struct ceph_msg_header *hdr,
+                         struct ceph_osd_request *req,
+                         u64 tid,
+                         struct ceph_msg *m)
+{
+        struct ceph_osd *osd = con->private;
+        struct ceph_osd_client *osdc;
+        int ret = -1;
+        int data_len = le32_to_cpu(hdr->data_len);
+        unsigned data_off = le16_to_cpu(hdr->data_off);
+        int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
+        if (!osd)
+                return -1;
+        osdc = osd->o_osdc;
+        dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
+             tid, req->r_num_pages, want);
+        if (unlikely(req->r_num_pages < want))
+                goto out;
+        m->pages = req->r_pages;
+        m->nr_pages = req->r_num_pages;
+        ret = 0; /* success */
+out:
+        BUG_ON(ret < 0 || m->nr_pages < want);
+        return ret;
+}
+/*
+ * Register request, send initial attempt.
+ */
+int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+                            struct ceph_osd_request *req,
+                            bool nofail)
+{
+        int rc = 0;
+        req->r_request->pages = req->r_pages;
+        req->r_request->nr_pages = req->r_num_pages;
+        register_request(osdc, req);
+        down_read(&osdc->map_sem);
+        mutex_lock(&osdc->request_mutex);
+        /*
+         * a racing kick_requests() may have sent the message for us
+         * while we dropped request_mutex above, so only send now if
+         * the request still han't been touched yet.
+         */
+        if (req->r_sent == 0) {
+                rc = __send_request(osdc, req);
+                if (rc) {
+                        if (nofail) {
+                                dout("osdc_start_request failed send, "
+                                     " marking %lld\n", req->r_tid);
+                                req->r_resend = true;
+                                rc = 0;
+                        } else {
+                                __unregister_request(osdc, req);
+                        }
+                }
+        }
+        mutex_unlock(&osdc->request_mutex);
+        up_read(&osdc->map_sem);
+        return rc;
+}
+/*
+ * wait for a request to complete
+ */
+int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+                           struct ceph_osd_request *req)
+{
+        int rc;
+        rc = wait_for_completion_interruptible(&req->r_completion);
+        if (rc < 0) {
+                mutex_lock(&osdc->request_mutex);
+                __cancel_request(req);
+                __unregister_request(osdc, req);
+                mutex_unlock(&osdc->request_mutex);
+                dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
+                return rc;
+        }
+        dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
+        return req->r_result;
+}
+/*
+ * sync - wait for all in-flight requests to flush.  avoid starvation.
+ */
+void ceph_osdc_sync(struct ceph_osd_client *osdc)
+{
+        struct ceph_osd_request *req;
+        u64 last_tid, next_tid = 0;
+        mutex_lock(&osdc->request_mutex);
+        last_tid = osdc->last_tid;
+        while (1) {
+                req = __lookup_request_ge(osdc, next_tid);
+                if (!req)
+                        break;
+                if (req->r_tid > last_tid)
+                        break;
+                next_tid = req->r_tid + 1;
+                if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
+                        continue;
+                ceph_osdc_get_request(req);
+                mutex_unlock(&osdc->request_mutex);
+                dout("sync waiting on tid %llu (last is %llu)\n",
+                     req->r_tid, last_tid);
+                wait_for_completion(&req->r_safe_completion);
+                mutex_lock(&osdc->request_mutex);
+                ceph_osdc_put_request(req);
+        }
+        mutex_unlock(&osdc->request_mutex);
+        dout("sync done (thru tid %llu)\n", last_tid);
+}
+/*
+ * init, shutdown
+ */
+int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
+{
+        int err;
+        dout("init\n");
+        osdc->client = client;
+        osdc->osdmap = NULL;
+        init_rwsem(&osdc->map_sem);
+        init_completion(&osdc->map_waiters);
+        osdc->last_requested_map = 0;
+        mutex_init(&osdc->request_mutex);
+        osdc->last_tid = 0;
+        osdc->osds = RB_ROOT;
+        INIT_LIST_HEAD(&osdc->osd_lru);
+        osdc->requests = RB_ROOT;
+        INIT_LIST_HEAD(&osdc->req_lru);
+        osdc->num_requests = 0;
+        INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
+        INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
+        schedule_delayed_work(&osdc->osds_timeout_work,
+           round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
+        err = -ENOMEM;
+        osdc->req_mempool = mempool_create_kmalloc_pool(10,
+                                        sizeof(struct ceph_osd_request));
+        if (!osdc->req_mempool)
+                goto out;
+        err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true);
+        if (err < 0)
+                goto out_mempool;
+        err = ceph_msgpool_init(&osdc->msgpool_op_reply,
+                                OSD_OPREPLY_FRONT_LEN, 10, true);
+        if (err < 0)
+                goto out_msgpool;
+        return 0;
+out_msgpool:
+        ceph_msgpool_destroy(&osdc->msgpool_op);
+out_mempool:
+        mempool_destroy(osdc->req_mempool);
+out:
+        return err;
+}
+void ceph_osdc_stop(struct ceph_osd_client *osdc)
+{
+        cancel_delayed_work_sync(&osdc->timeout_work);
+        cancel_delayed_work_sync(&osdc->osds_timeout_work);
+        if (osdc->osdmap) {
+                ceph_osdmap_destroy(osdc->osdmap);
+                osdc->osdmap = NULL;
+        }
+        remove_old_osds(osdc, 1);
+        mempool_destroy(osdc->req_mempool);
+        ceph_msgpool_destroy(&osdc->msgpool_op);
+        ceph_msgpool_destroy(&osdc->msgpool_op_reply);
+}
+/*
+ * Read some contiguous pages.  If we cross a stripe boundary, shorten
+ * *plen.  Return number of bytes read, or error.
+ */
+int ceph_osdc_readpages(struct ceph_osd_client *osdc,
+                        struct ceph_vino vino, struct ceph_file_layout *layout,
+                        u64 off, u64 *plen,
+                        u32 truncate_seq, u64 truncate_size,
+                        struct page **pages, int num_pages)
+{
+        struct ceph_osd_request *req;
+        int rc = 0;
+        dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
+             vino.snap, off, *plen);
+        req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
+                                    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+                                    NULL, 0, truncate_seq, truncate_size, NULL,
+                                    false, 1);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        /* it may be a short read due to an object boundary */
+        req->r_pages = pages;
+        num_pages = calc_pages_for(off, *plen);
+        req->r_num_pages = num_pages;
+        dout("readpages  final extent is %llu~%llu (%d pages)\n",
+             off, *plen, req->r_num_pages);
+        rc = ceph_osdc_start_request(osdc, req, false);
+        if (!rc)
+                rc = ceph_osdc_wait_request(osdc, req);
+        ceph_osdc_put_request(req);
+        dout("readpages result %d\n", rc);
+        return rc;
+}
+/*
+ * do a synchronous write on N pages
+ */
+int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
+                         struct ceph_file_layout *layout,
+                         struct ceph_snap_context *snapc,
+                         u64 off, u64 len,
+                         u32 truncate_seq, u64 truncate_size,
+                         struct timespec *mtime,
+                         struct page **pages, int num_pages,
+                         int flags, int do_sync, bool nofail)
+{
+        struct ceph_osd_request *req;
+        int rc = 0;
+        BUG_ON(vino.snap != CEPH_NOSNAP);
+        req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
+                                    CEPH_OSD_OP_WRITE,
+                                    flags | CEPH_OSD_FLAG_ONDISK |
+                                            CEPH_OSD_FLAG_WRITE,
+                                    snapc, do_sync,
+                                    truncate_seq, truncate_size, mtime,
+                                    nofail, 1);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        /* it may be a short write due to an object boundary */
+        req->r_pages = pages;
+        req->r_num_pages = calc_pages_for(off, len);
+        dout("writepages %llu~%llu (%d pages)\n", off, len,
+             req->r_num_pages);
+        rc = ceph_osdc_start_request(osdc, req, nofail);
+        if (!rc)
+                rc = ceph_osdc_wait_request(osdc, req);
+        ceph_osdc_put_request(req);
+        if (rc == 0)
+                rc = len;
+        dout("writepages result %d\n", rc);
+        return rc;
+}
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+        struct ceph_osd *osd = con->private;
+        struct ceph_osd_client *osdc;
+        int type = le16_to_cpu(msg->hdr.type);
+        if (!osd)
+                return;
+        osdc = osd->o_osdc;
+        switch (type) {
+        case CEPH_MSG_OSD_MAP:
+                ceph_osdc_handle_map(osdc, msg);
+                break;
+        case CEPH_MSG_OSD_OPREPLY:
+                handle_reply(osdc, msg, con);
+                break;
+        default:
+                pr_err("received unknown message type %d %s\n", type,
+                       ceph_msg_type_name(type));
+        }
+        ceph_msg_put(msg);
+}
+/*
+ * lookup and return message for incoming reply
+ */
+static struct ceph_msg *get_reply(struct ceph_connection *con,
+                                  struct ceph_msg_header *hdr,
+                                  int *skip)
+{
+        struct ceph_osd *osd = con->private;
+        struct ceph_osd_client *osdc = osd->o_osdc;
+        struct ceph_msg *m;
+        struct ceph_osd_request *req;
+        int front = le32_to_cpu(hdr->front_len);
+        int data_len = le32_to_cpu(hdr->data_len);
+        u64 tid;
+        int err;
+        tid = le64_to_cpu(hdr->tid);
+        mutex_lock(&osdc->request_mutex);
+        req = __lookup_request(osdc, tid);
+        if (!req) {
+                *skip = 1;
+                m = NULL;
+                pr_info("get_reply unknown tid %llu from osd%d\n", tid,
+                        osd->o_osd);
+                goto out;
+        }
+        if (req->r_con_filling_msg) {
+                dout("get_reply revoking msg %p from old con %p\n",
+                     req->r_reply, req->r_con_filling_msg);
+                ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
+                ceph_con_put(req->r_con_filling_msg);
+        }
+        if (front > req->r_reply->front.iov_len) {
+                pr_warning("get_reply front %d > preallocated %d\n",
+                           front, (int)req->r_reply->front.iov_len);
+                m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
+                if (IS_ERR(m))
+                        goto out;
+                ceph_msg_put(req->r_reply);
+                req->r_reply = m;
+        }
+        m = ceph_msg_get(req->r_reply);
+        if (data_len > 0) {
+                err = __prepare_pages(con, hdr, req, tid, m);
+                if (err < 0) {
+                        *skip = 1;
+                        ceph_msg_put(m);
+                        m = ERR_PTR(err);
+                }
+        }
+        *skip = 0;
+        req->r_con_filling_msg = ceph_con_get(con);
+        dout("get_reply tid %lld %p\n", tid, m);
+out:
+        mutex_unlock(&osdc->request_mutex);
+        return m;
+}
+static struct ceph_msg *alloc_msg(struct ceph_connection *con,
+                                  struct ceph_msg_header *hdr,
+                                  int *skip)
+{
+        struct ceph_osd *osd = con->private;
+        int type = le16_to_cpu(hdr->type);
+        int front = le32_to_cpu(hdr->front_len);
+        switch (type) {
+        case CEPH_MSG_OSD_MAP:
+                return ceph_msg_new(type, front, 0, 0, NULL);
+        case CEPH_MSG_OSD_OPREPLY:
+                return get_reply(con, hdr, skip);
+        default:
+                pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
+                        osd->o_osd);
+                *skip = 1;
+                return NULL;
+        }
+}
+/*
+ * Wrappers to refcount containing ceph_osd struct
+ */
+static struct ceph_connection *get_osd_con(struct ceph_connection *con)
+{
+        struct ceph_osd *osd = con->private;
+        if (get_osd(osd))
+                return con;
+        return NULL;
+}
+static void put_osd_con(struct ceph_connection *con)
+{
+        struct ceph_osd *osd = con->private;
+        put_osd(osd);
+}
+/*
+ * authentication
+ */
+static int get_authorizer(struct ceph_connection *con,
+                          void **buf, int *len, int *proto,
+                          void **reply_buf, int *reply_len, int force_new)
+{
+        struct ceph_osd *o = con->private;
+        struct ceph_osd_client *osdc = o->o_osdc;
+        struct ceph_auth_client *ac = osdc->client->monc.auth;
+        int ret = 0;
+        if (force_new && o->o_authorizer) {
+                ac->ops->destroy_authorizer(ac, o->o_authorizer);
+                o->o_authorizer = NULL;
+        }
+        if (o->o_authorizer == NULL) {
+                ret = ac->ops->create_authorizer(
+                        ac, CEPH_ENTITY_TYPE_OSD,
+                        &o->o_authorizer,
+                        &o->o_authorizer_buf,
+                        &o->o_authorizer_buf_len,
+                        &o->o_authorizer_reply_buf,
+                        &o->o_authorizer_reply_buf_len);
+                if (ret)
+                return ret;
+        }
+        *proto = ac->protocol;
+        *buf = o->o_authorizer_buf;
+        *len = o->o_authorizer_buf_len;
+        *reply_buf = o->o_authorizer_reply_buf;
+        *reply_len = o->o_authorizer_reply_buf_len;
+        return 0;
+}
+static int verify_authorizer_reply(struct ceph_connection *con, int len)
+{
+        struct ceph_osd *o = con->private;
+        struct ceph_osd_client *osdc = o->o_osdc;
+        struct ceph_auth_client *ac = osdc->client->monc.auth;
+        return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
+}
+static int invalidate_authorizer(struct ceph_connection *con)
+{
+        struct ceph_osd *o = con->private;
+        struct ceph_osd_client *osdc = o->o_osdc;
+        struct ceph_auth_client *ac = osdc->client->monc.auth;
+        if (ac->ops->invalidate_authorizer)
+                ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
+        return ceph_monc_validate_auth(&osdc->client->monc);
+}
+const static struct ceph_connection_operations osd_con_ops = {
+        .get = get_osd_con,
+        .put = put_osd_con,
+        .dispatch = dispatch,
+        .get_authorizer = get_authorizer,
+        .verify_authorizer_reply = verify_authorizer_reply,
+        .invalidate_authorizer = invalidate_authorizer,
+        .alloc_msg = alloc_msg,
+        .fault = osd_reset,
+};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
new file mode 100644
index 000000000000..b0759911e7c3
--- /dev/null
+++ b/fs/ceph/osd_client.h
@@ -0,0 +1,166 @@
+#ifndef _FS_CEPH_OSD_CLIENT_H
+#define _FS_CEPH_OSD_CLIENT_H
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/mempool.h>
+#include <linux/rbtree.h>
+#include "types.h"
+#include "osdmap.h"
+#include "messenger.h"
+struct ceph_msg;
+struct ceph_snap_context;
+struct ceph_osd_request;
+struct ceph_osd_client;
+struct ceph_authorizer;
+/*
+ * completion callback for async writepages
+ */
+typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
+                                     struct ceph_msg *);
+/* a given osd we're communicating with */
+struct ceph_osd {
+        atomic_t o_ref;
+        struct ceph_osd_client *o_osdc;
+        int o_osd;
+        int o_incarnation;
+        struct rb_node o_node;
+        struct ceph_connection o_con;
+        struct list_head o_requests;
+        struct list_head o_osd_lru;
+        struct ceph_authorizer *o_authorizer;
+        void *o_authorizer_buf, *o_authorizer_reply_buf;
+        size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
+        unsigned long lru_ttl;
+        int o_marked_for_keepalive;
+        struct list_head o_keepalive_item;
+};
+/* an in-flight request */
+struct ceph_osd_request {
+        u64             r_tid;              /* unique for this client */
+        struct rb_node  r_node;
+        struct list_head r_req_lru_item;
+        struct list_head r_osd_item;
+        struct ceph_osd *r_osd;
+        struct ceph_pg   r_pgid;
+        struct ceph_connection *r_con_filling_msg;
+        struct ceph_msg  *r_request, *r_reply;
+        int               r_result;
+        int               r_flags;     /* any additional flags for the osd */
+        u32               r_sent;      /* >0 if r_request is sending/sent */
+        int               r_got_reply;
+        struct ceph_osd_client *r_osdc;
+        struct kref       r_kref;
+        bool              r_mempool;
+        struct completion r_completion, r_safe_completion;
+        ceph_osdc_callback_t r_callback, r_safe_callback;
+        struct ceph_eversion r_reassert_version;
+        struct list_head  r_unsafe_item;
+        struct inode *r_inode;                /* for use by callbacks */
+        struct writeback_control *r_wbc;      /* ditto */
+        char              r_oid[40];          /* object name */
+        int               r_oid_len;
+        unsigned long     r_stamp;            /* send OR check time */
+        bool              r_resend;           /* msg send failed, needs retry */
+        struct ceph_file_layout r_file_layout;
+        struct ceph_snap_context *r_snapc;    /* snap context for writes */
+        unsigned          r_num_pages;        /* size of page array (follows) */
+        struct page     **r_pages;            /* pages for data payload */
+        int               r_pages_from_pool;
+        int               r_own_pages;        /* if true, i own page list */
+};
+struct ceph_osd_client {
+        struct ceph_client     *client;
+        struct ceph_osdmap     *osdmap;       /* current map */
+        struct rw_semaphore    map_sem;
+        struct completion      map_waiters;
+        u64                    last_requested_map;
+        struct mutex           request_mutex;
+        struct rb_root         osds;          /* osds */
+        struct list_head       osd_lru;       /* idle osds */
+        u64                    timeout_tid;   /* tid of timeout triggering rq */
+        u64                    last_tid;      /* tid of last request */
+        struct rb_root         requests;      /* pending requests */
+        struct list_head       req_lru;       /* pending requests lru */
+        int                    num_requests;
+        struct delayed_work    timeout_work;
+        struct delayed_work    osds_timeout_work;
+#ifdef CONFIG_DEBUG_FS
+        struct dentry          *debugfs_file;
+#endif
+        mempool_t              *req_mempool;
+        struct ceph_msgpool     msgpool_op;
+        struct ceph_msgpool     msgpool_op_reply;
+};
+extern int ceph_osdc_init(struct ceph_osd_client *osdc,
+                          struct ceph_client *client);
+extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
+extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
+                                   struct ceph_msg *msg);
+extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
+                                 struct ceph_msg *msg);
+extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
+                                      struct ceph_file_layout *layout,
+                                      struct ceph_vino vino,
+                                      u64 offset, u64 *len, int op, int flags,
+                                      struct ceph_snap_context *snapc,
+                                      int do_sync, u32 truncate_seq,
+                                      u64 truncate_size,
+                                      struct timespec *mtime,
+                                      bool use_mempool, int num_reply);
+static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
+{
+        kref_get(&req->r_kref);
+}
+extern void ceph_osdc_release_request(struct kref *kref);
+static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
+{
+        kref_put(&req->r_kref, ceph_osdc_release_request);
+}
+extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+                                   struct ceph_osd_request *req,
+                                   bool nofail);
+extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+                                  struct ceph_osd_request *req);
+extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
+extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
+                               struct ceph_vino vino,
+                               struct ceph_file_layout *layout,
+                               u64 off, u64 *plen,
+                               u32 truncate_seq, u64 truncate_size,
+                               struct page **pages, int nr_pages);
+extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
+                                struct ceph_vino vino,
+                                struct ceph_file_layout *layout,
+                                struct ceph_snap_context *sc,
+                                u64 off, u64 len,
+                                u32 truncate_seq, u64 truncate_size,
+                                struct timespec *mtime,
+                                struct page **pages, int nr_pages,
+                                int flags, int do_sync, bool nofail);
+#endif
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
new file mode 100644
index 000000000000..21c6623c4b07
--- /dev/null
+++ b/fs/ceph/osdmap.c
@@ -0,0 +1,1024 @@
+#include "ceph_debug.h"
+#include <linux/slab.h>
+#include <asm/div64.h>
+#include "super.h"
+#include "osdmap.h"
+#include "crush/hash.h"
+#include "crush/mapper.h"
+#include "decode.h"
+char *ceph_osdmap_state_str(char *str, int len, int state)
+{
+        int flag = 0;
+        if (!len)
+                goto done;
+        *str = '\0';
+        if (state) {
+                if (state & CEPH_OSD_EXISTS) {
+                        snprintf(str, len, "exists");
+                        flag = 1;
+                }
+                if (state & CEPH_OSD_UP) {
+                        snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
+                                 "up");
+                        flag = 1;
+                }
+        } else {
+                snprintf(str, len, "doesn't exist");
+        }
+done:
+        return str;
+}
+/* maps */
+static int calc_bits_of(unsigned t)
+{
+        int b = 0;
+        while (t) {
+                t = t >> 1;
+                b++;
+        }
+        return b;
+}
+/*
+ * the foo_mask is the smallest value 2^n-1 that is >= foo.
+ */
+static void calc_pg_masks(struct ceph_pg_pool_info *pi)
+{
+        pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
+        pi->pgp_num_mask =
+                (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
+        pi->lpg_num_mask =
+                (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
+        pi->lpgp_num_mask =
+                (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
+}
+/*
+ * decode crush map
+ */
+static int crush_decode_uniform_bucket(void **p, void *end,
+                                       struct crush_bucket_uniform *b)
+{
+        dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
+        ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
+        b->item_weight = ceph_decode_32(p);
+        return 0;
+bad:
+        return -EINVAL;
+}
+static int crush_decode_list_bucket(void **p, void *end,
+                                    struct crush_bucket_list *b)
+{
+        int j;
+        dout("crush_decode_list_bucket %p to %p\n", *p, end);
+        b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+        if (b->item_weights == NULL)
+                return -ENOMEM;
+        b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+        if (b->sum_weights == NULL)
+                return -ENOMEM;
+        ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+        for (j = 0; j < b->h.size; j++) {
+                b->item_weights[j] = ceph_decode_32(p);
+                b->sum_weights[j] = ceph_decode_32(p);
+        }
+        return 0;
+bad:
+        return -EINVAL;
+}
+static int crush_decode_tree_bucket(void **p, void *end,
+                                    struct crush_bucket_tree *b)
+{
+        int j;
+        dout("crush_decode_tree_bucket %p to %p\n", *p, end);
+        ceph_decode_32_safe(p, end, b->num_nodes, bad);
+        b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
+        if (b->node_weights == NULL)
+                return -ENOMEM;
+        ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
+        for (j = 0; j < b->num_nodes; j++)
+                b->node_weights[j] = ceph_decode_32(p);
+        return 0;
+bad:
+        return -EINVAL;
+}
+static int crush_decode_straw_bucket(void **p, void *end,
+                                     struct crush_bucket_straw *b)
+{
+        int j;
+        dout("crush_decode_straw_bucket %p to %p\n", *p, end);
+        b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+        if (b->item_weights == NULL)
+                return -ENOMEM;
+        b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+        if (b->straws == NULL)
+                return -ENOMEM;
+        ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+        for (j = 0; j < b->h.size; j++) {
+                b->item_weights[j] = ceph_decode_32(p);
+                b->straws[j] = ceph_decode_32(p);
+        }
+        return 0;
+bad:
+        return -EINVAL;
+}
+static struct crush_map *crush_decode(void *pbyval, void *end)
+{
+        struct crush_map *c;
+        int err = -EINVAL;
+        int i, j;
+        void **p = &pbyval;
+        void *start = pbyval;
+        u32 magic;
+        dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+        c = kzalloc(sizeof(*c), GFP_NOFS);
+        if (c == NULL)
+                return ERR_PTR(-ENOMEM);
+        ceph_decode_need(p, end, 4*sizeof(u32), bad);
+        magic = ceph_decode_32(p);
+        if (magic != CRUSH_MAGIC) {
+                pr_err("crush_decode magic %x != current %x\n",
+                       (unsigned)magic, (unsigned)CRUSH_MAGIC);
+                goto bad;
+        }
+        c->max_buckets = ceph_decode_32(p);
+        c->max_rules = ceph_decode_32(p);
+        c->max_devices = ceph_decode_32(p);
+        c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
+        if (c->device_parents == NULL)
+                goto badmem;
+        c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
+        if (c->bucket_parents == NULL)
+                goto badmem;
+        c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
+        if (c->buckets == NULL)
+                goto badmem;
+        c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
+        if (c->rules == NULL)
+                goto badmem;
+        /* buckets */
+        for (i = 0; i < c->max_buckets; i++) {
+                int size = 0;
+                u32 alg;
+                struct crush_bucket *b;
+                ceph_decode_32_safe(p, end, alg, bad);
+                if (alg == 0) {
+                        c->buckets[i] = NULL;
+                        continue;
+                }
+                dout("crush_decode bucket %d off %x %p to %p\n",
+                     i, (int)(*p-start), *p, end);
+                switch (alg) {
+                case CRUSH_BUCKET_UNIFORM:
+                        size = sizeof(struct crush_bucket_uniform);
+                        break;
+                case CRUSH_BUCKET_LIST:
+                        size = sizeof(struct crush_bucket_list);
+                        break;
+                case CRUSH_BUCKET_TREE:
+                        size = sizeof(struct crush_bucket_tree);
+                        break;
+                case CRUSH_BUCKET_STRAW:
+                        size = sizeof(struct crush_bucket_straw);
+                        break;
+                default:
+                        err = -EINVAL;
+                        goto bad;
+                }
+                BUG_ON(size == 0);
+                b = c->buckets[i] = kzalloc(size, GFP_NOFS);
+                if (b == NULL)
+                        goto badmem;
+                ceph_decode_need(p, end, 4*sizeof(u32), bad);
+                b->id = ceph_decode_32(p);
+                b->type = ceph_decode_16(p);
+                b->alg = ceph_decode_8(p);
+                b->hash = ceph_decode_8(p);
+                b->weight = ceph_decode_32(p);
+                b->size = ceph_decode_32(p);
+                dout("crush_decode bucket size %d off %x %p to %p\n",
+                     b->size, (int)(*p-start), *p, end);
+                b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
+                if (b->items == NULL)
+                        goto badmem;
+                b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
+                if (b->perm == NULL)
+                        goto badmem;
+                b->perm_n = 0;
+                ceph_decode_need(p, end, b->size*sizeof(u32), bad);
+                for (j = 0; j < b->size; j++)
+                        b->items[j] = ceph_decode_32(p);
+                switch (b->alg) {
+                case CRUSH_BUCKET_UNIFORM:
+                        err = crush_decode_uniform_bucket(p, end,
+                                  (struct crush_bucket_uniform *)b);
+                        if (err < 0)
+                                goto bad;
+                        break;
+                case CRUSH_BUCKET_LIST:
+                        err = crush_decode_list_bucket(p, end,
+                               (struct crush_bucket_list *)b);
+                        if (err < 0)
+                                goto bad;
+                        break;
+                case CRUSH_BUCKET_TREE:
+                        err = crush_decode_tree_bucket(p, end,
+                                (struct crush_bucket_tree *)b);
+                        if (err < 0)
+                                goto bad;
+                        break;
+                case CRUSH_BUCKET_STRAW:
+                        err = crush_decode_straw_bucket(p, end,
+                                (struct crush_bucket_straw *)b);
+                        if (err < 0)
+                                goto bad;
+                        break;
+                }
+        }
+        /* rules */
+        dout("rule vec is %p\n", c->rules);
+        for (i = 0; i < c->max_rules; i++) {
+                u32 yes;
+                struct crush_rule *r;
+                ceph_decode_32_safe(p, end, yes, bad);
+                if (!yes) {
+                        dout("crush_decode NO rule %d off %x %p to %p\n",
+                             i, (int)(*p-start), *p, end);
+                        c->rules[i] = NULL;
+                        continue;
+                }
+                dout("crush_decode rule %d off %x %p to %p\n",
+                     i, (int)(*p-start), *p, end);
+                /* len */
+                ceph_decode_32_safe(p, end, yes, bad);
+#if BITS_PER_LONG == 32
+                err = -EINVAL;
+                if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
+                        goto bad;
+#endif
+                r = c->rules[i] = kmalloc(sizeof(*r) +
+                                          yes*sizeof(struct crush_rule_step),
+                                          GFP_NOFS);
+                if (r == NULL)
+                        goto badmem;
+                dout(" rule %d is at %p\n", i, r);
+                r->len = yes;
+                ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
+                ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
+                for (j = 0; j < r->len; j++) {
+                        r->steps[j].op = ceph_decode_32(p);
+                        r->steps[j].arg1 = ceph_decode_32(p);
+                        r->steps[j].arg2 = ceph_decode_32(p);
+                }
+        }
+        /* ignore trailing name maps. */
+        dout("crush_decode success\n");
+        return c;
+badmem:
+        err = -ENOMEM;
+bad:
+        dout("crush_decode fail %d\n", err);
+        crush_destroy(c);
+        return ERR_PTR(err);
+}
+/*
+ * osd map
+ */
+void ceph_osdmap_destroy(struct ceph_osdmap *map)
+{
+        dout("osdmap_destroy %p\n", map);
+        if (map->crush)
+                crush_destroy(map->crush);
+        while (!RB_EMPTY_ROOT(&map->pg_temp)) {
+                struct ceph_pg_mapping *pg =
+                        rb_entry(rb_first(&map->pg_temp),
+                                 struct ceph_pg_mapping, node);
+                rb_erase(&pg->node, &map->pg_temp);
+                kfree(pg);
+        }
+        while (!RB_EMPTY_ROOT(&map->pg_pools)) {
+                struct ceph_pg_pool_info *pi =
+                        rb_entry(rb_first(&map->pg_pools),
+                                 struct ceph_pg_pool_info, node);
+                rb_erase(&pi->node, &map->pg_pools);
+                kfree(pi);
+        }
+        kfree(map->osd_state);
+        kfree(map->osd_weight);
+        kfree(map->osd_addr);
+        kfree(map);
+}
+/*
+ * adjust max osd value.  reallocate arrays.
+ */
+static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
+{
+        u8 *state;
+        struct ceph_entity_addr *addr;
+        u32 *weight;
+        state = kcalloc(max, sizeof(*state), GFP_NOFS);
+        addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
+        weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
+        if (state == NULL || addr == NULL || weight == NULL) {
+                kfree(state);
+                kfree(addr);
+                kfree(weight);
+                return -ENOMEM;
+        }
+        /* copy old? */
+        if (map->osd_state) {
+                memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
+                memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
+                memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
+                kfree(map->osd_state);
+                kfree(map->osd_addr);
+                kfree(map->osd_weight);
+        }
+        map->osd_state = state;
+        map->osd_weight = weight;
+        map->osd_addr = addr;
+        map->max_osd = max;
+        return 0;
+}
+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds)
+ */
+static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+{
+        u64 a = *(u64 *)&l;
+        u64 b = *(u64 *)&r;
+        if (a < b)
+                return -1;
+        if (a > b)
+                return 1;
+        return 0;
+}
+static int __insert_pg_mapping(struct ceph_pg_mapping *new,
+                               struct rb_root *root)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct ceph_pg_mapping *pg = NULL;
+        int c;
+        while (*p) {
+                parent = *p;
+                pg = rb_entry(parent, struct ceph_pg_mapping, node);
+                c = pgid_cmp(new->pgid, pg->pgid);
+                if (c < 0)
+                        p = &(*p)->rb_left;
+                else if (c > 0)
+                        p = &(*p)->rb_right;
+                else
+                        return -EEXIST;
+        }
+        rb_link_node(&new->node, parent, p);
+        rb_insert_color(&new->node, root);
+        return 0;
+}
+static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
+                                                   struct ceph_pg pgid)
+{
+        struct rb_node *n = root->rb_node;
+        struct ceph_pg_mapping *pg;
+        int c;
+        while (n) {
+                pg = rb_entry(n, struct ceph_pg_mapping, node);
+                c = pgid_cmp(pgid, pg->pgid);
+                if (c < 0)
+                        n = n->rb_left;
+                else if (c > 0)
+                        n = n->rb_right;
+                else
+                        return pg;
+        }
+        return NULL;
+}
+/*
+ * rbtree of pg pool info
+ */
+static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct ceph_pg_pool_info *pi = NULL;
+        while (*p) {
+                parent = *p;
+                pi = rb_entry(parent, struct ceph_pg_pool_info, node);
+                if (new->id < pi->id)
+                        p = &(*p)->rb_left;
+                else if (new->id > pi->id)
+                        p = &(*p)->rb_right;
+                else
+                        return -EEXIST;
+        }
+        rb_link_node(&new->node, parent, p);
+        rb_insert_color(&new->node, root);
+        return 0;
+}
+static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
+{
+        struct ceph_pg_pool_info *pi;
+        struct rb_node *n = root->rb_node;
+        while (n) {
+                pi = rb_entry(n, struct ceph_pg_pool_info, node);
+                if (id < pi->id)
+                        n = n->rb_left;
+                else if (id > pi->id)
+                        n = n->rb_right;
+                else
+                        return pi;
+        }
+        return NULL;
+}
+void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
+{
+        ceph_decode_copy(p, &pi->v, sizeof(pi->v));
+        calc_pg_masks(pi);
+        *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
+        *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
+}
+/*
+ * decode a full map.
+ */
+struct ceph_osdmap *osdmap_decode(void **p, void *end)
+{
+        struct ceph_osdmap *map;
+        u16 version;
+        u32 len, max, i;
+        u8 ev;
+        int err = -EINVAL;
+        void *start = *p;
+        struct ceph_pg_pool_info *pi;
+        dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+        map = kzalloc(sizeof(*map), GFP_NOFS);
+        if (map == NULL)
+                return ERR_PTR(-ENOMEM);
+        map->pg_temp = RB_ROOT;
+        ceph_decode_16_safe(p, end, version, bad);
+        if (version > CEPH_OSDMAP_VERSION) {
+                pr_warning("got unknown v %d > %d of osdmap\n", version,
+                           CEPH_OSDMAP_VERSION);
+                goto bad;
+        }
+        ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
+        ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
+        map->epoch = ceph_decode_32(p);
+        ceph_decode_copy(p, &map->created, sizeof(map->created));
+        ceph_decode_copy(p, &map->modified, sizeof(map->modified));
+        ceph_decode_32_safe(p, end, max, bad);
+        while (max--) {
+                ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
+                pi = kmalloc(sizeof(*pi), GFP_NOFS);
+                if (!pi)
+                        goto bad;
+                pi->id = ceph_decode_32(p);
+                ev = ceph_decode_8(p); /* encoding version */
+                if (ev > CEPH_PG_POOL_VERSION) {
+                        pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+                                   ev, CEPH_PG_POOL_VERSION);
+                        goto bad;
+                }
+                __decode_pool(p, pi);
+                __insert_pg_pool(&map->pg_pools, pi);
+        }
+        ceph_decode_32_safe(p, end, map->pool_max, bad);
+        ceph_decode_32_safe(p, end, map->flags, bad);
+        max = ceph_decode_32(p);
+        /* (re)alloc osd arrays */
+        err = osdmap_set_max_osd(map, max);
+        if (err < 0)
+                goto bad;
+        dout("osdmap_decode max_osd = %d\n", map->max_osd);
+        /* osds */
+        err = -EINVAL;
+        ceph_decode_need(p, end, 3*sizeof(u32) +
+                         map->max_osd*(1 + sizeof(*map->osd_weight) +
+                                       sizeof(*map->osd_addr)), bad);
+        *p += 4; /* skip length field (should match max) */
+        ceph_decode_copy(p, map->osd_state, map->max_osd);
+        *p += 4; /* skip length field (should match max) */
+        for (i = 0; i < map->max_osd; i++)
+                map->osd_weight[i] = ceph_decode_32(p);
+        *p += 4; /* skip length field (should match max) */
+        ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
+        for (i = 0; i < map->max_osd; i++)
+                ceph_decode_addr(&map->osd_addr[i]);
+        /* pg_temp */
+        ceph_decode_32_safe(p, end, len, bad);
+        for (i = 0; i < len; i++) {
+                int n, j;
+                struct ceph_pg pgid;
+                struct ceph_pg_mapping *pg;
+                ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
+                ceph_decode_copy(p, &pgid, sizeof(pgid));
+                n = ceph_decode_32(p);
+                ceph_decode_need(p, end, n * sizeof(u32), bad);
+                err = -ENOMEM;
+                pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
+                if (!pg)
+                        goto bad;
+                pg->pgid = pgid;
+                pg->len = n;
+                for (j = 0; j < n; j++)
+                        pg->osds[j] = ceph_decode_32(p);
+                err = __insert_pg_mapping(pg, &map->pg_temp);
+                if (err)
+                        goto bad;
+                dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
+        }
+        /* crush */
+        ceph_decode_32_safe(p, end, len, bad);
+        dout("osdmap_decode crush len %d from off 0x%x\n", len,
+             (int)(*p - start));
+        ceph_decode_need(p, end, len, bad);
+        map->crush = crush_decode(*p, end);
+        *p += len;
+        if (IS_ERR(map->crush)) {
+                err = PTR_ERR(map->crush);
+                map->crush = NULL;
+                goto bad;
+        }
+        /* ignore the rest of the map */
+        *p = end;
+        dout("osdmap_decode done %p %p\n", *p, end);
+        return map;
+bad:
+        dout("osdmap_decode fail\n");
+        ceph_osdmap_destroy(map);
+        return ERR_PTR(err);
+}
+/*
+ * decode and apply an incremental map update.
+ */
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+                                             struct ceph_osdmap *map,
+                                             struct ceph_messenger *msgr)
+{
+        struct crush_map *newcrush = NULL;
+        struct ceph_fsid fsid;
+        u32 epoch = 0;
+        struct ceph_timespec modified;
+        u32 len, pool;
+        __s32 new_pool_max, new_flags, max;
+        void *start = *p;
+        int err = -EINVAL;
+        u16 version;
+        struct rb_node *rbp;
+        ceph_decode_16_safe(p, end, version, bad);
+        if (version > CEPH_OSDMAP_INC_VERSION) {
+                pr_warning("got unknown v %d > %d of inc osdmap\n", version,
+                           CEPH_OSDMAP_INC_VERSION);
+                goto bad;
+        }
+        ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
+                         bad);
+        ceph_decode_copy(p, &fsid, sizeof(fsid));
+        epoch = ceph_decode_32(p);
+        BUG_ON(epoch != map->epoch+1);
+        ceph_decode_copy(p, &modified, sizeof(modified));
+        new_pool_max = ceph_decode_32(p);
+        new_flags = ceph_decode_32(p);
+        /* full map? */
+        ceph_decode_32_safe(p, end, len, bad);
+        if (len > 0) {
+                dout("apply_incremental full map len %d, %p to %p\n",
+                     len, *p, end);
+                return osdmap_decode(p, min(*p+len, end));
+        }
+        /* new crush? */
+        ceph_decode_32_safe(p, end, len, bad);
+        if (len > 0) {
+                dout("apply_incremental new crush map len %d, %p to %p\n",
+                     len, *p, end);
+                newcrush = crush_decode(*p, min(*p+len, end));
+                if (IS_ERR(newcrush))
+                        return ERR_PTR(PTR_ERR(newcrush));
+        }
+        /* new flags? */
+        if (new_flags >= 0)
+                map->flags = new_flags;
+        if (new_pool_max >= 0)
+                map->pool_max = new_pool_max;
+        ceph_decode_need(p, end, 5*sizeof(u32), bad);
+        /* new max? */
+        max = ceph_decode_32(p);
+        if (max >= 0) {
+                err = osdmap_set_max_osd(map, max);
+                if (err < 0)
+                        goto bad;
+        }
+        map->epoch++;
+        map->modified = map->modified;
+        if (newcrush) {
+                if (map->crush)
+                        crush_destroy(map->crush);
+                map->crush = newcrush;
+                newcrush = NULL;
+        }
+        /* new_pool */
+        ceph_decode_32_safe(p, end, len, bad);
+        while (len--) {
+                __u8 ev;
+                struct ceph_pg_pool_info *pi;
+                ceph_decode_32_safe(p, end, pool, bad);
+                ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
+                ev = ceph_decode_8(p);  /* encoding version */
+                if (ev > CEPH_PG_POOL_VERSION) {
+                        pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+                                   ev, CEPH_PG_POOL_VERSION);
+                        goto bad;
+                }
+                pi = __lookup_pg_pool(&map->pg_pools, pool);
+                if (!pi) {
+                        pi = kmalloc(sizeof(*pi), GFP_NOFS);
+                        if (!pi) {
+                                err = -ENOMEM;
+                                goto bad;
+                        }
+                        pi->id = pool;
+                        __insert_pg_pool(&map->pg_pools, pi);
+                }
+                __decode_pool(p, pi);
+        }
+        /* old_pool */
+        ceph_decode_32_safe(p, end, len, bad);
+        while (len--) {
+                struct ceph_pg_pool_info *pi;
+                ceph_decode_32_safe(p, end, pool, bad);
+                pi = __lookup_pg_pool(&map->pg_pools, pool);
+                if (pi) {
+                        rb_erase(&pi->node, &map->pg_pools);
+                        kfree(pi);
+                }
+        }
+        /* new_up */
+        err = -EINVAL;
+        ceph_decode_32_safe(p, end, len, bad);
+        while (len--) {
+                u32 osd;
+                struct ceph_entity_addr addr;
+                ceph_decode_32_safe(p, end, osd, bad);
+                ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
+                ceph_decode_addr(&addr);
+                pr_info("osd%d up\n", osd);
+                BUG_ON(osd >= map->max_osd);
+                map->osd_state[osd] |= CEPH_OSD_UP;
+                map->osd_addr[osd] = addr;
+        }
+        /* new_down */
+        ceph_decode_32_safe(p, end, len, bad);
+        while (len--) {
+                u32 osd;
+                ceph_decode_32_safe(p, end, osd, bad);
+                (*p)++;  /* clean flag */
+                pr_info("osd%d down\n", osd);
+                if (osd < map->max_osd)
+                        map->osd_state[osd] &= ~CEPH_OSD_UP;
+        }
+        /* new_weight */
+        ceph_decode_32_safe(p, end, len, bad);
+        while (len--) {
+                u32 osd, off;
+                ceph_decode_need(p, end, sizeof(u32)*2, bad);
+                osd = ceph_decode_32(p);
+                off = ceph_decode_32(p);
+                pr_info("osd%d weight 0x%x %s\n", osd, off,
+                     off == CEPH_OSD_IN ? "(in)" :
+                     (off == CEPH_OSD_OUT ? "(out)" : ""));
+                if (osd < map->max_osd)
+                        map->osd_weight[osd] = off;
+        }
+        /* new_pg_temp */
+        rbp = rb_first(&map->pg_temp);
+        ceph_decode_32_safe(p, end, len, bad);
+        while (len--) {
+                struct ceph_pg_mapping *pg;
+                int j;
+                struct ceph_pg pgid;
+                u32 pglen;
+                ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
+                ceph_decode_copy(p, &pgid, sizeof(pgid));
+                pglen = ceph_decode_32(p);
+                /* remove any? */
+                while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
+                                                node)->pgid, pgid) <= 0) {
+                        struct rb_node *cur = rbp;
+                        rbp = rb_next(rbp);
+                        dout(" removed pg_temp %llx\n",
+                             *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
+                                               node)->pgid);
+                        rb_erase(cur, &map->pg_temp);
+                }
+                if (pglen) {
+                        /* insert */
+                        ceph_decode_need(p, end, pglen*sizeof(u32), bad);
+                        pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
+                        if (!pg) {
+                                err = -ENOMEM;
+                                goto bad;
+                        }
+                        pg->pgid = pgid;
+                        pg->len = pglen;
+                        for (j = 0; j < pglen; j++)
+                                pg->osds[j] = ceph_decode_32(p);
+                        err = __insert_pg_mapping(pg, &map->pg_temp);
+                        if (err)
+                                goto bad;
+                        dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
+                             pglen);
+                }
+        }
+        while (rbp) {
+                struct rb_node *cur = rbp;
+                rbp = rb_next(rbp);
+                dout(" removed pg_temp %llx\n",
+                     *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
+                                       node)->pgid);
+                rb_erase(cur, &map->pg_temp);
+        }
+        /* ignore the rest */
+        *p = end;
+        return map;
+bad:
+        pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
+               epoch, (int)(*p - start), *p, start, end);
+        print_hex_dump(KERN_DEBUG, "osdmap: ",
+                       DUMP_PREFIX_OFFSET, 16, 1,
+                       start, end - start, true);
+        if (newcrush)
+                crush_destroy(newcrush);
+        return ERR_PTR(err);
+}
+/*
+ * calculate file layout from given offset, length.
+ * fill in correct oid, logical length, and object extent
+ * offset, length.
+ *
+ * for now, we write only a single su, until we can
+ * pass a stride back to the caller.
+ */
+void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+                                   u64 off, u64 *plen,
+                                   u64 *ono,
+                                   u64 *oxoff, u64 *oxlen)
+{
+        u32 osize = le32_to_cpu(layout->fl_object_size);
+        u32 su = le32_to_cpu(layout->fl_stripe_unit);
+        u32 sc = le32_to_cpu(layout->fl_stripe_count);
+        u32 bl, stripeno, stripepos, objsetno;
+        u32 su_per_object;
+        u64 t, su_offset;
+        dout("mapping %llu~%llu  osize %u fl_su %u\n", off, *plen,
+             osize, su);
+        su_per_object = osize / su;
+        dout("osize %u / su %u = su_per_object %u\n", osize, su,
+             su_per_object);
+        BUG_ON((su & ~PAGE_MASK) != 0);
+        /* bl = *off / su; */
+        t = off;
+        do_div(t, su);
+        bl = t;
+        dout("off %llu / su %u = bl %u\n", off, su, bl);
+        stripeno = bl / sc;
+        stripepos = bl % sc;
+        objsetno = stripeno / su_per_object;
+        *ono = objsetno * sc + stripepos;
+        dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
+        /* *oxoff = *off % layout->fl_stripe_unit;  # offset in su */
+        t = off;
+        su_offset = do_div(t, su);
+        *oxoff = su_offset + (stripeno % su_per_object) * su;
+        /*
+         * Calculate the length of the extent being written to the selected
+         * object. This is the minimum of the full length requested (plen) or
+         * the remainder of the current stripe being written to.
+         */
+        *oxlen = min_t(u64, *plen, su - su_offset);
+        *plen = *oxlen;
+        dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
+}
+/*
+ * calculate an object layout (i.e. pgid) from an oid,
+ * file_layout, and osdmap
+ */
+int ceph_calc_object_layout(struct ceph_object_layout *ol,
+                            const char *oid,
+                            struct ceph_file_layout *fl,
+                            struct ceph_osdmap *osdmap)
+{
+        unsigned num, num_mask;
+        struct ceph_pg pgid;
+        s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
+        int poolid = le32_to_cpu(fl->fl_pg_pool);
+        struct ceph_pg_pool_info *pool;
+        unsigned ps;
+        BUG_ON(!osdmap);
+        pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+        if (!pool)
+                return -EIO;
+        ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
+        if (preferred >= 0) {
+                ps += preferred;
+                num = le32_to_cpu(pool->v.lpg_num);
+                num_mask = pool->lpg_num_mask;
+        } else {
+                num = le32_to_cpu(pool->v.pg_num);
+                num_mask = pool->pg_num_mask;
+        }
+        pgid.ps = cpu_to_le16(ps);
+        pgid.preferred = cpu_to_le16(preferred);
+        pgid.pool = fl->fl_pg_pool;
+        if (preferred >= 0)
+                dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
+                     (int)preferred);
+        else
+                dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
+        ol->ol_pgid = pgid;
+        ol->ol_stripe_unit = fl->fl_object_stripe_unit;
+        return 0;
+}
+/*
+ * Calculate raw osd vector for the given pgid.  Return pointer to osd
+ * array, or NULL on failure.
+ */
+static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+                        int *osds, int *num)
+{
+        struct ceph_pg_mapping *pg;
+        struct ceph_pg_pool_info *pool;
+        int ruleno;
+        unsigned poolid, ps, pps;
+        int preferred;
+        /* pg_temp? */
+        pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
+        if (pg) {
+                *num = pg->len;
+                return pg->osds;
+        }
+        /* crush */
+        poolid = le32_to_cpu(pgid.pool);
+        ps = le16_to_cpu(pgid.ps);
+        preferred = (s16)le16_to_cpu(pgid.preferred);
+        /* don't forcefeed bad device ids to crush */
+        if (preferred >= osdmap->max_osd ||
+            preferred >= osdmap->crush->max_devices)
+                preferred = -1;
+        pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+        if (!pool)
+                return NULL;
+        ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
+                                 pool->v.type, pool->v.size);
+        if (ruleno < 0) {
+                pr_err("no crush rule pool %d type %d size %d\n",
+                       poolid, pool->v.type, pool->v.size);
+                return NULL;
+        }
+        if (preferred >= 0)
+                pps = ceph_stable_mod(ps,
+                                      le32_to_cpu(pool->v.lpgp_num),
+                                      pool->lpgp_num_mask);
+        else
+                pps = ceph_stable_mod(ps,
+                                      le32_to_cpu(pool->v.pgp_num),
+                                      pool->pgp_num_mask);
+        pps += poolid;
+        *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
+                             min_t(int, pool->v.size, *num),
+                             preferred, osdmap->osd_weight);
+        return osds;
+}
+/*
+ * Return primary osd for given pgid, or -1 if none.
+ */
+int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+{
+        int rawosds[10], *osds;
+        int i, num = ARRAY_SIZE(rawosds);
+        osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+        if (!osds)
+                return -1;
+        /* primary is first up osd */
+        for (i = 0; i < num; i++)
+                if (ceph_osd_is_up(osdmap, osds[i])) {
+                        return osds[i];
+                        break;
+                }
+        return -1;
+}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
new file mode 100644
index 000000000000..1fb55afb2642
--- /dev/null
+++ b/fs/ceph/osdmap.h
@@ -0,0 +1,125 @@
+#ifndef _FS_CEPH_OSDMAP_H
+#define _FS_CEPH_OSDMAP_H
+#include <linux/rbtree.h>
+#include "types.h"
+#include "ceph_fs.h"
+#include "crush/crush.h"
+/*
+ * The osd map describes the current membership of the osd cluster and
+ * specifies the mapping of objects to placement groups and placement
+ * groups to (sets of) osds.  That is, it completely specifies the
+ * (desired) distribution of all data objects in the system at some
+ * point in time.
+ *
+ * Each map version is identified by an epoch, which increases monotonically.
+ *
+ * The map can be updated either via an incremental map (diff) describing
+ * the change between two successive epochs, or as a fully encoded map.
+ */
+struct ceph_pg_pool_info {
+        struct rb_node node;
+        int id;
+        struct ceph_pg_pool v;
+        int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
+};
+struct ceph_pg_mapping {
+        struct rb_node node;
+        struct ceph_pg pgid;
+        int len;
+        int osds[];
+};
+struct ceph_osdmap {
+        struct ceph_fsid fsid;
+        u32 epoch;
+        u32 mkfs_epoch;
+        struct ceph_timespec created, modified;
+        u32 flags;         /* CEPH_OSDMAP_* */
+        u32 max_osd;       /* size of osd_state, _offload, _addr arrays */
+        u8 *osd_state;     /* CEPH_OSD_* */
+        u32 *osd_weight;   /* 0 = failed, 0x10000 = 100% normal */
+        struct ceph_entity_addr *osd_addr;
+        struct rb_root pg_temp;
+        struct rb_root pg_pools;
+        u32 pool_max;
+        /* the CRUSH map specifies the mapping of placement groups to
+         * the list of osds that store+replicate them. */
+        struct crush_map *crush;
+};
+/*
+ * file layout helpers
+ */
+#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
+#define ceph_file_layout_stripe_count(l) \
+        ((__s32)le32_to_cpu((l).fl_stripe_count))
+#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
+#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
+#define ceph_file_layout_object_su(l) \
+        ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
+#define ceph_file_layout_pg_preferred(l) \
+        ((__s32)le32_to_cpu((l).fl_pg_preferred))
+#define ceph_file_layout_pg_pool(l) \
+        ((__s32)le32_to_cpu((l).fl_pg_pool))
+static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
+{
+        return le32_to_cpu(l->fl_stripe_unit) *
+                le32_to_cpu(l->fl_stripe_count);
+}
+/* "period" == bytes before i start on a new set of objects */
+static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
+{
+        return le32_to_cpu(l->fl_object_size) *
+                le32_to_cpu(l->fl_stripe_count);
+}
+static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
+{
+        return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
+}
+static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
+{
+        return map && (map->flags & flag);
+}
+extern char *ceph_osdmap_state_str(char *str, int len, int state);
+static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
+                                                     int osd)
+{
+        if (osd >= map->max_osd)
+                return NULL;
+        return &map->osd_addr[osd];
+}
+extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
+extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+                                            struct ceph_osdmap *map,
+                                            struct ceph_messenger *msgr);
+extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
+/* calculate mapping of a file extent to an object */
+extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+                                          u64 off, u64 *plen,
+                                          u64 *bno, u64 *oxoff, u64 *oxlen);
+/* calculate mapping of object to a placement group */
+extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
+                                   const char *oid,
+                                   struct ceph_file_layout *fl,
+                                   struct ceph_osdmap *osdmap);
+extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
+                                struct ceph_pg pgid);
+#endif
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
new file mode 100644
index 000000000000..5f8dbf7c745a
--- /dev/null
+++ b/fs/ceph/pagelist.c
@@ -0,0 +1,55 @@
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include "pagelist.h"
+int ceph_pagelist_release(struct ceph_pagelist *pl)
+{
+        if (pl->mapped_tail)
+                kunmap(pl->mapped_tail);
+        while (!list_empty(&pl->head)) {
+                struct page *page = list_first_entry(&pl->head, struct page,
+                                                     lru);
+                list_del(&page->lru);
+                __free_page(page);
+        }
+        return 0;
+}
+static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
+{
+        struct page *page = alloc_page(GFP_NOFS);
+        if (!page)
+                return -ENOMEM;
+        pl->room += PAGE_SIZE;
+        list_add_tail(&page->lru, &pl->head);
+        if (pl->mapped_tail)
+                kunmap(pl->mapped_tail);
+        pl->mapped_tail = kmap(page);
+        return 0;
+}
+int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
+{
+        while (pl->room < len) {
+                size_t bit = pl->room;
+                int ret;
+                memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
+                       buf, bit);
+                pl->length += bit;
+                pl->room -= bit;
+                buf += bit;
+                len -= bit;
+                ret = ceph_pagelist_addpage(pl);
+                if (ret)
+                        return ret;
+        }
+        memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
+        pl->length += len;
+        pl->room -= len;
+        return 0;
+}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
new file mode 100644
index 000000000000..e8a4187e1087
--- /dev/null
+++ b/fs/ceph/pagelist.h
@@ -0,0 +1,54 @@
+#ifndef __FS_CEPH_PAGELIST_H
+#define __FS_CEPH_PAGELIST_H
+#include <linux/list.h>
+struct ceph_pagelist {
+        struct list_head head;
+        void *mapped_tail;
+        size_t length;
+        size_t room;
+};
+static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
+{
+        INIT_LIST_HEAD(&pl->head);
+        pl->mapped_tail = NULL;
+        pl->length = 0;
+        pl->room = 0;
+}
+extern int ceph_pagelist_release(struct ceph_pagelist *pl);
+extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
+static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
+{
+        __le64 ev = cpu_to_le64(v);
+        return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
+{
+        __le32 ev = cpu_to_le32(v);
+        return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
+{
+        __le16 ev = cpu_to_le16(v);
+        return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
+{
+        return ceph_pagelist_append(pl, &v, 1);
+}
+static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
+                                              char *s, size_t len)
+{
+        int ret = ceph_pagelist_encode_32(pl, len);
+        if (ret)
+                return ret;
+        if (len)
+                return ceph_pagelist_append(pl, s, len);
+        return 0;
+}
+#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
new file mode 100644
index 000000000000..26ac8b89a676
--- /dev/null
+++ b/fs/ceph/rados.h
@@ -0,0 +1,374 @@
+#ifndef __RADOS_H
+#define __RADOS_H
+/*
+ * Data types for the Ceph distributed object storage layer RADOS
+ * (Reliable Autonomic Distributed Object Store).
+ */
+#include "msgr.h"
+/*
+ * osdmap encoding versions
+ */
+#define CEPH_OSDMAP_INC_VERSION 4
+#define CEPH_OSDMAP_VERSION     4
+/*
+ * fs id
+ */
+struct ceph_fsid {
+        unsigned char fsid[16];
+};
+static inline int ceph_fsid_compare(const struct ceph_fsid *a,
+                                    const struct ceph_fsid *b)
+{
+        return memcmp(a, b, sizeof(*a));
+}
+/*
+ * ino, object, etc.
+ */
+typedef __le64 ceph_snapid_t;
+#define CEPH_SNAPDIR ((__u64)(-1))  /* reserved for hidden .snap dir */
+#define CEPH_NOSNAP  ((__u64)(-2))  /* "head", "live" revision */
+#define CEPH_MAXSNAP ((__u64)(-3))  /* largest valid snapid */
+struct ceph_timespec {
+        __le32 tv_sec;
+        __le32 tv_nsec;
+} __attribute__ ((packed));
+/*
+ * object layout - how objects are mapped into PGs
+ */
+#define CEPH_OBJECT_LAYOUT_HASH     1
+#define CEPH_OBJECT_LAYOUT_LINEAR   2
+#define CEPH_OBJECT_LAYOUT_HASHINO  3
+/*
+ * pg layout -- how PGs are mapped onto (sets of) OSDs
+ */
+#define CEPH_PG_LAYOUT_CRUSH  0
+#define CEPH_PG_LAYOUT_HASH   1
+#define CEPH_PG_LAYOUT_LINEAR 2
+#define CEPH_PG_LAYOUT_HYBRID 3
+/*
+ * placement group.
+ * we encode this into one __le64.
+ */
+struct ceph_pg {
+        __le16 preferred; /* preferred primary osd */
+        __le16 ps;        /* placement seed */
+        __le32 pool;      /* object pool */
+} __attribute__ ((packed));
+/*
+ * pg_pool is a set of pgs storing a pool of objects
+ *
+ *  pg_num -- base number of pseudorandomly placed pgs
+ *
+ *  pgp_num -- effective number when calculating pg placement.  this
+ * is used for pg_num increases.  new pgs result in data being "split"
+ * into new pgs.  for this to proceed smoothly, new pgs are intiially
+ * colocated with their parents; that is, pgp_num doesn't increase
+ * until the new pgs have successfully split.  only _then_ are the new
+ * pgs placed independently.
+ *
+ *  lpg_num -- localized pg count (per device).  replicas are randomly
+ * selected.
+ *
+ *  lpgp_num -- as above.
+ */
+#define CEPH_PG_TYPE_REP     1
+#define CEPH_PG_TYPE_RAID4   2
+#define CEPH_PG_POOL_VERSION 2
+struct ceph_pg_pool {
+        __u8 type;                /* CEPH_PG_TYPE_* */
+        __u8 size;                /* number of osds in each pg */
+        __u8 crush_ruleset;       /* crush placement rule */
+        __u8 object_hash;         /* hash mapping object name to ps */
+        __le32 pg_num, pgp_num;   /* number of pg's */
+        __le32 lpg_num, lpgp_num; /* number of localized pg's */
+        __le32 last_change;       /* most recent epoch changed */
+        __le64 snap_seq;          /* seq for per-pool snapshot */
+        __le32 snap_epoch;        /* epoch of last snap */
+        __le32 num_snaps;
+        __le32 num_removed_snap_intervals;
+        __le64 uid;
+} __attribute__ ((packed));
+/*
+ * stable_mod func is used to control number of placement groups.
+ * similar to straight-up modulo, but produces a stable mapping as b
+ * increases over time.  b is the number of bins, and bmask is the
+ * containing power of 2 minus 1.
+ *
+ * b <= bmask and bmask=(2**n)-1
+ * e.g., b=12 -> bmask=15, b=123 -> bmask=127
+ */
+static inline int ceph_stable_mod(int x, int b, int bmask)
+{
+        if ((x & bmask) < b)
+                return x & bmask;
+        else
+                return x & (bmask >> 1);
+}
+/*
+ * object layout - how a given object should be stored.
+ */
+struct ceph_object_layout {
+        struct ceph_pg ol_pgid;   /* raw pg, with _full_ ps precision. */
+        __le32 ol_stripe_unit;    /* for per-object parity, if any */
+} __attribute__ ((packed));
+/*
+ * compound epoch+version, used by storage layer to serialize mutations
+ */
+struct ceph_eversion {
+        __le32 epoch;
+        __le64 version;
+} __attribute__ ((packed));
+/*
+ * osd map bits
+ */
+/* status bits */
+#define CEPH_OSD_EXISTS 1
+#define CEPH_OSD_UP     2
+/* osd weights.  fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
+#define CEPH_OSD_IN  0x10000
+#define CEPH_OSD_OUT 0
+/*
+ * osd map flag bits
+ */
+#define CEPH_OSDMAP_NEARFULL (1<<0)  /* sync writes (near ENOSPC) */
+#define CEPH_OSDMAP_FULL     (1<<1)  /* no data writes (ENOSPC) */
+#define CEPH_OSDMAP_PAUSERD  (1<<2)  /* pause all reads */
+#define CEPH_OSDMAP_PAUSEWR  (1<<3)  /* pause all writes */
+#define CEPH_OSDMAP_PAUSEREC (1<<4)  /* pause recovery */
+/*
+ * osd ops
+ */
+#define CEPH_OSD_OP_MODE       0xf000
+#define CEPH_OSD_OP_MODE_RD    0x1000
+#define CEPH_OSD_OP_MODE_WR    0x2000
+#define CEPH_OSD_OP_MODE_RMW   0x3000
+#define CEPH_OSD_OP_MODE_SUB   0x4000
+#define CEPH_OSD_OP_TYPE       0x0f00
+#define CEPH_OSD_OP_TYPE_LOCK  0x0100
+#define CEPH_OSD_OP_TYPE_DATA  0x0200
+#define CEPH_OSD_OP_TYPE_ATTR  0x0300
+#define CEPH_OSD_OP_TYPE_EXEC  0x0400
+#define CEPH_OSD_OP_TYPE_PG    0x0500
+enum {
+        /** data **/
+        /* read */
+        CEPH_OSD_OP_READ      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
+        CEPH_OSD_OP_STAT      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
+        /* fancy read */
+        CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
+        /* write */
+        CEPH_OSD_OP_WRITE     = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
+        CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
+        CEPH_OSD_OP_TRUNCATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
+        CEPH_OSD_OP_ZERO      = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
+        CEPH_OSD_OP_DELETE    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
+        /* fancy write */
+        CEPH_OSD_OP_APPEND    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
+        CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
+        CEPH_OSD_OP_SETTRUNC  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
+        CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
+        CEPH_OSD_OP_TMAPUP  = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
+        CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
+        CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
+        CEPH_OSD_OP_CREATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
+        /** attrs **/
+        /* read */
+        CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
+        CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
+        /* write */
+        CEPH_OSD_OP_SETXATTR  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
+        CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
+        CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
+        CEPH_OSD_OP_RMXATTR   = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
+        /** subop **/
+        CEPH_OSD_OP_PULL           = CEPH_OSD_OP_MODE_SUB | 1,
+        CEPH_OSD_OP_PUSH           = CEPH_OSD_OP_MODE_SUB | 2,
+        CEPH_OSD_OP_BALANCEREADS   = CEPH_OSD_OP_MODE_SUB | 3,
+        CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
+        CEPH_OSD_OP_SCRUB          = CEPH_OSD_OP_MODE_SUB | 5,
+        /** lock **/
+        CEPH_OSD_OP_WRLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
+        CEPH_OSD_OP_WRUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
+        CEPH_OSD_OP_RDLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
+        CEPH_OSD_OP_RDUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
+        CEPH_OSD_OP_UPLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
+        CEPH_OSD_OP_DNLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
+        /** exec **/
+        CEPH_OSD_OP_CALL    = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
+        /** pg **/
+        CEPH_OSD_OP_PGLS      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
+};
+static inline int ceph_osd_op_type_lock(int op)
+{
+        return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
+}
+static inline int ceph_osd_op_type_data(int op)
+{
+        return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
+}
+static inline int ceph_osd_op_type_attr(int op)
+{
+        return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
+}
+static inline int ceph_osd_op_type_exec(int op)
+{
+        return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
+}
+static inline int ceph_osd_op_type_pg(int op)
+{
+        return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
+}
+static inline int ceph_osd_op_mode_subop(int op)
+{
+        return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
+}
+static inline int ceph_osd_op_mode_read(int op)
+{
+        return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
+}
+static inline int ceph_osd_op_mode_modify(int op)
+{
+        return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
+}
+#define CEPH_OSD_TMAP_HDR 'h'
+#define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_RM  'r'
+extern const char *ceph_osd_op_name(int op);
+/*
+ * osd op flags
+ *
+ * An op may be READ, WRITE, or READ|WRITE.
+ */
+enum {
+        CEPH_OSD_FLAG_ACK = 1,          /* want (or is) "ack" ack */
+        CEPH_OSD_FLAG_ONNVRAM = 2,      /* want (or is) "onnvram" ack */
+        CEPH_OSD_FLAG_ONDISK = 4,       /* want (or is) "ondisk" ack */
+        CEPH_OSD_FLAG_RETRY = 8,        /* resend attempt */
+        CEPH_OSD_FLAG_READ = 16,        /* op may read */
+        CEPH_OSD_FLAG_WRITE = 32,       /* op may write */
+        CEPH_OSD_FLAG_ORDERSNAP = 64,   /* EOLDSNAP if snapc is out of order */
+        CEPH_OSD_FLAG_PEERSTAT = 128,   /* msg includes osd_peer_stat */
+        CEPH_OSD_FLAG_BALANCE_READS = 256,
+        CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
+        CEPH_OSD_FLAG_PGOP = 1024,      /* pg op, no object */
+        CEPH_OSD_FLAG_EXEC = 2048,      /* op may exec */
+};
+enum {
+        CEPH_OSD_OP_FLAG_EXCL = 1,      /* EXCL object create */
+};
+#define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
+#define EBLACKLISTED ESHUTDOWN /* blacklisted */
+/*
+ * an individual object operation.  each may be accompanied by some data
+ * payload
+ */
+struct ceph_osd_op {
+        __le16 op;           /* CEPH_OSD_OP_* */
+        __le32 flags;        /* CEPH_OSD_FLAG_* */
+        union {
+                struct {
+                        __le64 offset, length;
+                        __le64 truncate_size;
+                        __le32 truncate_seq;
+                } __attribute__ ((packed)) extent;
+                struct {
+                        __le32 name_len;
+                        __le32 value_len;
+                } __attribute__ ((packed)) xattr;
+                struct {
+                        __u8 class_len;
+                        __u8 method_len;
+                        __u8 argc;
+                        __le32 indata_len;
+                } __attribute__ ((packed)) cls;
+                struct {
+                        __le64 cookie, count;
+                } __attribute__ ((packed)) pgls;
+        };
+        __le32 payload_len;
+} __attribute__ ((packed));
+/*
+ * osd request message header.  each request may include multiple
+ * ceph_osd_op object operations.
+ */
+struct ceph_osd_request_head {
+        __le32 client_inc;                 /* client incarnation */
+        struct ceph_object_layout layout;  /* pgid */
+        __le32 osdmap_epoch;               /* client's osdmap epoch */
+        __le32 flags;
+        struct ceph_timespec mtime;        /* for mutations only */
+        struct ceph_eversion reassert_version; /* if we are replaying op */
+        __le32 object_len;     /* length of object name */
+        __le64 snapid;         /* snapid to read */
+        __le64 snap_seq;       /* writer's snap context */
+        __le32 num_snaps;
+        __le16 num_ops;
+        struct ceph_osd_op ops[];  /* followed by ops[], obj, ticket, snaps */
+} __attribute__ ((packed));
+struct ceph_osd_reply_head {
+        __le32 client_inc;                /* client incarnation */
+        __le32 flags;
+        struct ceph_object_layout layout;
+        __le32 osdmap_epoch;
+        struct ceph_eversion reassert_version; /* for replaying uncommitted */
+        __le32 result;                    /* result code */
+        __le32 object_len;                /* length of object name */
+        __le32 num_ops;
+        struct ceph_osd_op ops[0];  /* ops[], object */
+} __attribute__ ((packed));
+#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
new file mode 100644
index 000000000000..e6f9bc57d472
--- /dev/null
+++ b/fs/ceph/snap.c
@@ -0,0 +1,907 @@
+#include "ceph_debug.h"
+#include <linux/sort.h>
+#include <linux/slab.h>
+#include "super.h"
+#include "decode.h"
+/*
+ * Snapshots in ceph are driven in large part by cooperation from the
+ * client.  In contrast to local file systems or file servers that
+ * implement snapshots at a single point in the system, ceph's
+ * distributed access to storage requires clients to help decide
+ * whether a write logically occurs before or after a recently created
+ * snapshot.
+ *
+ * This provides a perfect instantanous client-wide snapshot.  Between
+ * clients, however, snapshots may appear to be applied at slightly
+ * different points in time, depending on delays in delivering the
+ * snapshot notification.
+ *
+ * Snapshots are _not_ file system-wide.  Instead, each snapshot
+ * applies to the subdirectory nested beneath some directory.  This
+ * effectively divides the hierarchy into multiple "realms," where all
+ * of the files contained by each realm share the same set of
+ * snapshots.  An individual realm's snap set contains snapshots
+ * explicitly created on that realm, as well as any snaps in its
+ * parent's snap set _after_ the point at which the parent became it's
+ * parent (due to, say, a rename).  Similarly, snaps from prior parents
+ * during the time intervals during which they were the parent are included.
+ *
+ * The client is spared most of this detail, fortunately... it must only
+ * maintains a hierarchy of realms reflecting the current parent/child
+ * realm relationship, and for each realm has an explicit list of snaps
+ * inherited from prior parents.
+ *
+ * A snap_realm struct is maintained for realms containing every inode
+ * with an open cap in the system.  (The needed snap realm information is
+ * provided by the MDS whenever a cap is issued, i.e., on open.)  A 'seq'
+ * version number is used to ensure that as realm parameters change (new
+ * snapshot, new parent, etc.) the client's realm hierarchy is updated.
+ *
+ * The realm hierarchy drives the generation of a 'snap context' for each
+ * realm, which simply lists the resulting set of snaps for the realm.  This
+ * is attached to any writes sent to OSDs.
+ */
+/*
+ * Unfortunately error handling is a bit mixed here.  If we get a snap
+ * update, but don't have enough memory to update our realm hierarchy,
+ * it's not clear what we can do about it (besides complaining to the
+ * console).
+ */
+/*
+ * increase ref count for the realm
+ *
+ * caller must hold snap_rwsem for write.
+ */
+void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
+                         struct ceph_snap_realm *realm)
+{
+        dout("get_realm %p %d -> %d\n", realm,
+             atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
+        /*
+         * since we _only_ increment realm refs or empty the empty
+         * list with snap_rwsem held, adjusting the empty list here is
+         * safe.  we do need to protect against concurrent empty list
+         * additions, however.
+         */
+        if (atomic_read(&realm->nref) == 0) {
+                spin_lock(&mdsc->snap_empty_lock);
+                list_del_init(&realm->empty_item);
+                spin_unlock(&mdsc->snap_empty_lock);
+        }
+        atomic_inc(&realm->nref);
+}
+static void __insert_snap_realm(struct rb_root *root,
+                                struct ceph_snap_realm *new)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct ceph_snap_realm *r = NULL;
+        while (*p) {
+                parent = *p;
+                r = rb_entry(parent, struct ceph_snap_realm, node);
+                if (new->ino < r->ino)
+                        p = &(*p)->rb_left;
+                else if (new->ino > r->ino)
+                        p = &(*p)->rb_right;
+                else
+                        BUG();
+        }
+        rb_link_node(&new->node, parent, p);
+        rb_insert_color(&new->node, root);
+}
+/*
+ * create and get the realm rooted at @ino and bump its ref count.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+static struct ceph_snap_realm *ceph_create_snap_realm(
+        struct ceph_mds_client *mdsc,
+        u64 ino)
+{
+        struct ceph_snap_realm *realm;
+        realm = kzalloc(sizeof(*realm), GFP_NOFS);
+        if (!realm)
+                return ERR_PTR(-ENOMEM);
+        atomic_set(&realm->nref, 0);    /* tree does not take a ref */
+        realm->ino = ino;
+        INIT_LIST_HEAD(&realm->children);
+        INIT_LIST_HEAD(&realm->child_item);
+        INIT_LIST_HEAD(&realm->empty_item);
+        INIT_LIST_HEAD(&realm->inodes_with_caps);
+        spin_lock_init(&realm->inodes_with_caps_lock);
+        __insert_snap_realm(&mdsc->snap_realms, realm);
+        dout("create_snap_realm %llx %p\n", realm->ino, realm);
+        return realm;
+}
+/*
+ * lookup the realm rooted at @ino.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+                                               u64 ino)
+{
+        struct rb_node *n = mdsc->snap_realms.rb_node;
+        struct ceph_snap_realm *r;
+        while (n) {
+                r = rb_entry(n, struct ceph_snap_realm, node);
+                if (ino < r->ino)
+                        n = n->rb_left;
+                else if (ino > r->ino)
+                        n = n->rb_right;
+                else {
+                        dout("lookup_snap_realm %llx %p\n", r->ino, r);
+                        return r;
+                }
+        }
+        return NULL;
+}
+static void __put_snap_realm(struct ceph_mds_client *mdsc,
+                             struct ceph_snap_realm *realm);
+/*
+ * called with snap_rwsem (write)
+ */
+static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
+                                 struct ceph_snap_realm *realm)
+{
+        dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
+        rb_erase(&realm->node, &mdsc->snap_realms);
+        if (realm->parent) {
+                list_del_init(&realm->child_item);
+                __put_snap_realm(mdsc, realm->parent);
+        }
+        kfree(realm->prior_parent_snaps);
+        kfree(realm->snaps);
+        ceph_put_snap_context(realm->cached_context);
+        kfree(realm);
+}
+/*
+ * caller holds snap_rwsem (write)
+ */
+static void __put_snap_realm(struct ceph_mds_client *mdsc,
+                             struct ceph_snap_realm *realm)
+{
+        dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
+             atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+        if (atomic_dec_and_test(&realm->nref))
+                __destroy_snap_realm(mdsc, realm);
+}
+/*
+ * caller needn't hold any locks
+ */
+void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
+                         struct ceph_snap_realm *realm)
+{
+        dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
+             atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+        if (!atomic_dec_and_test(&realm->nref))
+                return;
+        if (down_write_trylock(&mdsc->snap_rwsem)) {
+                __destroy_snap_realm(mdsc, realm);
+                up_write(&mdsc->snap_rwsem);
+        } else {
+                spin_lock(&mdsc->snap_empty_lock);
+                list_add(&mdsc->snap_empty, &realm->empty_item);
+                spin_unlock(&mdsc->snap_empty_lock);
+        }
+}
+/*
+ * Clean up any realms whose ref counts have dropped to zero.  Note
+ * that this does not include realms who were created but not yet
+ * used.
+ *
+ * Called under snap_rwsem (write)
+ */
+static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
+{
+        struct ceph_snap_realm *realm;
+        spin_lock(&mdsc->snap_empty_lock);
+        while (!list_empty(&mdsc->snap_empty)) {
+                realm = list_first_entry(&mdsc->snap_empty,
+                                   struct ceph_snap_realm, empty_item);
+                list_del(&realm->empty_item);
+                spin_unlock(&mdsc->snap_empty_lock);
+                __destroy_snap_realm(mdsc, realm);
+                spin_lock(&mdsc->snap_empty_lock);
+        }
+        spin_unlock(&mdsc->snap_empty_lock);
+}
+void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
+{
+        down_write(&mdsc->snap_rwsem);
+        __cleanup_empty_realms(mdsc);
+        up_write(&mdsc->snap_rwsem);
+}
+/*
+ * adjust the parent realm of a given @realm.  adjust child list, and parent
+ * pointers, and ref counts appropriately.
+ *
+ * return true if parent was changed, 0 if unchanged, <0 on error.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
+                                    struct ceph_snap_realm *realm,
+                                    u64 parentino)
+{
+        struct ceph_snap_realm *parent;
+        if (realm->parent_ino == parentino)
+                return 0;
+        parent = ceph_lookup_snap_realm(mdsc, parentino);
+        if (!parent) {
+                parent = ceph_create_snap_realm(mdsc, parentino);
+                if (IS_ERR(parent))
+                        return PTR_ERR(parent);
+        }
+        dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
+             realm->ino, realm, realm->parent_ino, realm->parent,
+             parentino, parent);
+        if (realm->parent) {
+                list_del_init(&realm->child_item);
+                ceph_put_snap_realm(mdsc, realm->parent);
+        }
+        realm->parent_ino = parentino;
+        realm->parent = parent;
+        ceph_get_snap_realm(mdsc, parent);
+        list_add(&realm->child_item, &parent->children);
+        return 1;
+}
+static int cmpu64_rev(const void *a, const void *b)
+{
+        if (*(u64 *)a < *(u64 *)b)
+                return 1;
+        if (*(u64 *)a > *(u64 *)b)
+                return -1;
+        return 0;
+}
+/*
+ * build the snap context for a given realm.
+ */
+static int build_snap_context(struct ceph_snap_realm *realm)
+{
+        struct ceph_snap_realm *parent = realm->parent;
+        struct ceph_snap_context *snapc;
+        int err = 0;
+        int i;
+        int num = realm->num_prior_parent_snaps + realm->num_snaps;
+        /*
+         * build parent context, if it hasn't been built.
+         * conservatively estimate that all parent snaps might be
+         * included by us.
+         */
+        if (parent) {
+                if (!parent->cached_context) {
+                        err = build_snap_context(parent);
+                        if (err)
+                                goto fail;
+                }
+                num += parent->cached_context->num_snaps;
+        }
+        /* do i actually need to update?  not if my context seq
+           matches realm seq, and my parents' does to.  (this works
+           because we rebuild_snap_realms() works _downward_ in
+           hierarchy after each update.) */
+        if (realm->cached_context &&
+            realm->cached_context->seq == realm->seq &&
+            (!parent ||
+             realm->cached_context->seq >= parent->cached_context->seq)) {
+                dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
+                     " (unchanged)\n",
+                     realm->ino, realm, realm->cached_context,
+                     realm->cached_context->seq,
+                     realm->cached_context->num_snaps);
+                return 0;
+        }
+        /* alloc new snap context */
+        err = -ENOMEM;
+        if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
+                goto fail;
+        snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
+        if (!snapc)
+                goto fail;
+        atomic_set(&snapc->nref, 1);
+        /* build (reverse sorted) snap vector */
+        num = 0;
+        snapc->seq = realm->seq;
+        if (parent) {
+                /* include any of parent's snaps occuring _after_ my
+                   parent became my parent */
+                for (i = 0; i < parent->cached_context->num_snaps; i++)
+                        if (parent->cached_context->snaps[i] >=
+                            realm->parent_since)
+                                snapc->snaps[num++] =
+                                        parent->cached_context->snaps[i];
+                if (parent->cached_context->seq > snapc->seq)
+                        snapc->seq = parent->cached_context->seq;
+        }
+        memcpy(snapc->snaps + num, realm->snaps,
+               sizeof(u64)*realm->num_snaps);
+        num += realm->num_snaps;
+        memcpy(snapc->snaps + num, realm->prior_parent_snaps,
+               sizeof(u64)*realm->num_prior_parent_snaps);
+        num += realm->num_prior_parent_snaps;
+        sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
+        snapc->num_snaps = num;
+        dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n",
+             realm->ino, realm, snapc, snapc->seq, snapc->num_snaps);
+        if (realm->cached_context)
+                ceph_put_snap_context(realm->cached_context);
+        realm->cached_context = snapc;
+        return 0;
+fail:
+        /*
+         * if we fail, clear old (incorrect) cached_context... hopefully
+         * we'll have better luck building it later
+         */
+        if (realm->cached_context) {
+                ceph_put_snap_context(realm->cached_context);
+                realm->cached_context = NULL;
+        }
+        pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
+               realm, err);
+        return err;
+}
+/*
+ * rebuild snap context for the given realm and all of its children.
+ */
+static void rebuild_snap_realms(struct ceph_snap_realm *realm)
+{
+        struct ceph_snap_realm *child;
+        dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
+        build_snap_context(realm);
+        list_for_each_entry(child, &realm->children, child_item)
+                rebuild_snap_realms(child);
+}
+/*
+ * helper to allocate and decode an array of snapids.  free prior
+ * instance, if any.
+ */
+static int dup_array(u64 **dst, __le64 *src, int num)
+{
+        int i;
+        kfree(*dst);
+        if (num) {
+                *dst = kcalloc(num, sizeof(u64), GFP_NOFS);
+                if (!*dst)
+                        return -ENOMEM;
+                for (i = 0; i < num; i++)
+                        (*dst)[i] = get_unaligned_le64(src + i);
+        } else {
+                *dst = NULL;
+        }
+        return 0;
+}
+/*
+ * When a snapshot is applied, the size/mtime inode metadata is queued
+ * in a ceph_cap_snap (one for each snapshot) until writeback
+ * completes and the metadata can be flushed back to the MDS.
+ *
+ * However, if a (sync) write is currently in-progress when we apply
+ * the snapshot, we have to wait until the write succeeds or fails
+ * (and a final size/mtime is known).  In this case the
+ * cap_snap->writing = 1, and is said to be "pending."  When the write
+ * finishes, we __ceph_finish_cap_snap().
+ *
+ * Caller must hold snap_rwsem for read (i.e., the realm topology won't
+ * change).
+ */
+void ceph_queue_cap_snap(struct ceph_inode_info *ci,
+                         struct ceph_snap_context *snapc)
+{
+        struct inode *inode = &ci->vfs_inode;
+        struct ceph_cap_snap *capsnap;
+        int used;
+        capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
+        if (!capsnap) {
+                pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
+                return;
+        }
+        spin_lock(&inode->i_lock);
+        used = __ceph_caps_used(ci);
+        if (__ceph_have_pending_cap_snap(ci)) {
+                /* there is no point in queuing multiple "pending" cap_snaps,
+                   as no new writes are allowed to start when pending, so any
+                   writes in progress now were started before the previous
+                   cap_snap.  lucky us. */
+                dout("queue_cap_snap %p snapc %p seq %llu used %d"
+                     " already pending\n", inode, snapc, snapc->seq, used);
+                kfree(capsnap);
+        } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
+                igrab(inode);
+                atomic_set(&capsnap->nref, 1);
+                capsnap->ci = ci;
+                INIT_LIST_HEAD(&capsnap->ci_item);
+                INIT_LIST_HEAD(&capsnap->flushing_item);
+                capsnap->follows = snapc->seq - 1;
+                capsnap->context = ceph_get_snap_context(snapc);
+                capsnap->issued = __ceph_caps_issued(ci, NULL);
+                capsnap->dirty = __ceph_caps_dirty(ci);
+                capsnap->mode = inode->i_mode;
+                capsnap->uid = inode->i_uid;
+                capsnap->gid = inode->i_gid;
+                /* fixme? */
+                capsnap->xattr_blob = NULL;
+                capsnap->xattr_len = 0;
+                /* dirty page count moved from _head to this cap_snap;
+                   all subsequent writes page dirties occur _after_ this
+                   snapshot. */
+                capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
+                ci->i_wrbuffer_ref_head = 0;
+                ceph_put_snap_context(ci->i_head_snapc);
+                ci->i_head_snapc = NULL;
+                list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
+                if (used & CEPH_CAP_FILE_WR) {
+                        dout("queue_cap_snap %p cap_snap %p snapc %p"
+                             " seq %llu used WR, now pending\n", inode,
+                             capsnap, snapc, snapc->seq);
+                        capsnap->writing = 1;
+                } else {
+                        /* note mtime, size NOW. */
+                        __ceph_finish_cap_snap(ci, capsnap);
+                }
+        } else {
+                dout("queue_cap_snap %p nothing dirty|writing\n", inode);
+                kfree(capsnap);
+        }
+        spin_unlock(&inode->i_lock);
+}
+/*
+ * Finalize the size, mtime for a cap_snap.. that is, settle on final values
+ * to be used for the snapshot, to be flushed back to the mds.
+ *
+ * If capsnap can now be flushed, add to snap_flush list, and return 1.
+ *
+ * Caller must hold i_lock.
+ */
+int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
+                            struct ceph_cap_snap *capsnap)
+{
+        struct inode *inode = &ci->vfs_inode;
+        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+        BUG_ON(capsnap->writing);
+        capsnap->size = inode->i_size;
+        capsnap->mtime = inode->i_mtime;
+        capsnap->atime = inode->i_atime;
+        capsnap->ctime = inode->i_ctime;
+        capsnap->time_warp_seq = ci->i_time_warp_seq;
+        if (capsnap->dirty_pages) {
+                dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu "
+                     "still has %d dirty pages\n", inode, capsnap,
+                     capsnap->context, capsnap->context->seq,
+                     capsnap->size, capsnap->dirty_pages);
+                return 0;
+        }
+        dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n",
+             inode, capsnap, capsnap->context,
+             capsnap->context->seq, capsnap->size);
+        spin_lock(&mdsc->snap_flush_lock);
+        list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
+        spin_unlock(&mdsc->snap_flush_lock);
+        return 1;  /* caller may want to ceph_flush_snaps */
+}
+/*
+ * Parse and apply a snapblob "snap trace" from the MDS.  This specifies
+ * the snap realm parameters from a given realm and all of its ancestors,
+ * up to the root.
+ *
+ * Caller must hold snap_rwsem for write.
+ */
+int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
+                           void *p, void *e, bool deletion)
+{
+        struct ceph_mds_snap_realm *ri;    /* encoded */
+        __le64 *snaps;                     /* encoded */
+        __le64 *prior_parent_snaps;        /* encoded */
+        struct ceph_snap_realm *realm;
+        int invalidate = 0;
+        int err = -ENOMEM;
+        dout("update_snap_trace deletion=%d\n", deletion);
+more:
+        ceph_decode_need(&p, e, sizeof(*ri), bad);
+        ri = p;
+        p += sizeof(*ri);
+        ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
+                            le32_to_cpu(ri->num_prior_parent_snaps)), bad);
+        snaps = p;
+        p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
+        prior_parent_snaps = p;
+        p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
+        realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
+        if (!realm) {
+                realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
+                if (IS_ERR(realm)) {
+                        err = PTR_ERR(realm);
+                        goto fail;
+                }
+        }
+        if (le64_to_cpu(ri->seq) > realm->seq) {
+                dout("update_snap_trace updating %llx %p %lld -> %lld\n",
+                     realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
+                /*
+                 * if the realm seq has changed, queue a cap_snap for every
+                 * inode with open caps.  we do this _before_ we update
+                 * the realm info so that we prepare for writeback under the
+                 * _previous_ snap context.
+                 *
+                 * ...unless it's a snap deletion!
+                 */
+                if (!deletion) {
+                        struct ceph_inode_info *ci;
+                        struct inode *lastinode = NULL;
+                        spin_lock(&realm->inodes_with_caps_lock);
+                        list_for_each_entry(ci, &realm->inodes_with_caps,
+                                            i_snap_realm_item) {
+                                struct inode *inode = igrab(&ci->vfs_inode);
+                                if (!inode)
+                                        continue;
+                                spin_unlock(&realm->inodes_with_caps_lock);
+                                if (lastinode)
+                                        iput(lastinode);
+                                lastinode = inode;
+                                ceph_queue_cap_snap(ci, realm->cached_context);
+                                spin_lock(&realm->inodes_with_caps_lock);
+                        }
+                        spin_unlock(&realm->inodes_with_caps_lock);
+                        if (lastinode)
+                                iput(lastinode);
+                        dout("update_snap_trace cap_snaps queued\n");
+                }
+        } else {
+                dout("update_snap_trace %llx %p seq %lld unchanged\n",
+                     realm->ino, realm, realm->seq);
+        }
+        /* ensure the parent is correct */
+        err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
+        if (err < 0)
+                goto fail;
+        invalidate += err;
+        if (le64_to_cpu(ri->seq) > realm->seq) {
+                /* update realm parameters, snap lists */
+                realm->seq = le64_to_cpu(ri->seq);
+                realm->created = le64_to_cpu(ri->created);
+                realm->parent_since = le64_to_cpu(ri->parent_since);
+                realm->num_snaps = le32_to_cpu(ri->num_snaps);
+                err = dup_array(&realm->snaps, snaps, realm->num_snaps);
+                if (err < 0)
+                        goto fail;
+                realm->num_prior_parent_snaps =
+                        le32_to_cpu(ri->num_prior_parent_snaps);
+                err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
+                                realm->num_prior_parent_snaps);
+                if (err < 0)
+                        goto fail;
+                invalidate = 1;
+        } else if (!realm->cached_context) {
+                invalidate = 1;
+        }
+        dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
+             realm, invalidate, p, e);
+        if (p < e)
+                goto more;
+        /* invalidate when we reach the _end_ (root) of the trace */
+        if (invalidate)
+                rebuild_snap_realms(realm);
+        __cleanup_empty_realms(mdsc);
+        return 0;
+bad:
+        err = -EINVAL;
+fail:
+        pr_err("update_snap_trace error %d\n", err);
+        return err;
+}
+/*
+ * Send any cap_snaps that are queued for flush.  Try to carry
+ * s_mutex across multiple snap flushes to avoid locking overhead.
+ *
+ * Caller holds no locks.
+ */
+static void flush_snaps(struct ceph_mds_client *mdsc)
+{
+        struct ceph_inode_info *ci;
+        struct inode *inode;
+        struct ceph_mds_session *session = NULL;
+        dout("flush_snaps\n");
+        spin_lock(&mdsc->snap_flush_lock);
+        while (!list_empty(&mdsc->snap_flush_list)) {
+                ci = list_first_entry(&mdsc->snap_flush_list,
+                                struct ceph_inode_info, i_snap_flush_item);
+                inode = &ci->vfs_inode;
+                igrab(inode);
+                spin_unlock(&mdsc->snap_flush_lock);
+                spin_lock(&inode->i_lock);
+                __ceph_flush_snaps(ci, &session);
+                spin_unlock(&inode->i_lock);
+                iput(inode);
+                spin_lock(&mdsc->snap_flush_lock);
+        }
+        spin_unlock(&mdsc->snap_flush_lock);
+        if (session) {
+                mutex_unlock(&session->s_mutex);
+                ceph_put_mds_session(session);
+        }
+        dout("flush_snaps done\n");
+}
+/*
+ * Handle a snap notification from the MDS.
+ *
+ * This can take two basic forms: the simplest is just a snap creation
+ * or deletion notification on an existing realm.  This should update the
+ * realm and its children.
+ *
+ * The more difficult case is realm creation, due to snap creation at a
+ * new point in the file hierarchy, or due to a rename that moves a file or
+ * directory into another realm.
+ */
+void ceph_handle_snap(struct ceph_mds_client *mdsc,
+                      struct ceph_mds_session *session,
+                      struct ceph_msg *msg)
+{
+        struct super_block *sb = mdsc->client->sb;
+        int mds = session->s_mds;
+        u64 split;
+        int op;
+        int trace_len;
+        struct ceph_snap_realm *realm = NULL;
+        void *p = msg->front.iov_base;
+        void *e = p + msg->front.iov_len;
+        struct ceph_mds_snap_head *h;
+        int num_split_inos, num_split_realms;
+        __le64 *split_inos = NULL, *split_realms = NULL;
+        int i;
+        int locked_rwsem = 0;
+        /* decode */
+        if (msg->front.iov_len < sizeof(*h))
+                goto bad;
+        h = p;
+        op = le32_to_cpu(h->op);
+        split = le64_to_cpu(h->split);   /* non-zero if we are splitting an
+                                          * existing realm */
+        num_split_inos = le32_to_cpu(h->num_split_inos);
+        num_split_realms = le32_to_cpu(h->num_split_realms);
+        trace_len = le32_to_cpu(h->trace_len);
+        p += sizeof(*h);
+        dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
+             ceph_snap_op_name(op), split, trace_len);
+        mutex_lock(&session->s_mutex);
+        session->s_seq++;
+        mutex_unlock(&session->s_mutex);
+        down_write(&mdsc->snap_rwsem);
+        locked_rwsem = 1;
+        if (op == CEPH_SNAP_OP_SPLIT) {
+                struct ceph_mds_snap_realm *ri;
+                /*
+                 * A "split" breaks part of an existing realm off into
+                 * a new realm.  The MDS provides a list of inodes
+                 * (with caps) and child realms that belong to the new
+                 * child.
+                 */
+                split_inos = p;
+                p += sizeof(u64) * num_split_inos;
+                split_realms = p;
+                p += sizeof(u64) * num_split_realms;
+                ceph_decode_need(&p, e, sizeof(*ri), bad);
+                /* we will peek at realm info here, but will _not_
+                 * advance p, as the realm update will occur below in
+                 * ceph_update_snap_trace. */
+                ri = p;
+                realm = ceph_lookup_snap_realm(mdsc, split);
+                if (!realm) {
+                        realm = ceph_create_snap_realm(mdsc, split);
+                        if (IS_ERR(realm))
+                                goto out;
+                }
+                ceph_get_snap_realm(mdsc, realm);
+                dout("splitting snap_realm %llx %p\n", realm->ino, realm);
+                for (i = 0; i < num_split_inos; i++) {
+                        struct ceph_vino vino = {
+                                .ino = le64_to_cpu(split_inos[i]),
+                                .snap = CEPH_NOSNAP,
+                        };
+                        struct inode *inode = ceph_find_inode(sb, vino);
+                        struct ceph_inode_info *ci;
+                        if (!inode)
+                                continue;
+                        ci = ceph_inode(inode);
+                        spin_lock(&inode->i_lock);
+                        if (!ci->i_snap_realm)
+                                goto skip_inode;
+                        /*
+                         * If this inode belongs to a realm that was
+                         * created after our new realm, we experienced
+                         * a race (due to another split notifications
+                         * arriving from a different MDS).  So skip
+                         * this inode.
+                         */
+                        if (ci->i_snap_realm->created >
+                            le64_to_cpu(ri->created)) {
+                                dout(" leaving %p in newer realm %llx %p\n",
+                                     inode, ci->i_snap_realm->ino,
+                                     ci->i_snap_realm);
+                                goto skip_inode;
+                        }
+                        dout(" will move %p to split realm %llx %p\n",
+                             inode, realm->ino, realm);
+                        /*
+                         * Remove the inode from the realm's inode
+                         * list, but don't add it to the new realm
+                         * yet.  We don't want the cap_snap to be
+                         * queued (again) by ceph_update_snap_trace()
+                         * below.  Queue it _now_, under the old context.
+                         */
+                        spin_lock(&realm->inodes_with_caps_lock);
+                        list_del_init(&ci->i_snap_realm_item);
+                        spin_unlock(&realm->inodes_with_caps_lock);
+                        spin_unlock(&inode->i_lock);
+                        ceph_queue_cap_snap(ci,
+                                            ci->i_snap_realm->cached_context);
+                        iput(inode);
+                        continue;
+skip_inode:
+                        spin_unlock(&inode->i_lock);
+                        iput(inode);
+                }
+                /* we may have taken some of the old realm's children. */
+                for (i = 0; i < num_split_realms; i++) {
+                        struct ceph_snap_realm *child =
+                                ceph_lookup_snap_realm(mdsc,
+                                           le64_to_cpu(split_realms[i]));
+                        if (!child)
+                                continue;
+                        adjust_snap_realm_parent(mdsc, child, realm->ino);
+                }
+        }
+        /*
+         * update using the provided snap trace. if we are deleting a
+         * snap, we can avoid queueing cap_snaps.
+         */
+        ceph_update_snap_trace(mdsc, p, e,
+                               op == CEPH_SNAP_OP_DESTROY);
+        if (op == CEPH_SNAP_OP_SPLIT) {
+                /*
+                 * ok, _now_ add the inodes into the new realm.
+                 */
+                for (i = 0; i < num_split_inos; i++) {
+                        struct ceph_vino vino = {
+                                .ino = le64_to_cpu(split_inos[i]),
+                                .snap = CEPH_NOSNAP,
+                        };
+                        struct inode *inode = ceph_find_inode(sb, vino);
+                        struct ceph_inode_info *ci;
+                        if (!inode)
+                                continue;
+                        ci = ceph_inode(inode);
+                        spin_lock(&inode->i_lock);
+                        if (!ci->i_snap_realm)
+                                goto split_skip_inode;
+                        ceph_put_snap_realm(mdsc, ci->i_snap_realm);
+                        spin_lock(&realm->inodes_with_caps_lock);
+                        list_add(&ci->i_snap_realm_item,
+                                 &realm->inodes_with_caps);
+                        ci->i_snap_realm = realm;
+                        spin_unlock(&realm->inodes_with_caps_lock);
+                        ceph_get_snap_realm(mdsc, realm);
+split_skip_inode:
+                        spin_unlock(&inode->i_lock);
+                        iput(inode);
+                }
+                /* we took a reference when we created the realm, above */
+                ceph_put_snap_realm(mdsc, realm);
+        }
+        __cleanup_empty_realms(mdsc);
+        up_write(&mdsc->snap_rwsem);
+        flush_snaps(mdsc);
+        return;
+bad:
+        pr_err("corrupt snap message from mds%d\n", mds);
+        ceph_msg_dump(msg);
+out:
+        if (locked_rwsem)
+                up_write(&mdsc->snap_rwsem);
+        return;
+}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
new file mode 100644
index 000000000000..75d02eaa1279
--- /dev/null
+++ b/fs/ceph/super.c
@@ -0,0 +1,1031 @@
+#include "ceph_debug.h"
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/parser.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+#include "decode.h"
+#include "super.h"
+#include "mon_client.h"
+#include "auth.h"
+/*
+ * Ceph superblock operations
+ *
+ * Handle the basics of mounting, unmounting.
+ */
+/*
+ * find filename portion of a path (/foo/bar/baz -> baz)
+ */
+const char *ceph_file_part(const char *s, int len)
+{
+        const char *e = s + len;
+        while (e != s && *(e-1) != '/')
+                e--;
+        return e;
+}
+/*
+ * super ops
+ */
+static void ceph_put_super(struct super_block *s)
+{
+        struct ceph_client *cl = ceph_client(s);
+        dout("put_super\n");
+        ceph_mdsc_close_sessions(&cl->mdsc);
+        return;
+}
+static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
+        struct ceph_monmap *monmap = client->monc.monmap;
+        struct ceph_statfs st;
+        u64 fsid;
+        int err;
+        dout("statfs\n");
+        err = ceph_monc_do_statfs(&client->monc, &st);
+        if (err < 0)
+                return err;
+        /* fill in kstatfs */
+        buf->f_type = CEPH_SUPER_MAGIC;  /* ?? */
+        /*
+         * express utilization in terms of large blocks to avoid
+         * overflow on 32-bit machines.
+         */
+        buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
+        buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
+        buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >>
+                (CEPH_BLOCK_SHIFT-10);
+        buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
+        buf->f_files = le64_to_cpu(st.num_objects);
+        buf->f_ffree = -1;
+        buf->f_namelen = PATH_MAX;
+        buf->f_frsize = PAGE_CACHE_SIZE;
+        /* leave fsid little-endian, regardless of host endianness */
+        fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
+        buf->f_fsid.val[0] = fsid & 0xffffffff;
+        buf->f_fsid.val[1] = fsid >> 32;
+        return 0;
+}
+static int ceph_syncfs(struct super_block *sb, int wait)
+{
+        dout("sync_fs %d\n", wait);
+        ceph_osdc_sync(&ceph_client(sb)->osdc);
+        ceph_mdsc_sync(&ceph_client(sb)->mdsc);
+        dout("sync_fs %d done\n", wait);
+        return 0;
+}
+/**
+ * ceph_show_options - Show mount options in /proc/mounts
+ * @m: seq_file to write to
+ * @mnt: mount descriptor
+ */
+static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+        struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
+        struct ceph_mount_args *args = client->mount_args;
+        if (args->flags & CEPH_OPT_FSID)
+                seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
+                           le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
+                           le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
+        if (args->flags & CEPH_OPT_NOSHARE)
+                seq_puts(m, ",noshare");
+        if (args->flags & CEPH_OPT_DIRSTAT)
+                seq_puts(m, ",dirstat");
+        if ((args->flags & CEPH_OPT_RBYTES) == 0)
+                seq_puts(m, ",norbytes");
+        if (args->flags & CEPH_OPT_NOCRC)
+                seq_puts(m, ",nocrc");
+        if (args->flags & CEPH_OPT_NOASYNCREADDIR)
+                seq_puts(m, ",noasyncreaddir");
+        if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
+                seq_printf(m, ",snapdirname=%s", args->snapdir_name);
+        if (args->name)
+                seq_printf(m, ",name=%s", args->name);
+        if (args->secret)
+                seq_puts(m, ",secret=<hidden>");
+        return 0;
+}
+/*
+ * caches
+ */
+struct kmem_cache *ceph_inode_cachep;
+struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_dentry_cachep;
+struct kmem_cache *ceph_file_cachep;
+static void ceph_inode_init_once(void *foo)
+{
+        struct ceph_inode_info *ci = foo;
+        inode_init_once(&ci->vfs_inode);
+}
+static int default_congestion_kb(void)
+{
+        int congestion_kb;
+        /*
+         * Copied from NFS
+         *
+         * congestion size, scale with available memory.
+         *
+         *  64MB:    8192k
+         * 128MB:   11585k
+         * 256MB:   16384k
+         * 512MB:   23170k
+         *   1GB:   32768k
+         *   2GB:   46340k
+         *   4GB:   65536k
+         *   8GB:   92681k
+         *  16GB:  131072k
+         *
+         * This allows larger machines to have larger/more transfers.
+         * Limit the default to 256M
+         */
+        congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+        if (congestion_kb > 256*1024)
+                congestion_kb = 256*1024;
+        return congestion_kb;
+}
+static int __init init_caches(void)
+{
+        ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
+                                      sizeof(struct ceph_inode_info),
+                                      __alignof__(struct ceph_inode_info),
+                                      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+                                      ceph_inode_init_once);
+        if (ceph_inode_cachep == NULL)
+                return -ENOMEM;
+        ceph_cap_cachep = KMEM_CACHE(ceph_cap,
+                                     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+        if (ceph_cap_cachep == NULL)
+                goto bad_cap;
+        ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
+                                        SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+        if (ceph_dentry_cachep == NULL)
+                goto bad_dentry;
+        ceph_file_cachep = KMEM_CACHE(ceph_file_info,
+                                      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+        if (ceph_file_cachep == NULL)
+                goto bad_file;
+        return 0;
+bad_file:
+        kmem_cache_destroy(ceph_dentry_cachep);
+bad_dentry:
+        kmem_cache_destroy(ceph_cap_cachep);
+bad_cap:
+        kmem_cache_destroy(ceph_inode_cachep);
+        return -ENOMEM;
+}
+static void destroy_caches(void)
+{
+        kmem_cache_destroy(ceph_inode_cachep);
+        kmem_cache_destroy(ceph_cap_cachep);
+        kmem_cache_destroy(ceph_dentry_cachep);
+        kmem_cache_destroy(ceph_file_cachep);
+}
+/*
+ * ceph_umount_begin - initiate forced umount.  Tear down down the
+ * mount, skipping steps that may hang while waiting for server(s).
+ */
+static void ceph_umount_begin(struct super_block *sb)
+{
+        struct ceph_client *client = ceph_sb_to_client(sb);
+        dout("ceph_umount_begin - starting forced umount\n");
+        if (!client)
+                return;
+        client->mount_state = CEPH_MOUNT_SHUTDOWN;
+        return;
+}
+static const struct super_operations ceph_super_ops = {
+        .alloc_inode    = ceph_alloc_inode,
+        .destroy_inode  = ceph_destroy_inode,
+        .write_inode    = ceph_write_inode,
+        .sync_fs        = ceph_syncfs,
+        .put_super      = ceph_put_super,
+        .show_options   = ceph_show_options,
+        .statfs         = ceph_statfs,
+        .umount_begin   = ceph_umount_begin,
+};
+const char *ceph_msg_type_name(int type)
+{
+        switch (type) {
+        case CEPH_MSG_SHUTDOWN: return "shutdown";
+        case CEPH_MSG_PING: return "ping";
+        case CEPH_MSG_AUTH: return "auth";
+        case CEPH_MSG_AUTH_REPLY: return "auth_reply";
+        case CEPH_MSG_MON_MAP: return "mon_map";
+        case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
+        case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
+        case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
+        case CEPH_MSG_STATFS: return "statfs";
+        case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
+        case CEPH_MSG_MDS_MAP: return "mds_map";
+        case CEPH_MSG_CLIENT_SESSION: return "client_session";
+        case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
+        case CEPH_MSG_CLIENT_REQUEST: return "client_request";
+        case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
+        case CEPH_MSG_CLIENT_REPLY: return "client_reply";
+        case CEPH_MSG_CLIENT_CAPS: return "client_caps";
+        case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
+        case CEPH_MSG_CLIENT_SNAP: return "client_snap";
+        case CEPH_MSG_CLIENT_LEASE: return "client_lease";
+        case CEPH_MSG_OSD_MAP: return "osd_map";
+        case CEPH_MSG_OSD_OP: return "osd_op";
+        case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
+        default: return "unknown";
+        }
+}
+/*
+ * mount options
+ */
+enum {
+        Opt_fsidmajor,
+        Opt_fsidminor,
+        Opt_monport,
+        Opt_wsize,
+        Opt_rsize,
+        Opt_osdtimeout,
+        Opt_osdkeepalivetimeout,
+        Opt_mount_timeout,
+        Opt_osd_idle_ttl,
+        Opt_caps_wanted_delay_min,
+        Opt_caps_wanted_delay_max,
+        Opt_readdir_max_entries,
+        Opt_congestion_kb,
+        Opt_last_int,
+        /* int args above */
+        Opt_snapdirname,
+        Opt_name,
+        Opt_secret,
+        Opt_last_string,
+        /* string args above */
+        Opt_ip,
+        Opt_noshare,
+        Opt_dirstat,
+        Opt_nodirstat,
+        Opt_rbytes,
+        Opt_norbytes,
+        Opt_nocrc,
+        Opt_noasyncreaddir,
+};
+static match_table_t arg_tokens = {
+        {Opt_fsidmajor, "fsidmajor=%ld"},
+        {Opt_fsidminor, "fsidminor=%ld"},
+        {Opt_monport, "monport=%d"},
+        {Opt_wsize, "wsize=%d"},
+        {Opt_rsize, "rsize=%d"},
+        {Opt_osdtimeout, "osdtimeout=%d"},
+        {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
+        {Opt_mount_timeout, "mount_timeout=%d"},
+        {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
+        {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
+        {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+        {Opt_readdir_max_entries, "readdir_max_entries=%d"},
+        {Opt_congestion_kb, "write_congestion_kb=%d"},
+        /* int args above */
+        {Opt_snapdirname, "snapdirname=%s"},
+        {Opt_name, "name=%s"},
+        {Opt_secret, "secret=%s"},
+        /* string args above */
+        {Opt_ip, "ip=%s"},
+        {Opt_noshare, "noshare"},
+        {Opt_dirstat, "dirstat"},
+        {Opt_nodirstat, "nodirstat"},
+        {Opt_rbytes, "rbytes"},
+        {Opt_norbytes, "norbytes"},
+        {Opt_nocrc, "nocrc"},
+        {Opt_noasyncreaddir, "noasyncreaddir"},
+        {-1, NULL}
+};
+static struct ceph_mount_args *parse_mount_args(int flags, char *options,
+                                                const char *dev_name,
+                                                const char **path)
+{
+        struct ceph_mount_args *args;
+        const char *c;
+        int err = -ENOMEM;
+        substring_t argstr[MAX_OPT_ARGS];
+        args = kzalloc(sizeof(*args), GFP_KERNEL);
+        if (!args)
+                return ERR_PTR(-ENOMEM);
+        args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
+                                 GFP_KERNEL);
+        if (!args->mon_addr)
+                goto out;
+        dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
+        /* start with defaults */
+        args->sb_flags = flags;
+        args->flags = CEPH_OPT_DEFAULT;
+        args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
+        args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
+        args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
+        args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
+        args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
+        args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
+        args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
+        args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+        args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
+        args->max_readdir = 1024;
+        args->congestion_kb = default_congestion_kb();
+        /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
+        err = -EINVAL;
+        if (!dev_name)
+                goto out;
+        *path = strstr(dev_name, ":/");
+        if (*path == NULL) {
+                pr_err("device name is missing path (no :/ in %s)\n",
+                       dev_name);
+                goto out;
+        }
+        /* get mon ip(s) */
+        err = ceph_parse_ips(dev_name, *path, args->mon_addr,
+                             CEPH_MAX_MON, &args->num_mon);
+        if (err < 0)
+                goto out;
+        /* path on server */
+        *path += 2;
+        dout("server path '%s'\n", *path);
+        /* parse mount options */
+        while ((c = strsep(&options, ",")) != NULL) {
+                int token, intval, ret;
+                if (!*c)
+                        continue;
+                err = -EINVAL;
+                token = match_token((char *)c, arg_tokens, argstr);
+                if (token < 0) {
+                        pr_err("bad mount option at '%s'\n", c);
+                        goto out;
+                }
+                if (token < Opt_last_int) {
+                        ret = match_int(&argstr[0], &intval);
+                        if (ret < 0) {
+                                pr_err("bad mount option arg (not int) "
+                                       "at '%s'\n", c);
+                                continue;
+                        }
+                        dout("got int token %d val %d\n", token, intval);
+                } else if (token > Opt_last_int && token < Opt_last_string) {
+                        dout("got string token %d val %s\n", token,
+                             argstr[0].from);
+                } else {
+                        dout("got token %d\n", token);
+                }
+                switch (token) {
+                case Opt_fsidmajor:
+                        *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
+                        break;
+                case Opt_fsidminor:
+                        *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
+                        break;
+                case Opt_ip:
+                        err = ceph_parse_ips(argstr[0].from,
+                                             argstr[0].to,
+                                             &args->my_addr,
+                                             1, NULL);
+                        if (err < 0)
+                                goto out;
+                        args->flags |= CEPH_OPT_MYIP;
+                        break;
+                case Opt_snapdirname:
+                        kfree(args->snapdir_name);
+                        args->snapdir_name = kstrndup(argstr[0].from,
+                                              argstr[0].to-argstr[0].from,
+                                              GFP_KERNEL);
+                        break;
+                case Opt_name:
+                        args->name = kstrndup(argstr[0].from,
+                                              argstr[0].to-argstr[0].from,
+                                              GFP_KERNEL);
+                        break;
+                case Opt_secret:
+                        args->secret = kstrndup(argstr[0].from,
+                                                argstr[0].to-argstr[0].from,
+                                                GFP_KERNEL);
+                        break;
+                        /* misc */
+                case Opt_wsize:
+                        args->wsize = intval;
+                        break;
+                case Opt_rsize:
+                        args->rsize = intval;
+                        break;
+                case Opt_osdtimeout:
+                        args->osd_timeout = intval;
+                        break;
+                case Opt_osdkeepalivetimeout:
+                        args->osd_keepalive_timeout = intval;
+                        break;
+                case Opt_mount_timeout:
+                        args->mount_timeout = intval;
+                        break;
+                case Opt_caps_wanted_delay_min:
+                        args->caps_wanted_delay_min = intval;
+                        break;
+                case Opt_caps_wanted_delay_max:
+                        args->caps_wanted_delay_max = intval;
+                        break;
+                case Opt_readdir_max_entries:
+                        args->max_readdir = intval;
+                        break;
+                case Opt_congestion_kb:
+                        args->congestion_kb = intval;
+                        break;
+                case Opt_noshare:
+                        args->flags |= CEPH_OPT_NOSHARE;
+                        break;
+                case Opt_dirstat:
+                        args->flags |= CEPH_OPT_DIRSTAT;
+                        break;
+                case Opt_nodirstat:
+                        args->flags &= ~CEPH_OPT_DIRSTAT;
+                        break;
+                case Opt_rbytes:
+                        args->flags |= CEPH_OPT_RBYTES;
+                        break;
+                case Opt_norbytes:
+                        args->flags &= ~CEPH_OPT_RBYTES;
+                        break;
+                case Opt_nocrc:
+                        args->flags |= CEPH_OPT_NOCRC;
+                        break;
+                case Opt_noasyncreaddir:
+                        args->flags |= CEPH_OPT_NOASYNCREADDIR;
+                        break;
+                default:
+                        BUG_ON(token);
+                }
+        }
+        return args;
+out:
+        kfree(args->mon_addr);
+        kfree(args);
+        return ERR_PTR(err);
+}
+static void destroy_mount_args(struct ceph_mount_args *args)
+{
+        dout("destroy_mount_args %p\n", args);
+        kfree(args->snapdir_name);
+        args->snapdir_name = NULL;
+        kfree(args->name);
+        args->name = NULL;
+        kfree(args->secret);
+        args->secret = NULL;
+        kfree(args);
+}
+/*
+ * create a fresh client instance
+ */
+static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
+{
+        struct ceph_client *client;
+        int err = -ENOMEM;
+        client = kzalloc(sizeof(*client), GFP_KERNEL);
+        if (client == NULL)
+                return ERR_PTR(-ENOMEM);
+        mutex_init(&client->mount_mutex);
+        init_waitqueue_head(&client->auth_wq);
+        client->sb = NULL;
+        client->mount_state = CEPH_MOUNT_MOUNTING;
+        client->mount_args = args;
+        client->msgr = NULL;
+        client->auth_err = 0;
+        atomic_long_set(&client->writeback_count, 0);
+        err = bdi_init(&client->backing_dev_info);
+        if (err < 0)
+                goto fail;
+        err = -ENOMEM;
+        client->wb_wq = create_workqueue("ceph-writeback");
+        if (client->wb_wq == NULL)
+                goto fail_bdi;
+        client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
+        if (client->pg_inv_wq == NULL)
+                goto fail_wb_wq;
+        client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
+        if (client->trunc_wq == NULL)
+                goto fail_pg_inv_wq;
+        /* set up mempools */
+        err = -ENOMEM;
+        client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
+                              client->mount_args->wsize >> PAGE_CACHE_SHIFT);
+        if (!client->wb_pagevec_pool)
+                goto fail_trunc_wq;
+        /* caps */
+        client->min_caps = args->max_readdir;
+        ceph_adjust_min_caps(client->min_caps);
+        /* subsystems */
+        err = ceph_monc_init(&client->monc, client);
+        if (err < 0)
+                goto fail_mempool;
+        err = ceph_osdc_init(&client->osdc, client);
+        if (err < 0)
+                goto fail_monc;
+        err = ceph_mdsc_init(&client->mdsc, client);
+        if (err < 0)
+                goto fail_osdc;
+        return client;
+fail_osdc:
+        ceph_osdc_stop(&client->osdc);
+fail_monc:
+        ceph_monc_stop(&client->monc);
+fail_mempool:
+        mempool_destroy(client->wb_pagevec_pool);
+fail_trunc_wq:
+        destroy_workqueue(client->trunc_wq);
+fail_pg_inv_wq:
+        destroy_workqueue(client->pg_inv_wq);
+fail_wb_wq:
+        destroy_workqueue(client->wb_wq);
+fail_bdi:
+        bdi_destroy(&client->backing_dev_info);
+fail:
+        kfree(client);
+        return ERR_PTR(err);
+}
+static void ceph_destroy_client(struct ceph_client *client)
+{
+        dout("destroy_client %p\n", client);
+        /* unmount */
+        ceph_mdsc_stop(&client->mdsc);
+        ceph_monc_stop(&client->monc);
+        ceph_osdc_stop(&client->osdc);
+        ceph_adjust_min_caps(-client->min_caps);
+        ceph_debugfs_client_cleanup(client);
+        destroy_workqueue(client->wb_wq);
+        destroy_workqueue(client->pg_inv_wq);
+        destroy_workqueue(client->trunc_wq);
+        if (client->msgr)
+                ceph_messenger_destroy(client->msgr);
+        mempool_destroy(client->wb_pagevec_pool);
+        destroy_mount_args(client->mount_args);
+        kfree(client);
+        dout("destroy_client %p done\n", client);
+}
+/*
+ * Initially learn our fsid, or verify an fsid matches.
+ */
+int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+{
+        if (client->have_fsid) {
+                if (ceph_fsid_compare(&client->fsid, fsid)) {
+                        pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT,
+                               PR_FSID(&client->fsid), PR_FSID(fsid));
+                        return -1;
+                }
+        } else {
+                pr_info("client%lld fsid " FSID_FORMAT "\n",
+                        client->monc.auth->global_id, PR_FSID(fsid));
+                memcpy(&client->fsid, fsid, sizeof(*fsid));
+                ceph_debugfs_client_init(client);
+                client->have_fsid = true;
+        }
+        return 0;
+}
+/*
+ * true if we have the mon map (and have thus joined the cluster)
+ */
+static int have_mon_map(struct ceph_client *client)
+{
+        return client->monc.monmap && client->monc.monmap->epoch;
+}
+/*
+ * Bootstrap mount by opening the root directory.  Note the mount
+ * @started time from caller, and time out if this takes too long.
+ */
+static struct dentry *open_root_dentry(struct ceph_client *client,
+                                       const char *path,
+                                       unsigned long started)
+{
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_request *req = NULL;
+        int err;
+        struct dentry *root;
+        /* open dir */
+        dout("open_root_inode opening '%s'\n", path);
+        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+        if (IS_ERR(req))
+                return ERR_PTR(PTR_ERR(req));
+        req->r_path1 = kstrdup(path, GFP_NOFS);
+        req->r_ino1.ino = CEPH_INO_ROOT;
+        req->r_ino1.snap = CEPH_NOSNAP;
+        req->r_started = started;
+        req->r_timeout = client->mount_args->mount_timeout * HZ;
+        req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
+        req->r_num_caps = 2;
+        err = ceph_mdsc_do_request(mdsc, NULL, req);
+        if (err == 0) {
+                dout("open_root_inode success\n");
+                if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
+                    client->sb->s_root == NULL)
+                        root = d_alloc_root(req->r_target_inode);
+                else
+                        root = d_obtain_alias(req->r_target_inode);
+                req->r_target_inode = NULL;
+                dout("open_root_inode success, root dentry is %p\n", root);
+        } else {
+                root = ERR_PTR(err);
+        }
+        ceph_mdsc_put_request(req);
+        return root;
+}
+/*
+ * mount: join the ceph cluster, and open root directory.
+ */
+static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
+                      const char *path)
+{
+        struct ceph_entity_addr *myaddr = NULL;
+        int err;
+        unsigned long timeout = client->mount_args->mount_timeout * HZ;
+        unsigned long started = jiffies;  /* note the start time */
+        struct dentry *root;
+        dout("mount start\n");
+        mutex_lock(&client->mount_mutex);
+        /* initialize the messenger */
+        if (client->msgr == NULL) {
+                if (ceph_test_opt(client, MYIP))
+                        myaddr = &client->mount_args->my_addr;
+                client->msgr = ceph_messenger_create(myaddr);
+                if (IS_ERR(client->msgr)) {
+                        err = PTR_ERR(client->msgr);
+                        client->msgr = NULL;
+                        goto out;
+                }
+                client->msgr->nocrc = ceph_test_opt(client, NOCRC);
+        }
+        /* open session, and wait for mon, mds, and osd maps */
+        err = ceph_monc_open_session(&client->monc);
+        if (err < 0)
+                goto out;
+        while (!have_mon_map(client)) {
+                err = -EIO;
+                if (timeout && time_after_eq(jiffies, started + timeout))
+                        goto out;
+                /* wait */
+                dout("mount waiting for mon_map\n");
+                err = wait_event_interruptible_timeout(client->auth_wq,
+                               have_mon_map(client) || (client->auth_err < 0),
+                               timeout);
+                if (err == -EINTR || err == -ERESTARTSYS)
+                        goto out;
+                if (client->auth_err < 0) {
+                        err = client->auth_err;
+                        goto out;
+                }
+        }
+        dout("mount opening root\n");
+        root = open_root_dentry(client, "", started);
+        if (IS_ERR(root)) {
+                err = PTR_ERR(root);
+                goto out;
+        }
+        if (client->sb->s_root)
+                dput(root);
+        else
+                client->sb->s_root = root;
+        if (path[0] == 0) {
+                dget(root);
+        } else {
+                dout("mount opening base mountpoint\n");
+                root = open_root_dentry(client, path, started);
+                if (IS_ERR(root)) {
+                        err = PTR_ERR(root);
+                        dput(client->sb->s_root);
+                        client->sb->s_root = NULL;
+                        goto out;
+                }
+        }
+        mnt->mnt_root = root;
+        mnt->mnt_sb = client->sb;
+        client->mount_state = CEPH_MOUNT_MOUNTED;
+        dout("mount success\n");
+        err = 0;
+out:
+        mutex_unlock(&client->mount_mutex);
+        return err;
+}
+static int ceph_set_super(struct super_block *s, void *data)
+{
+        struct ceph_client *client = data;
+        int ret;
+        dout("set_super %p data %p\n", s, data);
+        s->s_flags = client->mount_args->sb_flags;
+        s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */
+        s->s_fs_info = client;
+        client->sb = s;
+        s->s_op = &ceph_super_ops;
+        s->s_export_op = &ceph_export_ops;
+        s->s_time_gran = 1000;  /* 1000 ns == 1 us */
+        ret = set_anon_super(s, NULL);  /* what is that second arg for? */
+        if (ret != 0)
+                goto fail;
+        return ret;
+fail:
+        s->s_fs_info = NULL;
+        client->sb = NULL;
+        return ret;
+}
+/*
+ * share superblock if same fs AND options
+ */
+static int ceph_compare_super(struct super_block *sb, void *data)
+{
+        struct ceph_client *new = data;
+        struct ceph_mount_args *args = new->mount_args;
+        struct ceph_client *other = ceph_sb_to_client(sb);
+        int i;
+        dout("ceph_compare_super %p\n", sb);
+        if (args->flags & CEPH_OPT_FSID) {
+                if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
+                        dout("fsid doesn't match\n");
+                        return 0;
+                }
+        } else {
+                /* do we share (a) monitor? */
+                for (i = 0; i < new->monc.monmap->num_mon; i++)
+                        if (ceph_monmap_contains(other->monc.monmap,
+                                         &new->monc.monmap->mon_inst[i].addr))
+                                break;
+                if (i == new->monc.monmap->num_mon) {
+                        dout("mon ip not part of monmap\n");
+                        return 0;
+                }
+                dout("mon ip matches existing sb %p\n", sb);
+        }
+        if (args->sb_flags != other->mount_args->sb_flags) {
+                dout("flags differ\n");
+                return 0;
+        }
+        return 1;
+}
+/*
+ * construct our own bdi so we can control readahead, etc.
+ */
+static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
+{
+        int err;
+        sb->s_bdi = &client->backing_dev_info;
+        /* set ra_pages based on rsize mount option? */
+        if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
+                client->backing_dev_info.ra_pages =
+                        (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
+                        >> PAGE_SHIFT;
+        err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
+        return err;
+}
+static int ceph_get_sb(struct file_system_type *fs_type,
+                       int flags, const char *dev_name, void *data,
+                       struct vfsmount *mnt)
+{
+        struct super_block *sb;
+        struct ceph_client *client;
+        int err;
+        int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
+        const char *path = NULL;
+        struct ceph_mount_args *args;
+        dout("ceph_get_sb\n");
+        args = parse_mount_args(flags, data, dev_name, &path);
+        if (IS_ERR(args)) {
+                err = PTR_ERR(args);
+                goto out_final;
+        }
+        /* create client (which we may/may not use) */
+        client = ceph_create_client(args);
+        if (IS_ERR(client)) {
+                err = PTR_ERR(client);
+                goto out_final;
+        }
+        if (client->mount_args->flags & CEPH_OPT_NOSHARE)
+                compare_super = NULL;
+        sb = sget(fs_type, compare_super, ceph_set_super, client);
+        if (IS_ERR(sb)) {
+                err = PTR_ERR(sb);
+                goto out;
+        }
+        if (ceph_client(sb) != client) {
+                ceph_destroy_client(client);
+                client = ceph_client(sb);
+                dout("get_sb got existing client %p\n", client);
+        } else {
+                dout("get_sb using new client %p\n", client);
+                err = ceph_register_bdi(sb, client);
+                if (err < 0)
+                        goto out_splat;
+        }
+        err = ceph_mount(client, mnt, path);
+        if (err < 0)
+                goto out_splat;
+        dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
+             mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
+        return 0;
+out_splat:
+        ceph_mdsc_close_sessions(&client->mdsc);
+        up_write(&sb->s_umount);
+        deactivate_super(sb);
+        goto out_final;
+out:
+        ceph_destroy_client(client);
+out_final:
+        dout("ceph_get_sb fail %d\n", err);
+        return err;
+}
+static void ceph_kill_sb(struct super_block *s)
+{
+        struct ceph_client *client = ceph_sb_to_client(s);
+        dout("kill_sb %p\n", s);
+        ceph_mdsc_pre_umount(&client->mdsc);
+        kill_anon_super(s);    /* will call put_super after sb is r/o */
+        if (s->s_bdi == &client->backing_dev_info)
+                bdi_unregister(&client->backing_dev_info);
+        bdi_destroy(&client->backing_dev_info);
+        ceph_destroy_client(client);
+}
+static struct file_system_type ceph_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "ceph",
+        .get_sb         = ceph_get_sb,
+        .kill_sb        = ceph_kill_sb,
+        .fs_flags       = FS_RENAME_DOES_D_MOVE,
+};
+#define _STRINGIFY(x) #x
+#define STRINGIFY(x) _STRINGIFY(x)
+static int __init init_ceph(void)
+{
+        int ret = 0;
+        ret = ceph_debugfs_init();
+        if (ret < 0)
+                goto out;
+        ret = ceph_msgr_init();
+        if (ret < 0)
+                goto out_debugfs;
+        ret = init_caches();
+        if (ret)
+                goto out_msgr;
+        ceph_caps_init();
+        ret = register_filesystem(&ceph_fs_type);
+        if (ret)
+                goto out_icache;
+        pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n",
+                CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH,
+                CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL);
+        return 0;
+out_icache:
+        destroy_caches();
+out_msgr:
+        ceph_msgr_exit();
+out_debugfs:
+        ceph_debugfs_cleanup();
+out:
+        return ret;
+}
+static void __exit exit_ceph(void)
+{
+        dout("exit_ceph\n");
+        unregister_filesystem(&ceph_fs_type);
+        ceph_caps_finalize();
+        destroy_caches();
+        ceph_msgr_exit();
+        ceph_debugfs_cleanup();
+}
+module_init(init_ceph);
+module_exit(exit_ceph);
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
+MODULE_DESCRIPTION("Ceph filesystem for Linux");
+MODULE_LICENSE("GPL");
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
new file mode 100644
index 000000000000..ca702c67bc66
--- /dev/null
+++ b/fs/ceph/super.h
@@ -0,0 +1,902 @@
+#ifndef _FS_CEPH_SUPER_H
+#define _FS_CEPH_SUPER_H
+#include "ceph_debug.h"
+#include <asm/unaligned.h>
+#include <linux/backing-dev.h>
+#include <linux/completion.h>
+#include <linux/exportfs.h>
+#include <linux/fs.h>
+#include <linux/mempool.h>
+#include <linux/pagemap.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
+#include <linux/slab.h>
+#include "types.h"
+#include "messenger.h"
+#include "msgpool.h"
+#include "mon_client.h"
+#include "mds_client.h"
+#include "osd_client.h"
+#include "ceph_fs.h"
+/* f_type in struct statfs */
+#define CEPH_SUPER_MAGIC 0x00c36400
+/* large granularity for statfs utilization stats to facilitate
+ * large volume sizes on 32-bit machines. */
+#define CEPH_BLOCK_SHIFT   20  /* 1 MB */
+#define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
+/*
+ * mount options
+ */
+#define CEPH_OPT_FSID             (1<<0)
+#define CEPH_OPT_NOSHARE          (1<<1) /* don't share client with other sbs */
+#define CEPH_OPT_MYIP             (1<<2) /* specified my ip */
+#define CEPH_OPT_DIRSTAT          (1<<4) /* funky `cat dirname` for stats */
+#define CEPH_OPT_RBYTES           (1<<5) /* dir st_bytes = rbytes */
+#define CEPH_OPT_NOCRC            (1<<6) /* no data crc on writes */
+#define CEPH_OPT_NOASYNCREADDIR   (1<<7) /* no dcache readdir */
+#define CEPH_OPT_DEFAULT   (CEPH_OPT_RBYTES)
+#define ceph_set_opt(client, opt) \
+        (client)->mount_args->flags |= CEPH_OPT_##opt;
+#define ceph_test_opt(client, opt) \
+        (!!((client)->mount_args->flags & CEPH_OPT_##opt))
+struct ceph_mount_args {
+        int sb_flags;
+        int num_mon;
+        struct ceph_entity_addr *mon_addr;
+        int flags;
+        int mount_timeout;
+        int osd_idle_ttl;
+        int caps_wanted_delay_min, caps_wanted_delay_max;
+        struct ceph_fsid fsid;
+        struct ceph_entity_addr my_addr;
+        int wsize;
+        int rsize;            /* max readahead */
+        int max_readdir;      /* max readdir size */
+        int congestion_kb;      /* max readdir size */
+        int osd_timeout;
+        int osd_keepalive_timeout;
+        char *snapdir_name;   /* default ".snap" */
+        char *name;
+        char *secret;
+        int cap_release_safety;
+};
+/*
+ * defaults
+ */
+#define CEPH_MOUNT_TIMEOUT_DEFAULT  60
+#define CEPH_OSD_TIMEOUT_DEFAULT    60  /* seconds */
+#define CEPH_OSD_KEEPALIVE_DEFAULT  5
+#define CEPH_OSD_IDLE_TTL_DEFAULT    60
+#define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
+#define CEPH_MSG_MAX_FRONT_LEN  (16*1024*1024)
+#define CEPH_MSG_MAX_DATA_LEN   (16*1024*1024)
+#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
+#define CEPH_AUTH_NAME_DEFAULT   "guest"
+/*
+ * Delay telling the MDS we no longer want caps, in case we reopen
+ * the file.  Delay a minimum amount of time, even if we send a cap
+ * message for some other reason.  Otherwise, take the oppotunity to
+ * update the mds to avoid sending another message later.
+ */
+#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
+#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
+/* mount state */
+enum {
+        CEPH_MOUNT_MOUNTING,
+        CEPH_MOUNT_MOUNTED,
+        CEPH_MOUNT_UNMOUNTING,
+        CEPH_MOUNT_UNMOUNTED,
+        CEPH_MOUNT_SHUTDOWN,
+};
+/*
+ * subtract jiffies
+ */
+static inline unsigned long time_sub(unsigned long a, unsigned long b)
+{
+        BUG_ON(time_after(b, a));
+        return (long)a - (long)b;
+}
+/*
+ * per-filesystem client state
+ *
+ * possibly shared by multiple mount points, if they are
+ * mounting the same ceph filesystem/cluster.
+ */
+struct ceph_client {
+        struct ceph_fsid fsid;
+        bool have_fsid;
+        struct mutex mount_mutex;       /* serialize mount attempts */
+        struct ceph_mount_args *mount_args;
+        struct super_block *sb;
+        unsigned long mount_state;
+        wait_queue_head_t auth_wq;
+        int auth_err;
+        int min_caps;                  /* min caps i added */
+        struct ceph_messenger *msgr;   /* messenger instance */
+        struct ceph_mon_client monc;
+        struct ceph_mds_client mdsc;
+        struct ceph_osd_client osdc;
+        /* writeback */
+        mempool_t *wb_pagevec_pool;
+        struct workqueue_struct *wb_wq;
+        struct workqueue_struct *pg_inv_wq;
+        struct workqueue_struct *trunc_wq;
+        atomic_long_t writeback_count;
+        struct backing_dev_info backing_dev_info;
+#ifdef CONFIG_DEBUG_FS
+        struct dentry *debugfs_monmap;
+        struct dentry *debugfs_mdsmap, *debugfs_osdmap;
+        struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
+        struct dentry *debugfs_congestion_kb;
+        struct dentry *debugfs_bdi;
+#endif
+};
+static inline struct ceph_client *ceph_client(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
+/*
+ * File i/o capability.  This tracks shared state with the metadata
+ * server that allows us to cache or writeback attributes or to read
+ * and write data.  For any given inode, we should have one or more
+ * capabilities, one issued by each metadata server, and our
+ * cumulative access is the OR of all issued capabilities.
+ *
+ * Each cap is referenced by the inode's i_caps rbtree and by per-mds
+ * session capability lists.
+ */
+struct ceph_cap {
+        struct ceph_inode_info *ci;
+        struct rb_node ci_node;          /* per-ci cap tree */
+        struct ceph_mds_session *session;
+        struct list_head session_caps;   /* per-session caplist */
+        int mds;
+        u64 cap_id;       /* unique cap id (mds provided) */
+        int issued;       /* latest, from the mds */
+        int implemented;  /* implemented superset of issued (for revocation) */
+        int mds_wanted;
+        u32 seq, issue_seq, mseq;
+        u32 cap_gen;      /* active/stale cycle */
+        unsigned long last_used;
+        struct list_head caps_item;
+};
+#define CHECK_CAPS_NODELAY    1  /* do not delay any further */
+#define CHECK_CAPS_AUTHONLY   2  /* only check auth cap */
+#define CHECK_CAPS_FLUSH      4  /* flush any dirty caps */
+/*
+ * Snapped cap state that is pending flush to mds.  When a snapshot occurs,
+ * we first complete any in-process sync writes and writeback any dirty
+ * data before flushing the snapped state (tracked here) back to the MDS.
+ */
+struct ceph_cap_snap {
+        atomic_t nref;
+        struct ceph_inode_info *ci;
+        struct list_head ci_item, flushing_item;
+        u64 follows, flush_tid;
+        int issued, dirty;
+        struct ceph_snap_context *context;
+        mode_t mode;
+        uid_t uid;
+        gid_t gid;
+        void *xattr_blob;
+        int xattr_len;
+        u64 xattr_version;
+        u64 size;
+        struct timespec mtime, atime, ctime;
+        u64 time_warp_seq;
+        int writing;   /* a sync write is still in progress */
+        int dirty_pages;     /* dirty pages awaiting writeback */
+};
+static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
+{
+        if (atomic_dec_and_test(&capsnap->nref))
+                kfree(capsnap);
+}
+/*
+ * The frag tree describes how a directory is fragmented, potentially across
+ * multiple metadata servers.  It is also used to indicate points where
+ * metadata authority is delegated, and whether/where metadata is replicated.
+ *
+ * A _leaf_ frag will be present in the i_fragtree IFF there is
+ * delegation info.  That is, if mds >= 0 || ndist > 0.
+ */
+#define CEPH_MAX_DIRFRAG_REP 4
+struct ceph_inode_frag {
+        struct rb_node node;
+        /* fragtree state */
+        u32 frag;
+        int split_by;         /* i.e. 2^(split_by) children */
+        /* delegation and replication info */
+        int mds;              /* -1 if same authority as parent */
+        int ndist;            /* >0 if replicated */
+        int dist[CEPH_MAX_DIRFRAG_REP];
+};
+/*
+ * We cache inode xattrs as an encoded blob until they are first used,
+ * at which point we parse them into an rbtree.
+ */
+struct ceph_inode_xattr {
+        struct rb_node node;
+        const char *name;
+        int name_len;
+        const char *val;
+        int val_len;
+        int dirty;
+        int should_free_name;
+        int should_free_val;
+};
+struct ceph_inode_xattrs_info {
+        /*
+         * (still encoded) xattr blob. we avoid the overhead of parsing
+         * this until someone actually calls getxattr, etc.
+         *
+         * blob->vec.iov_len == 4 implies there are no xattrs; blob ==
+         * NULL means we don't know.
+        */
+        struct ceph_buffer *blob, *prealloc_blob;
+        struct rb_root index;
+        bool dirty;
+        int count;
+        int names_size;
+        int vals_size;
+        u64 version, index_version;
+};
+/*
+ * Ceph inode.
+ */
+#define CEPH_I_COMPLETE  1  /* we have complete directory cached */
+#define CEPH_I_NODELAY   4  /* do not delay cap release */
+#define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
+#define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
+struct ceph_inode_info {
+        struct ceph_vino i_vino;   /* ceph ino + snap */
+        u64 i_version;
+        u32 i_time_warp_seq;
+        unsigned i_ceph_flags;
+        unsigned long i_release_count;
+        struct ceph_file_layout i_layout;
+        char *i_symlink;
+        /* for dirs */
+        struct timespec i_rctime;
+        u64 i_rbytes, i_rfiles, i_rsubdirs;
+        u64 i_files, i_subdirs;
+        u64 i_max_offset;  /* largest readdir offset, set with I_COMPLETE */
+        struct rb_root i_fragtree;
+        struct mutex i_fragtree_mutex;
+        struct ceph_inode_xattrs_info i_xattrs;
+        /* capabilities.  protected _both_ by i_lock and cap->session's
+         * s_mutex. */
+        struct rb_root i_caps;           /* cap list */
+        struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
+        unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
+        struct list_head i_dirty_item, i_flushing_item;
+        u64 i_cap_flush_seq;
+        /* we need to track cap writeback on a per-cap-bit basis, to allow
+         * overlapping, pipelined cap flushes to the mds.  we can probably
+         * reduce the tid to 8 bits if we're concerned about inode size. */
+        u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
+        wait_queue_head_t i_cap_wq;      /* threads waiting on a capability */
+        unsigned long i_hold_caps_min; /* jiffies */
+        unsigned long i_hold_caps_max; /* jiffies */
+        struct list_head i_cap_delay_list;  /* for delayed cap release to mds */
+        int i_cap_exporting_mds;         /* to handle cap migration between */
+        unsigned i_cap_exporting_mseq;   /*  mds's. */
+        unsigned i_cap_exporting_issued;
+        struct ceph_cap_reservation i_cap_migration_resv;
+        struct list_head i_cap_snaps;   /* snapped state pending flush to mds */
+        struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 */
+        unsigned i_snap_caps;           /* cap bits for snapped files */
+        int i_nr_by_mode[CEPH_FILE_MODE_NUM];  /* open file counts */
+        u32 i_truncate_seq;        /* last truncate to smaller size */
+        u64 i_truncate_size;       /*  and the size we last truncated down to */
+        int i_truncate_pending;    /*  still need to call vmtruncate */
+        u64 i_max_size;            /* max file size authorized by mds */
+        u64 i_reported_size; /* (max_)size reported to or requested of mds */
+        u64 i_wanted_max_size;     /* offset we'd like to write too */
+        u64 i_requested_max_size;  /* max_size we've requested */
+        /* held references to caps */
+        int i_pin_ref;
+        int i_rd_ref, i_rdcache_ref, i_wr_ref;
+        int i_wrbuffer_ref, i_wrbuffer_ref_head;
+        u32 i_shared_gen;       /* increment each time we get FILE_SHARED */
+        u32 i_rdcache_gen;      /* we increment this each time we get
+                                   FILE_CACHE.  If it's non-zero, we
+                                   _may_ have cached pages. */
+        u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
+        struct list_head i_unsafe_writes; /* uncommitted sync writes */
+        struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
+        spinlock_t i_unsafe_lock;
+        struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+        int i_snap_realm_counter; /* snap realm (if caps) */
+        struct list_head i_snap_realm_item;
+        struct list_head i_snap_flush_item;
+        struct work_struct i_wb_work;  /* writeback work */
+        struct work_struct i_pg_inv_work;  /* page invalidation work */
+        struct work_struct i_vmtruncate_work;
+        struct inode vfs_inode; /* at end */
+};
+static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
+{
+        return container_of(inode, struct ceph_inode_info, vfs_inode);
+}
+static inline void ceph_i_clear(struct inode *inode, unsigned mask)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        spin_lock(&inode->i_lock);
+        ci->i_ceph_flags &= ~mask;
+        spin_unlock(&inode->i_lock);
+}
+static inline void ceph_i_set(struct inode *inode, unsigned mask)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        spin_lock(&inode->i_lock);
+        ci->i_ceph_flags |= mask;
+        spin_unlock(&inode->i_lock);
+}
+static inline bool ceph_i_test(struct inode *inode, unsigned mask)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        bool r;
+        smp_mb();
+        r = (ci->i_ceph_flags & mask) == mask;
+        return r;
+}
+/* find a specific frag @f */
+extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
+                                                u32 f);
+/*
+ * choose fragment for value @v.  copy frag content to pfrag, if leaf
+ * exists
+ */
+extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+                            struct ceph_inode_frag *pfrag,
+                            int *found);
+/*
+ * Ceph dentry state
+ */
+struct ceph_dentry_info {
+        struct ceph_mds_session *lease_session;
+        u32 lease_gen, lease_shared_gen;
+        u32 lease_seq;
+        unsigned long lease_renew_after, lease_renew_from;
+        struct list_head lru;
+        struct dentry *dentry;
+        u64 time;
+        u64 offset;
+};
+static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
+{
+        return (struct ceph_dentry_info *)dentry->d_fsdata;
+}
+static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
+{
+        return ((loff_t)frag << 32) | (loff_t)off;
+}
+/*
+ * ino_t is <64 bits on many architectures, blech.
+ *
+ * don't include snap in ino hash, at least for now.
+ */
+static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+{
+        ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
+#if BITS_PER_LONG == 32
+        ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
+        if (!ino)
+                ino = 1;
+#endif
+        return ino;
+}
+static inline int ceph_set_ino_cb(struct inode *inode, void *data)
+{
+        ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
+        inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
+        return 0;
+}
+static inline struct ceph_vino ceph_vino(struct inode *inode)
+{
+        return ceph_inode(inode)->i_vino;
+}
+/* for printf-style formatting */
+#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
+static inline u64 ceph_ino(struct inode *inode)
+{
+        return ceph_inode(inode)->i_vino.ino;
+}
+static inline u64 ceph_snap(struct inode *inode)
+{
+        return ceph_inode(inode)->i_vino.snap;
+}
+static inline int ceph_ino_compare(struct inode *inode, void *data)
+{
+        struct ceph_vino *pvino = (struct ceph_vino *)data;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        return ci->i_vino.ino == pvino->ino &&
+                ci->i_vino.snap == pvino->snap;
+}
+static inline struct inode *ceph_find_inode(struct super_block *sb,
+                                            struct ceph_vino vino)
+{
+        ino_t t = ceph_vino_to_ino(vino);
+        return ilookup5(sb, t, ceph_ino_compare, &vino);
+}
+/*
+ * caps helpers
+ */
+static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
+{
+        return !RB_EMPTY_ROOT(&ci->i_caps);
+}
+extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
+extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
+extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
+                                    struct ceph_cap *cap);
+static inline int ceph_caps_issued(struct ceph_inode_info *ci)
+{
+        int issued;
+        spin_lock(&ci->vfs_inode.i_lock);
+        issued = __ceph_caps_issued(ci, NULL);
+        spin_unlock(&ci->vfs_inode.i_lock);
+        return issued;
+}
+static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
+                                        int touch)
+{
+        int r;
+        spin_lock(&ci->vfs_inode.i_lock);
+        r = __ceph_caps_issued_mask(ci, mask, touch);
+        spin_unlock(&ci->vfs_inode.i_lock);
+        return r;
+}
+static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
+{
+        return ci->i_dirty_caps | ci->i_flushing_caps;
+}
+extern void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
+extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
+extern int __ceph_caps_used(struct ceph_inode_info *ci);
+extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
+/*
+ * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
+ */
+static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
+{
+        int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
+        if (w & CEPH_CAP_FILE_BUFFER)
+                w |= CEPH_CAP_FILE_EXCL;  /* we want EXCL if dirty data */
+        return w;
+}
+/* what the mds thinks we want */
+extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
+extern void ceph_caps_init(void);
+extern void ceph_caps_finalize(void);
+extern void ceph_adjust_min_caps(int delta);
+extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need);
+extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx);
+extern void ceph_reservation_status(struct ceph_client *client,
+                                    int *total, int *avail, int *used,
+                                    int *reserved, int *min);
+static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
+{
+        return (struct ceph_client *)inode->i_sb->s_fs_info;
+}
+static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
+{
+        return (struct ceph_client *)sb->s_fs_info;
+}
+/*
+ * we keep buffered readdir results attached to file->private_data
+ */
+struct ceph_file_info {
+        int fmode;     /* initialized on open */
+        /* readdir: position within the dir */
+        u32 frag;
+        struct ceph_mds_request *last_readdir;
+        int at_end;
+        /* readdir: position within a frag */
+        unsigned offset;       /* offset of last chunk, adjusted for . and .. */
+        u64 next_offset;       /* offset of next chunk (last_name's + 1) */
+        char *last_name;       /* last entry in previous chunk */
+        struct dentry *dentry; /* next dentry (for dcache readdir) */
+        unsigned long dir_release_count;
+        /* used for -o dirstat read() on directory thing */
+        char *dir_info;
+        int dir_info_len;
+};
+/*
+ * snapshots
+ */
+/*
+ * A "snap context" is the set of existing snapshots when we
+ * write data.  It is used by the OSD to guide its COW behavior.
+ *
+ * The ceph_snap_context is refcounted, and attached to each dirty
+ * page, indicating which context the dirty data belonged when it was
+ * dirtied.
+ */
+struct ceph_snap_context {
+        atomic_t nref;
+        u64 seq;
+        int num_snaps;
+        u64 snaps[];
+};
+static inline struct ceph_snap_context *
+ceph_get_snap_context(struct ceph_snap_context *sc)
+{
+        /*
+        printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
+               atomic_read(&sc->nref)+1);
+        */
+        if (sc)
+                atomic_inc(&sc->nref);
+        return sc;
+}
+static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
+{
+        if (!sc)
+                return;
+        /*
+        printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
+               atomic_read(&sc->nref)-1);
+        */
+        if (atomic_dec_and_test(&sc->nref)) {
+                /*printk(" deleting snap_context %p\n", sc);*/
+                kfree(sc);
+        }
+}
+/*
+ * A "snap realm" describes a subset of the file hierarchy sharing
+ * the same set of snapshots that apply to it.  The realms themselves
+ * are organized into a hierarchy, such that children inherit (some of)
+ * the snapshots of their parents.
+ *
+ * All inodes within the realm that have capabilities are linked into a
+ * per-realm list.
+ */
+struct ceph_snap_realm {
+        u64 ino;
+        atomic_t nref;
+        struct rb_node node;
+        u64 created, seq;
+        u64 parent_ino;
+        u64 parent_since;   /* snapid when our current parent became so */
+        u64 *prior_parent_snaps;      /* snaps inherited from any parents we */
+        int num_prior_parent_snaps;   /*  had prior to parent_since */
+        u64 *snaps;                   /* snaps specific to this realm */
+        int num_snaps;
+        struct ceph_snap_realm *parent;
+        struct list_head children;       /* list of child realms */
+        struct list_head child_item;
+        struct list_head empty_item;     /* if i have ref==0 */
+        /* the current set of snaps for this realm */
+        struct ceph_snap_context *cached_context;
+        struct list_head inodes_with_caps;
+        spinlock_t inodes_with_caps_lock;
+};
+/*
+ * calculate the number of pages a given length and offset map onto,
+ * if we align the data.
+ */
+static inline int calc_pages_for(u64 off, u64 len)
+{
+        return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
+                (off >> PAGE_CACHE_SHIFT);
+}
+/* snap.c */
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+                                               u64 ino);
+extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
+                                struct ceph_snap_realm *realm);
+extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
+                                struct ceph_snap_realm *realm);
+extern int ceph_update_snap_trace(struct ceph_mds_client *m,
+                                  void *p, void *e, bool deletion);
+extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
+                             struct ceph_mds_session *session,
+                             struct ceph_msg *msg);
+extern void ceph_queue_cap_snap(struct ceph_inode_info *ci,
+                                struct ceph_snap_context *snapc);
+extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
+                                  struct ceph_cap_snap *capsnap);
+extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
+/*
+ * a cap_snap is "pending" if it is still awaiting an in-progress
+ * sync write (that may/may not still update size, mtime, etc.).
+ */
+static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
+{
+        return !list_empty(&ci->i_cap_snaps) &&
+                list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
+                           ci_item)->writing;
+}
+/* super.c */
+extern struct kmem_cache *ceph_inode_cachep;
+extern struct kmem_cache *ceph_cap_cachep;
+extern struct kmem_cache *ceph_dentry_cachep;
+extern struct kmem_cache *ceph_file_cachep;
+extern const char *ceph_msg_type_name(int type);
+extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
+#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
+        "%02x%02x%02x%02x%02x%02x"
+#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
+                (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7],    \
+                (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11],  \
+                (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
+/* inode.c */
+extern const struct inode_operations ceph_file_iops;
+extern struct inode *ceph_alloc_inode(struct super_block *sb);
+extern void ceph_destroy_inode(struct inode *inode);
+extern struct inode *ceph_get_inode(struct super_block *sb,
+                                    struct ceph_vino vino);
+extern struct inode *ceph_get_snapdir(struct inode *parent);
+extern int ceph_fill_file_size(struct inode *inode, int issued,
+                               u32 truncate_seq, u64 truncate_size, u64 size);
+extern void ceph_fill_file_time(struct inode *inode, int issued,
+                                u64 time_warp_seq, struct timespec *ctime,
+                                struct timespec *mtime, struct timespec *atime);
+extern int ceph_fill_trace(struct super_block *sb,
+                           struct ceph_mds_request *req,
+                           struct ceph_mds_session *session);
+extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
+                                    struct ceph_mds_session *session);
+extern int ceph_inode_holds_cap(struct inode *inode, int mask);
+extern int ceph_inode_set_size(struct inode *inode, loff_t size);
+extern void __ceph_do_pending_vmtruncate(struct inode *inode);
+extern void ceph_queue_vmtruncate(struct inode *inode);
+extern void ceph_queue_invalidate(struct inode *inode);
+extern void ceph_queue_writeback(struct inode *inode);
+extern int ceph_do_getattr(struct inode *inode, int mask);
+extern int ceph_permission(struct inode *inode, int mask);
+extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
+extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                        struct kstat *stat);
+/* xattr.c */
+extern int ceph_setxattr(struct dentry *, const char *, const void *,
+                         size_t, int);
+extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
+extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
+extern int ceph_removexattr(struct dentry *, const char *);
+extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
+extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
+/* caps.c */
+extern const char *ceph_cap_string(int c);
+extern void ceph_handle_caps(struct ceph_mds_session *session,
+                             struct ceph_msg *msg);
+extern int ceph_add_cap(struct inode *inode,
+                        struct ceph_mds_session *session, u64 cap_id,
+                        int fmode, unsigned issued, unsigned wanted,
+                        unsigned cap, unsigned seq, u64 realmino, int flags,
+                        struct ceph_cap_reservation *caps_reservation);
+extern void __ceph_remove_cap(struct ceph_cap *cap);
+static inline void ceph_remove_cap(struct ceph_cap *cap)
+{
+        struct inode *inode = &cap->ci->vfs_inode;
+        spin_lock(&inode->i_lock);
+        __ceph_remove_cap(cap);
+        spin_unlock(&inode->i_lock);
+}
+extern void ceph_put_cap(struct ceph_cap *cap);
+extern void ceph_queue_caps_release(struct inode *inode);
+extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
+extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
+extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
+                                    struct ceph_mds_session *session);
+extern int ceph_get_cap_mds(struct inode *inode);
+extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
+extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
+extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
+                                       struct ceph_snap_context *snapc);
+extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
+                               struct ceph_mds_session **psession);
+extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
+                            struct ceph_mds_session *session);
+extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
+extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
+extern int ceph_encode_inode_release(void **p, struct inode *inode,
+                                     int mds, int drop, int unless, int force);
+extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
+                                      int mds, int drop, int unless);
+extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
+                         int *got, loff_t endoff);
+/* for counting open files by mode */
+static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
+{
+        ci->i_nr_by_mode[mode]++;
+}
+extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
+/* addr.c */
+extern const struct address_space_operations ceph_aops;
+extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
+/* file.c */
+extern const struct file_operations ceph_file_fops;
+extern const struct address_space_operations ceph_aops;
+extern int ceph_open(struct inode *inode, struct file *file);
+extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
+                                       struct nameidata *nd, int mode,
+                                       int locked_dir);
+extern int ceph_release(struct inode *inode, struct file *filp);
+extern void ceph_release_page_vector(struct page **pages, int num_pages);
+/* dir.c */
+extern const struct file_operations ceph_dir_fops;
+extern const struct inode_operations ceph_dir_iops;
+extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
+        ceph_snapdir_dentry_ops;
+extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
+extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
+                                         struct dentry *dentry, int err);
+extern void ceph_dentry_lru_add(struct dentry *dn);
+extern void ceph_dentry_lru_touch(struct dentry *dn);
+extern void ceph_dentry_lru_del(struct dentry *dn);
+/*
+ * our d_ops vary depending on whether the inode is live,
+ * snapshotted (read-only), or a virtual ".snap" directory.
+ */
+int ceph_init_dentry(struct dentry *dentry);
+/* ioctl.c */
+extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+/* export.c */
+extern const struct export_operations ceph_export_ops;
+/* debugfs.c */
+extern int ceph_debugfs_init(void);
+extern void ceph_debugfs_cleanup(void);
+extern int ceph_debugfs_client_init(struct ceph_client *client);
+extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
+static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
+{
+        if (dentry && dentry->d_parent)
+                return dentry->d_parent->d_inode;
+        return NULL;
+}
+#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
new file mode 100644
index 000000000000..28b35a005ec2
--- /dev/null
+++ b/fs/ceph/types.h
@@ -0,0 +1,29 @@
+#ifndef _FS_CEPH_TYPES_H
+#define _FS_CEPH_TYPES_H
+/* needed before including ceph_fs.h */
+#include <linux/in.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/string.h>
+#include "ceph_fs.h"
+#include "ceph_frag.h"
+#include "ceph_hash.h"
+/*
+ * Identify inodes by both their ino AND snapshot id (a u64).
+ */
+struct ceph_vino {
+        u64 ino;
+        u64 snap;
+};
+/* context for the caps reservation mechanism */
+struct ceph_cap_reservation {
+        int count;
+};
+#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
new file mode 100644
index 000000000000..2845422907fc
--- /dev/null
+++ b/fs/ceph/xattr.c
@@ -0,0 +1,845 @@
+#include "ceph_debug.h"
+#include "super.h"
+#include "decode.h"
+#include <linux/xattr.h>
+#include <linux/slab.h>
+static bool ceph_is_valid_xattr(const char *name)
+{
+        return !strncmp(name, XATTR_SECURITY_PREFIX,
+                        XATTR_SECURITY_PREFIX_LEN) ||
+               !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
+               !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+}
+/*
+ * These define virtual xattrs exposing the recursive directory
+ * statistics and layout metadata.
+ */
+struct ceph_vxattr_cb {
+        bool readonly;
+        char *name;
+        size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
+                              size_t size);
+};
+/* directories */
+static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
+                                        size_t size)
+{
+        return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
+}
+static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
+                                      size_t size)
+{
+        return snprintf(val, size, "%lld", ci->i_files);
+}
+static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
+                                        size_t size)
+{
+        return snprintf(val, size, "%lld", ci->i_subdirs);
+}
+static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
+                                         size_t size)
+{
+        return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
+}
+static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
+                                       size_t size)
+{
+        return snprintf(val, size, "%lld", ci->i_rfiles);
+}
+static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
+                                         size_t size)
+{
+        return snprintf(val, size, "%lld", ci->i_rsubdirs);
+}
+static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
+                                       size_t size)
+{
+        return snprintf(val, size, "%lld", ci->i_rbytes);
+}
+static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
+                                       size_t size)
+{
+        return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
+                        (long)ci->i_rctime.tv_nsec);
+}
+static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
+        { true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
+        { true, "user.ceph.dir.files", ceph_vxattrcb_files},
+        { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
+        { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
+        { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
+        { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
+        { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
+        { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
+        { true, NULL, NULL }
+};
+/* files */
+static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
+                                   size_t size)
+{
+        int ret;
+        ret = snprintf(val, size,
+                "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
+                (unsigned long long)ceph_file_layout_su(ci->i_layout),
+                (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
+                (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+        if (ceph_file_layout_pg_preferred(ci->i_layout))
+                ret += snprintf(val + ret, size, "preferred_osd=%lld\n",
+                            (unsigned long long)ceph_file_layout_pg_preferred(
+                                    ci->i_layout));
+        return ret;
+}
+static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
+        { true, "user.ceph.layout", ceph_vxattrcb_layout},
+        { NULL, NULL }
+};
+static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
+{
+        if (S_ISDIR(inode->i_mode))
+                return ceph_dir_vxattrs;
+        else if (S_ISREG(inode->i_mode))
+                return ceph_file_vxattrs;
+        return NULL;
+}
+static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
+                                                const char *name)
+{
+        do {
+                if (strcmp(vxattr->name, name) == 0)
+                        return vxattr;
+                vxattr++;
+        } while (vxattr->name);
+        return NULL;
+}
+static int __set_xattr(struct ceph_inode_info *ci,
+                           const char *name, int name_len,
+                           const char *val, int val_len,
+                           int dirty,
+                           int should_free_name, int should_free_val,
+                           struct ceph_inode_xattr **newxattr)
+{
+        struct rb_node **p;
+        struct rb_node *parent = NULL;
+        struct ceph_inode_xattr *xattr = NULL;
+        int c;
+        int new = 0;
+        p = &ci->i_xattrs.index.rb_node;
+        while (*p) {
+                parent = *p;
+                xattr = rb_entry(parent, struct ceph_inode_xattr, node);
+                c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
+                if (c < 0)
+                        p = &(*p)->rb_left;
+                else if (c > 0)
+                        p = &(*p)->rb_right;
+                else {
+                        if (name_len == xattr->name_len)
+                                break;
+                        else if (name_len < xattr->name_len)
+                                p = &(*p)->rb_left;
+                        else
+                                p = &(*p)->rb_right;
+                }
+                xattr = NULL;
+        }
+        if (!xattr) {
+                new = 1;
+                xattr = *newxattr;
+                xattr->name = name;
+                xattr->name_len = name_len;
+                xattr->should_free_name = should_free_name;
+                ci->i_xattrs.count++;
+                dout("__set_xattr count=%d\n", ci->i_xattrs.count);
+        } else {
+                kfree(*newxattr);
+                *newxattr = NULL;
+                if (xattr->should_free_val)
+                        kfree((void *)xattr->val);
+                if (should_free_name) {
+                        kfree((void *)name);
+                        name = xattr->name;
+                }
+                ci->i_xattrs.names_size -= xattr->name_len;
+                ci->i_xattrs.vals_size -= xattr->val_len;
+        }
+        if (!xattr) {
+                pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
+                       &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
+                       xattr->val);
+                return -ENOMEM;
+        }
+        ci->i_xattrs.names_size += name_len;
+        ci->i_xattrs.vals_size += val_len;
+        if (val)
+                xattr->val = val;
+        else
+                xattr->val = "";
+        xattr->val_len = val_len;
+        xattr->dirty = dirty;
+        xattr->should_free_val = (val && should_free_val);
+        if (new) {
+                rb_link_node(&xattr->node, parent, p);
+                rb_insert_color(&xattr->node, &ci->i_xattrs.index);
+                dout("__set_xattr_val p=%p\n", p);
+        }
+        dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
+             ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
+        return 0;
+}
+static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
+                           const char *name)
+{
+        struct rb_node **p;
+        struct rb_node *parent = NULL;
+        struct ceph_inode_xattr *xattr = NULL;
+        int c;
+        p = &ci->i_xattrs.index.rb_node;
+        while (*p) {
+                parent = *p;
+                xattr = rb_entry(parent, struct ceph_inode_xattr, node);
+                c = strncmp(name, xattr->name, xattr->name_len);
+                if (c < 0)
+                        p = &(*p)->rb_left;
+                else if (c > 0)
+                        p = &(*p)->rb_right;
+                else {
+                        dout("__get_xattr %s: found %.*s\n", name,
+                             xattr->val_len, xattr->val);
+                        return xattr;
+                }
+        }
+        dout("__get_xattr %s: not found\n", name);
+        return NULL;
+}
+static void __free_xattr(struct ceph_inode_xattr *xattr)
+{
+        BUG_ON(!xattr);
+        if (xattr->should_free_name)
+                kfree((void *)xattr->name);
+        if (xattr->should_free_val)
+                kfree((void *)xattr->val);
+        kfree(xattr);
+}
+static int __remove_xattr(struct ceph_inode_info *ci,
+                          struct ceph_inode_xattr *xattr)
+{
+        if (!xattr)
+                return -EOPNOTSUPP;
+        rb_erase(&xattr->node, &ci->i_xattrs.index);
+        if (xattr->should_free_name)
+                kfree((void *)xattr->name);
+        if (xattr->should_free_val)
+                kfree((void *)xattr->val);
+        ci->i_xattrs.names_size -= xattr->name_len;
+        ci->i_xattrs.vals_size -= xattr->val_len;
+        ci->i_xattrs.count--;
+        kfree(xattr);
+        return 0;
+}
+static int __remove_xattr_by_name(struct ceph_inode_info *ci,
+                           const char *name)
+{
+        struct rb_node **p;
+        struct ceph_inode_xattr *xattr;
+        int err;
+        p = &ci->i_xattrs.index.rb_node;
+        xattr = __get_xattr(ci, name);
+        err = __remove_xattr(ci, xattr);
+        return err;
+}
+static char *__copy_xattr_names(struct ceph_inode_info *ci,
+                                char *dest)
+{
+        struct rb_node *p;
+        struct ceph_inode_xattr *xattr = NULL;
+        p = rb_first(&ci->i_xattrs.index);
+        dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
+        while (p) {
+                xattr = rb_entry(p, struct ceph_inode_xattr, node);
+                memcpy(dest, xattr->name, xattr->name_len);
+                dest[xattr->name_len] = '\0';
+                dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
+                     xattr->name_len, ci->i_xattrs.names_size);
+                dest += xattr->name_len + 1;
+                p = rb_next(p);
+        }
+        return dest;
+}
+void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
+{
+        struct rb_node *p, *tmp;
+        struct ceph_inode_xattr *xattr = NULL;
+        p = rb_first(&ci->i_xattrs.index);
+        dout("__ceph_destroy_xattrs p=%p\n", p);
+        while (p) {
+                xattr = rb_entry(p, struct ceph_inode_xattr, node);
+                tmp = p;
+                p = rb_next(tmp);
+                dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
+                     xattr->name_len, xattr->name);
+                rb_erase(tmp, &ci->i_xattrs.index);
+                __free_xattr(xattr);
+        }
+        ci->i_xattrs.names_size = 0;
+        ci->i_xattrs.vals_size = 0;
+        ci->i_xattrs.index_version = 0;
+        ci->i_xattrs.count = 0;
+        ci->i_xattrs.index = RB_ROOT;
+}
+static int __build_xattrs(struct inode *inode)
+{
+        u32 namelen;
+        u32 numattr = 0;
+        void *p, *end;
+        u32 len;
+        const char *name, *val;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int xattr_version;
+        struct ceph_inode_xattr **xattrs = NULL;
+        int err = 0;
+        int i;
+        dout("__build_xattrs() len=%d\n",
+             ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
+        if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
+                return 0; /* already built */
+        __ceph_destroy_xattrs(ci);
+start:
+        /* updated internal xattr rb tree */
+        if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
+                p = ci->i_xattrs.blob->vec.iov_base;
+                end = p + ci->i_xattrs.blob->vec.iov_len;
+                ceph_decode_32_safe(&p, end, numattr, bad);
+                xattr_version = ci->i_xattrs.version;
+                spin_unlock(&inode->i_lock);
+                xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
+                                 GFP_NOFS);
+                err = -ENOMEM;
+                if (!xattrs)
+                        goto bad_lock;
+                memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
+                for (i = 0; i < numattr; i++) {
+                        xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
+                                            GFP_NOFS);
+                        if (!xattrs[i])
+                                goto bad_lock;
+                }
+                spin_lock(&inode->i_lock);
+                if (ci->i_xattrs.version != xattr_version) {
+                        /* lost a race, retry */
+                        for (i = 0; i < numattr; i++)
+                                kfree(xattrs[i]);
+                        kfree(xattrs);
+                        goto start;
+                }
+                err = -EIO;
+                while (numattr--) {
+                        ceph_decode_32_safe(&p, end, len, bad);
+                        namelen = len;
+                        name = p;
+                        p += len;
+                        ceph_decode_32_safe(&p, end, len, bad);
+                        val = p;
+                        p += len;
+                        err = __set_xattr(ci, name, namelen, val, len,
+                                          0, 0, 0, &xattrs[numattr]);
+                        if (err < 0)
+                                goto bad;
+                }
+                kfree(xattrs);
+        }
+        ci->i_xattrs.index_version = ci->i_xattrs.version;
+        ci->i_xattrs.dirty = false;
+        return err;
+bad_lock:
+        spin_lock(&inode->i_lock);
+bad:
+        if (xattrs) {
+                for (i = 0; i < numattr; i++)
+                        kfree(xattrs[i]);
+                kfree(xattrs);
+        }
+        ci->i_xattrs.names_size = 0;
+        return err;
+}
+static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
+                                    int val_size)
+{
+        /*
+         * 4 bytes for the length, and additional 4 bytes per each xattr name,
+         * 4 bytes per each value
+         */
+        int size = 4 + ci->i_xattrs.count*(4 + 4) +
+                             ci->i_xattrs.names_size +
+                             ci->i_xattrs.vals_size;
+        dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
+             ci->i_xattrs.count, ci->i_xattrs.names_size,
+             ci->i_xattrs.vals_size);
+        if (name_size)
+                size += 4 + 4 + name_size + val_size;
+        return size;
+}
+/*
+ * If there are dirty xattrs, reencode xattrs into the prealloc_blob
+ * and swap into place.
+ */
+void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
+{
+        struct rb_node *p;
+        struct ceph_inode_xattr *xattr = NULL;
+        void *dest;
+        dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
+        if (ci->i_xattrs.dirty) {
+                int need = __get_required_blob_size(ci, 0, 0);
+                BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
+                p = rb_first(&ci->i_xattrs.index);
+                dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
+                ceph_encode_32(&dest, ci->i_xattrs.count);
+                while (p) {
+                        xattr = rb_entry(p, struct ceph_inode_xattr, node);
+                        ceph_encode_32(&dest, xattr->name_len);
+                        memcpy(dest, xattr->name, xattr->name_len);
+                        dest += xattr->name_len;
+                        ceph_encode_32(&dest, xattr->val_len);
+                        memcpy(dest, xattr->val, xattr->val_len);
+                        dest += xattr->val_len;
+                        p = rb_next(p);
+                }
+                /* adjust buffer len; it may be larger than we need */
+                ci->i_xattrs.prealloc_blob->vec.iov_len =
+                        dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
+                if (ci->i_xattrs.blob)
+                        ceph_buffer_put(ci->i_xattrs.blob);
+                ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
+                ci->i_xattrs.prealloc_blob = NULL;
+                ci->i_xattrs.dirty = false;
+        }
+}
+ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
+                      size_t size)
+{
+        struct inode *inode = dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+        int err;
+        struct ceph_inode_xattr *xattr;
+        struct ceph_vxattr_cb *vxattr = NULL;
+        if (!ceph_is_valid_xattr(name))
+                return -ENODATA;
+        /* let's see if a virtual xattr was requested */
+        if (vxattrs)
+                vxattr = ceph_match_vxattr(vxattrs, name);
+        spin_lock(&inode->i_lock);
+        dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
+             ci->i_xattrs.version, ci->i_xattrs.index_version);
+        if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
+            (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
+                goto get_xattr;
+        } else {
+                spin_unlock(&inode->i_lock);
+                /* get xattrs from mds (if we don't already have them) */
+                err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
+                if (err)
+                        return err;
+        }
+        spin_lock(&inode->i_lock);
+        if (vxattr && vxattr->readonly) {
+                err = vxattr->getxattr_cb(ci, value, size);
+                goto out;
+        }
+        err = __build_xattrs(inode);
+        if (err < 0)
+                goto out;
+get_xattr:
+        err = -ENODATA;  /* == ENOATTR */
+        xattr = __get_xattr(ci, name);
+        if (!xattr) {
+                if (vxattr)
+                        err = vxattr->getxattr_cb(ci, value, size);
+                goto out;
+        }
+        err = -ERANGE;
+        if (size && size < xattr->val_len)
+                goto out;
+        err = xattr->val_len;
+        if (size == 0)
+                goto out;
+        memcpy(value, xattr->val, xattr->val_len);
+out:
+        spin_unlock(&inode->i_lock);
+        return err;
+}
+ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
+{
+        struct inode *inode = dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+        u32 vir_namelen = 0;
+        u32 namelen;
+        int err;
+        u32 len;
+        int i;
+        spin_lock(&inode->i_lock);
+        dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
+             ci->i_xattrs.version, ci->i_xattrs.index_version);
+        if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
+            (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
+                goto list_xattr;
+        } else {
+                spin_unlock(&inode->i_lock);
+                err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
+                if (err)
+                        return err;
+        }
+        spin_lock(&inode->i_lock);
+        err = __build_xattrs(inode);
+        if (err < 0)
+                goto out;
+list_xattr:
+        vir_namelen = 0;
+        /* include virtual dir xattrs */
+        if (vxattrs)
+                for (i = 0; vxattrs[i].name; i++)
+                        vir_namelen += strlen(vxattrs[i].name) + 1;
+        /* adding 1 byte per each variable due to the null termination */
+        namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
+        err = -ERANGE;
+        if (size && namelen > size)
+                goto out;
+        err = namelen;
+        if (size == 0)
+                goto out;
+        names = __copy_xattr_names(ci, names);
+        /* virtual xattr names, too */
+        if (vxattrs)
+                for (i = 0; vxattrs[i].name; i++) {
+                        len = sprintf(names, "%s", vxattrs[i].name);
+                        names += len + 1;
+                }
+out:
+        spin_unlock(&inode->i_lock);
+        return err;
+}
+static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
+                              const char *value, size_t size, int flags)
+{
+        struct ceph_client *client = ceph_client(dentry->d_sb);
+        struct inode *inode = dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct inode *parent_inode = dentry->d_parent->d_inode;
+        struct ceph_mds_request *req;
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        int err;
+        int i, nr_pages;
+        struct page **pages = NULL;
+        void *kaddr;
+        /* copy value into some pages */
+        nr_pages = calc_pages_for(0, size);
+        if (nr_pages) {
+                pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
+                if (!pages)
+                        return -ENOMEM;
+                err = -ENOMEM;
+                for (i = 0; i < nr_pages; i++) {
+                        pages[i] = alloc_page(GFP_NOFS);
+                        if (!pages[i]) {
+                                nr_pages = i;
+                                goto out;
+                        }
+                        kaddr = kmap(pages[i]);
+                        memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
+                               min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
+                }
+        }
+        dout("setxattr value=%.*s\n", (int)size, value);
+        /* do request */
+        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
+                                       USE_AUTH_MDS);
+        if (IS_ERR(req)) {
+                err = PTR_ERR(req);
+                goto out;
+        }
+        req->r_inode = igrab(inode);
+        req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
+        req->r_num_caps = 1;
+        req->r_args.setxattr.flags = cpu_to_le32(flags);
+        req->r_path2 = kstrdup(name, GFP_NOFS);
+        req->r_pages = pages;
+        req->r_num_pages = nr_pages;
+        req->r_data_len = size;
+        dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
+        err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+        ceph_mdsc_put_request(req);
+        dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
+out:
+        if (pages) {
+                for (i = 0; i < nr_pages; i++)
+                        __free_page(pages[i]);
+                kfree(pages);
+        }
+        return err;
+}
+int ceph_setxattr(struct dentry *dentry, const char *name,
+                  const void *value, size_t size, int flags)
+{
+        struct inode *inode = dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+        int err;
+        int name_len = strlen(name);
+        int val_len = size;
+        char *newname = NULL;
+        char *newval = NULL;
+        struct ceph_inode_xattr *xattr = NULL;
+        int issued;
+        int required_blob_size;
+        if (ceph_snap(inode) != CEPH_NOSNAP)
+                return -EROFS;
+        if (!ceph_is_valid_xattr(name))
+                return -EOPNOTSUPP;
+        if (vxattrs) {
+                struct ceph_vxattr_cb *vxattr =
+                        ceph_match_vxattr(vxattrs, name);
+                if (vxattr && vxattr->readonly)
+                        return -EOPNOTSUPP;
+        }
+        /* preallocate memory for xattr name, value, index node */
+        err = -ENOMEM;
+        newname = kmalloc(name_len + 1, GFP_NOFS);
+        if (!newname)
+                goto out;
+        memcpy(newname, name, name_len + 1);
+        if (val_len) {
+                newval = kmalloc(val_len + 1, GFP_NOFS);
+                if (!newval)
+                        goto out;
+                memcpy(newval, value, val_len);
+                newval[val_len] = '\0';
+        }
+        xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
+        if (!xattr)
+                goto out;
+        spin_lock(&inode->i_lock);
+retry:
+        issued = __ceph_caps_issued(ci, NULL);
+        if (!(issued & CEPH_CAP_XATTR_EXCL))
+                goto do_sync;
+        __build_xattrs(inode);
+        required_blob_size = __get_required_blob_size(ci, name_len, val_len);
+        if (!ci->i_xattrs.prealloc_blob ||
+            required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
+                struct ceph_buffer *blob = NULL;
+                spin_unlock(&inode->i_lock);
+                dout(" preaallocating new blob size=%d\n", required_blob_size);
+                blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
+                if (!blob)
+                        goto out;
+                spin_lock(&inode->i_lock);
+                if (ci->i_xattrs.prealloc_blob)
+                        ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+                ci->i_xattrs.prealloc_blob = blob;
+                goto retry;
+        }
+        dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
+        err = __set_xattr(ci, newname, name_len, newval,
+                          val_len, 1, 1, 1, &xattr);
+        __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+        ci->i_xattrs.dirty = true;
+        inode->i_ctime = CURRENT_TIME;
+        spin_unlock(&inode->i_lock);
+        return err;
+do_sync:
+        spin_unlock(&inode->i_lock);
+        err = ceph_sync_setxattr(dentry, name, value, size, flags);
+out:
+        kfree(newname);
+        kfree(newval);
+        kfree(xattr);
+        return err;
+}
+static int ceph_send_removexattr(struct dentry *dentry, const char *name)
+{
+        struct ceph_client *client = ceph_client(dentry->d_sb);
+        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct inode *inode = dentry->d_inode;
+        struct inode *parent_inode = dentry->d_parent->d_inode;
+        struct ceph_mds_request *req;
+        int err;
+        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
+                                       USE_AUTH_MDS);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        req->r_inode = igrab(inode);
+        req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
+        req->r_num_caps = 1;
+        req->r_path2 = kstrdup(name, GFP_NOFS);
+        err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+        ceph_mdsc_put_request(req);
+        return err;
+}
+int ceph_removexattr(struct dentry *dentry, const char *name)
+{
+        struct inode *inode = dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+        int issued;
+        int err;
+        if (ceph_snap(inode) != CEPH_NOSNAP)
+                return -EROFS;
+        if (!ceph_is_valid_xattr(name))
+                return -EOPNOTSUPP;
+        if (vxattrs) {
+                struct ceph_vxattr_cb *vxattr =
+                        ceph_match_vxattr(vxattrs, name);
+                if (vxattr && vxattr->readonly)
+                        return -EOPNOTSUPP;
+        }
+        spin_lock(&inode->i_lock);
+        __build_xattrs(inode);
+        issued = __ceph_caps_issued(ci, NULL);
+        dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
+        if (!(issued & CEPH_CAP_XATTR_EXCL))
+                goto do_sync;
+        err = __remove_xattr_by_name(ceph_inode(inode), name);
+        __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+        ci->i_xattrs.dirty = true;
+        inode->i_ctime = CURRENT_TIME;
+        spin_unlock(&inode->i_lock);
+        return err;
+do_sync:
+        spin_unlock(&inode->i_lock);
+        err = ceph_send_removexattr(dentry, name);
+        return err;
+}
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 094ea65afc85..bc0025cdd1c9 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,8 @@
+Version 1.62
+------------
+Add sockopt=TCP_NODELAY mount option. EA (xattr) routines hardened
+to more strictly handle corrupt frames.
 Version 1.61
 ------------
 Fix append problem to Samba servers (files opened with O_APPEND could
@@ -5,7 +10,9 @@ have duplicated data). Fix oops in cifs_lookup. Workaround problem
 mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session.
 Disable use of server inode numbers when server only
 partially supports them (e.g. for one server querying inode numbers on
-FindFirst fails but QPathInfo queries works).
+FindFirst fails but QPathInfo queries works). Fix oops with dfs in 
+cifs_put_smb_ses. Fix mmap to work on directio mounts (needed
+for OpenOffice when on forcedirectio mount e.g.)
 Version 1.60
 -------------
diff --git a/fs/cifs/README b/fs/cifs/README
index 79c1a93400be..a727b7cb075f 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -423,7 +423,7 @@ A partial list of the supported mount options follows:
                source name to use to represent the client netbios machine 
                name when doing the RFC1001 netbios session initialize.
  direct        Do not do inode data caching on files opened on this mount.
-                This precludes mmaping files on this mount. In some cases
+                This precludes mmapping files on this mount. In some cases
                with fast networks and little or no caching benefits on the
                client (e.g. when the application is doing large sequential
                reads bigger than page size without rereading the same data) 
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 20692fbfdb24..a20bea598933 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -136,7 +136,7 @@ asn1_enum_decode(struct asn1_ctx *ctx, __le32 *val)
                return 0;
        }
-        ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to lenght octet */
+        ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to length octet */
        if ((ch) == ASN1_ENUM)  /* if ch value is ENUM, 0xa */
                *val = *(++(ctx->pointer)); /* value has enum value */
        else
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index fea9e898c4ba..78e4d2a3a68b 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -15,6 +15,7 @@
 #include <linux/dcache.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #include <linux/vfs.h>
 #include <linux/fs.h>
 #include "cifsglob.h"
@@ -54,7 +55,7 @@ void cifs_dfs_release_automount_timer(void)
 * Extracts sharename form full UNC.
 * i.e. strips from UNC trailing path that is not part of share
 * name and fixup missing '\' in the begining of DFS node refferal
- * if neccessary.
+ * if necessary.
 * Returns pointer to share name on success or ERR_PTR on error.
 * Caller is responsible for freeing returned string.
 */
@@ -269,7 +270,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
        int err;
        mntget(newmnt);
-        err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags, mntlist);
+        err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags | MNT_SHRINKABLE, mntlist);
        switch (err) {
        case 0:
                path_put(&nd->path);
@@ -371,7 +372,6 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
        if (IS_ERR(mnt))
                goto out_err;
-        nd->path.mnt->mnt_flags |= MNT_SHRINKABLE;
        rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list);
 out:
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 8ec7736ce954..310d12f69a92 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -20,6 +20,7 @@
 */
 #include <linux/list.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <keys/user-type.h>
 #include <linux/key-type.h>
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 714a542cbafc..d07676bd76d2 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -19,6 +19,7 @@
 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include "cifs_unicode.h"
 #include "cifs_uniupr.h"
 #include "cifspdu.h"
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 7dfe0842a6f6..9b716d044bbd 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -22,6 +22,7 @@
 */
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsacl.h"
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 7efe1745494d..fbe986430d0c 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -20,6 +20,7 @@
 */
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifs_debug.h"
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 29f1da761bbf..5183bc2a1916 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -312,6 +312,7 @@ cifs_alloc_inode(struct super_block *sb)
        cifs_inode->clientCanCacheRead = false;
        cifs_inode->clientCanCacheAll = false;
        cifs_inode->delete_pending = false;
+        cifs_inode->invalid_mapping = false;
        cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
        cifs_inode->server_eof = 0;
@@ -638,7 +639,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
                   setting the revalidate time to zero */
                CIFS_I(file->f_path.dentry->d_inode)->time = 0;
-                retval = cifs_revalidate(file->f_path.dentry);
+                retval = cifs_revalidate_file(file);
                if (retval < 0)
                        return (loff_t)retval;
        }
@@ -758,7 +759,7 @@ const struct file_operations cifs_file_ops = {
 };
 const struct file_operations cifs_file_direct_ops = {
-        /* no mmap, no aio, no readv -
+        /* no aio, no readv -
           BB reevaluate whether they can be done with directio, no cache */
        .read = cifs_user_read,
        .write = cifs_user_write,
@@ -767,6 +768,7 @@ const struct file_operations cifs_file_direct_ops = {
        .lock = cifs_lock,
        .fsync = cifs_fsync,
        .flush = cifs_flush,
+        .mmap = cifs_file_mmap,
        .splice_read = generic_file_splice_read,
 #ifdef CONFIG_CIFS_POSIX
        .unlocked_ioctl  = cifs_ioctl,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index ac2b24c192f8..7aa57ecdc437 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -61,7 +61,8 @@ extern int cifs_mkdir(struct inode *, struct dentry *, int);
 extern int cifs_rmdir(struct inode *, struct dentry *);
 extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
                       struct dentry *);
-extern int cifs_revalidate(struct dentry *);
+extern int cifs_revalidate_file(struct file *filp);
+extern int cifs_revalidate_dentry(struct dentry *);
 extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int cifs_setattr(struct dentry *, struct iattr *);
@@ -113,5 +114,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.61"
+#define CIFS_VERSION   "1.62"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 5d0fde18039c..ecf0ffbe2b64 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -18,6 +18,7 @@
 */
 #include <linux/in.h>
 #include <linux/in6.h>
+#include <linux/slab.h>
 #include <linux/slow-work.h>
 #include "cifs_fs_sb.h"
 #include "cifsacl.h"
@@ -39,7 +40,7 @@
 /*
 * MAX_REQ is the maximum number of requests that WE will send
- * on one socket concurently. It also matches the most common
+ * on one socket concurrently. It also matches the most common
 * value of max multiplex returned by servers.  We may
 * eventually want to use the negotiated value (in case
 * future servers can handle more) when we are more confident that
@@ -149,6 +150,7 @@ struct TCP_Server_Info {
        bool svlocal:1;                 /* local server or remote */
        bool noblocksnd;                /* use blocking sendmsg */
        bool noautotune;                /* do not autotune send buf sizes */
+        bool tcp_nodelay;
        atomic_t inFlight;  /* number of requests on the wire to server */
 #ifdef CONFIG_CIFS_STATS2
        atomic_t inSend; /* requests trying to send */
@@ -204,7 +206,7 @@ struct cifsUidInfo {
 struct cifsSesInfo {
        struct list_head smb_ses_list;
        struct list_head tcon_list;
-        struct semaphore sesSem;
+        struct mutex session_mutex;
 #if 0
        struct cifsUidInfo *uidInfo;    /* pointer to user info */
 #endif
@@ -388,6 +390,7 @@ struct cifsInodeInfo {
        bool clientCanCacheRead:1;      /* read oplock */
        bool clientCanCacheAll:1;       /* read and writebehind oplock */
        bool delete_pending:1;          /* DELETE_ON_CLOSE is set */
+        bool invalid_mapping:1;         /* pagecache is invalid */
        u64  server_eof;                /* current file size on server */
        u64  uniqueid;                  /* server inode number */
        struct inode vfs_inode;
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 2d07f890a842..14d036d8db11 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -415,10 +415,10 @@ struct smb_hdr {
        __u8 WordCount;
 } __attribute__((packed));
 /* given a pointer to an smb_hdr retrieve the value of byte count */
-#define BCC(smb_var) (*(__u16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount)))
+#define BCC(smb_var) (*(__u16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
-#define BCC_LE(smb_var) (*(__le16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount)))
+#define BCC_LE(smb_var) (*(__le16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
 /* given a pointer to an smb_hdr retrieve the pointer to the byte area */
-#define pByteArea(smb_var) ((unsigned char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount) + 2)
+#define pByteArea(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount) + 2)
 /*
 * Computer Name Length (since Netbios name was length 16 with last byte 0x20)
@@ -1227,7 +1227,7 @@ typedef struct smb_com_setattr_rsp {
 /* empty wct response to setattr */
 /*******************************************************/
-/* NT Transact structure defintions follow             */
+/* NT Transact structure definitions follow            */
 /* Currently only ioctl, acl (get security descriptor) */
 /* and notify are implemented                          */
 /*******************************************************/
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 5646727e33f5..39e47f46dea5 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -104,10 +104,12 @@ extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
 extern struct inode *cifs_iget(struct super_block *sb,
                               struct cifs_fattr *fattr);
+extern int cifs_get_file_info(struct file *filp);
 extern int cifs_get_inode_info(struct inode **pinode,
                        const unsigned char *search_path,
                        FILE_ALL_INFO *pfile_info,
                        struct super_block *sb, int xid, const __u16 *pfid);
+extern int cifs_get_file_info_unix(struct file *filp);
 extern int cifs_get_inode_info_unix(struct inode **pinode,
                        const unsigned char *search_path,
                        struct super_block *sb, int xid);
@@ -142,6 +144,8 @@ extern int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 extern int CIFSFindClose(const int, struct cifsTconInfo *tcon,
                        const __u16 search_handle);
+extern int CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
+                        u16 netfid, FILE_ALL_INFO *pFindData);
 extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
                        const unsigned char *searchName,
                        FILE_ALL_INFO *findData,
@@ -152,6 +156,8 @@ extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
                        FILE_ALL_INFO *findData,
                        const struct nls_table *nls_codepage, int remap);
+extern int CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
+                        u16 netfid, FILE_UNIX_BASIC_INFO *pFindData);
 extern int CIFSSMBUnixQPathInfo(const int xid,
                        struct cifsTconInfo *tcon,
                        const unsigned char *searchName,
@@ -363,13 +369,10 @@ extern int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
                        __u32 filter, struct file *file, int multishot,
                        const struct nls_table *nls_codepage);
 extern ssize_t CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
-                        const unsigned char *searchName, char *EAData,
+                        const unsigned char *searchName,
+                        const unsigned char *ea_name, char *EAData,
                        size_t bufsize, const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern ssize_t CIFSSMBQueryEA(const int xid, struct cifsTconInfo *tcon,
-                const unsigned char *searchName, const unsigned char *ea_name,
-                unsigned char *ea_value, size_t buf_size,
-                const struct nls_table *nls_codepage, int remap_special_chars);
 extern int CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon,
                const char *fileName, const char *ea_name,
                const void *ea_value, const __u16 ea_value_len,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 941441d3e386..3f4fbd670507 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -30,6 +30,7 @@
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/vfs.h>
+#include <linux/slab.h>
 #include <linux/posix_acl_xattr.h>
 #include <asm/uaccess.h>
 #include "cifspdu.h"
@@ -170,19 +171,19 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
         * need to prevent multiple threads trying to simultaneously
         * reconnect the same SMB session
         */
-        down(&ses->sesSem);
+        mutex_lock(&ses->session_mutex);
        if (ses->need_reconnect)
                rc = cifs_setup_session(0, ses, nls_codepage);
        /* do we need to reconnect tcon? */
        if (rc || !tcon->need_reconnect) {
-                up(&ses->sesSem);
+                mutex_unlock(&ses->session_mutex);
                goto out;
        }
        mark_open_files_invalid(tcon);
        rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
-        up(&ses->sesSem);
+        mutex_unlock(&ses->session_mutex);
        cFYI(1, ("reconnect tcon rc = %d", rc));
        if (rc)
@@ -500,7 +501,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        } else if (pSMBr->hdr.WordCount == 13) {
                cERROR(1, ("mount failed, cifs module not built "
                          "with CIFS_WEAK_PW_HASH support"));
-                        rc = -EOPNOTSUPP;
+                rc = -EOPNOTSUPP;
 #endif /* WEAK_PW_HASH */
                goto neg_err_exit;
        } else if (pSMBr->hdr.WordCount != 17) {
@@ -700,13 +701,13 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
        if (!ses || !ses->server)
                return -EIO;
-        down(&ses->sesSem);
+        mutex_lock(&ses->session_mutex);
        if (ses->need_reconnect)
                goto session_already_dead; /* no need to send SMBlogoff if uid
                                              already closed due to reconnect */
        rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB);
        if (rc) {
-                up(&ses->sesSem);
+                mutex_unlock(&ses->session_mutex);
                return rc;
        }
@@ -721,7 +722,7 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
        pSMB->AndXCommand = 0xFF;
        rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0);
 session_already_dead:
-        up(&ses->sesSem);
+        mutex_unlock(&ses->session_mutex);
        /* if session dead then we do not need to do ulogoff,
                since server closed smb session, no sense reporting
@@ -3230,8 +3231,72 @@ QInfRetry:
        return rc;
 }
+int
+CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
+                 u16 netfid, FILE_ALL_INFO *pFindData)
+{
+        struct smb_t2_qfi_req *pSMB = NULL;
+        struct smb_t2_qfi_rsp *pSMBr = NULL;
+        int rc = 0;
+        int bytes_returned;
+        __u16 params, byte_count;
+QFileInfoRetry:
+        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+                      (void **) &pSMBr);
+        if (rc)
+                return rc;
+        params = 2 /* level */ + 2 /* fid */;
+        pSMB->t2.TotalDataCount = 0;
+        pSMB->t2.MaxParameterCount = cpu_to_le16(4);
+        /* BB find exact max data count below from sess structure BB */
+        pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
+        pSMB->t2.MaxSetupCount = 0;
+        pSMB->t2.Reserved = 0;
+        pSMB->t2.Flags = 0;
+        pSMB->t2.Timeout = 0;
+        pSMB->t2.Reserved2 = 0;
+        pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
+                                               Fid) - 4);
+        pSMB->t2.DataCount = 0;
+        pSMB->t2.DataOffset = 0;
+        pSMB->t2.SetupCount = 1;
+        pSMB->t2.Reserved3 = 0;
+        pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
+        byte_count = params + 1 /* pad */ ;
+        pSMB->t2.TotalParameterCount = cpu_to_le16(params);
+        pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
+        pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
+        pSMB->Pad = 0;
+        pSMB->Fid = netfid;
+        pSMB->hdr.smb_buf_length += byte_count;
+        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+        if (rc) {
+                cFYI(1, ("Send error in QPathInfo = %d", rc));
+        } else {                /* decode response */
+                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+                if (rc) /* BB add auto retry on EOPNOTSUPP? */
+                        rc = -EIO;
+                else if (pSMBr->ByteCount < 40)
+                        rc = -EIO;      /* bad smb */
+                else if (pFindData) {
+                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+                        memcpy((char *) pFindData,
+                               (char *) &pSMBr->hdr.Protocol +
+                               data_offset, sizeof(FILE_ALL_INFO));
+                } else
+                    rc = -ENOMEM;
+        }
+        cifs_buf_release(pSMB);
+        if (rc == -EAGAIN)
+                goto QFileInfoRetry;
+        return rc;
+}
 int
 CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
@@ -3335,6 +3400,75 @@ QPathInfoRetry:
 }
 int
+CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
+                 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData)
+{
+        struct smb_t2_qfi_req *pSMB = NULL;
+        struct smb_t2_qfi_rsp *pSMBr = NULL;
+        int rc = 0;
+        int bytes_returned;
+        __u16 params, byte_count;
+UnixQFileInfoRetry:
+        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+                      (void **) &pSMBr);
+        if (rc)
+                return rc;
+        params = 2 /* level */ + 2 /* fid */;
+        pSMB->t2.TotalDataCount = 0;
+        pSMB->t2.MaxParameterCount = cpu_to_le16(4);
+        /* BB find exact max data count below from sess structure BB */
+        pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
+        pSMB->t2.MaxSetupCount = 0;
+        pSMB->t2.Reserved = 0;
+        pSMB->t2.Flags = 0;
+        pSMB->t2.Timeout = 0;
+        pSMB->t2.Reserved2 = 0;
+        pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
+                                               Fid) - 4);
+        pSMB->t2.DataCount = 0;
+        pSMB->t2.DataOffset = 0;
+        pSMB->t2.SetupCount = 1;
+        pSMB->t2.Reserved3 = 0;
+        pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
+        byte_count = params + 1 /* pad */ ;
+        pSMB->t2.TotalParameterCount = cpu_to_le16(params);
+        pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
+        pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
+        pSMB->Pad = 0;
+        pSMB->Fid = netfid;
+        pSMB->hdr.smb_buf_length += byte_count;
+        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+        if (rc) {
+                cFYI(1, ("Send error in QPathInfo = %d", rc));
+        } else {                /* decode response */
+                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+                if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
+                        cERROR(1, ("Malformed FILE_UNIX_BASIC_INFO response.\n"
+                                   "Unix Extensions can be disabled on mount "
+                                   "by specifying the nosfu mount option."));
+                        rc = -EIO;      /* bad smb */
+                } else {
+                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+                        memcpy((char *) pFindData,
+                               (char *) &pSMBr->hdr.Protocol +
+                               data_offset,
+                               sizeof(FILE_UNIX_BASIC_INFO));
+                }
+        }
+        cifs_buf_release(pSMB);
+        if (rc == -EAGAIN)
+                goto UnixQFileInfoRetry;
+        return rc;
+}
+int
 CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
                     const unsigned char *searchName,
                     FILE_UNIX_BASIC_INFO *pFindData,
@@ -3886,7 +4020,7 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
                goto parse_DFS_referrals_exit;
        }
-        /* collect neccessary data from referrals */
+        /* collect necessary data from referrals */
        for (i = 0; i < *num_of_nodes; i++) {
                char *temp;
                int max_len;
@@ -5269,22 +5403,34 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
        cifs_buf_release(pSMB);
        return rc;
 }
 #ifdef CONFIG_CIFS_XATTR
+/*
+ * Do a path-based QUERY_ALL_EAS call and parse the result. This is a common
+ * function used by listxattr and getxattr type calls. When ea_name is set,
+ * it looks for that attribute name and stuffs that value into the EAData
+ * buffer. When ea_name is NULL, it stuffs a list of attribute names into the
+ * buffer. In both cases, the return value is either the length of the
+ * resulting data or a negative error code. If EAData is a NULL pointer then
+ * the data isn't copied to it, but the length is returned.
+ */
 ssize_t
 CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
-                 const unsigned char *searchName,
+                const unsigned char *searchName, const unsigned char *ea_name,
-                 char *EAData, size_t buf_size,
+                char *EAData, size_t buf_size,
-                 const struct nls_table *nls_codepage, int remap)
+                const struct nls_table *nls_codepage, int remap)
 {
                /* BB assumes one setup word */
        TRANSACTION2_QPI_REQ *pSMB = NULL;
        TRANSACTION2_QPI_RSP *pSMBr = NULL;
        int rc = 0;
        int bytes_returned;
-        int name_len;
+        int list_len;
+        struct fealist *ea_response_data;
        struct fea *temp_fea;
        char *temp_ptr;
-        __u16 params, byte_count;
+        char *end_of_smb;
+        __u16 params, byte_count, data_offset;
        cFYI(1, ("In Query All EAs path %s", searchName));
 QAllEAsRetry:
@@ -5294,22 +5440,22 @@ QAllEAsRetry:
                return rc;
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-                name_len =
+                list_len =
                    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
                                     PATH_MAX, nls_codepage, remap);
-                name_len++;     /* trailing null */
+                list_len++;     /* trailing null */
-                name_len *= 2;
+                list_len *= 2;
        } else {        /* BB improve the check for buffer overruns BB */
-                name_len = strnlen(searchName, PATH_MAX);
+                list_len = strnlen(searchName, PATH_MAX);
-                name_len++;     /* trailing null */
+                list_len++;     /* trailing null */
-                strncpy(pSMB->FileName, searchName, name_len);
+                strncpy(pSMB->FileName, searchName, list_len);
        }
-        params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */;
+        params = 2 /* level */ + 4 /* reserved */ + list_len /* includes NUL */;
        pSMB->TotalDataCount = 0;
        pSMB->MaxParameterCount = cpu_to_le16(2);
        /* BB find exact max SMB PDU from sess structure BB */
-        pSMB->MaxDataCount = cpu_to_le16(4000);
+        pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
        pSMB->MaxSetupCount = 0;
        pSMB->Reserved = 0;
        pSMB->Flags = 0;
@@ -5334,237 +5480,117 @@ QAllEAsRetry:
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
                cFYI(1, ("Send error in QueryAllEAs = %d", rc));
-        } else {                /* decode response */
+                goto QAllEAsOut;
-                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+        }
-                /* BB also check enough total bytes returned */
-                /* BB we need to improve the validity checking
+        /* BB also check enough total bytes returned */
-                of these trans2 responses */
+        /* BB we need to improve the validity checking
-                if (rc || (pSMBr->ByteCount < 4))
+        of these trans2 responses */
-                        rc = -EIO;      /* bad smb */
-           /* else if (pFindData){
+        rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-                        memcpy((char *) pFindData,
+        if (rc || (pSMBr->ByteCount < 4)) {
-                               (char *) &pSMBr->hdr.Protocol +
+                rc = -EIO;      /* bad smb */
-                               data_offset, kl);
+                goto QAllEAsOut;
-                }*/ else {
-                        /* check that length of list is not more than bcc */
-                        /* check that each entry does not go beyond length
-                           of list */
-                        /* check that each element of each entry does not
-                           go beyond end of list */
-                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
-                        struct fealist *ea_response_data;
-                        rc = 0;
-                        /* validate_trans2_offsets() */
-                        /* BB check if start of smb + data_offset > &bcc+ bcc */
-                        ea_response_data = (struct fealist *)
-                                (((char *) &pSMBr->hdr.Protocol) +
-                                data_offset);
-                        name_len = le32_to_cpu(ea_response_data->list_len);
-                        cFYI(1, ("ea length %d", name_len));
-                        if (name_len <= 8) {
-                        /* returned EA size zeroed at top of function */
-                                cFYI(1, ("empty EA list returned from server"));
-                        } else {
-                                /* account for ea list len */
-                                name_len -= 4;
-                                temp_fea = ea_response_data->list;
-                                temp_ptr = (char *)temp_fea;
-                                while (name_len > 0) {
-                                        __u16 value_len;
-                                        name_len -= 4;
-                                        temp_ptr += 4;
-                                        rc += temp_fea->name_len;
-                                /* account for prefix user. and trailing null */
-                                        rc = rc + 5 + 1;
-                                        if (rc < (int)buf_size) {
-                                                memcpy(EAData, "user.", 5);
-                                                EAData += 5;
-                                                memcpy(EAData, temp_ptr,
-                                                       temp_fea->name_len);
-                                                EAData += temp_fea->name_len;
-                                                /* null terminate name */
-                                                *EAData = 0;
-                                                EAData = EAData + 1;
-                                        } else if (buf_size == 0) {
-                                                /* skip copy - calc size only */
-                                        } else {
-                                                /* stop before overrun buffer */
-                                                rc = -ERANGE;
-                                                break;
-                                        }
-                                        name_len -= temp_fea->name_len;
-                                        temp_ptr += temp_fea->name_len;
-                                        /* account for trailing null */
-                                        name_len--;
-                                        temp_ptr++;
-                                        value_len =
-                                              le16_to_cpu(temp_fea->value_len);
-                                        name_len -= value_len;
-                                        temp_ptr += value_len;
-                                        /* BB check that temp_ptr is still
-                                              within the SMB BB*/
-                                        /* no trailing null to account for
-                                           in value len */
-                                        /* go on to next EA */
-                                        temp_fea = (struct fea *)temp_ptr;
-                                }
-                        }
-                }
        }
-        cifs_buf_release(pSMB);
-        if (rc == -EAGAIN)
-                goto QAllEAsRetry;
-        return (ssize_t)rc;
+        /* check that length of list is not more than bcc */
-}
+        /* check that each entry does not go beyond length
+           of list */
+        /* check that each element of each entry does not
+           go beyond end of list */
+        /* validate_trans2_offsets() */
+        /* BB check if start of smb + data_offset > &bcc+ bcc */
-ssize_t CIFSSMBQueryEA(const int xid, struct cifsTconInfo *tcon,
+        data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
-                const unsigned char *searchName, const unsigned char *ea_name,
+        ea_response_data = (struct fealist *)
-                unsigned char *ea_value, size_t buf_size,
+                                (((char *) &pSMBr->hdr.Protocol) + data_offset);
-                const struct nls_table *nls_codepage, int remap)
-{
-        TRANSACTION2_QPI_REQ *pSMB = NULL;
-        TRANSACTION2_QPI_RSP *pSMBr = NULL;
-        int rc = 0;
-        int bytes_returned;
-        int name_len;
-        struct fea *temp_fea;
-        char *temp_ptr;
-        __u16 params, byte_count;
-        cFYI(1, ("In Query EA path %s", searchName));
+        list_len = le32_to_cpu(ea_response_data->list_len);
-QEARetry:
+        cFYI(1, ("ea length %d", list_len));
-        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+        if (list_len <= 8) {
-                      (void **) &pSMBr);
+                cFYI(1, ("empty EA list returned from server"));
-        if (rc)
+                goto QAllEAsOut;
-                return rc;
+        }
-        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+        /* make sure list_len doesn't go past end of SMB */
-                name_len =
+        end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr);
-                    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
+        if ((char *)ea_response_data + list_len > end_of_smb) {
-                                     PATH_MAX, nls_codepage, remap);
+                cFYI(1, ("EA list appears to go beyond SMB"));
-                name_len++;     /* trailing null */
+                rc = -EIO;
-                name_len *= 2;
+                goto QAllEAsOut;
-        } else {        /* BB improve the check for buffer overruns BB */
-                name_len = strnlen(searchName, PATH_MAX);
-                name_len++;     /* trailing null */
-                strncpy(pSMB->FileName, searchName, name_len);
        }
-        params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */;
+        /* account for ea list len */
-        pSMB->TotalDataCount = 0;
+        list_len -= 4;
-        pSMB->MaxParameterCount = cpu_to_le16(2);
+        temp_fea = ea_response_data->list;
-        /* BB find exact max SMB PDU from sess structure BB */
+        temp_ptr = (char *)temp_fea;
-        pSMB->MaxDataCount = cpu_to_le16(4000);
+        while (list_len > 0) {
-        pSMB->MaxSetupCount = 0;
+                unsigned int name_len;
-        pSMB->Reserved = 0;
+                __u16 value_len;
-        pSMB->Flags = 0;
-        pSMB->Timeout = 0;
+                list_len -= 4;
-        pSMB->Reserved2 = 0;
+                temp_ptr += 4;
-        pSMB->ParameterOffset = cpu_to_le16(offsetof(
+                /* make sure we can read name_len and value_len */
-                struct smb_com_transaction2_qpi_req, InformationLevel) - 4);
+                if (list_len < 0) {
-        pSMB->DataCount = 0;
+                        cFYI(1, ("EA entry goes beyond length of list"));
-        pSMB->DataOffset = 0;
+                        rc = -EIO;
-        pSMB->SetupCount = 1;
+                        goto QAllEAsOut;
-        pSMB->Reserved3 = 0;
+                }
-        pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION);
-        byte_count = params + 1 /* pad */ ;
-        pSMB->TotalParameterCount = cpu_to_le16(params);
-        pSMB->ParameterCount = pSMB->TotalParameterCount;
-        pSMB->InformationLevel = cpu_to_le16(SMB_INFO_QUERY_ALL_EAS);
-        pSMB->Reserved4 = 0;
-        pSMB->hdr.smb_buf_length += byte_count;
-        pSMB->ByteCount = cpu_to_le16(byte_count);
-        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+                name_len = temp_fea->name_len;
-                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+                value_len = le16_to_cpu(temp_fea->value_len);
-        if (rc) {
+                list_len -= name_len + 1 + value_len;
-                cFYI(1, ("Send error in Query EA = %d", rc));
+                if (list_len < 0) {
-        } else {                /* decode response */
+                        cFYI(1, ("EA entry goes beyond length of list"));
-                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+                        rc = -EIO;
+                        goto QAllEAsOut;
+                }
-                /* BB also check enough total bytes returned */
+                if (ea_name) {
-                /* BB we need to improve the validity checking
+                        if (strncmp(ea_name, temp_ptr, name_len) == 0) {
-                of these trans2 responses */
+                                temp_ptr += name_len + 1;
-                if (rc || (pSMBr->ByteCount < 4))
+                                rc = value_len;
-                        rc = -EIO;      /* bad smb */
+                                if (buf_size == 0)
-           /* else if (pFindData){
+                                        goto QAllEAsOut;
-                        memcpy((char *) pFindData,
+                                if ((size_t)value_len > buf_size) {
-                               (char *) &pSMBr->hdr.Protocol +
+                                        rc = -ERANGE;
-                               data_offset, kl);
+                                        goto QAllEAsOut;
-                }*/ else {
-                        /* check that length of list is not more than bcc */
-                        /* check that each entry does not go beyond length
-                           of list */
-                        /* check that each element of each entry does not
-                           go beyond end of list */
-                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
-                        struct fealist *ea_response_data;
-                        rc = -ENODATA;
-                        /* validate_trans2_offsets() */
-                        /* BB check if start of smb + data_offset > &bcc+ bcc*/
-                        ea_response_data = (struct fealist *)
-                                (((char *) &pSMBr->hdr.Protocol) +
-                                data_offset);
-                        name_len = le32_to_cpu(ea_response_data->list_len);
-                        cFYI(1, ("ea length %d", name_len));
-                        if (name_len <= 8) {
-                        /* returned EA size zeroed at top of function */
-                                cFYI(1, ("empty EA list returned from server"));
-                        } else {
-                                /* account for ea list len */
-                                name_len -= 4;
-                                temp_fea = ea_response_data->list;
-                                temp_ptr = (char *)temp_fea;
-                                /* loop through checking if we have a matching
-                                name and then return the associated value */
-                                while (name_len > 0) {
-                                        __u16 value_len;
-                                        name_len -= 4;
-                                        temp_ptr += 4;
-                                        value_len =
-                                              le16_to_cpu(temp_fea->value_len);
-                                /* BB validate that value_len falls within SMB,
-                                even though maximum for name_len is 255 */
-                                        if (memcmp(temp_fea->name, ea_name,
-                                                  temp_fea->name_len) == 0) {
-                                                /* found a match */
-                                                rc = value_len;
-                                /* account for prefix user. and trailing null */
-                                                if (rc <= (int)buf_size) {
-                                                        memcpy(ea_value,
-                                                                temp_fea->name+temp_fea->name_len+1,
-                                                                rc);
-                                                        /* ea values, unlike ea
-                                                           names, are not null
-                                                           terminated */
-                                                } else if (buf_size == 0) {
-                                                /* skip copy - calc size only */
-                                                } else {
-                                                /* stop before overrun buffer */
-                                                        rc = -ERANGE;
-                                                }
-                                                break;
-                                        }
-                                        name_len -= temp_fea->name_len;
-                                        temp_ptr += temp_fea->name_len;
-                                        /* account for trailing null */
-                                        name_len--;
-                                        temp_ptr++;
-                                        name_len -= value_len;
-                                        temp_ptr += value_len;
-                                        /* No trailing null to account for in
-                                           value_len.  Go on to next EA */
-                                        temp_fea = (struct fea *)temp_ptr;
                                }
+                                memcpy(EAData, temp_ptr, value_len);
+                                goto QAllEAsOut;
+                        }
+                } else {
+                        /* account for prefix user. and trailing null */
+                        rc += (5 + 1 + name_len);
+                        if (rc < (int) buf_size) {
+                                memcpy(EAData, "user.", 5);
+                                EAData += 5;
+                                memcpy(EAData, temp_ptr, name_len);
+                                EAData += name_len;
+                                /* null terminate name */
+                                *EAData = 0;
+                                ++EAData;
+                        } else if (buf_size == 0) {
+                                /* skip copy - calc size only */
+                        } else {
+                                /* stop before overrun buffer */
+                                rc = -ERANGE;
+                                break;
                        }
                }
+                temp_ptr += name_len + 1 + value_len;
+                temp_fea = (struct fea *)temp_ptr;
        }
+        /* didn't find the named attribute */
+        if (ea_name)
+                rc = -ENODATA;
+QAllEAsOut:
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
-                goto QEARetry;
+                goto QAllEAsRetry;
        return (ssize_t)rc;
 }
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 63ea83ff687f..d9566bf8f917 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -23,6 +23,7 @@
 #include <linux/string.h>
 #include <linux/list.h>
 #include <linux/wait.h>
+#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/ctype.h>
 #include <linux/utsname.h>
@@ -98,7 +99,7 @@ struct smb_vol {
        bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
        unsigned int rsize;
        unsigned int wsize;
-        unsigned int sockopt;
+        bool sockopt_tcp_nodelay:1;
        unsigned short int port;
        char *prepath;
 };
@@ -1142,9 +1143,11 @@ cifs_parse_mount_options(char *options, const char *devname,
                                        simple_strtoul(value, &value, 0);
                        }
                } else if (strnicmp(data, "sockopt", 5) == 0) {
-                        if (value && *value) {
+                        if (!value || !*value) {
-                                vol->sockopt =
+                                cERROR(1, ("no socket option specified"));
-                                        simple_strtoul(value, &value, 0);
+                                continue;
+                        } else if (strnicmp(value, "TCP_NODELAY", 11) == 0) {
+                                vol->sockopt_tcp_nodelay = 1;
                        }
                } else if (strnicmp(data, "netbiosname", 4) == 0) {
                        if (!value || !*value || (*value == ' ')) {
@@ -1514,6 +1517,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        tcp_ses->noblocksnd = volume_info->noblocksnd;
        tcp_ses->noautotune = volume_info->noautotune;
+        tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay;
        atomic_set(&tcp_ses->inFlight, 0);
        init_waitqueue_head(&tcp_ses->response_q);
        init_waitqueue_head(&tcp_ses->request_q);
@@ -1764,6 +1768,7 @@ static int
 ipv4_connect(struct TCP_Server_Info *server)
 {
        int rc = 0;
+        int val;
        bool connected = false;
        __be16 orig_port = 0;
        struct socket *socket = server->ssocket;
@@ -1845,6 +1850,14 @@ ipv4_connect(struct TCP_Server_Info *server)
                        socket->sk->sk_rcvbuf = 140 * 1024;
        }
+        if (server->tcp_nodelay) {
+                val = 1;
+                rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
+                                (char *)&val, sizeof(val));
+                if (rc)
+                        cFYI(1, ("set TCP_NODELAY socket option error %d", rc));
+        }
         cFYI(1, ("sndbuf %d rcvbuf %d rcvtimeo 0x%lx",
                 socket->sk->sk_sndbuf,
                 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo));
@@ -1916,6 +1929,7 @@ static int
 ipv6_connect(struct TCP_Server_Info *server)
 {
        int rc = 0;
+        int val;
        bool connected = false;
        __be16 orig_port = 0;
        struct socket *socket = server->ssocket;
@@ -1987,6 +2001,15 @@ ipv6_connect(struct TCP_Server_Info *server)
         */
        socket->sk->sk_rcvtimeo = 7 * HZ;
        socket->sk->sk_sndtimeo = 5 * HZ;
+        if (server->tcp_nodelay) {
+                val = 1;
+                rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
+                                (char *)&val, sizeof(val));
+                if (rc)
+                        cFYI(1, ("set TCP_NODELAY socket option error %d", rc));
+        }
        server->ssocket = socket;
        return rc;
@@ -2287,12 +2310,12 @@ int
 cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                char *mount_data_global, const char *devname)
 {
-        int rc = 0;
+        int rc;
        int xid;
        struct smb_vol *volume_info;
-        struct cifsSesInfo *pSesInfo = NULL;
+        struct cifsSesInfo *pSesInfo;
-        struct cifsTconInfo *tcon = NULL;
+        struct cifsTconInfo *tcon;
-        struct TCP_Server_Info *srvTcp = NULL;
+        struct TCP_Server_Info *srvTcp;
        char   *full_path;
        char *mount_data = mount_data_global;
 #ifdef CONFIG_CIFS_DFS_UPCALL
@@ -2301,6 +2324,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
        int referral_walks_count = 0;
 try_mount_again:
 #endif
+        rc = 0;
+        tcon = NULL;
+        pSesInfo = NULL;
+        srvTcp = NULL;
        full_path = NULL;
        xid = GetXid();
@@ -2362,13 +2389,13 @@ try_mount_again:
                 */
                cifs_put_tcp_session(srvTcp);
-                down(&pSesInfo->sesSem);
+                mutex_lock(&pSesInfo->session_mutex);
                if (pSesInfo->need_reconnect) {
                        cFYI(1, ("Session needs reconnect"));
                        rc = cifs_setup_session(xid, pSesInfo,
                                                cifs_sb->local_nls);
                }
-                up(&pSesInfo->sesSem);
+                mutex_unlock(&pSesInfo->session_mutex);
        } else if (!rc) {
                cFYI(1, ("Existing smb sess not found"));
                pSesInfo = sesInfoAlloc();
@@ -2411,12 +2438,12 @@ try_mount_again:
                }
                pSesInfo->linux_uid = volume_info->linux_uid;
                pSesInfo->overrideSecFlg = volume_info->secFlg;
-                down(&pSesInfo->sesSem);
+                mutex_lock(&pSesInfo->session_mutex);
                /* BB FIXME need to pass vol->secFlgs BB */
                rc = cifs_setup_session(xid, pSesInfo,
                                        cifs_sb->local_nls);
-                up(&pSesInfo->sesSem);
+                mutex_unlock(&pSesInfo->session_mutex);
        }
        /* search for existing tcon to this server share */
@@ -2597,6 +2624,7 @@ remote_path_check:
                        cleanup_volume_info(&volume_info);
                        referral_walks_count++;
+                        FreeXid(xid);
                        goto try_mount_again;
                }
 #else /* No DFS support, return error on mount */
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 1f42f772865a..e9f7ecc2714b 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -214,7 +214,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
                posix_flags |= SMB_O_EXCL;
        if (oflags & O_TRUNC)
                posix_flags |= SMB_O_TRUNC;
-        if (oflags & O_SYNC)
+        /* be safe and imply O_SYNC for O_DSYNC */
+        if (oflags & O_DSYNC)
                posix_flags |= SMB_O_SYNC;
        if (oflags & O_DIRECTORY)
                posix_flags |= SMB_O_DIRECTORY;
@@ -738,7 +739,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
        int isValid = 1;
        if (direntry->d_inode) {
-                if (cifs_revalidate(direntry))
+                if (cifs_revalidate_dentry(direntry))
                        return 0;
        } else {
                cFYI(1, ("neg dentry 0x%p name = %s",
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 87948147d7ec..6f8a0e3fb25b 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -23,6 +23,7 @@
 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */
+#include <linux/slab.h>
 #include <keys/user-type.h>
 #include "dns_resolve.h"
 #include "cifsglob.h"
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 75949d6a5f1b..6177f7cca16a 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -24,7 +24,7 @@
 */
 /*
-  * See Documentation/filesystems/Exporting
+  * See Documentation/filesystems/nfs/Exporting
  * and examples in fs/exportfs
  *
  * Since cifs is a network file system, an "fsid" must be included for
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 429337eb7afe..058b390d3da8 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -31,6 +31,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/delay.h>
 #include <linux/mount.h>
+#include <linux/slab.h>
 #include <asm/div64.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
@@ -76,8 +77,10 @@ static inline fmode_t cifs_posix_convert_flags(unsigned int flags)
           reopening a file.  They had their effect on the original open */
        if (flags & O_APPEND)
                posix_flags |= (fmode_t)O_APPEND;
-        if (flags & O_SYNC)
+        if (flags & O_DSYNC)
-                posix_flags |= (fmode_t)O_SYNC;
+                posix_flags |= (fmode_t)O_DSYNC;
+        if (flags & __O_SYNC)
+                posix_flags |= (fmode_t)__O_SYNC;
        if (flags & O_DIRECTORY)
                posix_flags |= (fmode_t)O_DIRECTORY;
        if (flags & O_NOFOLLOW)
@@ -217,8 +220,8 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
                cFYI(1, ("inode unchanged on server"));
        } else {
                if (file->f_path.dentry->d_inode->i_mapping) {
-                /* BB no need to lock inode until after invalidate
+                        /* BB no need to lock inode until after invalidate
-                   since namei code should already have it locked? */
+                        since namei code should already have it locked? */
                        rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
                        if (rc != 0)
                                CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
@@ -1888,11 +1891,10 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
-        struct dentry *dentry = file->f_path.dentry;
        int rc, xid;
        xid = GetXid();
-        rc = cifs_revalidate(dentry);
+        rc = cifs_revalidate_file(file);
        if (rc) {
                cFYI(1, ("Validation prior to mmap failed, error=%d", rc));
                FreeXid(xid);
@@ -2287,9 +2289,9 @@ cifs_oplock_break(struct slow_work *work)
        if (inode && S_ISREG(inode->i_mode)) {
 #ifdef CONFIG_CIFS_EXPERIMENTAL
                if (cinode->clientCanCacheAll == 0)
-                        break_lease(inode, FMODE_READ);
+                        break_lease(inode, O_RDONLY);
                else if (cinode->clientCanCacheRead == 0)
-                        break_lease(inode, FMODE_WRITE);
+                        break_lease(inode, O_WRONLY);
 #endif
                rc = filemap_fdatawrite(inode->i_mapping);
                if (cinode->clientCanCacheRead == 0) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index cababd8a52df..35ec11716213 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -20,6 +20,7 @@
 */
 #include <linux/fs.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <asm/div64.h>
 #include "cifsfs.h"
@@ -77,6 +78,41 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
        }
 }
+/* check inode attributes against fattr. If they don't match, tag the
+ * inode for cache invalidation
+ */
+static void
+cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
+{
+        struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+        cFYI(1, ("%s: revalidating inode %llu", __func__, cifs_i->uniqueid));
+        if (inode->i_state & I_NEW) {
+                cFYI(1, ("%s: inode %llu is new", __func__, cifs_i->uniqueid));
+                return;
+        }
+        /* don't bother with revalidation if we have an oplock */
+        if (cifs_i->clientCanCacheRead) {
+                cFYI(1, ("%s: inode %llu is oplocked", __func__,
+                         cifs_i->uniqueid));
+                return;
+        }
+         /* revalidate if mtime or size have changed */
+        if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
+            cifs_i->server_eof == fattr->cf_eof) {
+                cFYI(1, ("%s: inode %llu is unchanged", __func__,
+                         cifs_i->uniqueid));
+                return;
+        }
+        cFYI(1, ("%s: invalidating inode %llu mapping", __func__,
+                 cifs_i->uniqueid));
+        cifs_i->invalid_mapping = true;
+}
 /* populate an inode with info from a cifs_fattr struct */
 void
 cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
@@ -85,6 +121,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        unsigned long oldtime = cifs_i->time;
+        cifs_revalidate_cache(inode, fattr);
        inode->i_atime = fattr->cf_atime;
        inode->i_mtime = fattr->cf_mtime;
        inode->i_ctime = fattr->cf_ctime;
@@ -111,6 +149,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
        cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING;
+        cifs_i->server_eof = fattr->cf_eof;
        /*
         * Can't safely change the file size here if the client is writing to
         * it due to potential races.
@@ -230,6 +269,31 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
        fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
 }
+int cifs_get_file_info_unix(struct file *filp)
+{
+        int rc;
+        int xid;
+        FILE_UNIX_BASIC_INFO find_data;
+        struct cifs_fattr fattr;
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct cifsTconInfo *tcon = cifs_sb->tcon;
+        struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
+        xid = GetXid();
+        rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
+        if (!rc) {
+                cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
+        } else if (rc == -EREMOTE) {
+                cifs_create_dfs_fattr(&fattr, inode->i_sb);
+                rc = 0;
+        }
+        cifs_fattr_to_inode(inode, &fattr);
+        FreeXid(xid);
+        return rc;
+}
 int cifs_get_inode_info_unix(struct inode **pinode,
                             const unsigned char *full_path,
                             struct super_block *sb, int xid)
@@ -366,7 +430,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
        char ea_value[4];
        __u32 mode;
-        rc = CIFSSMBQueryEA(xid, cifs_sb->tcon, path, "SETFILEBITS",
+        rc = CIFSSMBQAllEAs(xid, cifs_sb->tcon, path, "SETFILEBITS",
                            ea_value, 4 /* size of buf */, cifs_sb->local_nls,
                            cifs_sb->mnt_cifs_flags &
                                CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -431,6 +495,47 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
        fattr->cf_gid = cifs_sb->mnt_gid;
 }
+int cifs_get_file_info(struct file *filp)
+{
+        int rc;
+        int xid;
+        FILE_ALL_INFO find_data;
+        struct cifs_fattr fattr;
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct cifsTconInfo *tcon = cifs_sb->tcon;
+        struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
+        xid = GetXid();
+        rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
+        if (rc == -EOPNOTSUPP || rc == -EINVAL) {
+                /*
+                 * FIXME: legacy server -- fall back to path-based call?
+                 * for now, just skip revalidating and mark inode for
+                 * immediate reval.
+                 */
+                rc = 0;
+                CIFS_I(inode)->time = 0;
+                goto cgfi_exit;
+        } else if (rc == -EREMOTE) {
+                cifs_create_dfs_fattr(&fattr, inode->i_sb);
+                rc = 0;
+        } else if (rc)
+                goto cgfi_exit;
+        /*
+         * don't bother with SFU junk here -- just mark inode as needing
+         * revalidation.
+         */
+        cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
+        fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
+        fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
+        cifs_fattr_to_inode(inode, &fattr);
+cgfi_exit:
+        FreeXid(xid);
+        return rc;
+}
 int cifs_get_inode_info(struct inode **pinode,
        const unsigned char *full_path, FILE_ALL_INFO *pfindData,
        struct super_block *sb, int xid, const __u16 *pfid)
@@ -914,8 +1019,8 @@ undo_setattr:
 /*
 * If dentry->d_inode is null (usually meaning the cached dentry
 * is a negative dentry) then we would attempt a standard SMB delete, but
- * if that fails we can not attempt the fall back mechanisms on EACESS
+ * if that fails we can not attempt the fall back mechanisms on EACCESS
- * but will return the EACESS to the caller.  Note that the VFS does not call
+ * but will return the EACCESS to the caller. Note that the VFS does not call
 * unlink on negative dentries currently.
 */
 int cifs_unlink(struct inode *dir, struct dentry *dentry)
@@ -1388,135 +1493,103 @@ cifs_rename_exit:
        return rc;
 }
-int cifs_revalidate(struct dentry *direntry)
+static bool
+cifs_inode_needs_reval(struct inode *inode)
 {
-        int xid;
+        struct cifsInodeInfo *cifs_i = CIFS_I(inode);
-        int rc = 0, wbrc = 0;
-        char *full_path;
-        struct cifs_sb_info *cifs_sb;
-        struct cifsInodeInfo *cifsInode;
-        loff_t local_size;
-        struct timespec local_mtime;
-        bool invalidate_inode = false;
-        if (direntry->d_inode == NULL)
+        if (cifs_i->clientCanCacheRead)
-                return -ENOENT;
+                return false;
-        cifsInode = CIFS_I(direntry->d_inode);
+        if (!lookupCacheEnabled)
+                return true;
-        if (cifsInode == NULL)
+        if (cifs_i->time == 0)
-                return -ENOENT;
+                return true;
-        /* no sense revalidating inode info on file that no one can write */
+        /* FIXME: the actimeo should be tunable */
-        if (CIFS_I(direntry->d_inode)->clientCanCacheRead)
+        if (time_after_eq(jiffies, cifs_i->time + HZ))
-                return rc;
+                return true;
+        return false;
+}
+/* check invalid_mapping flag and zap the cache if it's set */
+static void
+cifs_invalidate_mapping(struct inode *inode)
+{
+        int rc;
+        struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+        cifs_i->invalid_mapping = false;
+        /* write back any cached data */
+        if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
+                rc = filemap_write_and_wait(inode->i_mapping);
+                if (rc)
+                        cifs_i->write_behind_rc = rc;
+        }
+        invalidate_remote_inode(inode);
+}
+int cifs_revalidate_file(struct file *filp)
+{
+        int rc = 0;
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        if (!cifs_inode_needs_reval(inode))
+                goto check_inval;
+        if (CIFS_SB(inode->i_sb)->tcon->unix_ext)
+                rc = cifs_get_file_info_unix(filp);
+        else
+                rc = cifs_get_file_info(filp);
+check_inval:
+        if (CIFS_I(inode)->invalid_mapping)
+                cifs_invalidate_mapping(inode);
+        return rc;
+}
+/* revalidate a dentry's inode attributes */
+int cifs_revalidate_dentry(struct dentry *dentry)
+{
+        int xid;
+        int rc = 0;
+        char *full_path = NULL;
+        struct inode *inode = dentry->d_inode;
+        struct super_block *sb = dentry->d_sb;
+        if (inode == NULL)
+                return -ENOENT;
        xid = GetXid();
-        cifs_sb = CIFS_SB(direntry->d_sb);
+        if (!cifs_inode_needs_reval(inode))
+                goto check_inval;
        /* can not safely grab the rename sem here if rename calls revalidate
           since that would deadlock */
-        full_path = build_path_from_dentry(direntry);
+        full_path = build_path_from_dentry(dentry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                FreeXid(xid);
+                goto check_inval;
-                return rc;
-        }
-        cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
-                 "jiffies %ld", full_path, direntry->d_inode,
-                 direntry->d_inode->i_count.counter, direntry,
-                 direntry->d_time, jiffies));
-        if (cifsInode->time == 0) {
-                /* was set to zero previously to force revalidate */
-        } else if (time_before(jiffies, cifsInode->time + HZ) &&
-                   lookupCacheEnabled) {
-                if ((S_ISREG(direntry->d_inode->i_mode) == 0) ||
-                    (direntry->d_inode->i_nlink == 1)) {
-                        kfree(full_path);
-                        FreeXid(xid);
-                        return rc;
-                } else {
-                        cFYI(1, ("Have to revalidate file due to hardlinks"));
-                }
-        }
-        /* save mtime and size */
-        local_mtime = direntry->d_inode->i_mtime;
-        local_size = direntry->d_inode->i_size;
-        if (cifs_sb->tcon->unix_ext) {
-                rc = cifs_get_inode_info_unix(&direntry->d_inode, full_path,
-                                              direntry->d_sb, xid);
-                if (rc) {
-                        cFYI(1, ("error on getting revalidate info %d", rc));
-/*                      if (rc != -ENOENT)
-                                rc = 0; */      /* BB should we cache info on
-                                                   certain errors? */
-                }
-        } else {
-                rc = cifs_get_inode_info(&direntry->d_inode, full_path, NULL,
-                                         direntry->d_sb, xid, NULL);
-                if (rc) {
-                        cFYI(1, ("error on getting revalidate info %d", rc));
-/*                      if (rc != -ENOENT)
-                                rc = 0; */      /* BB should we cache info on
-                                                   certain errors? */
-                }
        }
-        /* should we remap certain errors, access denied?, to zero */
-        /* if not oplocked, we invalidate inode pages if mtime or file size
+        cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
-           had changed on server */
+                 "jiffies %ld", full_path, inode, inode->i_count.counter,
+                 dentry, dentry->d_time, jiffies));
-        if (timespec_equal(&local_mtime, &direntry->d_inode->i_mtime) &&
+        if (CIFS_SB(sb)->tcon->unix_ext)
-            (local_size == direntry->d_inode->i_size)) {
+                rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
-                cFYI(1, ("cifs_revalidate - inode unchanged"));
+        else
-        } else {
+                rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
-                /* file may have changed on server */
+                                         xid, NULL);
-                if (cifsInode->clientCanCacheRead) {
-                        /* no need to invalidate inode pages since we were the
-                           only ones who could have modified the file and the
-                           server copy is staler than ours */
-                } else {
-                        invalidate_inode = true;
-                }
-        }
-        /* can not grab this sem since kernel filesys locking documentation
+check_inval:
-           indicates i_mutex may be taken by the kernel on lookup and rename
+        if (CIFS_I(inode)->invalid_mapping)
-           which could deadlock if we grab the i_mutex here as well */
+                cifs_invalidate_mapping(inode);
-/*      mutex_lock(&direntry->d_inode->i_mutex);*/
-        /* need to write out dirty pages here  */
-        if (direntry->d_inode->i_mapping) {
-                /* do we need to lock inode until after invalidate completes
-                   below? */
-                wbrc = filemap_fdatawrite(direntry->d_inode->i_mapping);
-                if (wbrc)
-                        CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
-        }
-        if (invalidate_inode) {
-        /* shrink_dcache not necessary now that cifs dentry ops
-        are exported for negative dentries */
-/*              if (S_ISDIR(direntry->d_inode->i_mode))
-                        shrink_dcache_parent(direntry); */
-                if (S_ISREG(direntry->d_inode->i_mode)) {
-                        if (direntry->d_inode->i_mapping) {
-                                wbrc = filemap_fdatawait(direntry->d_inode->i_mapping);
-                                if (wbrc)
-                                        CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
-                        }
-                        /* may eventually have to do this for open files too */
-                        if (list_empty(&(cifsInode->openFileList))) {
-                                /* changed on server - flush read ahead pages */
-                                cFYI(1, ("Invalidating read ahead data on "
-                                         "closed file"));
-                                invalidate_remote_inode(direntry->d_inode);
-                        }
-                }
-        }
-/*      mutex_unlock(&direntry->d_inode->i_mutex); */
        kfree(full_path);
        FreeXid(xid);
@@ -1526,7 +1599,7 @@ int cifs_revalidate(struct dentry *direntry)
 int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        struct kstat *stat)
 {
-        int err = cifs_revalidate(dentry);
+        int err = cifs_revalidate_dentry(dentry);
        if (!err) {
                generic_fillattr(dentry->d_inode, stat);
                stat->blksize = CIFS_MAX_MSGSIZE;
@@ -1762,8 +1835,18 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
        }
-        if (!rc)
+        if (!rc) {
                rc = inode_setattr(inode, attrs);
+                /* force revalidate when any of these times are set since some
+                   of the fs types (eg ext3, fat) do not have fine enough
+                   time granularity to match protocol, and we do not have a
+                   a way (yet) to query the server fs's time granularity (and
+                   whether it rounds times down).
+                */
+                if (!rc && (attrs->ia_valid & (ATTR_MTIME | ATTR_CTIME)))
+                        cifsInode->time = 0;
+        }
 out:
        kfree(args);
        kfree(full_path);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index fc1e0487eaee..c1a9d4236a8c 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -20,6 +20,7 @@
 */
 #include <linux/fs.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index d27d4ec6579b..d1474996a812 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -79,7 +79,7 @@ sesInfoAlloc(void)
                ++ret_buf->ses_count;
                INIT_LIST_HEAD(&ret_buf->smb_ses_list);
                INIT_LIST_HEAD(&ret_buf->tcon_list);
-                init_MUTEX(&ret_buf->sesSem);
+                mutex_init(&ret_buf->session_mutex);
        }
        return ret_buf;
 }
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index f84062f9a985..18e0bc1fb593 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -22,6 +22,7 @@
 */
 #include <linux/fs.h>
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/stat.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -77,6 +78,11 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
        cFYI(1, ("For %s", name->name));
+        if (parent->d_op && parent->d_op->d_hash)
+                parent->d_op->d_hash(parent, name);
+        else
+                name->hash = full_name_hash(name->name, name->len);
        dentry = d_lookup(parent, name);
        if (dentry) {
                /* FIXME: check for inode number changes? */
@@ -666,12 +672,11 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
                                           min(len, max_len), nlt,
                                           cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+                pqst->len -= nls_nullsize(nlt);
        } else {
                pqst->name = filename;
                pqst->len = len;
        }
-        pqst->hash = full_name_hash(pqst->name, pqst->len);
-/*      cFYI(1, ("filldir on %s",pqst->name));  */
        return rc;
 }
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 7085a6275c4c..7c3fd7463f44 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -29,6 +29,7 @@
 #include "ntlmssp.h"
 #include "nterr.h"
 #include <linux/utsname.h>
+#include <linux/slab.h>
 #include "cifs_spnego.h"
 extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
@@ -223,9 +224,9 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
                /* null user mount */
                *bcc_ptr = 0;
                *(bcc_ptr+1) = 0;
-        } else { /* 300 should be long enough for any conceivable user name */
+        } else {
                bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->userName,
-                                          300, nls_cp);
+                                          MAX_USERNAME_SIZE, nls_cp);
        }
        bcc_ptr += 2 * bytes_ret;
        bcc_ptr += 2; /* account for null termination */
@@ -246,11 +247,10 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
        /* copy user */
        if (ses->userName == NULL) {
                /* BB what about null user mounts - check that we do this BB */
-        } else { /* 300 should be long enough for any conceivable user name */
+        } else {
-                strncpy(bcc_ptr, ses->userName, 300);
+                strncpy(bcc_ptr, ses->userName, MAX_USERNAME_SIZE);
        }
-        /* BB improve check for overflow */
+        bcc_ptr += strnlen(ses->userName, MAX_USERNAME_SIZE);
-        bcc_ptr += strnlen(ses->userName, 300);
        *bcc_ptr = 0;
        bcc_ptr++; /* account for null termination */
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
index 224a1f478966..b6b6dcb500bf 100644
--- a/fs/cifs/smbdes.c
+++ b/fs/cifs/smbdes.c
@@ -371,7 +371,7 @@ E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24)
        smbhash(p24 + 16, c8, p21 + 14, 1);
 }
-#if 0 /* currently unsued */
+#if 0 /* currently unused */
 static void
 D_P16(unsigned char *p14, unsigned char *in, unsigned char *out)
 {
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 93fb09a99c69..192ea51af20f 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -24,6 +24,7 @@
 */
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 07b8e71544ee..ad081fe7eb18 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -22,6 +22,7 @@
 #include <linux/fs.h>
 #include <linux/list.h>
+#include <linux/gfp.h>
 #include <linux/wait.h>
 #include <linux/net.h>
 #include <linux/delay.h>
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index a75afa3dd9e1..f555ce077d4f 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/slab.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -244,7 +245,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
                        /* revalidate/getattr then populate from inode */
                } /* BB add else when above is implemented */
                ea_name += 5; /* skip past user. prefix */
-                rc = CIFSSMBQueryEA(xid, pTcon, full_path, ea_name, ea_value,
+                rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
                        buf_size, cifs_sb->local_nls,
                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) {
@@ -252,7 +253,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
                        goto get_ea_exit;
                ea_name += 4; /* skip past os2. prefix */
-                rc = CIFSSMBQueryEA(xid, pTcon, full_path, ea_name, ea_value,
+                rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
                        buf_size, cifs_sb->local_nls,
                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
@@ -364,8 +365,8 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
        /* if proc/fs/cifs/streamstoxattr is set then
                search server for EAs or streams to
                returns as xattrs */
-        rc = CIFSSMBQAllEAs(xid, pTcon, full_path, data, buf_size,
+        rc = CIFSSMBQAllEAs(xid, pTcon, full_path, NULL, data,
-                                cifs_sb->local_nls,
+                                buf_size, cifs_sb->local_nls,
                                cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 4bb9d0a5decc..ccd98b0f2b0b 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/time.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/stat.h>
 #include <linux/errno.h>
diff --git a/fs/coda/file.c b/fs/coda/file.c
index ffd42815fda1..4c813f2cdc52 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -17,6 +17,7 @@
 #include <linux/errno.h>
 #include <linux/smp_lock.h>
 #include <linux/string.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/coda.h>
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 830f51abb971..a1695dcadd99 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -18,6 +18,7 @@
 #include <linux/smp_lock.h>
 #include <linux/file.h>
 #include <linux/vfs.h>
+#include <linux/slab.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index 43c96ce29614..c6405ce3c50e 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -17,28 +17,25 @@ static struct ctl_table_header *fs_table_header;
 static ctl_table coda_table[] = {
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "timeout",
                .data           = &coda_timeout,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec
+                .proc_handler   = proc_dointvec
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "hard",
                .data           = &coda_hard,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec
+                .proc_handler   = proc_dointvec
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "fake_statfs",
                .data           = &coda_fake_statfs,
                .maxlen         = sizeof(int),
                .mode           = 0600,
-                .proc_handler   = &proc_dointvec
+                .proc_handler   = proc_dointvec
        },
        {}
 };
@@ -46,7 +43,6 @@ static ctl_table coda_table[] = {
 #ifdef CONFIG_SYSCTL
 static ctl_table fs_table[] = {
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "coda",
                .mode           = 0555,
                .child          = coda_table
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index c274d949179d..f09c5ed76f6c 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -26,6 +26,7 @@
 #include <linux/stat.h>
 #include <linux/errno.h>
 #include <linux/string.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/vfs.h>
diff --git a/fs/compat.c b/fs/compat.c
index 6c19040ffeef..4b6ed03cc478 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -38,8 +38,6 @@
 #include <linux/dirent.h>
 #include <linux/fsnotify.h>
 #include <linux/highuid.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/syscall.h>
 #include <linux/personality.h>
 #include <linux/rwsem.h>
@@ -51,6 +49,7 @@
 #include <linux/mm.h>
 #include <linux/eventpoll.h>
 #include <linux/fs_struct.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1797,6 +1796,24 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
        return ret;
 }
+struct compat_sel_arg_struct {
+        compat_ulong_t n;
+        compat_uptr_t inp;
+        compat_uptr_t outp;
+        compat_uptr_t exp;
+        compat_uptr_t tvp;
+};
+asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg)
+{
+        struct compat_sel_arg_struct a;
+        if (copy_from_user(&a, arg, sizeof(a)))
+                return -EFAULT;
+        return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
+                                 compat_ptr(a.exp), compat_ptr(a.tvp));
+}
 #ifdef HAVE_SET_RESTORE_SIGMASK
 static long do_compat_pselect(int n, compat_ulong_t __user *inp,
        compat_ulong_t __user *outp, compat_ulong_t __user *exp,
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 0adced2f296f..112e45a17e99 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -28,10 +28,12 @@
 #undef  elfhdr
 #undef  elf_phdr
+#undef  elf_shdr
 #undef  elf_note
 #undef  elf_addr_t
 #define elfhdr          elf32_hdr
 #define elf_phdr        elf32_phdr
+#define elf_shdr        elf32_shdr
 #define elf_note        elf32_note
 #define elf_addr_t      Elf32_Addr
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index d84e7058c298..c32a1b6a856b 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -23,7 +23,6 @@
 #include <linux/ioctl.h>
 #include <linux/if.h>
 #include <linux/if_bridge.h>
-#include <linux/slab.h>
 #include <linux/raid/md_u.h>
 #include <linux/kd.h>
 #include <linux/route.h>
@@ -60,6 +59,7 @@
 #include <linux/i2c.h>
 #include <linux/i2c-dev.h>
 #include <linux/atalk.h>
+#include <linux/gfp.h>
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci.h>
@@ -111,43 +111,40 @@
 #include <linux/dvb/frontend.h>
 #include <linux/dvb/video.h>
+#include <linux/sort.h>
 #ifdef CONFIG_SPARC
 #include <asm/fbio.h>
 #endif
-static int do_ioctl32_pointer(unsigned int fd, unsigned int cmd,
+static int w_long(unsigned int fd, unsigned int cmd,
-                              unsigned long arg, struct file *f)
+                compat_ulong_t __user *argp)
-{
-        return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg));
-}
-static int w_long(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
        mm_segment_t old_fs = get_fs();
        int err;
        unsigned long val;
-        
        set_fs (KERNEL_DS);
        err = sys_ioctl(fd, cmd, (unsigned long)&val);
        set_fs (old_fs);
-        if (!err && put_user(val, (u32 __user *)compat_ptr(arg)))
+        if (!err && put_user(val, argp))
                return -EFAULT;
        return err;
 }
- 
-static int rw_long(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int rw_long(unsigned int fd, unsigned int cmd,
+                compat_ulong_t __user *argp)
 {
        mm_segment_t old_fs = get_fs();
-        u32 __user *argptr = compat_ptr(arg);
        int err;
        unsigned long val;
-        
-        if(get_user(val, argptr))
+        if(get_user(val, argp))
                return -EFAULT;
        set_fs (KERNEL_DS);
        err = sys_ioctl(fd, cmd, (unsigned long)&val);
        set_fs (old_fs);
-        if (!err && put_user(val, argptr))
+        if (!err && put_user(val, argp))
                return -EFAULT;
        return err;
 }
@@ -161,7 +158,8 @@ struct compat_video_event {
        } u;
 };
-static int do_video_get_event(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int do_video_get_event(unsigned int fd, unsigned int cmd,
+                struct compat_video_event __user *up)
 {
        struct video_event kevent;
        mm_segment_t old_fs = get_fs();
@@ -172,8 +170,6 @@ static int do_video_get_event(unsigned int fd, unsigned int cmd, unsigned long a
        set_fs(old_fs);
        if (!err) {
-                struct compat_video_event __user *up = compat_ptr(arg);
                err  = put_user(kevent.type, &up->type);
                err |= put_user(kevent.timestamp, &up->timestamp);
                err |= put_user(kevent.u.size.w, &up->u.size.w);
@@ -192,15 +188,14 @@ struct compat_video_still_picture {
        int32_t size;
 };
-static int do_video_stillpicture(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int do_video_stillpicture(unsigned int fd, unsigned int cmd,
+        struct compat_video_still_picture __user *up)
 {
-        struct compat_video_still_picture __user *up;
        struct video_still_picture __user *up_native;
        compat_uptr_t fp;
        int32_t size;
        int err;
-        up = (struct compat_video_still_picture __user *) arg;
        err  = get_user(fp, &up->iFrame);
        err |= get_user(size, &up->size);
        if (err)
@@ -224,14 +219,13 @@ struct compat_video_spu_palette {
        compat_uptr_t palette;
 };
-static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd,
+                struct compat_video_spu_palette __user *up)
 {
-        struct compat_video_spu_palette __user *up;
        struct video_spu_palette __user *up_native;
        compat_uptr_t palp;
        int length, err;
-        up = (struct compat_video_spu_palette __user *) arg;
        err  = get_user(palp, &up->palette);
        err |= get_user(length, &up->length);
@@ -246,428 +240,6 @@ static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, unsigned
        return err;
 }
-#ifdef CONFIG_NET
-static int do_siocgstamp(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct compat_timeval __user *up = compat_ptr(arg);
-        struct timeval ktv;
-        mm_segment_t old_fs = get_fs();
-        int err;
-        set_fs(KERNEL_DS);
-        err = sys_ioctl(fd, cmd, (unsigned long)&ktv);
-        set_fs(old_fs);
-        if(!err) {
-                err = put_user(ktv.tv_sec, &up->tv_sec);
-                err |= __put_user(ktv.tv_usec, &up->tv_usec);
-        }
-        return err;
-}
-static int do_siocgstampns(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct compat_timespec __user *up = compat_ptr(arg);
-        struct timespec kts;
-        mm_segment_t old_fs = get_fs();
-        int err;
-        set_fs(KERNEL_DS);
-        err = sys_ioctl(fd, cmd, (unsigned long)&kts);
-        set_fs(old_fs);
-        if (!err) {
-                err = put_user(kts.tv_sec, &up->tv_sec);
-                err |= __put_user(kts.tv_nsec, &up->tv_nsec);
-        }
-        return err;
-}
-struct ifmap32 {
-        compat_ulong_t mem_start;
-        compat_ulong_t mem_end;
-        unsigned short base_addr;
-        unsigned char irq;
-        unsigned char dma;
-        unsigned char port;
-};
-struct ifreq32 {
-#define IFHWADDRLEN     6
-#define IFNAMSIZ        16
-        union {
-                char    ifrn_name[IFNAMSIZ];            /* if name, e.g. "en0" */
-        } ifr_ifrn;
-        union {
-                struct  sockaddr ifru_addr;
-                struct  sockaddr ifru_dstaddr;
-                struct  sockaddr ifru_broadaddr;
-                struct  sockaddr ifru_netmask;
-                struct  sockaddr ifru_hwaddr;
-                short   ifru_flags;
-                compat_int_t     ifru_ivalue;
-                compat_int_t     ifru_mtu;
-                struct  ifmap32 ifru_map;
-                char    ifru_slave[IFNAMSIZ];   /* Just fits the size */
-                char    ifru_newname[IFNAMSIZ];
-                compat_caddr_t ifru_data;
-            /* XXXX? ifru_settings should be here */
-        } ifr_ifru;
-};
-struct ifconf32 {
-        compat_int_t    ifc_len;                        /* size of buffer       */
-        compat_caddr_t  ifcbuf;
-};
-static int dev_ifname32(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct ifreq __user *uifr;
-        int err;
-        uifr = compat_alloc_user_space(sizeof(struct ifreq));
-        if (copy_in_user(uifr, compat_ptr(arg), sizeof(struct ifreq32)))
-                return -EFAULT;
-        err = sys_ioctl(fd, SIOCGIFNAME, (unsigned long)uifr);
-        if (err)
-                return err;
-        if (copy_in_user(compat_ptr(arg), uifr, sizeof(struct ifreq32)))
-                return -EFAULT;
-        return 0;
-}
-static int dev_ifconf(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct ifconf32 ifc32;
-        struct ifconf ifc;
-        struct ifconf __user *uifc;
-        struct ifreq32 __user *ifr32;
-        struct ifreq __user *ifr;
-        unsigned int i, j;
-        int err;
-        if (copy_from_user(&ifc32, compat_ptr(arg), sizeof(struct ifconf32)))
-                return -EFAULT;
-        if (ifc32.ifcbuf == 0) {
-                ifc32.ifc_len = 0;
-                ifc.ifc_len = 0;
-                ifc.ifc_req = NULL;
-                uifc = compat_alloc_user_space(sizeof(struct ifconf));
-        } else {
-                size_t len =((ifc32.ifc_len / sizeof (struct ifreq32)) + 1) *
-                        sizeof (struct ifreq);
-                uifc = compat_alloc_user_space(sizeof(struct ifconf) + len);
-                ifc.ifc_len = len;
-                ifr = ifc.ifc_req = (void __user *)(uifc + 1);
-                ifr32 = compat_ptr(ifc32.ifcbuf);
-                for (i = 0; i < ifc32.ifc_len; i += sizeof (struct ifreq32)) {
-                        if (copy_in_user(ifr, ifr32, sizeof(struct ifreq32)))
-                                return -EFAULT;
-                        ifr++;
-                        ifr32++; 
-                }
-        }
-        if (copy_to_user(uifc, &ifc, sizeof(struct ifconf)))
-                return -EFAULT;
-        err = sys_ioctl (fd, SIOCGIFCONF, (unsigned long)uifc); 
-        if (err)
-                return err;
-        if (copy_from_user(&ifc, uifc, sizeof(struct ifconf))) 
-                return -EFAULT;
-        ifr = ifc.ifc_req;
-        ifr32 = compat_ptr(ifc32.ifcbuf);
-        for (i = 0, j = 0;
-             i + sizeof (struct ifreq32) <= ifc32.ifc_len && j < ifc.ifc_len;
-             i += sizeof (struct ifreq32), j += sizeof (struct ifreq)) {
-                if (copy_in_user(ifr32, ifr, sizeof (struct ifreq32)))
-                        return -EFAULT;
-                ifr32++;
-                ifr++;
-        }
-        if (ifc32.ifcbuf == 0) {
-                /* Translate from 64-bit structure multiple to
-                 * a 32-bit one.
-                 */
-                i = ifc.ifc_len;
-                i = ((i / sizeof(struct ifreq)) * sizeof(struct ifreq32));
-                ifc32.ifc_len = i;
-        } else {
-                ifc32.ifc_len = i;
-        }
-        if (copy_to_user(compat_ptr(arg), &ifc32, sizeof(struct ifconf32)))
-                return -EFAULT;
-        return 0;
-}
-static int ethtool_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct ifreq __user *ifr;
-        struct ifreq32 __user *ifr32;
-        u32 data;
-        void __user *datap;
-        
-        ifr = compat_alloc_user_space(sizeof(*ifr));
-        ifr32 = compat_ptr(arg);
-        if (copy_in_user(&ifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ))
-                return -EFAULT;
-        if (get_user(data, &ifr32->ifr_ifru.ifru_data))
-                return -EFAULT;
-        datap = compat_ptr(data);
-        if (put_user(datap, &ifr->ifr_ifru.ifru_data))
-                return -EFAULT;
-        return sys_ioctl(fd, cmd, (unsigned long) ifr);
-}
-static int bond_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct ifreq kifr;
-        struct ifreq __user *uifr;
-        struct ifreq32 __user *ifr32 = compat_ptr(arg);
-        mm_segment_t old_fs;
-        int err;
-        u32 data;
-        void __user *datap;
-        switch (cmd) {
-        case SIOCBONDENSLAVE:
-        case SIOCBONDRELEASE:
-        case SIOCBONDSETHWADDR:
-        case SIOCBONDCHANGEACTIVE:
-                if (copy_from_user(&kifr, ifr32, sizeof(struct ifreq32)))
-                        return -EFAULT;
-                old_fs = get_fs();
-                set_fs (KERNEL_DS);
-                err = sys_ioctl (fd, cmd, (unsigned long)&kifr);
-                set_fs (old_fs);
-                return err;
-        case SIOCBONDSLAVEINFOQUERY:
-        case SIOCBONDINFOQUERY:
-                uifr = compat_alloc_user_space(sizeof(*uifr));
-                if (copy_in_user(&uifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ))
-                        return -EFAULT;
-                if (get_user(data, &ifr32->ifr_ifru.ifru_data))
-                        return -EFAULT;
-                datap = compat_ptr(data);
-                if (put_user(datap, &uifr->ifr_ifru.ifru_data))
-                        return -EFAULT;
-                return sys_ioctl (fd, cmd, (unsigned long)uifr);
-        default:
-                return -EINVAL;
-        };
-}
-static int siocdevprivate_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct ifreq __user *u_ifreq64;
-        struct ifreq32 __user *u_ifreq32 = compat_ptr(arg);
-        char tmp_buf[IFNAMSIZ];
-        void __user *data64;
-        u32 data32;
-        if (copy_from_user(&tmp_buf[0], &(u_ifreq32->ifr_ifrn.ifrn_name[0]),
-                           IFNAMSIZ))
-                return -EFAULT;
-        if (__get_user(data32, &u_ifreq32->ifr_ifru.ifru_data))
-                return -EFAULT;
-        data64 = compat_ptr(data32);
-        u_ifreq64 = compat_alloc_user_space(sizeof(*u_ifreq64));
-        /* Don't check these user accesses, just let that get trapped
-         * in the ioctl handler instead.
-         */
-        if (copy_to_user(&u_ifreq64->ifr_ifrn.ifrn_name[0], &tmp_buf[0],
-                         IFNAMSIZ))
-                return -EFAULT;
-        if (__put_user(data64, &u_ifreq64->ifr_ifru.ifru_data))
-                return -EFAULT;
-        return sys_ioctl(fd, cmd, (unsigned long) u_ifreq64);
-}
-static int dev_ifsioc(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct ifreq ifr;
-        struct ifreq32 __user *uifr32;
-        struct ifmap32 __user *uifmap32;
-        mm_segment_t old_fs;
-        int err;
-        
-        uifr32 = compat_ptr(arg);
-        uifmap32 = &uifr32->ifr_ifru.ifru_map;
-        switch (cmd) {
-        case SIOCSIFMAP:
-                err = copy_from_user(&ifr, uifr32, sizeof(ifr.ifr_name));
-                err |= __get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
-                err |= __get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
-                err |= __get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
-                err |= __get_user(ifr.ifr_map.irq, &uifmap32->irq);
-                err |= __get_user(ifr.ifr_map.dma, &uifmap32->dma);
-                err |= __get_user(ifr.ifr_map.port, &uifmap32->port);
-                if (err)
-                        return -EFAULT;
-                break;
-        case SIOCSHWTSTAMP:
-                if (copy_from_user(&ifr, uifr32, sizeof(*uifr32)))
-                        return -EFAULT;
-                ifr.ifr_data = compat_ptr(uifr32->ifr_ifru.ifru_data);
-                break;
-        default:
-                if (copy_from_user(&ifr, uifr32, sizeof(*uifr32)))
-                        return -EFAULT;
-                break;
-        }
-        old_fs = get_fs();
-        set_fs (KERNEL_DS);
-        err = sys_ioctl (fd, cmd, (unsigned long)&ifr);
-        set_fs (old_fs);
-        if (!err) {
-                switch (cmd) {
-                /* TUNSETIFF is defined as _IOW, it should be _IORW
-                 * as the data is copied back to user space, but that
-                 * cannot be fixed without breaking all existing apps.
-                 */
-                case TUNSETIFF:
-                case TUNGETIFF:
-                case SIOCGIFFLAGS:
-                case SIOCGIFMETRIC:
-                case SIOCGIFMTU:
-                case SIOCGIFMEM:
-                case SIOCGIFHWADDR:
-                case SIOCGIFINDEX:
-                case SIOCGIFADDR:
-                case SIOCGIFBRDADDR:
-                case SIOCGIFDSTADDR:
-                case SIOCGIFNETMASK:
-                case SIOCGIFTXQLEN:
-                        if (copy_to_user(uifr32, &ifr, sizeof(*uifr32)))
-                                return -EFAULT;
-                        break;
-                case SIOCGIFMAP:
-                        err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name));
-                        err |= __put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
-                        err |= __put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
-                        err |= __put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
-                        err |= __put_user(ifr.ifr_map.irq, &uifmap32->irq);
-                        err |= __put_user(ifr.ifr_map.dma, &uifmap32->dma);
-                        err |= __put_user(ifr.ifr_map.port, &uifmap32->port);
-                        if (err)
-                                err = -EFAULT;
-                        break;
-                }
-        }
-        return err;
-}
-struct rtentry32 {
-        u32             rt_pad1;
-        struct sockaddr rt_dst;         /* target address               */
-        struct sockaddr rt_gateway;     /* gateway addr (RTF_GATEWAY)   */
-        struct sockaddr rt_genmask;     /* target network mask (IP)     */
-        unsigned short  rt_flags;
-        short           rt_pad2;
-        u32             rt_pad3;
-        unsigned char   rt_tos;
-        unsigned char   rt_class;
-        short           rt_pad4;
-        short           rt_metric;      /* +1 for binary compatibility! */
-        /* char * */ u32 rt_dev;        /* forcing the device at add    */
-        u32             rt_mtu;         /* per route MTU/Window         */
-        u32             rt_window;      /* Window clamping              */
-        unsigned short  rt_irtt;        /* Initial RTT                  */
-};
-struct in6_rtmsg32 {
-        struct in6_addr         rtmsg_dst;
-        struct in6_addr         rtmsg_src;
-        struct in6_addr         rtmsg_gateway;
-        u32                     rtmsg_type;
-        u16                     rtmsg_dst_len;
-        u16                     rtmsg_src_len;
-        u32                     rtmsg_metric;
-        u32                     rtmsg_info;
-        u32                     rtmsg_flags;
-        s32                     rtmsg_ifindex;
-};
-static int routing_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        int ret;
-        void *r = NULL;
-        struct in6_rtmsg r6;
-        struct rtentry r4;
-        char devname[16];
-        u32 rtdev;
-        mm_segment_t old_fs = get_fs();
-        
-        struct socket *mysock = sockfd_lookup(fd, &ret);
-        if (mysock && mysock->sk && mysock->sk->sk_family == AF_INET6) { /* ipv6 */
-                struct in6_rtmsg32 __user *ur6 = compat_ptr(arg);
-                ret = copy_from_user (&r6.rtmsg_dst, &(ur6->rtmsg_dst),
-                        3 * sizeof(struct in6_addr));
-                ret |= __get_user (r6.rtmsg_type, &(ur6->rtmsg_type));
-                ret |= __get_user (r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len));
-                ret |= __get_user (r6.rtmsg_src_len, &(ur6->rtmsg_src_len));
-                ret |= __get_user (r6.rtmsg_metric, &(ur6->rtmsg_metric));
-                ret |= __get_user (r6.rtmsg_info, &(ur6->rtmsg_info));
-                ret |= __get_user (r6.rtmsg_flags, &(ur6->rtmsg_flags));
-                ret |= __get_user (r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex));
-                
-                r = (void *) &r6;
-        } else { /* ipv4 */
-                struct rtentry32 __user *ur4 = compat_ptr(arg);
-                ret = copy_from_user (&r4.rt_dst, &(ur4->rt_dst),
-                                        3 * sizeof(struct sockaddr));
-                ret |= __get_user (r4.rt_flags, &(ur4->rt_flags));
-                ret |= __get_user (r4.rt_metric, &(ur4->rt_metric));
-                ret |= __get_user (r4.rt_mtu, &(ur4->rt_mtu));
-                ret |= __get_user (r4.rt_window, &(ur4->rt_window));
-                ret |= __get_user (r4.rt_irtt, &(ur4->rt_irtt));
-                ret |= __get_user (rtdev, &(ur4->rt_dev));
-                if (rtdev) {
-                        ret |= copy_from_user (devname, compat_ptr(rtdev), 15);
-                        r4.rt_dev = devname; devname[15] = 0;
-                } else
-                        r4.rt_dev = NULL;
-                r = (void *) &r4;
-        }
-        if (ret) {
-                ret = -EFAULT;
-                goto out;
-        }
-        set_fs (KERNEL_DS);
-        ret = sys_ioctl (fd, cmd, (unsigned long) r);
-        set_fs (old_fs);
-out:
-        if (mysock)
-                sockfd_put(mysock);
-        return ret;
-}
-#endif
 #ifdef CONFIG_BLOCK
 typedef struct sg_io_hdr32 {
        compat_int_t interface_id;      /* [i] 'S' for SCSI generic (required) */
@@ -721,16 +293,21 @@ static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iov
        return 0;
 }
-static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
+                        sg_io_hdr32_t __user *sgio32)
 {
        sg_io_hdr_t __user *sgio;
-        sg_io_hdr32_t __user *sgio32;
        u16 iovec_count;
        u32 data;
        void __user *dxferp;
        int err;
+        int interface_id;
+        if (get_user(interface_id, &sgio32->interface_id))
+                return -EFAULT;
+        if (interface_id != 'S')
+                return sys_ioctl(fd, cmd, (unsigned long)sgio32);
-        sgio32 = compat_ptr(arg);
        if (get_user(iovec_count, &sgio32->iovec_count))
                return -EFAULT;
@@ -820,11 +397,11 @@ struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */
        int unused;
 };
-static int sg_grt_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int sg_grt_trans(unsigned int fd, unsigned int cmd, struct
+                        compat_sg_req_info __user *o)
 {
        int err, i;
        sg_req_info_t __user *r;
-        struct compat_sg_req_info __user *o = (void __user *)arg;
        r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE);
        err = sys_ioctl(fd,cmd,(unsigned long)r);
        if (err < 0)
@@ -852,9 +429,9 @@ struct sock_fprog32 {
 #define PPPIOCSPASS32   _IOW('t', 71, struct sock_fprog32)
 #define PPPIOCSACTIVE32 _IOW('t', 70, struct sock_fprog32)
-static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd,
+                        struct sock_fprog32 __user *u_fprog32)
 {
-        struct sock_fprog32 __user *u_fprog32 = compat_ptr(arg);
        struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog));
        void __user *fptr64;
        u32 fptr32;
@@ -891,15 +468,14 @@ struct ppp_idle32 {
 };
 #define PPPIOCGIDLE32           _IOR('t', 63, struct ppp_idle32)
-static int ppp_gidle(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int ppp_gidle(unsigned int fd, unsigned int cmd,
+                struct ppp_idle32 __user *idle32)
 {
        struct ppp_idle __user *idle;
-        struct ppp_idle32 __user *idle32;
        __kernel_time_t xmit, recv;
        int err;
        idle = compat_alloc_user_space(sizeof(*idle));
-        idle32 = compat_ptr(arg);
        err = sys_ioctl(fd, PPPIOCGIDLE, (unsigned long) idle);
@@ -913,15 +489,14 @@ static int ppp_gidle(unsigned int fd, unsigned int cmd, unsigned long arg)
        return err;
 }
-static int ppp_scompress(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int ppp_scompress(unsigned int fd, unsigned int cmd,
+        struct ppp_option_data32 __user *odata32)
 {
        struct ppp_option_data __user *odata;
-        struct ppp_option_data32 __user *odata32;
        __u32 data;
        void __user *datap;
        odata = compat_alloc_user_space(sizeof(*odata));
-        odata32 = compat_ptr(arg);
        if (get_user(data, &odata32->ptr))
                return -EFAULT;
@@ -937,35 +512,6 @@ static int ppp_scompress(unsigned int fd, unsigned int cmd, unsigned long arg)
        return sys_ioctl(fd, PPPIOCSCOMPRESS, (unsigned long) odata);
 }
-static int ppp_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        int err;
-        switch (cmd) {
-        case PPPIOCGIDLE32:
-                err = ppp_gidle(fd, cmd, arg);
-                break;
-        case PPPIOCSCOMPRESS32:
-                err = ppp_scompress(fd, cmd, arg);
-                break;
-        default:
-                do {
-                        static int count;
-                        if (++count <= 20)
-                                printk("ppp_ioctl: Unknown cmd fd(%d) "
-                                       "cmd(%08x) arg(%08x)\n",
-                                       (int)fd, (unsigned int)cmd, (unsigned int)arg);
-                } while(0);
-                err = -EINVAL;
-                break;
-        };
-        return err;
-}
 #ifdef CONFIG_BLOCK
 struct mtget32 {
        compat_long_t   mt_type;
@@ -983,7 +529,7 @@ struct mtpos32 {
 };
 #define MTIOCPOS32      _IOR('m', 3, struct mtpos32)
-static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
 {
        mm_segment_t old_fs = get_fs();
        struct mtget get;
@@ -999,19 +545,10 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
                kcmd = MTIOCPOS;
                karg = &pos;
                break;
-        case MTIOCGET32:
+        default:        /* MTIOCGET32 */
                kcmd = MTIOCGET;
                karg = &get;
                break;
-        default:
-                do {
-                        static int count;
-                        if (++count <= 20)
-                                printk("mt_ioctl: Unknown cmd fd(%d) "
-                                       "cmd(%08x) arg(%08x)\n",
-                                       (int)fd, (unsigned int)cmd, (unsigned int)arg);
-                } while(0);
-                return -EINVAL;
        }
        set_fs (KERNEL_DS);
        err = sys_ioctl (fd, kcmd, (unsigned long)karg);
@@ -1020,11 +557,11 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
                return err;
        switch (cmd) {
        case MTIOCPOS32:
-                upos32 = compat_ptr(arg);
+                upos32 = argp;
                err = __put_user(pos.mt_blkno, &upos32->mt_blkno);
                break;
        case MTIOCGET32:
-                umget32 = compat_ptr(arg);
+                umget32 = argp;
                err = __put_user(get.mt_type, &umget32->mt_type);
                err |= __put_user(get.mt_resid, &umget32->mt_resid);
                err |= __put_user(get.mt_dsreg, &umget32->mt_dsreg);
@@ -1039,162 +576,8 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
 #endif /* CONFIG_BLOCK */
-#ifdef CONFIG_VT
+static int do_smb_getmountuid(unsigned int fd, unsigned int cmd,
+                        compat_uid_t __user *argp)
-static int vt_check(struct file *file)
-{
-        struct tty_struct *tty;
-        struct inode *inode = file->f_path.dentry->d_inode;
-        struct vc_data *vc;
-        
-        if (file->f_op->unlocked_ioctl != tty_ioctl)
-                return -EINVAL;
-                        
-        tty = (struct tty_struct *)file->private_data;
-        if (tty_paranoia_check(tty, inode, "tty_ioctl"))
-                return -EINVAL;
-                                                        
-        if (tty->ops->ioctl != vt_ioctl)
-                return -EINVAL;
-        vc = (struct vc_data *)tty->driver_data;
-        if (!vc_cons_allocated(vc->vc_num))     /* impossible? */
-                return -ENOIOCTLCMD;
-        /*
-         * To have permissions to do most of the vt ioctls, we either have
-         * to be the owner of the tty, or have CAP_SYS_TTY_CONFIG.
-         */
-        if (current->signal->tty == tty || capable(CAP_SYS_TTY_CONFIG))
-                return 1;
-        return 0;                                                    
-}
-struct consolefontdesc32 {
-        unsigned short charcount;       /* characters in font (256 or 512) */
-        unsigned short charheight;      /* scan lines per character (1-32) */
-        compat_caddr_t chardata;        /* font data in expanded form */
-};
-static int do_fontx_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg, struct file *file)
-{
-        struct consolefontdesc32 __user *user_cfd = compat_ptr(arg);
-        struct console_font_op op;
-        compat_caddr_t data;
-        int i, perm;
-        perm = vt_check(file);
-        if (perm < 0) return perm;
-        
-        switch (cmd) {
-        case PIO_FONTX:
-                if (!perm)
-                        return -EPERM;
-                op.op = KD_FONT_OP_SET;
-                op.flags = 0;
-                op.width = 8;
-                if (get_user(op.height, &user_cfd->charheight) ||
-                    get_user(op.charcount, &user_cfd->charcount) ||
-                    get_user(data, &user_cfd->chardata))
-                        return -EFAULT;
-                op.data = compat_ptr(data);
-                return con_font_op(vc_cons[fg_console].d, &op);
-        case GIO_FONTX:
-                op.op = KD_FONT_OP_GET;
-                op.flags = 0;
-                op.width = 8;
-                if (get_user(op.height, &user_cfd->charheight) ||
-                    get_user(op.charcount, &user_cfd->charcount) ||
-                    get_user(data, &user_cfd->chardata))
-                        return -EFAULT;
-                if (!data)
-                        return 0;
-                op.data = compat_ptr(data);
-                i = con_font_op(vc_cons[fg_console].d, &op);
-                if (i)
-                        return i;
-                if (put_user(op.height, &user_cfd->charheight) ||
-                    put_user(op.charcount, &user_cfd->charcount) ||
-                    put_user((compat_caddr_t)(unsigned long)op.data,
-                                &user_cfd->chardata))
-                        return -EFAULT;
-                return 0;
-        }
-        return -EINVAL;
-}
-struct console_font_op32 {
-        compat_uint_t op;        /* operation code KD_FONT_OP_* */
-        compat_uint_t flags;     /* KD_FONT_FLAG_* */
-        compat_uint_t width, height;     /* font size */
-        compat_uint_t charcount;
-        compat_caddr_t data;    /* font data with height fixed to 32 */
-};
-                                        
-static int do_kdfontop_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg, struct file *file)
-{
-        struct console_font_op op;
-        struct console_font_op32 __user *fontop = compat_ptr(arg);
-        int perm = vt_check(file), i;
-        struct vc_data *vc;
-        
-        if (perm < 0) return perm;
-        
-        if (copy_from_user(&op, fontop, sizeof(struct console_font_op32)))
-                return -EFAULT;
-        if (!perm && op.op != KD_FONT_OP_GET)
-                return -EPERM;
-        op.data = compat_ptr(((struct console_font_op32 *)&op)->data);
-        op.flags |= KD_FONT_FLAG_OLD;
-        vc = ((struct tty_struct *)file->private_data)->driver_data;
-        i = con_font_op(vc, &op);
-        if (i)
-                return i;
-        ((struct console_font_op32 *)&op)->data = (unsigned long)op.data;
-        if (copy_to_user(fontop, &op, sizeof(struct console_font_op32)))
-                return -EFAULT;
-        return 0;
-}
-struct unimapdesc32 {
-        unsigned short entry_ct;
-        compat_caddr_t entries;
-};
-static int do_unimap_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg, struct file *file)
-{
-        struct unimapdesc32 tmp;
-        struct unimapdesc32 __user *user_ud = compat_ptr(arg);
-        int perm = vt_check(file);
-        struct vc_data *vc;
-        if (perm < 0)
-                return perm;
-        if (copy_from_user(&tmp, user_ud, sizeof tmp))
-                return -EFAULT;
-        if (tmp.entries)
-                if (!access_ok(VERIFY_WRITE, compat_ptr(tmp.entries),
-                                tmp.entry_ct*sizeof(struct unipair)))
-                        return -EFAULT;
-        vc = ((struct tty_struct *)file->private_data)->driver_data;
-        switch (cmd) {
-        case PIO_UNIMAP:
-                if (!perm)
-                        return -EPERM;
-                return con_set_unimap(vc, tmp.entry_ct,
-                                                compat_ptr(tmp.entries));
-        case GIO_UNIMAP:
-                if (!perm && fg_console != vc->vc_num)
-                        return -EPERM;
-                return con_get_unimap(vc, tmp.entry_ct, &(user_ud->entry_ct),
-                                                compat_ptr(tmp.entries));
-        }
-        return 0;
-}
-#endif /* CONFIG_VT */
-static int do_smb_getmountuid(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
        mm_segment_t old_fs = get_fs();
        __kernel_uid_t kuid;
@@ -1207,184 +590,15 @@ static int do_smb_getmountuid(unsigned int fd, unsigned int cmd, unsigned long a
        set_fs(old_fs);
        if (err >= 0)
-                err = put_user(kuid, (compat_uid_t __user *)compat_ptr(arg));
+                err = put_user(kuid, argp);
-        return err;
-}
-struct atmif_sioc32 {
-        compat_int_t    number;
-        compat_int_t    length;
-        compat_caddr_t  arg;
-};
-struct atm_iobuf32 {
-        compat_int_t    length;
-        compat_caddr_t  buffer;
-};
-#define ATM_GETLINKRATE32 _IOW('a', ATMIOC_ITF+1, struct atmif_sioc32)
-#define ATM_GETNAMES32    _IOW('a', ATMIOC_ITF+3, struct atm_iobuf32)
-#define ATM_GETTYPE32     _IOW('a', ATMIOC_ITF+4, struct atmif_sioc32)
-#define ATM_GETESI32      _IOW('a', ATMIOC_ITF+5, struct atmif_sioc32)
-#define ATM_GETADDR32     _IOW('a', ATMIOC_ITF+6, struct atmif_sioc32)
-#define ATM_RSTADDR32     _IOW('a', ATMIOC_ITF+7, struct atmif_sioc32)
-#define ATM_ADDADDR32     _IOW('a', ATMIOC_ITF+8, struct atmif_sioc32)
-#define ATM_DELADDR32     _IOW('a', ATMIOC_ITF+9, struct atmif_sioc32)
-#define ATM_GETCIRANGE32  _IOW('a', ATMIOC_ITF+10, struct atmif_sioc32)
-#define ATM_SETCIRANGE32  _IOW('a', ATMIOC_ITF+11, struct atmif_sioc32)
-#define ATM_SETESI32      _IOW('a', ATMIOC_ITF+12, struct atmif_sioc32)
-#define ATM_SETESIF32     _IOW('a', ATMIOC_ITF+13, struct atmif_sioc32)
-#define ATM_GETSTAT32     _IOW('a', ATMIOC_SARCOM+0, struct atmif_sioc32)
-#define ATM_GETSTATZ32    _IOW('a', ATMIOC_SARCOM+1, struct atmif_sioc32)
-#define ATM_GETLOOP32     _IOW('a', ATMIOC_SARCOM+2, struct atmif_sioc32)
-#define ATM_SETLOOP32     _IOW('a', ATMIOC_SARCOM+3, struct atmif_sioc32)
-#define ATM_QUERYLOOP32   _IOW('a', ATMIOC_SARCOM+4, struct atmif_sioc32)
-static struct {
-        unsigned int cmd32;
-        unsigned int cmd;
-} atm_ioctl_map[] = {
-        { ATM_GETLINKRATE32, ATM_GETLINKRATE },
-        { ATM_GETNAMES32,    ATM_GETNAMES },
-        { ATM_GETTYPE32,     ATM_GETTYPE },
-        { ATM_GETESI32,      ATM_GETESI },
-        { ATM_GETADDR32,     ATM_GETADDR },
-        { ATM_RSTADDR32,     ATM_RSTADDR },
-        { ATM_ADDADDR32,     ATM_ADDADDR },
-        { ATM_DELADDR32,     ATM_DELADDR },
-        { ATM_GETCIRANGE32,  ATM_GETCIRANGE },
-        { ATM_SETCIRANGE32,  ATM_SETCIRANGE },
-        { ATM_SETESI32,      ATM_SETESI },
-        { ATM_SETESIF32,     ATM_SETESIF },
-        { ATM_GETSTAT32,     ATM_GETSTAT },
-        { ATM_GETSTATZ32,    ATM_GETSTATZ },
-        { ATM_GETLOOP32,     ATM_GETLOOP },
-        { ATM_SETLOOP32,     ATM_SETLOOP },
-        { ATM_QUERYLOOP32,   ATM_QUERYLOOP }
-};
-#define NR_ATM_IOCTL ARRAY_SIZE(atm_ioctl_map)
-static int do_atm_iobuf(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct atm_iobuf   __user *iobuf;
-        struct atm_iobuf32 __user *iobuf32;
-        u32 data;
-        void __user *datap;
-        int len, err;
-        iobuf = compat_alloc_user_space(sizeof(*iobuf));
-        iobuf32 = compat_ptr(arg);
-        if (get_user(len, &iobuf32->length) ||
-            get_user(data, &iobuf32->buffer))
-                return -EFAULT;
-        datap = compat_ptr(data);
-        if (put_user(len, &iobuf->length) ||
-            put_user(datap, &iobuf->buffer))
-                return -EFAULT;
-        err = sys_ioctl(fd, cmd, (unsigned long)iobuf);
-        if (!err) {
-                if (copy_in_user(&iobuf32->length, &iobuf->length,
-                                 sizeof(int)))
-                        err = -EFAULT;
-        }
-        return err;
-}
-static int do_atmif_sioc(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct atmif_sioc   __user *sioc;
-        struct atmif_sioc32 __user *sioc32;
-        u32 data;
-        void __user *datap;
-        int err;
-        
-        sioc = compat_alloc_user_space(sizeof(*sioc));
-        sioc32 = compat_ptr(arg);
-        if (copy_in_user(&sioc->number, &sioc32->number, 2 * sizeof(int)) ||
-            get_user(data, &sioc32->arg))
-                return -EFAULT;
-        datap = compat_ptr(data);
-        if (put_user(datap, &sioc->arg))
-                return -EFAULT;
-        err = sys_ioctl(fd, cmd, (unsigned long) sioc);
-        if (!err) {
-                if (copy_in_user(&sioc32->length, &sioc->length,
-                                 sizeof(int)))
-                        err = -EFAULT;
-        }
        return err;
 }
-static int do_atm_ioctl(unsigned int fd, unsigned int cmd32, unsigned long arg)
+static int ioc_settimeout(unsigned int fd, unsigned int cmd,
-{
+                compat_ulong_t __user *argp)
-        int i;
-        unsigned int cmd = 0;
-        
-        switch (cmd32) {
-        case SONET_GETSTAT:
-        case SONET_GETSTATZ:
-        case SONET_GETDIAG:
-        case SONET_SETDIAG:
-        case SONET_CLRDIAG:
-        case SONET_SETFRAMING:
-        case SONET_GETFRAMING:
-        case SONET_GETFRSENSE:
-                return do_atmif_sioc(fd, cmd32, arg);
-        }
-        for (i = 0; i < NR_ATM_IOCTL; i++) {
-                if (cmd32 == atm_ioctl_map[i].cmd32) {
-                        cmd = atm_ioctl_map[i].cmd;
-                        break;
-                }
-        }
-        if (i == NR_ATM_IOCTL)
-                return -EINVAL;
-        
-        switch (cmd) {
-        case ATM_GETNAMES:
-                return do_atm_iobuf(fd, cmd, arg);
-            
-        case ATM_GETLINKRATE:
-        case ATM_GETTYPE:
-        case ATM_GETESI:
-        case ATM_GETADDR:
-        case ATM_RSTADDR:
-        case ATM_ADDADDR:
-        case ATM_DELADDR:
-        case ATM_GETCIRANGE:
-        case ATM_SETCIRANGE:
-        case ATM_SETESI:
-        case ATM_SETESIF:
-        case ATM_GETSTAT:
-        case ATM_GETSTATZ:
-        case ATM_GETLOOP:
-        case ATM_SETLOOP:
-        case ATM_QUERYLOOP:
-                return do_atmif_sioc(fd, cmd, arg);
-        }
-        return -EINVAL;
-}
-static __used int
-ret_einval(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        return -EINVAL;
-}
-static int ioc_settimeout(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
-        return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, arg);
+        return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, argp);
 }
 /* Bluetooth ioctls */
@@ -1442,15 +656,15 @@ static int set_raw32_request(struct raw_config_request *req, struct raw32_config
        return ret ? -EFAULT : 0;
 }
-static int raw_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
+static int raw_ioctl(unsigned fd, unsigned cmd,
+                struct raw32_config_request __user *user_req)
 {
        int ret;
        switch (cmd) {
        case RAW_SETBIND:
-        case RAW_GETBIND: {
+        default: {      /* RAW_GETBIND */
                struct raw_config_request req;
-                struct raw32_config_request __user *user_req = compat_ptr(arg);
                mm_segment_t oldfs = get_fs();
                if ((ret = get_raw32_request(&req, user_req)))
@@ -1465,9 +679,6 @@ static int raw_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
                }
                break;
        }
-        default:
-                ret = sys_ioctl(fd, cmd, arg);
-                break;
        }
        return ret;
 }
@@ -1495,11 +706,11 @@ struct serial_struct32 {
        compat_int_t    reserved[1];
 };
-static int serial_struct_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
+static int serial_struct_ioctl(unsigned fd, unsigned cmd,
+                        struct serial_struct32 __user *ss32)
 {
        typedef struct serial_struct SS;
        typedef struct serial_struct32 SS32;
-        struct serial_struct32 __user *ss32 = compat_ptr(arg);
        int err;
        struct serial_struct ss;
        mm_segment_t oldseg = get_fs();
@@ -1537,96 +748,6 @@ static int serial_struct_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
        return err;
 }
-struct usbdevfs_ctrltransfer32 {
-        u8 bRequestType;
-        u8 bRequest;
-        u16 wValue;
-        u16 wIndex;
-        u16 wLength;
-        u32 timeout;  /* in milliseconds */
-        compat_caddr_t data;
-};
-#define USBDEVFS_CONTROL32           _IOWR('U', 0, struct usbdevfs_ctrltransfer32)
-static int do_usbdevfs_control(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct usbdevfs_ctrltransfer32 __user *p32 = compat_ptr(arg);
-        struct usbdevfs_ctrltransfer __user *p;
-        __u32 udata;
-        p = compat_alloc_user_space(sizeof(*p));
-        if (copy_in_user(p, p32, (sizeof(*p32) - sizeof(compat_caddr_t))) ||
-            get_user(udata, &p32->data) ||
-            put_user(compat_ptr(udata), &p->data))
-                return -EFAULT;
-        return sys_ioctl(fd, USBDEVFS_CONTROL, (unsigned long)p);
-}
-struct usbdevfs_bulktransfer32 {
-        compat_uint_t ep;
-        compat_uint_t len;
-        compat_uint_t timeout; /* in milliseconds */
-        compat_caddr_t data;
-};
-#define USBDEVFS_BULK32              _IOWR('U', 2, struct usbdevfs_bulktransfer32)
-static int do_usbdevfs_bulk(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct usbdevfs_bulktransfer32 __user *p32 = compat_ptr(arg);
-        struct usbdevfs_bulktransfer __user *p;
-        compat_uint_t n;
-        compat_caddr_t addr;
-        p = compat_alloc_user_space(sizeof(*p));
-        if (get_user(n, &p32->ep) || put_user(n, &p->ep) ||
-            get_user(n, &p32->len) || put_user(n, &p->len) ||
-            get_user(n, &p32->timeout) || put_user(n, &p->timeout) ||
-            get_user(addr, &p32->data) || put_user(compat_ptr(addr), &p->data))
-                return -EFAULT;
-        return sys_ioctl(fd, USBDEVFS_BULK, (unsigned long)p);
-}
-/*
- *  USBDEVFS_SUBMITURB, USBDEVFS_REAPURB and USBDEVFS_REAPURBNDELAY
- *  are handled in usbdevfs core.                       -Christopher Li
- */
-struct usbdevfs_disconnectsignal32 {
-        compat_int_t signr;
-        compat_caddr_t context;
-};
-#define USBDEVFS_DISCSIGNAL32      _IOR('U', 14, struct usbdevfs_disconnectsignal32)
-static int do_usbdevfs_discsignal(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct usbdevfs_disconnectsignal kdis;
-        struct usbdevfs_disconnectsignal32 __user *udis;
-        mm_segment_t old_fs;
-        u32 uctx;
-        int err;
-        udis = compat_ptr(arg);
-        if (get_user(kdis.signr, &udis->signr) ||
-            __get_user(uctx, &udis->context))
-                return -EFAULT;
-        kdis.context = compat_ptr(uctx);
-        old_fs = get_fs();
-        set_fs(KERNEL_DS);
-        err = sys_ioctl(fd, USBDEVFS_DISCSIGNAL, (unsigned long) &kdis);
-        set_fs(old_fs);
-        return err;
-}
 /*
 * I2C layer ioctls
 */
@@ -1655,9 +776,9 @@ struct i2c_rdwr_aligned {
        struct i2c_msg msgs[0];
 };
-static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
+                        struct i2c_rdwr_ioctl_data32    __user *udata)
 {
-        struct i2c_rdwr_ioctl_data32    __user *udata = compat_ptr(arg);
        struct i2c_rdwr_aligned         __user *tdata;
        struct i2c_msg                  __user *tmsgs;
        struct i2c_msg32                __user *umsgs;
@@ -1691,10 +812,10 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, unsigned long ar
        return sys_ioctl(fd, cmd, (unsigned long)tdata);
 }
-static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd,
+                        struct i2c_smbus_ioctl_data32   __user *udata)
 {
        struct i2c_smbus_ioctl_data     __user *tdata;
-        struct i2c_smbus_ioctl_data32   __user *udata;
        compat_caddr_t                  datap;
        tdata = compat_alloc_user_space(sizeof(*tdata));
@@ -1703,7 +824,6 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long a
        if (!access_ok(VERIFY_WRITE, tdata, sizeof(*tdata)))
                return -EFAULT;
-        udata = compat_ptr(arg);
        if (!access_ok(VERIFY_READ, udata, sizeof(*udata)))
                return -EFAULT;
@@ -1718,27 +838,12 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long a
        return sys_ioctl(fd, cmd, (unsigned long)tdata);
 }
-/* Since old style bridge ioctl's endup using SIOCDEVPRIVATE
- * for some operations; this forces use of the newer bridge-utils that
- * use compatible ioctls
- */
-static int old_bridge_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        u32 tmp;
-        if (get_user(tmp, (u32 __user *) arg))
-                return -EFAULT;
-        if (tmp == BRCTL_GET_VERSION)
-                return BRCTL_VERSION + 1;
-        return -EINVAL;
-}
 #define RTC_IRQP_READ32         _IOR('p', 0x0b, compat_ulong_t)
 #define RTC_IRQP_SET32          _IOW('p', 0x0c, compat_ulong_t)
 #define RTC_EPOCH_READ32        _IOR('p', 0x0d, compat_ulong_t)
 #define RTC_EPOCH_SET32         _IOW('p', 0x0e, compat_ulong_t)
-static int rtc_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
+static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp)
 {
        mm_segment_t oldfs = get_fs();
        compat_ulong_t val32;
@@ -1756,29 +861,14 @@ static int rtc_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
                if (ret)
                        return ret;
                val32 = kval;
-                return put_user(val32, (unsigned int __user *)arg);
+                return put_user(val32, (unsigned int __user *)argp);
        case RTC_IRQP_SET32:
-                return sys_ioctl(fd, RTC_IRQP_SET, arg); 
+                return sys_ioctl(fd, RTC_IRQP_SET, (unsigned long)argp);
        case RTC_EPOCH_SET32:
-                return sys_ioctl(fd, RTC_EPOCH_SET, arg);
+                return sys_ioctl(fd, RTC_EPOCH_SET, (unsigned long)argp);
-        default:
-                /* unreached */
-                return -ENOIOCTLCMD;
        }
-}
-static int
+        return -ENOIOCTLCMD;
-lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct compat_timeval __user *tc = (struct compat_timeval __user *)arg;
-        struct timeval __user *tn = compat_alloc_user_space(sizeof(struct timeval));
-        struct timeval ts;
-        if (get_user(ts.tv_sec, &tc->tv_sec) ||
-            get_user(ts.tv_usec, &tc->tv_usec) ||
-            put_user(ts.tv_sec, &tn->tv_sec) ||
-            put_user(ts.tv_usec, &tn->tv_usec))
-                return -EFAULT;
-        return sys_ioctl(fd, cmd, (unsigned long)tn);
 }
 /* on ia32 l_start is on a 32-bit boundary */
@@ -1798,9 +888,9 @@ struct space_resv_32 {
 #define FS_IOC_RESVSP64_32      _IOW ('X', 42, struct space_resv_32)
 /* just account for different alignment */
-static int compat_ioctl_preallocate(struct file *file, unsigned long arg)
+static int compat_ioctl_preallocate(struct file *file,
+                        struct space_resv_32    __user *p32)
 {
-        struct space_resv_32    __user *p32 = compat_ptr(arg);
        struct space_resv       __user *p = compat_alloc_user_space(sizeof(*p));
        if (copy_in_user(&p->l_type,    &p32->l_type,   sizeof(s16)) ||
@@ -1816,27 +906,13 @@ static int compat_ioctl_preallocate(struct file *file, unsigned long arg)
 }
 #endif
+/*
+ * simple reversible transform to make our table more evenly
+ * distributed after sorting.
+ */
+#define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff)
-typedef int (*ioctl_trans_handler_t)(unsigned int, unsigned int,
+#define COMPATIBLE_IOCTL(cmd) XFORM(cmd),
-                                        unsigned long, struct file *);
-struct ioctl_trans {
-        unsigned long cmd;
-        ioctl_trans_handler_t handler;
-        struct ioctl_trans *next;
-};
-#define HANDLE_IOCTL(cmd,handler) \
-        { (cmd), (ioctl_trans_handler_t)(handler) },
-/* pointer to compatible structure or no argument */
-#define COMPATIBLE_IOCTL(cmd) \
-        { (cmd), do_ioctl32_pointer },
-/* argument is an unsigned long integer, not a pointer */
-#define ULONG_IOCTL(cmd) \
-        { (cmd), (ioctl_trans_handler_t)sys_ioctl },
 /* ioctl should not be warned about even if it's not implemented.
   Valid reasons to use this:
   - It is implemented with ->compat_ioctl on some device, but programs
@@ -1846,7 +922,7 @@ struct ioctl_trans {
   Most other reasons are not valid. */
 #define IGNORE_IOCTL(cmd) COMPATIBLE_IOCTL(cmd)
-static struct ioctl_trans ioctl_start[] = {
+static unsigned int ioctl_pointer[] = {
 /* compatible ioctls first */
 COMPATIBLE_IOCTL(0x4B50)   /* KDGHWCLK - not in the kernel, but don't complain */
 COMPATIBLE_IOCTL(0x4B51)   /* KDSHWCLK - not in the kernel, but don't complain */
@@ -1857,7 +933,6 @@ COMPATIBLE_IOCTL(TCSETA)
 COMPATIBLE_IOCTL(TCSETAW)
 COMPATIBLE_IOCTL(TCSETAF)
 COMPATIBLE_IOCTL(TCSBRK)
-ULONG_IOCTL(TCSBRKP)
 COMPATIBLE_IOCTL(TCXONC)
 COMPATIBLE_IOCTL(TCFLSH)
 COMPATIBLE_IOCTL(TCGETS)
@@ -1867,7 +942,7 @@ COMPATIBLE_IOCTL(TCSETSF)
 COMPATIBLE_IOCTL(TIOCLINUX)
 COMPATIBLE_IOCTL(TIOCSBRK)
 COMPATIBLE_IOCTL(TIOCCBRK)
-ULONG_IOCTL(TIOCMIWAIT)
+COMPATIBLE_IOCTL(TIOCGSID)
 COMPATIBLE_IOCTL(TIOCGICOUNT)
 /* Little t */
 COMPATIBLE_IOCTL(TIOCGETD)
@@ -1889,7 +964,6 @@ COMPATIBLE_IOCTL(TIOCSTI)
 COMPATIBLE_IOCTL(TIOCOUTQ)
 COMPATIBLE_IOCTL(TIOCSPGRP)
 COMPATIBLE_IOCTL(TIOCGPGRP)
-ULONG_IOCTL(TIOCSCTTY)
 COMPATIBLE_IOCTL(TIOCGPTN)
 COMPATIBLE_IOCTL(TIOCSPTLCK)
 COMPATIBLE_IOCTL(TIOCSERGETLSR)
@@ -1912,44 +986,11 @@ COMPATIBLE_IOCTL(FIGETBSZ)
 /* 'X' - originally XFS but some now in the VFS */
 COMPATIBLE_IOCTL(FIFREEZE)
 COMPATIBLE_IOCTL(FITHAW)
-/* RAID */
-COMPATIBLE_IOCTL(RAID_VERSION)
-COMPATIBLE_IOCTL(GET_ARRAY_INFO)
-COMPATIBLE_IOCTL(GET_DISK_INFO)
-COMPATIBLE_IOCTL(PRINT_RAID_DEBUG)
-COMPATIBLE_IOCTL(RAID_AUTORUN)
-COMPATIBLE_IOCTL(CLEAR_ARRAY)
-COMPATIBLE_IOCTL(ADD_NEW_DISK)
-ULONG_IOCTL(HOT_REMOVE_DISK)
-COMPATIBLE_IOCTL(SET_ARRAY_INFO)
-COMPATIBLE_IOCTL(SET_DISK_INFO)
-COMPATIBLE_IOCTL(WRITE_RAID_INFO)
-COMPATIBLE_IOCTL(UNPROTECT_ARRAY)
-COMPATIBLE_IOCTL(PROTECT_ARRAY)
-ULONG_IOCTL(HOT_ADD_DISK)
-ULONG_IOCTL(SET_DISK_FAULTY)
-COMPATIBLE_IOCTL(RUN_ARRAY)
-COMPATIBLE_IOCTL(STOP_ARRAY)
-COMPATIBLE_IOCTL(STOP_ARRAY_RO)
-COMPATIBLE_IOCTL(RESTART_ARRAY_RW)
-COMPATIBLE_IOCTL(GET_BITMAP_FILE)
-ULONG_IOCTL(SET_BITMAP_FILE)
-/* Big K */
-COMPATIBLE_IOCTL(PIO_FONT)
-COMPATIBLE_IOCTL(GIO_FONT)
-COMPATIBLE_IOCTL(PIO_CMAP)
-COMPATIBLE_IOCTL(GIO_CMAP)
-ULONG_IOCTL(KDSIGACCEPT)
 COMPATIBLE_IOCTL(KDGETKEYCODE)
 COMPATIBLE_IOCTL(KDSETKEYCODE)
-ULONG_IOCTL(KIOCSOUND)
-ULONG_IOCTL(KDMKTONE)
 COMPATIBLE_IOCTL(KDGKBTYPE)
-ULONG_IOCTL(KDSETMODE)
 COMPATIBLE_IOCTL(KDGETMODE)
-ULONG_IOCTL(KDSKBMODE)
 COMPATIBLE_IOCTL(KDGKBMODE)
-ULONG_IOCTL(KDSKBMETA)
 COMPATIBLE_IOCTL(KDGKBMETA)
 COMPATIBLE_IOCTL(KDGKBENT)
 COMPATIBLE_IOCTL(KDSKBENT)
@@ -1959,15 +1000,7 @@ COMPATIBLE_IOCTL(KDGKBDIACR)
 COMPATIBLE_IOCTL(KDSKBDIACR)
 COMPATIBLE_IOCTL(KDKBDREP)
 COMPATIBLE_IOCTL(KDGKBLED)
-ULONG_IOCTL(KDSKBLED)
 COMPATIBLE_IOCTL(KDGETLED)
-ULONG_IOCTL(KDSETLED)
-COMPATIBLE_IOCTL(GIO_SCRNMAP)
-COMPATIBLE_IOCTL(PIO_SCRNMAP)
-COMPATIBLE_IOCTL(GIO_UNISCRNMAP)
-COMPATIBLE_IOCTL(PIO_UNISCRNMAP)
-COMPATIBLE_IOCTL(PIO_FONTRESET)
-COMPATIBLE_IOCTL(PIO_UNIMAPCLR)
 #ifdef CONFIG_BLOCK
 /* Big S */
 COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN)
@@ -1979,32 +1012,9 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND)
 COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST)
 COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI)
 #endif
-/* Big T */
+/* Big V (don't complain on serial console) */
-COMPATIBLE_IOCTL(TUNSETNOCSUM)
+IGNORE_IOCTL(VT_OPENQRY)
-COMPATIBLE_IOCTL(TUNSETDEBUG)
+IGNORE_IOCTL(VT_GETMODE)
-COMPATIBLE_IOCTL(TUNSETPERSIST)
-COMPATIBLE_IOCTL(TUNSETOWNER)
-COMPATIBLE_IOCTL(TUNSETLINK)
-COMPATIBLE_IOCTL(TUNSETGROUP)
-COMPATIBLE_IOCTL(TUNGETFEATURES)
-COMPATIBLE_IOCTL(TUNSETOFFLOAD)
-COMPATIBLE_IOCTL(TUNSETTXFILTER)
-COMPATIBLE_IOCTL(TUNGETSNDBUF)
-COMPATIBLE_IOCTL(TUNSETSNDBUF)
-/* Big V */
-COMPATIBLE_IOCTL(VT_SETMODE)
-COMPATIBLE_IOCTL(VT_GETMODE)
-COMPATIBLE_IOCTL(VT_GETSTATE)
-COMPATIBLE_IOCTL(VT_OPENQRY)
-ULONG_IOCTL(VT_ACTIVATE)
-ULONG_IOCTL(VT_WAITACTIVE)
-ULONG_IOCTL(VT_RELDISP)
-ULONG_IOCTL(VT_DISALLOCATE)
-COMPATIBLE_IOCTL(VT_RESIZE)
-COMPATIBLE_IOCTL(VT_RESIZEX)
-COMPATIBLE_IOCTL(VT_LOCKSWITCH)
-COMPATIBLE_IOCTL(VT_UNLOCKSWITCH)
-COMPATIBLE_IOCTL(VT_GETHIFONTMASK)
 /* Little p (/dev/rtc, /dev/envctrl, etc.) */
 COMPATIBLE_IOCTL(RTC_AIE_ON)
 COMPATIBLE_IOCTL(RTC_AIE_OFF)
@@ -2032,36 +1042,15 @@ COMPATIBLE_IOCTL(_IOW('p', 21, int[7])) /* RTCSET */
 COMPATIBLE_IOCTL(MTIOCTOP)
 /* Socket level stuff */
 COMPATIBLE_IOCTL(FIOQSIZE)
-COMPATIBLE_IOCTL(FIOSETOWN)
-COMPATIBLE_IOCTL(SIOCSPGRP)
-COMPATIBLE_IOCTL(FIOGETOWN)
-COMPATIBLE_IOCTL(SIOCGPGRP)
-COMPATIBLE_IOCTL(SIOCATMARK)
-COMPATIBLE_IOCTL(SIOCSIFLINK)
-COMPATIBLE_IOCTL(SIOCSIFENCAP)
-COMPATIBLE_IOCTL(SIOCGIFENCAP)
-COMPATIBLE_IOCTL(SIOCSIFNAME)
-COMPATIBLE_IOCTL(SIOCSARP)
-COMPATIBLE_IOCTL(SIOCGARP)
-COMPATIBLE_IOCTL(SIOCDARP)
-COMPATIBLE_IOCTL(SIOCSRARP)
-COMPATIBLE_IOCTL(SIOCGRARP)
-COMPATIBLE_IOCTL(SIOCDRARP)
-COMPATIBLE_IOCTL(SIOCADDDLCI)
-COMPATIBLE_IOCTL(SIOCDELDLCI)
-COMPATIBLE_IOCTL(SIOCGMIIPHY)
-COMPATIBLE_IOCTL(SIOCGMIIREG)
-COMPATIBLE_IOCTL(SIOCSMIIREG)
-COMPATIBLE_IOCTL(SIOCGIFVLAN)
-COMPATIBLE_IOCTL(SIOCSIFVLAN)
-COMPATIBLE_IOCTL(SIOCBRADDBR)
-COMPATIBLE_IOCTL(SIOCBRDELBR)
 #ifdef CONFIG_BLOCK
+/* loop */
+IGNORE_IOCTL(LOOP_CLR_FD)
+/* md calls this on random blockdevs */
+IGNORE_IOCTL(RAID_VERSION)
 /* SG stuff */
 COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
 COMPATIBLE_IOCTL(SG_GET_TIMEOUT)
 COMPATIBLE_IOCTL(SG_EMULATED_HOST)
-ULONG_IOCTL(SG_SET_TRANSFORM)
 COMPATIBLE_IOCTL(SG_GET_TRANSFORM)
 COMPATIBLE_IOCTL(SG_SET_RESERVED_SIZE)
 COMPATIBLE_IOCTL(SG_GET_RESERVED_SIZE)
@@ -2115,8 +1104,6 @@ COMPATIBLE_IOCTL(PPPIOCGCHAN)
 /* PPPOX */
 COMPATIBLE_IOCTL(PPPOEIOCSFWD)
 COMPATIBLE_IOCTL(PPPOEIOCDFWD)
-/* LP */
-COMPATIBLE_IOCTL(LPGETSTATUS)
 /* ppdev */
 COMPATIBLE_IOCTL(PPSETMODE)
 COMPATIBLE_IOCTL(PPRSTATUS)
@@ -2298,8 +1285,6 @@ COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS)
 COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS)
 COMPATIBLE_IOCTL(OSS_GETVERSION)
 /* AUTOFS */
-ULONG_IOCTL(AUTOFS_IOC_READY)
-ULONG_IOCTL(AUTOFS_IOC_FAIL)
 COMPATIBLE_IOCTL(AUTOFS_IOC_CATATONIC)
 COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOVER)
 COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE)
@@ -2311,22 +1296,6 @@ COMPATIBLE_IOCTL(RAW_SETBIND)
 COMPATIBLE_IOCTL(RAW_GETBIND)
 /* SMB ioctls which do not need any translations */
 COMPATIBLE_IOCTL(SMB_IOC_NEWCONN)
-/* Little a */
-COMPATIBLE_IOCTL(ATMSIGD_CTRL)
-COMPATIBLE_IOCTL(ATMARPD_CTRL)
-COMPATIBLE_IOCTL(ATMLEC_CTRL)
-COMPATIBLE_IOCTL(ATMLEC_MCAST)
-COMPATIBLE_IOCTL(ATMLEC_DATA)
-COMPATIBLE_IOCTL(ATM_SETSC)
-COMPATIBLE_IOCTL(SIOCSIFATMTCP)
-COMPATIBLE_IOCTL(SIOCMKCLIP)
-COMPATIBLE_IOCTL(ATMARP_MKIP)
-COMPATIBLE_IOCTL(ATMARP_SETENTRY)
-COMPATIBLE_IOCTL(ATMARP_ENCAP)
-COMPATIBLE_IOCTL(ATMTCP_CREATE)
-COMPATIBLE_IOCTL(ATMTCP_REMOVE)
-COMPATIBLE_IOCTL(ATMMPC_CTRL)
-COMPATIBLE_IOCTL(ATMMPC_DATA)
 /* Watchdog */
 COMPATIBLE_IOCTL(WDIOC_GETSUPPORT)
 COMPATIBLE_IOCTL(WDIOC_GETSTATUS)
@@ -2408,30 +1377,11 @@ COMPATIBLE_IOCTL(PCIIOC_CONTROLLER)
 COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO)
 COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM)
 COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE)
-/* USB */
-COMPATIBLE_IOCTL(USBDEVFS_RESETEP)
-COMPATIBLE_IOCTL(USBDEVFS_SETINTERFACE)
-COMPATIBLE_IOCTL(USBDEVFS_SETCONFIGURATION)
-COMPATIBLE_IOCTL(USBDEVFS_GETDRIVER)
-COMPATIBLE_IOCTL(USBDEVFS_DISCARDURB)
-COMPATIBLE_IOCTL(USBDEVFS_CLAIMINTERFACE)
-COMPATIBLE_IOCTL(USBDEVFS_RELEASEINTERFACE)
-COMPATIBLE_IOCTL(USBDEVFS_CONNECTINFO)
-COMPATIBLE_IOCTL(USBDEVFS_HUB_PORTINFO)
-COMPATIBLE_IOCTL(USBDEVFS_RESET)
-COMPATIBLE_IOCTL(USBDEVFS_SUBMITURB32)
-COMPATIBLE_IOCTL(USBDEVFS_REAPURB32)
-COMPATIBLE_IOCTL(USBDEVFS_REAPURBNDELAY32)
-COMPATIBLE_IOCTL(USBDEVFS_CLEAR_HALT)
 /* NBD */
-ULONG_IOCTL(NBD_SET_SOCK)
-ULONG_IOCTL(NBD_SET_BLKSIZE)
-ULONG_IOCTL(NBD_SET_SIZE)
 COMPATIBLE_IOCTL(NBD_DO_IT)
 COMPATIBLE_IOCTL(NBD_CLEAR_SOCK)
 COMPATIBLE_IOCTL(NBD_CLEAR_QUE)
 COMPATIBLE_IOCTL(NBD_PRINT_DEBUG)
-ULONG_IOCTL(NBD_SET_SIZE_BLOCKS)
 COMPATIBLE_IOCTL(NBD_DISCONNECT)
 /* i2c */
 COMPATIBLE_IOCTL(I2C_SLAVE)
@@ -2531,131 +1481,13 @@ COMPATIBLE_IOCTL(JSIOCGAXES)
 COMPATIBLE_IOCTL(JSIOCGBUTTONS)
 COMPATIBLE_IOCTL(JSIOCGNAME(0))
-/* now things that need handlers */
-#ifdef CONFIG_NET
-HANDLE_IOCTL(SIOCGIFNAME, dev_ifname32)
-HANDLE_IOCTL(SIOCGIFCONF, dev_ifconf)
-HANDLE_IOCTL(SIOCGIFFLAGS, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFFLAGS, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFMETRIC, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFMETRIC, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFMTU, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFMTU, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFMEM, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFMEM, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFHWADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFHWADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCADDMULTI, dev_ifsioc)
-HANDLE_IOCTL(SIOCDELMULTI, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFINDEX, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFMAP, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFMAP, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFHWBROADCAST, dev_ifsioc)
-HANDLE_IOCTL(SIOCSHWTSTAMP, dev_ifsioc)
-/* ioctls used by appletalk ddp.c */
-HANDLE_IOCTL(SIOCATALKDIFADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCDIFADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCSARP, dev_ifsioc)
-HANDLE_IOCTL(SIOCDARP, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFBRDADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFBRDADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFDSTADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFDSTADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFNETMASK, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFNETMASK, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFPFLAGS, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFPFLAGS, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFTXQLEN, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFTXQLEN, dev_ifsioc)
-HANDLE_IOCTL(TUNSETIFF, dev_ifsioc)
-HANDLE_IOCTL(TUNGETIFF, dev_ifsioc)
-HANDLE_IOCTL(SIOCETHTOOL, ethtool_ioctl)
-HANDLE_IOCTL(SIOCBONDENSLAVE, bond_ioctl)
-HANDLE_IOCTL(SIOCBONDRELEASE, bond_ioctl)
-HANDLE_IOCTL(SIOCBONDSETHWADDR, bond_ioctl)
-HANDLE_IOCTL(SIOCBONDSLAVEINFOQUERY, bond_ioctl)
-HANDLE_IOCTL(SIOCBONDINFOQUERY, bond_ioctl)
-HANDLE_IOCTL(SIOCBONDCHANGEACTIVE, bond_ioctl)
-HANDLE_IOCTL(SIOCADDRT, routing_ioctl)
-HANDLE_IOCTL(SIOCDELRT, routing_ioctl)
-HANDLE_IOCTL(SIOCBRADDIF, dev_ifsioc)
-HANDLE_IOCTL(SIOCBRDELIF, dev_ifsioc)
-/* Note SIOCRTMSG is no longer, so this is safe and * the user would have seen just an -EINVAL anyways. */
-HANDLE_IOCTL(SIOCRTMSG, ret_einval)
-HANDLE_IOCTL(SIOCGSTAMP, do_siocgstamp)
-HANDLE_IOCTL(SIOCGSTAMPNS, do_siocgstampns)
-#endif
-#ifdef CONFIG_BLOCK
-HANDLE_IOCTL(SG_IO,sg_ioctl_trans)
-HANDLE_IOCTL(SG_GET_REQUEST_TABLE, sg_grt_trans)
-#endif
-HANDLE_IOCTL(PPPIOCGIDLE32, ppp_ioctl_trans)
-HANDLE_IOCTL(PPPIOCSCOMPRESS32, ppp_ioctl_trans)
-HANDLE_IOCTL(PPPIOCSPASS32, ppp_sock_fprog_ioctl_trans)
-HANDLE_IOCTL(PPPIOCSACTIVE32, ppp_sock_fprog_ioctl_trans)
-#ifdef CONFIG_BLOCK
-HANDLE_IOCTL(MTIOCGET32, mt_ioctl_trans)
-HANDLE_IOCTL(MTIOCPOS32, mt_ioctl_trans)
-#endif
-#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,unsigned int)
-HANDLE_IOCTL(AUTOFS_IOC_SETTIMEOUT32, ioc_settimeout)
-#ifdef CONFIG_VT
-HANDLE_IOCTL(PIO_FONTX, do_fontx_ioctl)
-HANDLE_IOCTL(GIO_FONTX, do_fontx_ioctl)
-HANDLE_IOCTL(PIO_UNIMAP, do_unimap_ioctl)
-HANDLE_IOCTL(GIO_UNIMAP, do_unimap_ioctl)
-HANDLE_IOCTL(KDFONTOP, do_kdfontop_ioctl)
-#endif
-/* One SMB ioctl needs translations. */
-#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
-HANDLE_IOCTL(SMB_IOC_GETMOUNTUID_32, do_smb_getmountuid)
-HANDLE_IOCTL(ATM_GETLINKRATE32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETNAMES32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETTYPE32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETESI32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETADDR32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_RSTADDR32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_ADDADDR32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_DELADDR32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETCIRANGE32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_SETCIRANGE32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_SETESI32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_SETESIF32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETSTAT32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETSTATZ32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETLOOP32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_SETLOOP32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_QUERYLOOP32, do_atm_ioctl)
-HANDLE_IOCTL(SONET_GETSTAT, do_atm_ioctl)
-HANDLE_IOCTL(SONET_GETSTATZ, do_atm_ioctl)
-HANDLE_IOCTL(SONET_GETDIAG, do_atm_ioctl)
-HANDLE_IOCTL(SONET_SETDIAG, do_atm_ioctl)
-HANDLE_IOCTL(SONET_CLRDIAG, do_atm_ioctl)
-HANDLE_IOCTL(SONET_SETFRAMING, do_atm_ioctl)
-HANDLE_IOCTL(SONET_GETFRAMING, do_atm_ioctl)
-HANDLE_IOCTL(SONET_GETFRSENSE, do_atm_ioctl)
-/* block stuff */
-#ifdef CONFIG_BLOCK
-/* loop */
-IGNORE_IOCTL(LOOP_CLR_FD)
-/* Raw devices */
-HANDLE_IOCTL(RAW_SETBIND, raw_ioctl)
-HANDLE_IOCTL(RAW_GETBIND, raw_ioctl)
-#endif
-/* Serial */
-HANDLE_IOCTL(TIOCGSERIAL, serial_struct_ioctl)
-HANDLE_IOCTL(TIOCSSERIAL, serial_struct_ioctl)
 #ifdef TIOCGLTC
 COMPATIBLE_IOCTL(TIOCGLTC)
 COMPATIBLE_IOCTL(TIOCSLTC)
 #endif
 #ifdef TIOCSTART
 /*
- * For these two we have defintions in ioctls.h and/or termios.h on
+ * For these two we have definitions in ioctls.h and/or termios.h on
 * some architectures but no actual implemention.  Some applications
 * like bash call them if they are defined in the headers, so we provide
 * entries here to avoid syslog message spew.
@@ -2663,43 +1495,6 @@ COMPATIBLE_IOCTL(TIOCSLTC)
 COMPATIBLE_IOCTL(TIOCSTART)
 COMPATIBLE_IOCTL(TIOCSTOP)
 #endif
-/* Usbdevfs */
-HANDLE_IOCTL(USBDEVFS_CONTROL32, do_usbdevfs_control)
-HANDLE_IOCTL(USBDEVFS_BULK32, do_usbdevfs_bulk)
-HANDLE_IOCTL(USBDEVFS_DISCSIGNAL32, do_usbdevfs_discsignal)
-COMPATIBLE_IOCTL(USBDEVFS_IOCTL32)
-/* i2c */
-HANDLE_IOCTL(I2C_FUNCS, w_long)
-HANDLE_IOCTL(I2C_RDWR, do_i2c_rdwr_ioctl)
-HANDLE_IOCTL(I2C_SMBUS, do_i2c_smbus_ioctl)
-/* bridge */
-HANDLE_IOCTL(SIOCSIFBR, old_bridge_ioctl)
-HANDLE_IOCTL(SIOCGIFBR, old_bridge_ioctl)
-/* Not implemented in the native kernel */
-IGNORE_IOCTL(SIOCGIFCOUNT)
-HANDLE_IOCTL(RTC_IRQP_READ32, rtc_ioctl)
-HANDLE_IOCTL(RTC_IRQP_SET32, rtc_ioctl)
-HANDLE_IOCTL(RTC_EPOCH_READ32, rtc_ioctl)
-HANDLE_IOCTL(RTC_EPOCH_SET32, rtc_ioctl)
-/* dvb */
-HANDLE_IOCTL(VIDEO_GET_EVENT, do_video_get_event)
-HANDLE_IOCTL(VIDEO_STILLPICTURE, do_video_stillpicture)
-HANDLE_IOCTL(VIDEO_SET_SPU_PALETTE, do_video_set_spu_palette)
-/* parport */
-COMPATIBLE_IOCTL(LPTIME)
-COMPATIBLE_IOCTL(LPCHAR)
-COMPATIBLE_IOCTL(LPABORTOPEN)
-COMPATIBLE_IOCTL(LPCAREFUL)
-COMPATIBLE_IOCTL(LPWAIT)
-COMPATIBLE_IOCTL(LPSETIRQ)
-COMPATIBLE_IOCTL(LPGETSTATUS)
-COMPATIBLE_IOCTL(LPGETSTATUS)
-COMPATIBLE_IOCTL(LPRESET)
-/*LPGETSTATS not implemented, but no kernels seem to compile it in anyways*/
-COMPATIBLE_IOCTL(LPGETFLAGS)
-HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans)
 /* fat 'r' ioctls. These are handled by fat with ->compat_ioctl,
   but we don't want warnings on other file systems. So declare
@@ -2727,12 +1522,108 @@ IGNORE_IOCTL(FBIOGCURSOR32)
 #endif
 };
-#define IOCTL_HASHSIZE 256
+/*
-static struct ioctl_trans *ioctl32_hash_table[IOCTL_HASHSIZE];
+ * Convert common ioctl arguments based on their command number
+ *
-static inline unsigned long ioctl32_hash(unsigned long cmd)
+ * Please do not add any code in here. Instead, implement
+ * a compat_ioctl operation in the place that handleѕ the
+ * ioctl for the native case.
+ */
+static long do_ioctl_trans(int fd, unsigned int cmd,
+                 unsigned long arg, struct file *file)
 {
-        return (((cmd >> 6) ^ (cmd >> 4) ^ cmd)) % IOCTL_HASHSIZE;
+        void __user *argp = compat_ptr(arg);
+        switch (cmd) {
+        case PPPIOCGIDLE32:
+                return ppp_gidle(fd, cmd, argp);
+        case PPPIOCSCOMPRESS32:
+                return ppp_scompress(fd, cmd, argp);
+        case PPPIOCSPASS32:
+        case PPPIOCSACTIVE32:
+                return ppp_sock_fprog_ioctl_trans(fd, cmd, argp);
+#ifdef CONFIG_BLOCK
+        case SG_IO:
+                return sg_ioctl_trans(fd, cmd, argp);
+        case SG_GET_REQUEST_TABLE:
+                return sg_grt_trans(fd, cmd, argp);
+        case MTIOCGET32:
+        case MTIOCPOS32:
+                return mt_ioctl_trans(fd, cmd, argp);
+        /* Raw devices */
+        case RAW_SETBIND:
+        case RAW_GETBIND:
+                return raw_ioctl(fd, cmd, argp);
+#endif
+#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,unsigned int)
+        case AUTOFS_IOC_SETTIMEOUT32:
+                return ioc_settimeout(fd, cmd, argp);
+        /* One SMB ioctl needs translations. */
+#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
+        case SMB_IOC_GETMOUNTUID_32:
+                return do_smb_getmountuid(fd, cmd, argp);
+        /* Serial */
+        case TIOCGSERIAL:
+        case TIOCSSERIAL:
+                return serial_struct_ioctl(fd, cmd, argp);
+        /* i2c */
+        case I2C_FUNCS:
+                return w_long(fd, cmd, argp);
+        case I2C_RDWR:
+                return do_i2c_rdwr_ioctl(fd, cmd, argp);
+        case I2C_SMBUS:
+                return do_i2c_smbus_ioctl(fd, cmd, argp);
+        /* Not implemented in the native kernel */
+        case RTC_IRQP_READ32:
+        case RTC_IRQP_SET32:
+        case RTC_EPOCH_READ32:
+        case RTC_EPOCH_SET32:
+                return rtc_ioctl(fd, cmd, argp);
+        /* dvb */
+        case VIDEO_GET_EVENT:
+                return do_video_get_event(fd, cmd, argp);
+        case VIDEO_STILLPICTURE:
+                return do_video_stillpicture(fd, cmd, argp);
+        case VIDEO_SET_SPU_PALETTE:
+                return do_video_set_spu_palette(fd, cmd, argp);
+        }
+        /*
+         * These take an integer instead of a pointer as 'arg',
+         * so we must not do a compat_ptr() translation.
+         */
+        switch (cmd) {
+        /* Big T */
+        case TCSBRKP:
+        case TIOCMIWAIT:
+        case TIOCSCTTY:
+        /* RAID */
+        case HOT_REMOVE_DISK:
+        case HOT_ADD_DISK:
+        case SET_DISK_FAULTY:
+        case SET_BITMAP_FILE:
+        /* Big K */
+        case KDSIGACCEPT:
+        case KIOCSOUND:
+        case KDMKTONE:
+        case KDSETMODE:
+        case KDSKBMODE:
+        case KDSKBMETA:
+        case KDSKBLED:
+        case KDSETLED:
+        /* AUTOFS */
+        case AUTOFS_IOC_READY:
+        case AUTOFS_IOC_FAIL:
+        /* NBD */
+        case NBD_SET_SOCK:
+        case NBD_SET_BLKSIZE:
+        case NBD_SET_SIZE:
+        case NBD_SET_SIZE_BLOCKS:
+                return do_vfs_ioctl(file, fd, cmd, arg);
+        }
+        return -ENOIOCTLCMD;
 }
 static void compat_ioctl_error(struct file *filp, unsigned int fd,
@@ -2764,12 +1655,33 @@ static void compat_ioctl_error(struct file *filp, unsigned int fd,
                free_page((unsigned long)path);
 }
+static int compat_ioctl_check_table(unsigned int xcmd)
+{
+        int i;
+        const int max = ARRAY_SIZE(ioctl_pointer) - 1;
+        BUILD_BUG_ON(max >= (1 << 16));
+        /* guess initial offset into table, assuming a
+           normalized distribution */
+        i = ((xcmd >> 16) * max) >> 16;
+        /* do linear search up first, until greater or equal */
+        while (ioctl_pointer[i] < xcmd && i < max)
+                i++;
+        /* then do linear search down */
+        while (ioctl_pointer[i] > xcmd && i > 0)
+                i--;
+        return ioctl_pointer[i] == xcmd;
+}
 asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
                                unsigned long arg)
 {
        struct file *filp;
        int error = -EBADF;
-        struct ioctl_trans *t;
        int fput_needed;
        filp = fget_light(fd, &fput_needed);
@@ -2797,7 +1709,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
 #if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
        case FS_IOC_RESVSP_32:
        case FS_IOC_RESVSP64_32:
-                error = compat_ioctl_preallocate(filp, arg);
+                error = compat_ioctl_preallocate(filp, compat_ptr(arg));
                goto out_fput;
 #else
        case FS_IOC_RESVSP:
@@ -2826,18 +1738,11 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
                break;
        }
-        for (t = ioctl32_hash_table[ioctl32_hash(cmd)]; t; t = t->next) {
+        if (compat_ioctl_check_table(XFORM(cmd)))
-                if (t->cmd == cmd)
+                goto found_handler;
-                        goto found_handler;
-        }
-#ifdef CONFIG_NET
+        error = do_ioctl_trans(fd, cmd, arg, filp);
-        if (S_ISSOCK(filp->f_path.dentry->d_inode->i_mode) &&
+        if (error == -ENOIOCTLCMD) {
-            cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
-                error = siocdevprivate_ioctl(fd, cmd, arg);
-        } else
-#endif
-        {
                static int count;
                if (++count <= 50)
@@ -2848,13 +1753,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
        goto out_fput;
 found_handler:
-        if (t->handler) {
+        arg = (unsigned long)compat_ptr(arg);
-                lock_kernel();
-                error = t->handler(fd, cmd, arg, filp);
-                unlock_kernel();
-                goto out_fput;
-        }
 do_ioctl:
        error = do_vfs_ioctl(filp, fd, cmd, arg);
 out_fput:
@@ -2863,35 +1762,22 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
        return error;
 }
-static void ioctl32_insert_translation(struct ioctl_trans *trans)
+static int __init init_sys32_ioctl_cmp(const void *p, const void *q)
 {
-        unsigned long hash;
+        unsigned int a, b;
-        struct ioctl_trans *t;
+        a = *(unsigned int *)p;
+        b = *(unsigned int *)q;
-        hash = ioctl32_hash (trans->cmd);
+        if (a > b)
-        if (!ioctl32_hash_table[hash])
+                return 1;
-                ioctl32_hash_table[hash] = trans;
+        if (a < b)
-        else {
+                return -1;
-                t = ioctl32_hash_table[hash];
+        return 0;
-                while (t->next)
-                        t = t->next;
-                trans->next = NULL;
-                t->next = trans;
-        }
 }
 static int __init init_sys32_ioctl(void)
 {
-        int i;
+        sort(ioctl_pointer, ARRAY_SIZE(ioctl_pointer), sizeof(*ioctl_pointer),
+                init_sys32_ioctl_cmp, NULL);
-        for (i = 0; i < ARRAY_SIZE(ioctl_start); i++) {
-                if (ioctl_start[i].next) {
-                        printk("ioctl translation %d bad\n",i);
-                        return -1;
-                }
-                ioctl32_insert_translation(&ioctl_start[i]);
-        }
        return 0;
 }
 __initcall(init_sys32_ioctl);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index a2f746066c5d..c8af2d91174b 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -34,6 +34,7 @@
 #include <linux/capability.h>
 #include <linux/sched.h>
 #include <linux/lockdep.h>
+#include <linux/slab.h>
 #include <linux/configfs.h>
 #include "configfs_internal.h"
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 8421cea7d8c7..8c8d64230c2d 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -29,6 +29,7 @@
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/configfs.h>
 #include "configfs_internal.h"
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index c8afa6b1d91d..0f3eb41d9201 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -27,6 +27,7 @@
 #include <linux/fs.h>
 #include <linux/module.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #include <linux/configfs.h>
 #include "configfs_internal.h"
@@ -121,8 +122,10 @@ static int get_target(const char *symname, struct path *path,
                                ret = -ENOENT;
                                path_put(path);
                        }
-                } else
+                } else {
                        ret = -EPERM;
+                        path_put(path);
+                }
        }
        return ret;
diff --git a/fs/dcache.c b/fs/dcache.c
index a100fa35a48f..f1358e5c3a59 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -257,6 +257,7 @@ kill_it:
        if (dentry)
                goto repeat;
 }
+EXPORT_SYMBOL(dput);
 /**
 * d_invalidate - invalidate a dentry
@@ -314,6 +315,7 @@ int d_invalidate(struct dentry * dentry)
        spin_unlock(&dcache_lock);
        return 0;
 }
+EXPORT_SYMBOL(d_invalidate);
 /* This should be called _only_ with dcache_lock held */
@@ -328,6 +330,7 @@ struct dentry * dget_locked(struct dentry *dentry)
 {
        return __dget_locked(dentry);
 }
+EXPORT_SYMBOL(dget_locked);
 /**
 * d_find_alias - grab a hashed alias of inode
@@ -384,6 +387,7 @@ struct dentry * d_find_alias(struct inode *inode)
        }
        return de;
 }
+EXPORT_SYMBOL(d_find_alias);
 /*
 *      Try to kill dentries associated with this inode.
@@ -408,6 +412,7 @@ restart:
        }
        spin_unlock(&dcache_lock);
 }
+EXPORT_SYMBOL(d_prune_aliases);
 /*
 * Throw away a dentry - free the inode, dput the parent.  This requires that
@@ -610,6 +615,7 @@ void shrink_dcache_sb(struct super_block * sb)
 {
        __shrink_dcache_sb(sb, NULL, 0);
 }
+EXPORT_SYMBOL(shrink_dcache_sb);
 /*
 * destroy a single subtree of dentries for unmount
@@ -792,6 +798,7 @@ positive:
        spin_unlock(&dcache_lock);
        return 1;
 }
+EXPORT_SYMBOL(have_submounts);
 /*
 * Search the dentry child list for the specified parent,
@@ -876,6 +883,7 @@ void shrink_dcache_parent(struct dentry * parent)
        while ((found = select_parent(parent)) != 0)
                __shrink_dcache_sb(sb, &found, 0);
 }
+EXPORT_SYMBOL(shrink_dcache_parent);
 /*
 * Scan `nr' dentries and return the number which remain.
@@ -968,6 +976,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
        return dentry;
 }
+EXPORT_SYMBOL(d_alloc);
 struct dentry *d_alloc_name(struct dentry *parent, const char *name)
 {
@@ -978,6 +987,7 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
        q.hash = full_name_hash(q.name, q.len);
        return d_alloc(parent, &q);
 }
+EXPORT_SYMBOL(d_alloc_name);
 /* the caller must hold dcache_lock */
 static void __d_instantiate(struct dentry *dentry, struct inode *inode)
@@ -1011,6 +1021,7 @@ void d_instantiate(struct dentry *entry, struct inode * inode)
        spin_unlock(&dcache_lock);
        security_d_instantiate(entry, inode);
 }
+EXPORT_SYMBOL(d_instantiate);
 /**
 * d_instantiate_unique - instantiate a non-aliased dentry
@@ -1107,6 +1118,7 @@ struct dentry * d_alloc_root(struct inode * root_inode)
        }
        return res;
 }
+EXPORT_SYMBOL(d_alloc_root);
 static inline struct hlist_head *d_hash(struct dentry *parent,
                                        unsigned long hash)
@@ -1210,7 +1222,6 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
                        BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
                        spin_unlock(&dcache_lock);
                        security_d_instantiate(new, inode);
-                        d_rehash(dentry);
                        d_move(new, dentry);
                        iput(inode);
                } else {
@@ -1224,6 +1235,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
                d_add(dentry, inode);
        return new;
 }
+EXPORT_SYMBOL(d_splice_alias);
 /**
 * d_add_ci - lookup or allocate new dentry with case-exact name
@@ -1313,6 +1325,7 @@ err_out:
        iput(inode);
        return ERR_PTR(error);
 }
+EXPORT_SYMBOL(d_add_ci);
 /**
 * d_lookup - search for a dentry
@@ -1356,6 +1369,7 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
        } while (read_seqretry(&rename_lock, seq));
        return dentry;
 }
+EXPORT_SYMBOL(d_lookup);
 struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 {
@@ -1482,6 +1496,7 @@ int d_validate(struct dentry *dentry, struct dentry *dparent)
 out:
        return 0;
 }
+EXPORT_SYMBOL(d_validate);
 /*
 * When a file is deleted, we have two options:
@@ -1527,6 +1542,7 @@ void d_delete(struct dentry * dentry)
        fsnotify_nameremove(dentry, isdir);
 }
+EXPORT_SYMBOL(d_delete);
 static void __d_rehash(struct dentry * entry, struct hlist_head *list)
 {
@@ -1555,6 +1571,7 @@ void d_rehash(struct dentry * entry)
        spin_unlock(&entry->d_lock);
        spin_unlock(&dcache_lock);
 }
+EXPORT_SYMBOL(d_rehash);
 /*
 * When switching names, the actual string doesn't strictly have to
@@ -1701,6 +1718,7 @@ void d_move(struct dentry * dentry, struct dentry * target)
        d_move_locked(dentry, target);
        spin_unlock(&dcache_lock);
 }
+EXPORT_SYMBOL(d_move);
 /**
 * d_ancestor - search for an ancestor
@@ -1867,6 +1885,7 @@ shouldnt_be_hashed:
        spin_unlock(&dcache_lock);
        BUG();
 }
+EXPORT_SYMBOL_GPL(d_materialise_unique);
 static int prepend(char **buffer, int *buflen, const char *str, int namelen)
 {
@@ -2004,6 +2023,7 @@ char *d_path(const struct path *path, char *buf, int buflen)
        path_put(&root);
        return res;
 }
+EXPORT_SYMBOL(d_path);
 /*
 * Helper function for dentry_operations.d_dname() members
@@ -2170,6 +2190,30 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
        return result;
 }
+int path_is_under(struct path *path1, struct path *path2)
+{
+        struct vfsmount *mnt = path1->mnt;
+        struct dentry *dentry = path1->dentry;
+        int res;
+        spin_lock(&vfsmount_lock);
+        if (mnt != path2->mnt) {
+                for (;;) {
+                        if (mnt->mnt_parent == mnt) {
+                                spin_unlock(&vfsmount_lock);
+                                return 0;
+                        }
+                        if (mnt->mnt_parent == path2->mnt)
+                                break;
+                        mnt = mnt->mnt_parent;
+                }
+                dentry = mnt->mnt_mountpoint;
+        }
+        res = is_subdir(dentry, path2->dentry);
+        spin_unlock(&vfsmount_lock);
+        return res;
+}
+EXPORT_SYMBOL(path_is_under);
 void d_genocide(struct dentry *root)
 {
        struct dentry *this_parent = root;
@@ -2227,6 +2271,7 @@ ino_t find_inode_number(struct dentry *dir, struct qstr *name)
        }
        return ino;
 }
+EXPORT_SYMBOL(find_inode_number);
 static __initdata unsigned long dhash_entries;
 static int __init set_dhash_entries(char *str)
@@ -2296,6 +2341,7 @@ static void __init dcache_init(void)
 /* SLAB cache for __getname() consumers */
 struct kmem_cache *names_cachep __read_mostly;
+EXPORT_SYMBOL(names_cachep);
 EXPORT_SYMBOL(d_genocide);
@@ -2325,26 +2371,3 @@ void __init vfs_caches_init(unsigned long mempages)
        bdev_cache_init();
        chrdev_init();
 }
-EXPORT_SYMBOL(d_alloc);
-EXPORT_SYMBOL(d_alloc_root);
-EXPORT_SYMBOL(d_delete);
-EXPORT_SYMBOL(d_find_alias);
-EXPORT_SYMBOL(d_instantiate);
-EXPORT_SYMBOL(d_invalidate);
-EXPORT_SYMBOL(d_lookup);
-EXPORT_SYMBOL(d_move);
-EXPORT_SYMBOL_GPL(d_materialise_unique);
-EXPORT_SYMBOL(d_path);
-EXPORT_SYMBOL(d_prune_aliases);
-EXPORT_SYMBOL(d_rehash);
-EXPORT_SYMBOL(d_splice_alias);
-EXPORT_SYMBOL(d_add_ci);
-EXPORT_SYMBOL(d_validate);
-EXPORT_SYMBOL(dget_locked);
-EXPORT_SYMBOL(dput);
-EXPORT_SYMBOL(find_inode_number);
-EXPORT_SYMBOL(have_submounts);
-EXPORT_SYMBOL(names_cachep);
-EXPORT_SYMBOL(shrink_dcache_parent);
-EXPORT_SYMBOL(shrink_dcache_sb);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index d22438ef7674..30a87b3dbcac 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -27,12 +27,15 @@
 #include <linux/fsnotify.h>
 #include <linux/string.h>
 #include <linux/magic.h>
+#include <linux/slab.h>
 static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
 static bool debugfs_registered;
-static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev)
+static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev,
+                                       void *data, const struct file_operations *fops)
 {
        struct inode *inode = new_inode(sb);
@@ -44,14 +47,18 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
                        init_special_inode(inode, mode, dev);
                        break;
                case S_IFREG:
-                        inode->i_fop = &debugfs_file_operations;
+                        inode->i_fop = fops ? fops : &debugfs_file_operations;
+                        inode->i_private = data;
                        break;
                case S_IFLNK:
                        inode->i_op = &debugfs_link_operations;
+                        inode->i_fop = fops;
+                        inode->i_private = data;
                        break;
                case S_IFDIR:
                        inode->i_op = &simple_dir_inode_operations;
-                        inode->i_fop = &simple_dir_operations;
+                        inode->i_fop = fops ? fops : &simple_dir_operations;
+                        inode->i_private = data;
                        /* directory inodes start off with i_nlink == 2
                         * (for "." entry) */
@@ -64,7 +71,8 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
 /* SMP-safe */
 static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
-                         int mode, dev_t dev)
+                         int mode, dev_t dev, void *data,
+                         const struct file_operations *fops)
 {
        struct inode *inode;
        int error = -EPERM;
@@ -72,7 +80,7 @@ static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
        if (dentry->d_inode)
                return -EEXIST;
-        inode = debugfs_get_inode(dir->i_sb, mode, dev);
+        inode = debugfs_get_inode(dir->i_sb, mode, dev, data, fops);
        if (inode) {
                d_instantiate(dentry, inode);
                dget(dentry);
@@ -81,12 +89,13 @@ static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
        return error;
 }
-static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode,
+                         void *data, const struct file_operations *fops)
 {
        int res;
        mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
-        res = debugfs_mknod(dir, dentry, mode, 0);
+        res = debugfs_mknod(dir, dentry, mode, 0, data, fops);
        if (!res) {
                inc_nlink(dir);
                fsnotify_mkdir(dir, dentry);
@@ -94,18 +103,20 @@ static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        return res;
 }
-static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode)
+static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode,
+                        void *data, const struct file_operations *fops)
 {
        mode = (mode & S_IALLUGO) | S_IFLNK;
-        return debugfs_mknod(dir, dentry, mode, 0);
+        return debugfs_mknod(dir, dentry, mode, 0, data, fops);
 }
-static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode)
+static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode,
+                          void *data, const struct file_operations *fops)
 {
        int res;
        mode = (mode & S_IALLUGO) | S_IFREG;
-        res = debugfs_mknod(dir, dentry, mode, 0);
+        res = debugfs_mknod(dir, dentry, mode, 0, data, fops);
        if (!res)
                fsnotify_create(dir, dentry);
        return res;
@@ -139,7 +150,9 @@ static struct file_system_type debug_fs_type = {
 static int debugfs_create_by_name(const char *name, mode_t mode,
                                  struct dentry *parent,
-                                  struct dentry **dentry)
+                                  struct dentry **dentry,
+                                  void *data,
+                                  const struct file_operations *fops)
 {
        int error = 0;
@@ -148,15 +161,8 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
         * block. A pointer to that is in the struct vfsmount that we
         * have around.
         */
-        if (!parent) {
+        if (!parent)
-                if (debugfs_mount && debugfs_mount->mnt_sb) {
+                parent = debugfs_mount->mnt_sb->s_root;
-                        parent = debugfs_mount->mnt_sb->s_root;
-                }
-        }
-        if (!parent) {
-                pr_debug("debugfs: Ah! can not find a parent!\n");
-                return -EFAULT;
-        }
        *dentry = NULL;
        mutex_lock(&parent->d_inode->i_mutex);
@@ -164,13 +170,16 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
        if (!IS_ERR(*dentry)) {
                switch (mode & S_IFMT) {
                case S_IFDIR:
-                        error = debugfs_mkdir(parent->d_inode, *dentry, mode);
+                        error = debugfs_mkdir(parent->d_inode, *dentry, mode,
+                                              data, fops);
                        break;
                case S_IFLNK:
-                        error = debugfs_link(parent->d_inode, *dentry, mode);
+                        error = debugfs_link(parent->d_inode, *dentry, mode,
+                                             data, fops);
                        break;
                default:
-                        error = debugfs_create(parent->d_inode, *dentry, mode);
+                        error = debugfs_create(parent->d_inode, *dentry, mode,
+                                               data, fops);
                        break;
                }
                dput(*dentry);
@@ -184,7 +193,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
 /**
 * debugfs_create_file - create a file in the debugfs filesystem
 * @name: a pointer to a string containing the name of the file to create.
- * @mode: the permission that the file should have
+ * @mode: the permission that the file should have.
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this paramater is NULL, then the
 *          file will be created in the root of the debugfs filesystem.
@@ -195,8 +204,8 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
 *        this file.
 *
 * This is the basic "create a file" function for debugfs.  It allows for a
- * wide range of flexibility in createing a file, or a directory (if you
+ * wide range of flexibility in creating a file, or a directory (if you want
- * want to create a directory, the debugfs_create_dir() function is
+ * to create a directory, the debugfs_create_dir() function is
 * recommended to be used instead.)
 *
 * This function will return a pointer to a dentry if it succeeds.  This
@@ -221,19 +230,13 @@ struct dentry *debugfs_create_file(const char *name, mode_t mode,
        if (error)
                goto exit;
-        error = debugfs_create_by_name(name, mode, parent, &dentry);
+        error = debugfs_create_by_name(name, mode, parent, &dentry,
+                                       data, fops);
        if (error) {
                dentry = NULL;
                simple_release_fs(&debugfs_mount, &debugfs_mount_count);
                goto exit;
        }
-        if (dentry->d_inode) {
-                if (data)
-                        dentry->d_inode->i_private = data;
-                if (fops)
-                        dentry->d_inode->i_fop = fops;
-        }
 exit:
        return dentry;
 }
@@ -494,7 +497,7 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
        }
        d_move(old_dentry, dentry);
        fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name,
-                old_dentry->d_name.name, S_ISDIR(old_dentry->d_inode->i_mode),
+                S_ISDIR(old_dentry->d_inode->i_mode),
                NULL, old_dentry);
        fsnotify_oldname_free(old_name);
        unlock_rename(new_dir, old_dir);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index d5f8c96964be..0120247b41c0 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -15,6 +15,7 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/tty.h>
 #include <linux/mutex.h>
@@ -517,11 +518,23 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
 struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
 {
+        struct dentry *dentry;
+        struct tty_struct *tty;
        BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
+        /* Ensure dentry has not been deleted by devpts_pty_kill() */
+        dentry = d_find_alias(pts_inode);
+        if (!dentry)
+                return NULL;
+        tty = NULL;
        if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
-                return (struct tty_struct *)pts_inode->i_private;
+                tty = (struct tty_struct *)pts_inode->i_private;
-        return NULL;
+        dput(dentry);
+        return tty;
 }
 void devpts_pty_kill(struct tty_struct *tty)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 8b10b87dc01a..e82adc2debb7 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -53,13 +53,6 @@
 *
 * If blkfactor is zero then the user's request was aligned to the filesystem's
 * blocksize.
- *
- * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
- * This determines whether we need to do the fancy locking which prevents
- * direct-IO from being able to read uninitialised disk blocks.  If its zero
- * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
- * not held for the entire direct write (taken briefly, initially, during a
- * direct read though, but its never held for the duration of a direct-IO).
 */
 struct dio {
@@ -68,7 +61,7 @@ struct dio {
        struct inode *inode;
        int rw;
        loff_t i_size;                  /* i_size when submitted */
-        int lock_type;                  /* doesn't change */
+        int flags;                      /* doesn't change */
        unsigned blkbits;               /* doesn't change */
        unsigned blkfactor;             /* When we're using an alignment which
                                           is finer than the filesystem's soft
@@ -104,6 +97,18 @@ struct dio {
        unsigned cur_page_len;          /* Nr of bytes at cur_page_offset */
        sector_t cur_page_block;        /* Where it starts */
+        /* BIO completion state */
+        spinlock_t bio_lock;            /* protects BIO fields below */
+        unsigned long refcount;         /* direct_io_worker() and bios */
+        struct bio *bio_list;           /* singly linked via bi_private */
+        struct task_struct *waiter;     /* waiting task (NULL if none) */
+        /* AIO related stuff */
+        struct kiocb *iocb;             /* kiocb */
+        int is_async;                   /* is IO async ? */
+        int io_error;                   /* IO error in completion path */
+        ssize_t result;                 /* IO result */
        /*
         * Page fetching state. These variables belong to dio_refill_pages().
         */
@@ -115,22 +120,16 @@ struct dio {
         * Page queue.  These variables belong to dio_refill_pages() and
         * dio_get_page().
         */
-        struct page *pages[DIO_PAGES];  /* page buffer */
        unsigned head;                  /* next page to process */
        unsigned tail;                  /* last valid page + 1 */
        int page_errors;                /* errno from get_user_pages() */
-        /* BIO completion state */
+        /*
-        spinlock_t bio_lock;            /* protects BIO fields below */
+         * pages[] (and any fields placed after it) are not zeroed out at
-        unsigned long refcount;         /* direct_io_worker() and bios */
+         * allocation time.  Don't add new fields after pages[] unless you
-        struct bio *bio_list;           /* singly linked via bi_private */
+         * wish that they not be zeroed.
-        struct task_struct *waiter;     /* waiting task (NULL if none) */
+         */
+        struct page *pages[DIO_PAGES];  /* page buffer */
-        /* AIO related stuff */
-        struct kiocb *iocb;             /* kiocb */
-        int is_async;                   /* is IO async ? */
-        int io_error;                   /* IO error in completion path */
-        ssize_t result;                 /* IO result */
 };
 /*
@@ -240,7 +239,8 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
        if (dio->end_io && dio->result)
                dio->end_io(dio->iocb, offset, transferred,
                            dio->map_bh.b_private);
-        if (dio->lock_type == DIO_LOCKING)
+        if (dio->flags & DIO_LOCKING)
                /* lockdep: non-owner release */
                up_read_non_owner(&dio->inode->i_alloc_sem);
@@ -515,21 +515,24 @@ static int get_more_blocks(struct dio *dio)
                map_bh->b_state = 0;
                map_bh->b_size = fs_count << dio->inode->i_blkbits;
+                /*
+                 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
+                 * forbid block creations: only overwrites are permitted.
+                 * We will return early to the caller once we see an
+                 * unmapped buffer head returned, and the caller will fall
+                 * back to buffered I/O.
+                 *
+                 * Otherwise the decision is left to the get_blocks method,
+                 * which may decide to handle it or also return an unmapped
+                 * buffer head.
+                 */
                create = dio->rw & WRITE;
-                if (dio->lock_type == DIO_LOCKING) {
+                if (dio->flags & DIO_SKIP_HOLES) {
                        if (dio->block_in_file < (i_size_read(dio->inode) >>
                                                        dio->blkbits))
                                create = 0;
-                } else if (dio->lock_type == DIO_NO_LOCKING) {
-                        create = 0;
                }
-                /*
-                 * For writes inside i_size we forbid block creations: only
-                 * overwrites are permitted.  We fall back to buffered writes
-                 * at a higher level for inside-i_size block-instantiating
-                 * writes.
-                 */
                ret = (*dio->get_block)(dio->inode, fs_startblk,
                                                map_bh, create);
        }
@@ -1028,9 +1031,6 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
        if (dio->bio)
                dio_bio_submit(dio);
-        /* All IO is now issued, send it on its way */
-        blk_run_address_space(inode->i_mapping);
        /*
         * It is possible that, we return short IO due to end of file.
         * In that case, we need to release all the pages we got hold on.
@@ -1042,7 +1042,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
         * we can let i_mutex go now that its achieved its purpose
         * of protecting us from looking up uninitialized blocks.
         */
-        if ((rw == READ) && (dio->lock_type == DIO_LOCKING))
+        if (rw == READ && (dio->flags & DIO_LOCKING))
                mutex_unlock(&dio->inode->i_mutex);
        /*
@@ -1057,8 +1057,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
            ((rw & READ) || (dio->result == dio->size)))
                ret = -EIOCBQUEUED;
-        if (ret != -EIOCBQUEUED)
+        if (ret != -EIOCBQUEUED) {
+                /* All IO is now issued, send it on its way */
+                blk_run_address_space(inode->i_mapping);
                dio_await_completion(dio);
+        }
        /*
         * Sync will always be dropping the final ref and completing the
@@ -1086,30 +1089,28 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 /*
 * This is a library function for use by filesystem drivers.
- * The locking rules are governed by the dio_lock_type parameter.
 *
- * DIO_NO_LOCKING (no locking, for raw block device access)
+ * The locking rules are governed by the flags parameter:
- * For writes, i_mutex is not held on entry; it is never taken.
+ *  - if the flags value contains DIO_LOCKING we use a fancy locking
+ *    scheme for dumb filesystems.
+ *    For writes this function is called under i_mutex and returns with
+ *    i_mutex held, for reads, i_mutex is not held on entry, but it is
+ *    taken and dropped again before returning.
+ *    For reads and writes i_alloc_sem is taken in shared mode and released
+ *    on I/O completion (which may happen asynchronously after returning to
+ *    the caller).
 *
- * DIO_LOCKING (simple locking for regular files)
+ *  - if the flags value does NOT contain DIO_LOCKING we don't use any
- * For writes we are called under i_mutex and return with i_mutex held, even
+ *    internal locking but rather rely on the filesystem to synchronize
- * though it is internally dropped.
+ *    direct I/O reads/writes versus each other and truncate.
- * For reads, i_mutex is not held on entry, but it is taken and dropped before
+ *    For reads and writes both i_mutex and i_alloc_sem are not held on
- * returning.
+ *    entry and are never taken.
- *
- * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
- *      uninitialised data, allowing parallel direct readers and writers)
- * For writes we are called without i_mutex, return without it, never touch it.
- * For reads we are called under i_mutex and return with i_mutex held, even
- * though it may be internally dropped.
- *
- * Additional i_alloc_sem locking requirements described inline below.
 */
 ssize_t
 __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        struct block_device *bdev, const struct iovec *iov, loff_t offset, 
        unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
-        int dio_lock_type)
+        int flags)
 {
        int seg;
        size_t size;
@@ -1120,11 +1121,9 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        ssize_t retval = -EINVAL;
        loff_t end = offset;
        struct dio *dio;
-        int release_i_mutex = 0;
-        int acquire_i_mutex = 0;
        if (rw & WRITE)
-                rw = WRITE_ODIRECT;
+                rw = WRITE_ODIRECT_PLUG;
        if (bdev)
                bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
@@ -1151,48 +1150,41 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
                }
        }
-        dio = kzalloc(sizeof(*dio), GFP_KERNEL);
+        dio = kmalloc(sizeof(*dio), GFP_KERNEL);
        retval = -ENOMEM;
        if (!dio)
                goto out;
        /*
-         * For block device access DIO_NO_LOCKING is used,
+         * Believe it or not, zeroing out the page array caused a .5%
-         *      neither readers nor writers do any locking at all
+         * performance regression in a database benchmark.  So, we take
-         * For regular files using DIO_LOCKING,
+         * care to only zero out what's needed.
-         *      readers need to grab i_mutex and i_alloc_sem
-         *      writers need to grab i_alloc_sem only (i_mutex is already held)
-         * For regular files using DIO_OWN_LOCKING,
-         *      neither readers nor writers take any locks here
         */
-        dio->lock_type = dio_lock_type;
+        memset(dio, 0, offsetof(struct dio, pages));
-        if (dio_lock_type != DIO_NO_LOCKING) {
+        dio->flags = flags;
+        if (dio->flags & DIO_LOCKING) {
                /* watch out for a 0 len io from a tricksy fs */
                if (rw == READ && end > offset) {
-                        struct address_space *mapping;
+                        struct address_space *mapping =
+                                        iocb->ki_filp->f_mapping;
-                        mapping = iocb->ki_filp->f_mapping;
+                        /* will be released by direct_io_worker */
-                        if (dio_lock_type != DIO_OWN_LOCKING) {
+                        mutex_lock(&inode->i_mutex);
-                                mutex_lock(&inode->i_mutex);
-                                release_i_mutex = 1;
-                        }
                        retval = filemap_write_and_wait_range(mapping, offset,
                                                              end - 1);
                        if (retval) {
+                                mutex_unlock(&inode->i_mutex);
                                kfree(dio);
                                goto out;
                        }
-                        if (dio_lock_type == DIO_OWN_LOCKING) {
-                                mutex_unlock(&inode->i_mutex);
-                                acquire_i_mutex = 1;
-                        }
                }
-                if (dio_lock_type == DIO_LOCKING)
+                /*
-                        /* lockdep: not the owner will release it */
+                 * Will be released at I/O completion, possibly in a
-                        down_read_non_owner(&inode->i_alloc_sem);
+                 * different thread.
+                 */
+                down_read_non_owner(&inode->i_alloc_sem);
        }
        /*
@@ -1210,24 +1202,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        /*
         * In case of error extending write may have instantiated a few
         * blocks outside i_size. Trim these off again for DIO_LOCKING.
-         * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by
+         *
-         * it's own meaner.
+         * NOTE: filesystems with their own locking have to handle this
+         * on their own.
         */
-        if (unlikely(retval < 0 && (rw & WRITE))) {
+        if (flags & DIO_LOCKING) {
-                loff_t isize = i_size_read(inode);
+                if (unlikely((rw & WRITE) && retval < 0)) {
+                        loff_t isize = i_size_read(inode);
-                if (end > isize && dio_lock_type == DIO_LOCKING)
+                        if (end > isize)
-                        vmtruncate(inode, isize);
+                                vmtruncate(inode, isize);
+                }
        }
-        if (rw == READ && dio_lock_type == DIO_LOCKING)
-                release_i_mutex = 0;
 out:
-        if (release_i_mutex)
-                mutex_unlock(&inode->i_mutex);
-        else if (acquire_i_mutex)
-                mutex_lock(&inode->i_mutex);
        return retval;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index dc2ad6008b2d..4314f0d48d85 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2010 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -33,10 +33,10 @@ void dlm_del_ast(struct dlm_lkb *lkb)
        spin_unlock(&ast_queue_lock);
 }
-void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
+void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode)
 {
        if (lkb->lkb_flags & DLM_IFL_USER) {
-                dlm_user_add_ast(lkb, type, bastmode);
+                dlm_user_add_ast(lkb, type, mode);
                return;
        }
@@ -44,10 +44,21 @@ void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
        if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
                kref_get(&lkb->lkb_ref);
                list_add_tail(&lkb->lkb_astqueue, &ast_queue);
+                lkb->lkb_ast_first = type;
        }
+        /* sanity check, this should not happen */
+        if ((type == AST_COMP) && (lkb->lkb_ast_type & AST_COMP))
+                log_print("repeat cast %d castmode %d lock %x %s",
+                          mode, lkb->lkb_castmode,
+                          lkb->lkb_id, lkb->lkb_resource->res_name);
        lkb->lkb_ast_type |= type;
-        if (bastmode)
+        if (type == AST_BAST)
-                lkb->lkb_bastmode = bastmode;
+                lkb->lkb_bastmode = mode;
+        else
+                lkb->lkb_castmode = mode;
        spin_unlock(&ast_queue_lock);
        set_bit(WAKE_ASTS, &astd_wakeflags);
@@ -59,9 +70,9 @@ static void process_asts(void)
        struct dlm_ls *ls = NULL;
        struct dlm_rsb *r = NULL;
        struct dlm_lkb *lkb;
-        void (*cast) (void *astparam);
+        void (*castfn) (void *astparam);
-        void (*bast) (void *astparam, int mode);
+        void (*bastfn) (void *astparam, int mode);
-        int type = 0, bastmode;
+        int type, first, bastmode, castmode, do_bast, do_cast, last_castmode;
 repeat:
        spin_lock(&ast_queue_lock);
@@ -75,17 +86,48 @@ repeat:
                list_del(&lkb->lkb_astqueue);
                type = lkb->lkb_ast_type;
                lkb->lkb_ast_type = 0;
+                first = lkb->lkb_ast_first;
+                lkb->lkb_ast_first = 0;
                bastmode = lkb->lkb_bastmode;
+                castmode = lkb->lkb_castmode;
+                castfn = lkb->lkb_astfn;
+                bastfn = lkb->lkb_bastfn;
                spin_unlock(&ast_queue_lock);
-                cast = lkb->lkb_astfn;
-                bast = lkb->lkb_bastfn;
-                if ((type & AST_COMP) && cast)
-                        cast(lkb->lkb_astparam);
-                if ((type & AST_BAST) && bast)
+                do_cast = (type & AST_COMP) && castfn;
-                        bast(lkb->lkb_astparam, bastmode);
+                do_bast = (type & AST_BAST) && bastfn;
+                /* Skip a bast if its blocking mode is compatible with the
+                   granted mode of the preceding cast. */
+                if (do_bast) {
+                        if (first == AST_COMP)
+                                last_castmode = castmode;
+                        else
+                                last_castmode = lkb->lkb_castmode_done;
+                        if (dlm_modes_compat(bastmode, last_castmode))
+                                do_bast = 0;
+                }
+                if (first == AST_COMP) {
+                        if (do_cast)
+                                castfn(lkb->lkb_astparam);
+                        if (do_bast)
+                                bastfn(lkb->lkb_astparam, bastmode);
+                } else if (first == AST_BAST) {
+                        if (do_bast)
+                                bastfn(lkb->lkb_astparam, bastmode);
+                        if (do_cast)
+                                castfn(lkb->lkb_astparam);
+                } else {
+                        log_error(ls, "bad ast_first %d ast_type %d",
+                                  first, type);
+                }
+                if (do_cast)
+                        lkb->lkb_castmode_done = castmode;
+                if (do_bast)
+                        lkb->lkb_bastmode_done = bastmode;
                /* this removes the reference added by dlm_add_ast
                   and may result in the lkb being freed */
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index 1b5fc5f428fd..bcb1aaba519d 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2010 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -13,7 +13,7 @@
 #ifndef __ASTD_DOT_H__
 #define __ASTD_DOT_H__
-void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
+void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode);
 void dlm_del_ast(struct dlm_lkb *lkb);
 void dlm_astd_wake(void);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index fd9859f92fad..b54bca03d92f 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/configfs.h>
+#include <linux/slab.h>
 #include <linux/in.h>
 #include <linux/in6.h>
 #include <net/ipv6.h>
@@ -410,10 +411,10 @@ static struct config_group *make_cluster(struct config_group *g,
        struct dlm_comms *cms = NULL;
        void *gps = NULL;
-        cl = kzalloc(sizeof(struct dlm_cluster), GFP_KERNEL);
+        cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS);
-        gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
+        gps = kcalloc(3, sizeof(struct config_group *), GFP_NOFS);
-        sps = kzalloc(sizeof(struct dlm_spaces), GFP_KERNEL);
+        sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS);
-        cms = kzalloc(sizeof(struct dlm_comms), GFP_KERNEL);
+        cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS);
        if (!cl || !gps || !sps || !cms)
                goto fail;
@@ -482,9 +483,9 @@ static struct config_group *make_space(struct config_group *g, const char *name)
        struct dlm_nodes *nds = NULL;
        void *gps = NULL;
-        sp = kzalloc(sizeof(struct dlm_space), GFP_KERNEL);
+        sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS);
-        gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
+        gps = kcalloc(2, sizeof(struct config_group *), GFP_NOFS);
-        nds = kzalloc(sizeof(struct dlm_nodes), GFP_KERNEL);
+        nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS);
        if (!sp || !gps || !nds)
                goto fail;
@@ -536,7 +537,7 @@ static struct config_item *make_comm(struct config_group *g, const char *name)
 {
        struct dlm_comm *cm;
-        cm = kzalloc(sizeof(struct dlm_comm), GFP_KERNEL);
+        cm = kzalloc(sizeof(struct dlm_comm), GFP_NOFS);
        if (!cm)
                return ERR_PTR(-ENOMEM);
@@ -569,7 +570,7 @@ static struct config_item *make_node(struct config_group *g, const char *name)
        struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
        struct dlm_node *nd;
-        nd = kzalloc(sizeof(struct dlm_node), GFP_KERNEL);
+        nd = kzalloc(sizeof(struct dlm_node), GFP_NOFS);
        if (!nd)
                return ERR_PTR(-ENOMEM);
@@ -705,7 +706,7 @@ static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
        if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
                return -ENOSPC;
-        addr = kzalloc(sizeof(*addr), GFP_KERNEL);
+        addr = kzalloc(sizeof(*addr), GFP_NOFS);
        if (!addr)
                return -ENOMEM;
@@ -868,7 +869,7 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
        ids_count = sp->members_count;
-        ids = kcalloc(ids_count, sizeof(int), GFP_KERNEL);
+        ids = kcalloc(ids_count, sizeof(int), GFP_NOFS);
        if (!ids) {
                rv = -ENOMEM;
                goto out;
@@ -886,7 +887,7 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
        if (!new_count)
                goto out_ids;
-        new = kcalloc(new_count, sizeof(int), GFP_KERNEL);
+        new = kcalloc(new_count, sizeof(int), GFP_NOFS);
        if (!new) {
                kfree(ids);
                rv = -ENOMEM;
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 1c8bb8c3a82e..c6cf25158746 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
+#include <linux/slab.h>
 #include "dlm_internal.h"
 #include "lock.h"
@@ -256,7 +257,7 @@ static int print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
                        lkb->lkb_status,
                        lkb->lkb_grmode,
                        lkb->lkb_rqmode,
-                        lkb->lkb_highbast,
+                        lkb->lkb_bastmode,
                        rsb_lookup,
                        lkb->lkb_wait_type,
                        lkb->lkb_lvbseq,
@@ -404,7 +405,7 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
        if (bucket >= ls->ls_rsbtbl_size)
                return NULL;
-        ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_KERNEL);
+        ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_NOFS);
        if (!ri)
                return NULL;
        if (n == 0)
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index c4dfa1dcc86f..7b84c1dbc82e 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -49,8 +49,7 @@ static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
        spin_unlock(&ls->ls_recover_list_lock);
        if (!found)
-                de = kzalloc(sizeof(struct dlm_direntry) + len,
+                de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_NOFS);
-                             ls->ls_allocation);
        return de;
 }
@@ -212,7 +211,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
        dlm_dir_clear(ls);
-        last_name = kmalloc(DLM_RESNAME_MAXLEN, ls->ls_allocation);
+        last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_NOFS);
        if (!last_name)
                goto out;
@@ -323,7 +322,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
        if (namelen > DLM_RESNAME_MAXLEN)
                return -EINVAL;
-        de = kzalloc(sizeof(struct dlm_direntry) + namelen, ls->ls_allocation);
+        de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_NOFS);
        if (!de)
                return -ENOMEM;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index d01ca0a711db..f632b58cd222 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2010 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -232,11 +232,17 @@ struct dlm_lkb {
        int8_t                  lkb_status;     /* granted, waiting, convert */
        int8_t                  lkb_rqmode;     /* requested lock mode */
        int8_t                  lkb_grmode;     /* granted lock mode */
-        int8_t                  lkb_bastmode;   /* requested mode */
        int8_t                  lkb_highbast;   /* highest mode bast sent for */
        int8_t                  lkb_wait_type;  /* type of reply waiting for */
        int8_t                  lkb_wait_count;
        int8_t                  lkb_ast_type;   /* type of ast queued for */
+        int8_t                  lkb_ast_first;  /* type of first ast queued */
+        int8_t                  lkb_bastmode;   /* req mode of queued bast */
+        int8_t                  lkb_castmode;   /* gr mode of queued cast */
+        int8_t                  lkb_bastmode_done; /* last delivered bastmode */
+        int8_t                  lkb_castmode_done; /* last delivered castmode */
        struct list_head        lkb_idtbl_list; /* lockspace lkbtbl */
        struct list_head        lkb_statequeue; /* rsb g/c/w list */
@@ -473,7 +479,6 @@ struct dlm_ls {
        int                     ls_low_nodeid;
        int                     ls_total_weight;
        int                     *ls_node_array;
-        gfp_t                   ls_allocation;
        struct dlm_rsb          ls_stub_rsb;    /* for returning errors */
        struct dlm_lkb          ls_stub_lkb;    /* for returning errors */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index eb507c453c5f..17903b491298 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2010 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -56,6 +56,7 @@
   L: receive_xxxx_reply()     <-  R: send_xxxx_reply()
 */
 #include <linux/types.h>
+#include <linux/slab.h>
 #include "dlm_internal.h"
 #include <linux/dlm_device.h>
 #include "memory.h"
@@ -307,7 +308,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
        lkb->lkb_lksb->sb_status = rv;
        lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
-        dlm_add_ast(lkb, AST_COMP, 0);
+        dlm_add_ast(lkb, AST_COMP, lkb->lkb_grmode);
 }
 static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -320,10 +321,12 @@ static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
 {
        lkb->lkb_time_bast = ktime_get();
-        if (is_master_copy(lkb))
+        if (is_master_copy(lkb)) {
+                lkb->lkb_bastmode = rqmode; /* printed by debugfs */
                send_bast(r, lkb, rqmode);
-        else
+        } else {
                dlm_add_ast(lkb, AST_BAST, rqmode);
+        }
 }
 /*
@@ -2280,20 +2283,30 @@ static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
        if (can_be_queued(lkb)) {
                error = -EINPROGRESS;
                add_lkb(r, lkb, DLM_LKSTS_WAITING);
-                send_blocking_asts(r, lkb);
                add_timeout(lkb);
                goto out;
        }
        error = -EAGAIN;
-        if (force_blocking_asts(lkb))
-                send_blocking_asts_all(r, lkb);
        queue_cast(r, lkb, -EAGAIN);
 out:
        return error;
 }
+static void do_request_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                               int error)
+{
+        switch (error) {
+        case -EAGAIN:
+                if (force_blocking_asts(lkb))
+                        send_blocking_asts_all(r, lkb);
+                break;
+        case -EINPROGRESS:
+                send_blocking_asts(r, lkb);
+                break;
+        }
+}
 static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
        int error = 0;
@@ -2304,7 +2317,6 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
        if (can_be_granted(r, lkb, 1, &deadlk)) {
                grant_lock(r, lkb);
                queue_cast(r, lkb, 0);
-                grant_pending_locks(r);
                goto out;
        }
@@ -2334,7 +2346,6 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
                if (_can_be_granted(r, lkb, 1)) {
                        grant_lock(r, lkb);
                        queue_cast(r, lkb, 0);
-                        grant_pending_locks(r);
                        goto out;
                }
                /* else fall through and move to convert queue */
@@ -2344,28 +2355,47 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
                error = -EINPROGRESS;
                del_lkb(r, lkb);
                add_lkb(r, lkb, DLM_LKSTS_CONVERT);
-                send_blocking_asts(r, lkb);
                add_timeout(lkb);
                goto out;
        }
        error = -EAGAIN;
-        if (force_blocking_asts(lkb))
-                send_blocking_asts_all(r, lkb);
        queue_cast(r, lkb, -EAGAIN);
 out:
        return error;
 }
+static void do_convert_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                               int error)
+{
+        switch (error) {
+        case 0:
+                grant_pending_locks(r);
+                /* grant_pending_locks also sends basts */
+                break;
+        case -EAGAIN:
+                if (force_blocking_asts(lkb))
+                        send_blocking_asts_all(r, lkb);
+                break;
+        case -EINPROGRESS:
+                send_blocking_asts(r, lkb);
+                break;
+        }
+}
 static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
        remove_lock(r, lkb);
        queue_cast(r, lkb, -DLM_EUNLOCK);
-        grant_pending_locks(r);
        return -DLM_EUNLOCK;
 }
+static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                              int error)
+{
+        grant_pending_locks(r);
+}
 /* returns: 0 did nothing, -DLM_ECANCEL canceled lock */
 
 static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -2375,12 +2405,18 @@ static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
        error = revert_lock(r, lkb);
        if (error) {
                queue_cast(r, lkb, -DLM_ECANCEL);
-                grant_pending_locks(r);
                return -DLM_ECANCEL;
        }
        return 0;
 }
+static void do_cancel_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                              int error)
+{
+        if (error)
+                grant_pending_locks(r);
+}
 /*
 * Four stage 3 varieties:
 * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
@@ -2402,11 +2438,15 @@ static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
                goto out;
        }
-        if (is_remote(r))
+        if (is_remote(r)) {
                /* receive_request() calls do_request() on remote node */
                error = send_request(r, lkb);
-        else
+        } else {
                error = do_request(r, lkb);
+                /* for remote locks the request_reply is sent
+                   between do_request and do_request_effects */
+                do_request_effects(r, lkb, error);
+        }
 out:
        return error;
 }
@@ -2417,11 +2457,15 @@ static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
        int error;
-        if (is_remote(r))
+        if (is_remote(r)) {
                /* receive_convert() calls do_convert() on remote node */
                error = send_convert(r, lkb);
-        else
+        } else {
                error = do_convert(r, lkb);
+                /* for remote locks the convert_reply is sent
+                   between do_convert and do_convert_effects */
+                do_convert_effects(r, lkb, error);
+        }
        return error;
 }
@@ -2432,11 +2476,15 @@ static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
        int error;
-        if (is_remote(r))
+        if (is_remote(r)) {
                /* receive_unlock() calls do_unlock() on remote node */
                error = send_unlock(r, lkb);
-        else
+        } else {
                error = do_unlock(r, lkb);
+                /* for remote locks the unlock_reply is sent
+                   between do_unlock and do_unlock_effects */
+                do_unlock_effects(r, lkb, error);
+        }
        return error;
 }
@@ -2447,11 +2495,15 @@ static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
        int error;
-        if (is_remote(r))
+        if (is_remote(r)) {
                /* receive_cancel() calls do_cancel() on remote node */
                error = send_cancel(r, lkb);
-        else
+        } else {
                error = do_cancel(r, lkb);
+                /* for remote locks the cancel_reply is sent
+                   between do_cancel and do_cancel_effects */
+                do_cancel_effects(r, lkb, error);
+        }
        return error;
 }
@@ -2689,7 +2741,7 @@ static int _create_message(struct dlm_ls *ls, int mb_len,
           pass into lowcomms_commit and a message buffer (mb) that we
           write our data into */
-        mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb);
+        mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
        if (!mh)
                return -ENOBUFS;
@@ -3191,6 +3243,7 @@ static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
        attach_lkb(r, lkb);
        error = do_request(r, lkb);
        send_request_reply(r, lkb, error);
+        do_request_effects(r, lkb, error);
        unlock_rsb(r);
        put_rsb(r);
@@ -3226,15 +3279,19 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
                goto out;
        receive_flags(lkb, ms);
        error = receive_convert_args(ls, lkb, ms);
-        if (error)
+        if (error) {
-                goto out_reply;
+                send_convert_reply(r, lkb, error);
+                goto out;
+        }
        reply = !down_conversion(lkb);
        error = do_convert(r, lkb);
- out_reply:
        if (reply)
                send_convert_reply(r, lkb, error);
+        do_convert_effects(r, lkb, error);
 out:
        unlock_rsb(r);
        put_rsb(r);
@@ -3266,13 +3323,16 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
                goto out;
        receive_flags(lkb, ms);
        error = receive_unlock_args(ls, lkb, ms);
-        if (error)
+        if (error) {
-                goto out_reply;
+                send_unlock_reply(r, lkb, error);
+                goto out;
+        }
        error = do_unlock(r, lkb);
- out_reply:
        send_unlock_reply(r, lkb, error);
+        do_unlock_effects(r, lkb, error);
 out:
        unlock_rsb(r);
        put_rsb(r);
@@ -3307,6 +3367,7 @@ static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
        error = do_cancel(r, lkb);
        send_cancel_reply(r, lkb, error);
+        do_cancel_effects(r, lkb, error);
 out:
        unlock_rsb(r);
        put_rsb(r);
@@ -4512,7 +4573,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
        }
        if (flags & DLM_LKF_VALBLK) {
-                ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
+                ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
                if (!ua->lksb.sb_lvbptr) {
                        kfree(ua);
                        __put_lkb(ls, lkb);
@@ -4582,7 +4643,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
        ua = lkb->lkb_ua;
        if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
-                ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
+                ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
                if (!ua->lksb.sb_lvbptr) {
                        error = -ENOMEM;
                        goto out_put;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index d489fcc86713..f994a7dfda85 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -148,7 +148,7 @@ static void lockspace_kobj_release(struct kobject *k)
        kfree(ls);
 }
-static struct sysfs_ops dlm_attr_ops = {
+static const struct sysfs_ops dlm_attr_ops = {
        .show  = dlm_attr_show,
        .store = dlm_attr_store,
 };
@@ -191,6 +191,18 @@ static int do_uevent(struct dlm_ls *ls, int in)
        return error;
 }
+static int dlm_uevent(struct kset *kset, struct kobject *kobj,
+                      struct kobj_uevent_env *env)
+{
+        struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
+        add_uevent_var(env, "LOCKSPACE=%s", ls->ls_name);
+        return 0;
+}
+static struct kset_uevent_ops dlm_uevent_ops = {
+        .uevent = dlm_uevent,
+};
 int __init dlm_lockspace_init(void)
 {
@@ -199,7 +211,7 @@ int __init dlm_lockspace_init(void)
        INIT_LIST_HEAD(&lslist);
        spin_lock_init(&lslist_lock);
-        dlm_kset = kset_create_and_add("dlm", NULL, kernel_kobj);
+        dlm_kset = kset_create_and_add("dlm", &dlm_uevent_ops, kernel_kobj);
        if (!dlm_kset) {
                printk(KERN_WARNING "%s: can not create kset\n", __func__);
                return -ENOMEM;
@@ -430,7 +442,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
        error = -ENOMEM;
-        ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
+        ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_NOFS);
        if (!ls)
                goto out;
        memcpy(ls->ls_name, name, namelen);
@@ -443,11 +455,6 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
        if (flags & DLM_LSFL_TIMEWARN)
                set_bit(LSFL_TIMEWARN, &ls->ls_flags);
-        if (flags & DLM_LSFL_FS)
-                ls->ls_allocation = GFP_NOFS;
-        else
-                ls->ls_allocation = GFP_KERNEL;
        /* ls_exflags are forced to match among nodes, and we don't
           need to require all nodes to have some flags set */
        ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS |
@@ -456,7 +463,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
        size = dlm_config.ci_rsbtbl_size;
        ls->ls_rsbtbl_size = size;
-        ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
+        ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_NOFS);
        if (!ls->ls_rsbtbl)
                goto out_lsfree;
        for (i = 0; i < size; i++) {
@@ -468,7 +475,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
        size = dlm_config.ci_lkbtbl_size;
        ls->ls_lkbtbl_size = size;
-        ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
+        ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_NOFS);
        if (!ls->ls_lkbtbl)
                goto out_rsbfree;
        for (i = 0; i < size; i++) {
@@ -480,7 +487,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
        size = dlm_config.ci_dirtbl_size;
        ls->ls_dirtbl_size = size;
-        ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
+        ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_NOFS);
        if (!ls->ls_dirtbl)
                goto out_lkbfree;
        for (i = 0; i < size; i++) {
@@ -527,7 +534,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
        mutex_init(&ls->ls_requestqueue_mutex);
        mutex_init(&ls->ls_clear_proc_locks);
-        ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
+        ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
        if (!ls->ls_recover_buf)
                goto out_dirfree;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 70736eb4b516..c0d35c620526 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -51,6 +51,7 @@
 #include <linux/file.h>
 #include <linux/mutex.h>
 #include <linux/sctp.h>
+#include <linux/slab.h>
 #include <net/sctp/user.h>
 #include <net/ipv6.h>
@@ -1060,7 +1061,7 @@ static void init_local(void)
                if (dlm_our_addr(&sas, i))
                        break;
-                addr = kmalloc(sizeof(*addr), GFP_KERNEL);
+                addr = kmalloc(sizeof(*addr), GFP_NOFS);
                if (!addr)
                        break;
                memcpy(addr, &sas, sizeof(*addr));
@@ -1099,7 +1100,7 @@ static int sctp_listen_for_all(void)
        struct sockaddr_storage localaddr;
        struct sctp_event_subscribe subscribe;
        int result = -EINVAL, num = 1, i, addr_len;
-        struct connection *con = nodeid2con(0, GFP_KERNEL);
+        struct connection *con = nodeid2con(0, GFP_NOFS);
        int bufsize = NEEDED_RMEM;
        if (!con)
@@ -1171,7 +1172,7 @@ out:
 static int tcp_listen_for_all(void)
 {
        struct socket *sock = NULL;
-        struct connection *con = nodeid2con(0, GFP_KERNEL);
+        struct connection *con = nodeid2con(0, GFP_NOFS);
        int result = -EINVAL;
        if (!con)
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index b128775913b2..b12532e553f8 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -48,7 +48,7 @@ static int dlm_add_member(struct dlm_ls *ls, int nodeid)
        struct dlm_member *memb;
        int w, error;
-        memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation);
+        memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
        if (!memb)
                return -ENOMEM;
@@ -143,7 +143,7 @@ static void make_member_array(struct dlm_ls *ls)
        ls->ls_total_weight = total;
-        array = kmalloc(sizeof(int) * total, ls->ls_allocation);
+        array = kmalloc(sizeof(int) * total, GFP_NOFS);
        if (!array)
                return;
@@ -226,7 +226,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
                        continue;
                log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]);
-                memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation);
+                memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
                if (!memb)
                        return -ENOMEM;
                memb->nodeid = rv->new[i];
@@ -312,7 +312,7 @@ int dlm_ls_stop(struct dlm_ls *ls)
        /*
         * This in_recovery lock does two things:
         * 1) Keeps this function from returning until all threads are out
-         *    of locking routines and locking is truely stopped.
+         *    of locking routines and locking is truly stopped.
         * 2) Keeps any new requests from being processed until it's unlocked
         *    when recovery is complete.
         */
@@ -341,7 +341,7 @@ int dlm_ls_start(struct dlm_ls *ls)
        int *ids = NULL, *new = NULL;
        int error, ids_count = 0, new_count = 0;
-        rv = kzalloc(sizeof(struct dlm_recover), ls->ls_allocation);
+        rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS);
        if (!rv)
                return -ENOMEM;
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index c1775b84ebab..8e0d00db004f 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -39,7 +39,7 @@ char *dlm_allocate_lvb(struct dlm_ls *ls)
 {
        char *p;
-        p = kzalloc(ls->ls_lvblen, ls->ls_allocation);
+        p = kzalloc(ls->ls_lvblen, GFP_NOFS);
        return p;
 }
@@ -57,7 +57,7 @@ struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
        DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
-        r = kzalloc(sizeof(*r) + namelen, ls->ls_allocation);
+        r = kzalloc(sizeof(*r) + namelen, GFP_NOFS);
        return r;
 }
@@ -72,7 +72,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
 {
        struct dlm_lkb *lkb;
-        lkb = kmem_cache_zalloc(lkb_cache, ls->ls_allocation);
+        lkb = kmem_cache_zalloc(lkb_cache, GFP_NOFS);
        return lkb;
 }
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 55ea369f43a9..2c6ad518100d 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -9,6 +9,7 @@
 #include <net/genetlink.h>
 #include <linux/dlm.h>
 #include <linux/dlm_netlink.h>
+#include <linux/gfp.h>
 #include "dlm_internal.h"
@@ -26,7 +27,7 @@ static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size)
        struct sk_buff *skb;
        void *data;
-        skb = genlmsg_new(size, GFP_KERNEL);
+        skb = genlmsg_new(size, GFP_NOFS);
        if (!skb)
                return -ENOMEM;
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 16f682e26c07..d45c02db6943 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -11,6 +11,7 @@
 #include <linux/poll.h>
 #include <linux/dlm.h>
 #include <linux/dlm_plock.h>
+#include <linux/slab.h>
 #include "dlm_internal.h"
 #include "lockspace.h"
@@ -82,7 +83,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
        if (!ls)
                return -EINVAL;
-        xop = kzalloc(sizeof(*xop), GFP_KERNEL);
+        xop = kzalloc(sizeof(*xop), GFP_NOFS);
        if (!xop) {
                rv = -ENOMEM;
                goto out;
@@ -143,7 +144,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(dlm_posix_lock);
-/* Returns failure iff a succesful lock operation should be canceled */
+/* Returns failure iff a successful lock operation should be canceled */
 static int dlm_plock_callback(struct plock_op *op)
 {
        struct file *file;
@@ -211,7 +212,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
        if (!ls)
                return -EINVAL;
-        op = kzalloc(sizeof(*op), GFP_KERNEL);
+        op = kzalloc(sizeof(*op), GFP_NOFS);
        if (!op) {
                rv = -ENOMEM;
                goto out;
@@ -266,7 +267,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
        if (!ls)
                return -EINVAL;
-        op = kzalloc(sizeof(*op), GFP_KERNEL);
+        op = kzalloc(sizeof(*op), GFP_NOFS);
        if (!op) {
                rv = -ENOMEM;
                goto out;
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 67522c268c14..3c83a49a48a3 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -38,7 +38,7 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
        char *mb;
        int mb_len = sizeof(struct dlm_rcom) + len;
-        mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb);
+        mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
        if (!mh) {
                log_print("create_rcom to %d type %d len %d ENOBUFS",
                          to_nodeid, type, len);
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index 7a2307c08911..a44fa22890e1 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -35,7 +35,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
        struct rq_entry *e;
        int length = ms->m_header.h_length - sizeof(struct dlm_message);
-        e = kmalloc(sizeof(struct rq_entry) + length, ls->ls_allocation);
+        e = kmalloc(sizeof(struct rq_entry) + length, GFP_NOFS);
        if (!e) {
                log_print("dlm_add_requestqueue: out of memory len %d", length);
                return;
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index ebce994ab0b7..8b6e73c47435 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006-2009 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2006-2010 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -17,6 +17,7 @@
 #include <linux/spinlock.h>
 #include <linux/dlm.h>
 #include <linux/dlm_device.h>
+#include <linux/slab.h>
 #include "dlm_internal.h"
 #include "lockspace.h"
@@ -173,7 +174,7 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
 /* we could possibly check if the cancel of an orphan has resulted in the lkb
   being removed and then remove that lkb from the orphans list and free it */
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
 {
        struct dlm_ls *ls;
        struct dlm_user_args *ua;
@@ -206,8 +207,10 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
        ast_type = lkb->lkb_ast_type;
        lkb->lkb_ast_type |= type;
-        if (bastmode)
+        if (type == AST_BAST)
-                lkb->lkb_bastmode = bastmode;
+                lkb->lkb_bastmode = mode;
+        else
+                lkb->lkb_castmode = mode;
        if (!ast_type) {
                kref_get(&lkb->lkb_ref);
@@ -267,7 +270,7 @@ static int device_user_lock(struct dlm_user_proc *proc,
                goto out;
        }
-        ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
+        ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
        if (!ua)
                goto out;
        ua->proc = proc;
@@ -307,7 +310,7 @@ static int device_user_unlock(struct dlm_user_proc *proc,
        if (!ls)
                return -ENOENT;
-        ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
+        ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
        if (!ua)
                goto out;
        ua->proc = proc;
@@ -352,7 +355,7 @@ static int dlm_device_register(struct dlm_ls *ls, char *name)
        error = -ENOMEM;
        len = strlen(name) + strlen(name_prefix) + 2;
-        ls->ls_device.name = kzalloc(len, GFP_KERNEL);
+        ls->ls_device.name = kzalloc(len, GFP_NOFS);
        if (!ls->ls_device.name)
                goto fail;
@@ -520,7 +523,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 #endif
                return -EINVAL;
-        kbuf = kzalloc(count + 1, GFP_KERNEL);
+        kbuf = kzalloc(count + 1, GFP_NOFS);
        if (!kbuf)
                return -ENOMEM;
@@ -546,7 +549,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
                /* add 1 after namelen so that the name string is terminated */
                kbuf = kzalloc(sizeof(struct dlm_write_request) + namelen + 1,
-                               GFP_KERNEL);
+                               GFP_NOFS);
                if (!kbuf) {
                        kfree(k32buf);
                        return -ENOMEM;
@@ -648,7 +651,7 @@ static int device_open(struct inode *inode, struct file *file)
        if (!ls)
                return -ENOENT;
-        proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
+        proc = kzalloc(sizeof(struct dlm_user_proc), GFP_NOFS);
        if (!proc) {
                dlm_put_lockspace(ls);
                return -ENOMEM;
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index 1c9686492286..f196091dd7ff 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006-2008 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2006-2010 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -9,7 +9,7 @@
 #ifndef __USER_DOT_H__
 #define __USER_DOT_H__
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode);
 int dlm_user_init(void);
 void dlm_user_exit(void);
 int dlm_device_deregister(struct dlm_ls *ls);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index fbb6e5eed697..efb2b9400391 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -33,6 +33,7 @@
 #include <linux/crypto.h>
 #include <linux/file.h>
 #include <linux/scatterlist.h>
+#include <linux/slab.h>
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
@@ -1748,7 +1749,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
                            char *cipher_name, size_t *key_size)
 {
        char dummy_key[ECRYPTFS_MAX_KEY_BYTES];
-        char *full_alg_name;
+        char *full_alg_name = NULL;
        int rc;
        *key_tfm = NULL;
@@ -1763,7 +1764,6 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
        if (rc)
                goto out;
        *key_tfm = crypto_alloc_blkcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC);
-        kfree(full_alg_name);
        if (IS_ERR(*key_tfm)) {
                rc = PTR_ERR(*key_tfm);
                printk(KERN_ERR "Unable to allocate crypto cipher with name "
@@ -1786,6 +1786,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
                goto out;
        }
 out:
+        kfree(full_alg_name);
        return rc;
 }
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 2dda5ade75bc..906e803f7f79 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -26,6 +26,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/fs_stack.h>
+#include <linux/slab.h>
 #include "ecryptfs_kernel.h"
 /**
@@ -62,7 +63,7 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
                struct inode *lower_inode =
                        ecryptfs_inode_to_lower(dentry->d_inode);
-                fsstack_copy_attr_all(dentry->d_inode, lower_inode, NULL);
+                fsstack_copy_attr_all(dentry->d_inode, lower_inode);
        }
 out:
        return rc;
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 9e944057001b..e7440a6f5ebf 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -25,6 +25,7 @@
 #include <linux/file.h>
 #include <linux/poll.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/security.h>
@@ -158,7 +159,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
        struct dentry *ecryptfs_dentry = file->f_path.dentry;
        /* Private value of ecryptfs_dentry allocated in
         * ecryptfs_lookup() */
-        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
+        struct dentry *lower_dentry;
        struct ecryptfs_file_info *file_info;
        mount_crypt_stat = &ecryptfs_superblock_to_private(
@@ -191,13 +192,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
                                      | ECRYPTFS_ENCRYPTED);
        }
        mutex_unlock(&crypt_stat->cs_mutex);
-        if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
-            && !(file->f_flags & O_RDONLY)) {
-                rc = -EPERM;
-                printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
-                       "file must hence be opened RO\n", __func__);
-                goto out;
-        }
        if (!ecryptfs_inode_to_private(inode)->lower_file) {
                rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
                if (rc) {
@@ -208,6 +202,13 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
                        goto out;
                }
        }
+        if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
+            && !(file->f_flags & O_RDONLY)) {
+                rc = -EPERM;
+                printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
+                       "file must hence be opened RO\n", __func__);
+                goto out;
+        }
        ecryptfs_set_file_lower(
                file, ecryptfs_inode_to_private(inode)->lower_file);
        if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
@@ -299,7 +300,6 @@ static int ecryptfs_ioctl(struct inode *inode, struct file *file,
 const struct file_operations ecryptfs_dir_fops = {
        .readdir = ecryptfs_readdir,
        .ioctl = ecryptfs_ioctl,
-        .mmap = generic_file_mmap,
        .open = ecryptfs_open,
        .flush = ecryptfs_flush,
        .release = ecryptfs_release,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 056fed62d0de..d3362faf3852 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -31,6 +31,7 @@
 #include <linux/mount.h>
 #include <linux/crypto.h>
 #include <linux/fs_stack.h>
+#include <linux/slab.h>
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
@@ -282,7 +283,8 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                goto out;
        }
        rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
-                                ecryptfs_dir_inode->i_sb, 1);
+                                ecryptfs_dir_inode->i_sb,
+                                ECRYPTFS_INTERPOSE_FLAG_D_ADD);
        if (rc) {
                printk(KERN_ERR "%s: Error interposing; rc = [%d]\n",
                       __func__, rc);
@@ -463,9 +465,6 @@ out_lock:
        unlock_dir(lower_dir_dentry);
        dput(lower_new_dentry);
        dput(lower_old_dentry);
-        d_drop(lower_old_dentry);
-        d_drop(new_dentry);
-        d_drop(old_dentry);
        return rc;
 }
@@ -614,6 +613,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct dentry *lower_new_dentry;
        struct dentry *lower_old_dir_dentry;
        struct dentry *lower_new_dir_dentry;
+        struct dentry *trap = NULL;
        lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
        lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
@@ -621,14 +621,24 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        dget(lower_new_dentry);
        lower_old_dir_dentry = dget_parent(lower_old_dentry);
        lower_new_dir_dentry = dget_parent(lower_new_dentry);
-        lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+        trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+        /* source should not be ancestor of target */
+        if (trap == lower_old_dentry) {
+                rc = -EINVAL;
+                goto out_lock;
+        }
+        /* target should not be ancestor of source */
+        if (trap == lower_new_dentry) {
+                rc = -ENOTEMPTY;
+                goto out_lock;
+        }
        rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry,
                        lower_new_dir_dentry->d_inode, lower_new_dentry);
        if (rc)
                goto out_lock;
-        fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode, NULL);
+        fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode);
        if (new_dir != old_dir)
-                fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode, NULL);
+                fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode);
 out_lock:
        unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
        dput(lower_new_dentry->d_parent);
@@ -715,31 +725,31 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
        /* Released in ecryptfs_put_link(); only release here on error */
        buf = kmalloc(len, GFP_KERNEL);
        if (!buf) {
-                rc = -ENOMEM;
+                buf = ERR_PTR(-ENOMEM);
                goto out;
        }
        old_fs = get_fs();
        set_fs(get_ds());
        rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
        set_fs(old_fs);
-        if (rc < 0)
+        if (rc < 0) {
-                goto out_free;
+                kfree(buf);
-        else
+                buf = ERR_PTR(rc);
+        } else
                buf[rc] = '\0';
-        rc = 0;
-        nd_set_link(nd, buf);
-        goto out;
-out_free:
-        kfree(buf);
 out:
-        return ERR_PTR(rc);
+        nd_set_link(nd, buf);
+        return NULL;
 }
 static void
 ecryptfs_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr)
 {
-        /* Free the char* */
+        char *buf = nd_get_link(nd);
-        kfree(nd_get_link(nd));
+        if (!IS_ERR(buf)) {
+                /* Free the char* */
+                kfree(buf);
+        }
 }
 /**
@@ -772,18 +782,23 @@ upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat,
 }
 /**
- * ecryptfs_truncate
+ * truncate_upper
 * @dentry: The ecryptfs layer dentry
- * @new_length: The length to expand the file to
+ * @ia: Address of the ecryptfs inode's attributes
+ * @lower_ia: Address of the lower inode's attributes
 *
 * Function to handle truncations modifying the size of the file. Note
 * that the file sizes are interpolated. When expanding, we are simply
- * writing strings of 0's out. When truncating, we need to modify the
+ * writing strings of 0's out. When truncating, we truncate the upper
- * underlying file size according to the page index interpolations.
+ * inode and update the lower_ia according to the page index
+ * interpolations. If ATTR_SIZE is set in lower_ia->ia_valid upon return,
+ * the caller must use lower_ia in a call to notify_change() to perform
+ * the truncation of the lower inode.
 *
 * Returns zero on success; non-zero otherwise
 */
-int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
+static int truncate_upper(struct dentry *dentry, struct iattr *ia,
+                          struct iattr *lower_ia)
 {
        int rc = 0;
        struct inode *inode = dentry->d_inode;
@@ -794,8 +809,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
        loff_t lower_size_before_truncate;
        loff_t lower_size_after_truncate;
-        if (unlikely((new_length == i_size)))
+        if (unlikely((ia->ia_size == i_size))) {
+                lower_ia->ia_valid &= ~ATTR_SIZE;
                goto out;
+        }
        crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
        /* Set up a fake ecryptfs file, this is used to interface with
         * the file in the underlying filesystem so that the
@@ -815,28 +832,30 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
                &fake_ecryptfs_file,
                ecryptfs_inode_to_private(dentry->d_inode)->lower_file);
        /* Switch on growing or shrinking file */
-        if (new_length > i_size) {
+        if (ia->ia_size > i_size) {
                char zero[] = { 0x00 };
+                lower_ia->ia_valid &= ~ATTR_SIZE;
                /* Write a single 0 at the last position of the file;
                 * this triggers code that will fill in 0's throughout
                 * the intermediate portion of the previous end of the
                 * file and the new and of the file */
                rc = ecryptfs_write(&fake_ecryptfs_file, zero,
-                                    (new_length - 1), 1);
+                                    (ia->ia_size - 1), 1);
-        } else { /* new_length < i_size_read(inode) */
+        } else { /* ia->ia_size < i_size_read(inode) */
-                /* We're chopping off all the pages down do the page
+                /* We're chopping off all the pages down to the page
-                 * in which new_length is located. Fill in the end of
+                 * in which ia->ia_size is located. Fill in the end of
-                 * that page from (new_length & ~PAGE_CACHE_MASK) to
+                 * that page from (ia->ia_size & ~PAGE_CACHE_MASK) to
                 * PAGE_CACHE_SIZE with zeros. */
                size_t num_zeros = (PAGE_CACHE_SIZE
-                                    - (new_length & ~PAGE_CACHE_MASK));
+                                    - (ia->ia_size & ~PAGE_CACHE_MASK));
                if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
-                        rc = vmtruncate(inode, new_length);
+                        rc = vmtruncate(inode, ia->ia_size);
                        if (rc)
                                goto out_free;
-                        rc = vmtruncate(lower_dentry->d_inode, new_length);
+                        lower_ia->ia_size = ia->ia_size;
+                        lower_ia->ia_valid |= ATTR_SIZE;
                        goto out_free;
                }
                if (num_zeros) {
@@ -848,7 +867,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
                                goto out_free;
                        }
                        rc = ecryptfs_write(&fake_ecryptfs_file, zeros_virt,
-                                            new_length, num_zeros);
+                                            ia->ia_size, num_zeros);
                        kfree(zeros_virt);
                        if (rc) {
                                printk(KERN_ERR "Error attempting to zero out "
@@ -857,7 +876,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
                                goto out_free;
                        }
                }
-                vmtruncate(inode, new_length);
+                vmtruncate(inode, ia->ia_size);
                rc = ecryptfs_write_inode_size_to_metadata(inode);
                if (rc) {
                        printk(KERN_ERR "Problem with "
@@ -870,10 +889,12 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
                lower_size_before_truncate =
                    upper_size_to_lower_size(crypt_stat, i_size);
                lower_size_after_truncate =
-                    upper_size_to_lower_size(crypt_stat, new_length);
+                    upper_size_to_lower_size(crypt_stat, ia->ia_size);
-                if (lower_size_after_truncate < lower_size_before_truncate)
+                if (lower_size_after_truncate < lower_size_before_truncate) {
-                        vmtruncate(lower_dentry->d_inode,
+                        lower_ia->ia_size = lower_size_after_truncate;
-                                   lower_size_after_truncate);
+                        lower_ia->ia_valid |= ATTR_SIZE;
+                } else
+                        lower_ia->ia_valid &= ~ATTR_SIZE;
        }
 out_free:
        if (ecryptfs_file_to_private(&fake_ecryptfs_file))
@@ -883,6 +904,33 @@ out:
        return rc;
 }
+/**
+ * ecryptfs_truncate
+ * @dentry: The ecryptfs layer dentry
+ * @new_length: The length to expand the file to
+ *
+ * Simple function that handles the truncation of an eCryptfs inode and
+ * its corresponding lower inode.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
+{
+        struct iattr ia = { .ia_valid = ATTR_SIZE, .ia_size = new_length };
+        struct iattr lower_ia = { .ia_valid = 0 };
+        int rc;
+        rc = truncate_upper(dentry, &ia, &lower_ia);
+        if (!rc && lower_ia.ia_valid & ATTR_SIZE) {
+                struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+                mutex_lock(&lower_dentry->d_inode->i_mutex);
+                rc = notify_change(lower_dentry, &lower_ia);
+                mutex_unlock(&lower_dentry->d_inode->i_mutex);
+        }
+        return rc;
+}
 static int
 ecryptfs_permission(struct inode *inode, int mask)
 {
@@ -905,6 +953,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
 {
        int rc = 0;
        struct dentry *lower_dentry;
+        struct iattr lower_ia;
        struct inode *inode;
        struct inode *lower_inode;
        struct ecryptfs_crypt_stat *crypt_stat;
@@ -943,15 +992,11 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
                }
        }
        mutex_unlock(&crypt_stat->cs_mutex);
+        memcpy(&lower_ia, ia, sizeof(lower_ia));
+        if (ia->ia_valid & ATTR_FILE)
+                lower_ia.ia_file = ecryptfs_file_to_lower(ia->ia_file);
        if (ia->ia_valid & ATTR_SIZE) {
-                ecryptfs_printk(KERN_DEBUG,
+                rc = truncate_upper(dentry, ia, &lower_ia);
-                                "ia->ia_valid = [0x%x] ATTR_SIZE" " = [0x%x]\n",
-                                ia->ia_valid, ATTR_SIZE);
-                rc = ecryptfs_truncate(dentry, ia->ia_size);
-                /* ecryptfs_truncate handles resizing of the lower file */
-                ia->ia_valid &= ~ATTR_SIZE;
-                ecryptfs_printk(KERN_DEBUG, "ia->ia_valid = [%x]\n",
-                                ia->ia_valid);
                if (rc < 0)
                        goto out;
        }
@@ -960,14 +1005,29 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
         * mode change is for clearing setuid/setgid bits. Allow lower fs
         * to interpret this in its own way.
         */
-        if (ia->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
+        if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
-                ia->ia_valid &= ~ATTR_MODE;
+                lower_ia.ia_valid &= ~ATTR_MODE;
        mutex_lock(&lower_dentry->d_inode->i_mutex);
-        rc = notify_change(lower_dentry, ia);
+        rc = notify_change(lower_dentry, &lower_ia);
        mutex_unlock(&lower_dentry->d_inode->i_mutex);
 out:
-        fsstack_copy_attr_all(inode, lower_inode, NULL);
+        fsstack_copy_attr_all(inode, lower_inode);
+        return rc;
+}
+int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                     struct kstat *stat)
+{
+        struct kstat lower_stat;
+        int rc;
+        rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry),
+                         ecryptfs_dentry_to_lower(dentry), &lower_stat);
+        if (!rc) {
+                generic_fillattr(dentry->d_inode, stat);
+                stat->blocks = lower_stat.blocks;
+        }
        return rc;
 }
@@ -1100,6 +1160,7 @@ const struct inode_operations ecryptfs_dir_iops = {
 const struct inode_operations ecryptfs_main_iops = {
        .permission = ecryptfs_permission,
        .setattr = ecryptfs_setattr,
+        .getattr = ecryptfs_getattr,
        .setxattr = ecryptfs_setxattr,
        .getxattr = ecryptfs_getxattr,
        .listxattr = ecryptfs_listxattr,
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index a0a7847567e9..89c5476506ef 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -32,6 +32,7 @@
 #include <linux/random.h>
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>
+#include <linux/slab.h>
 #include "ecryptfs_kernel.h"
 /**
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index e14cf7e588db..d8c3a373aafa 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -22,6 +22,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/mount.h>
 #include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index c6ac85d6c701..af1a8f01ebac 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -35,7 +35,7 @@
 #include <linux/key.h>
 #include <linux/parser.h>
 #include <linux/fs_stack.h>
-#include <linux/ima.h>
+#include <linux/slab.h>
 #include "ecryptfs_kernel.h"
 /**
@@ -119,7 +119,6 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
        const struct cred *cred = current_cred();
        struct ecryptfs_inode_info *inode_info =
                ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
-        int opened_lower_file = 0;
        int rc = 0;
        mutex_lock(&inode_info->lower_file_mutex);
@@ -136,12 +135,9 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
                               "for lower_dentry [0x%p] and lower_mnt [0x%p]; "
                               "rc = [%d]\n", lower_dentry, lower_mnt, rc);
                        inode_info->lower_file = NULL;
-                } else
+                }
-                        opened_lower_file = 1;
        }
        mutex_unlock(&inode_info->lower_file_mutex);
-        if (opened_lower_file)
-                ima_counts_get(inode_info->lower_file);
        return rc;
 }
@@ -194,7 +190,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
                init_special_inode(inode, lower_inode->i_mode,
                                   lower_inode->i_rdev);
        dentry->d_op = &ecryptfs_dops;
-        fsstack_copy_attr_all(inode, lower_inode, NULL);
+        fsstack_copy_attr_all(inode, lower_inode);
        /* This size will be overwritten for real files w/ headers and
         * other metadata */
        fsstack_copy_inode_size(inode, lower_inode);
@@ -590,8 +586,8 @@ out:
 *                        with as much information as it can before needing
 *                        the lower filesystem.
 * ecryptfs_read_super(): this accesses the lower filesystem and uses
- *                        ecryptfs_interpolate to perform most of the linking
+ *                        ecryptfs_interpose to perform most of the linking
- * ecryptfs_interpolate(): links the lower filesystem into ecryptfs
+ * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
 */
 static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
                        const char *dev_name, void *raw_data,
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index f1c17e87c5fb..2d8dbce9d485 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -20,6 +20,7 @@
 * 02111-1307, USA.
 */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/user_namespace.h>
 #include <linux/nsproxy.h>
 #include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 4ec8f61ccf5a..3745f612bcd4 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -24,6 +24,7 @@
 #include <linux/random.h>
 #include <linux/miscdevice.h>
 #include <linux/poll.h>
+#include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/module.h>
 #include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index df4ce99d0597..d491237c98e7 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -32,6 +32,7 @@
 #include <linux/file.h>
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>
+#include <linux/slab.h>
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index b15a43a80ab7..fcef41c1d2cf 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -26,6 +26,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/key.h>
+#include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/smp_lock.h>
 #include <linux/file.h>
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 8b47e4200e65..6bd3f76fdf88 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -11,6 +11,7 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/anon_inodes.h>
@@ -135,26 +136,71 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait)
        return events;
 }
-static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
+static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
-                            loff_t *ppos)
+{
+        *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
+        ctx->count -= *cnt;
+}
+/**
+ * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
+ * @ctx: [in] Pointer to eventfd context.
+ * @wait: [in] Wait queue to be removed.
+ * @cnt: [out] Pointer to the 64bit conter value.
+ *
+ * Returns zero if successful, or the following error codes:
+ *
+ * -EAGAIN      : The operation would have blocked.
+ *
+ * This is used to atomically remove a wait queue entry from the eventfd wait
+ * queue head, and read/reset the counter value.
+ */
+int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
+                                  __u64 *cnt)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&ctx->wqh.lock, flags);
+        eventfd_ctx_do_read(ctx, cnt);
+        __remove_wait_queue(&ctx->wqh, wait);
+        if (*cnt != 0 && waitqueue_active(&ctx->wqh))
+                wake_up_locked_poll(&ctx->wqh, POLLOUT);
+        spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+        return *cnt != 0 ? 0 : -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
+/**
+ * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
+ * @ctx: [in] Pointer to eventfd context.
+ * @no_wait: [in] Different from zero if the operation should not block.
+ * @cnt: [out] Pointer to the 64bit conter value.
+ *
+ * Returns zero if successful, or the following error codes:
+ *
+ * -EAGAIN      : The operation would have blocked but @no_wait was nonzero.
+ * -ERESTARTSYS : A signal interrupted the wait operation.
+ *
+ * If @no_wait is zero, the function might sleep until the eventfd internal
+ * counter becomes greater than zero.
+ */
+ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt)
 {
-        struct eventfd_ctx *ctx = file->private_data;
        ssize_t res;
-        __u64 ucnt = 0;
        DECLARE_WAITQUEUE(wait, current);
-        if (count < sizeof(ucnt))
-                return -EINVAL;
        spin_lock_irq(&ctx->wqh.lock);
+        *cnt = 0;
        res = -EAGAIN;
        if (ctx->count > 0)
-                res = sizeof(ucnt);
+                res = 0;
-        else if (!(file->f_flags & O_NONBLOCK)) {
+        else if (!no_wait) {
                __add_wait_queue(&ctx->wqh, &wait);
-                for (res = 0;;) {
+                for (;;) {
                        set_current_state(TASK_INTERRUPTIBLE);
                        if (ctx->count > 0) {
-                                res = sizeof(ucnt);
+                                res = 0;
                                break;
                        }
                        if (signal_pending(current)) {
@@ -168,18 +214,32 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
                __remove_wait_queue(&ctx->wqh, &wait);
                __set_current_state(TASK_RUNNING);
        }
-        if (likely(res > 0)) {
+        if (likely(res == 0)) {
-                ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
+                eventfd_ctx_do_read(ctx, cnt);
-                ctx->count -= ucnt;
                if (waitqueue_active(&ctx->wqh))
                        wake_up_locked_poll(&ctx->wqh, POLLOUT);
        }
        spin_unlock_irq(&ctx->wqh.lock);
-        if (res > 0 && put_user(ucnt, (__u64 __user *) buf))
-                return -EFAULT;
        return res;
 }
+EXPORT_SYMBOL_GPL(eventfd_ctx_read);
+static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
+                            loff_t *ppos)
+{
+        struct eventfd_ctx *ctx = file->private_data;
+        ssize_t res;
+        __u64 cnt;
+        if (count < sizeof(cnt))
+                return -EINVAL;
+        res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt);
+        if (res < 0)
+                return res;
+        return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt);
+}
 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
                             loff_t *ppos)
@@ -339,7 +399,7 @@ struct file *eventfd_file_create(unsigned int count, int flags)
        ctx->flags = flags;
        file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx,
-                                  flags & EFD_SHARED_FCNTL_FLAGS);
+                                  O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
        if (IS_ERR(file))
                eventfd_free_ctx(ctx);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 085c5c063420..bd056a5b4efc 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -251,10 +251,10 @@ ctl_table epoll_table[] = {
                .data           = &max_user_watches,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
                .extra1         = &zero,
        },
-        { .ctl_name = 0 }
+        { }
 };
 #endif /* CONFIG_SYSCTL */
@@ -1206,7 +1206,7 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
         * a file structure and a free file descriptor.
         */
        error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
-                                 flags & O_CLOEXEC);
+                                 O_RDWR | (flags & O_CLOEXEC));
        if (error < 0)
                ep_free(ep);
diff --git a/fs/exec.c b/fs/exec.c
index ba112bd4a339..49cdaa19e5b9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -46,7 +46,6 @@
 #include <linux/proc_fs.h>
 #include <linux/mount.h>
 #include <linux/security.h>
-#include <linux/ima.h>
 #include <linux/syscalls.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
@@ -196,7 +195,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                 *    to work from.
                 */
                rlim = current->signal->rlim;
-                if (size > rlim[RLIMIT_STACK].rlim_cur / 4) {
+                if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) {
                        put_page(page);
                        return NULL;
                }
@@ -247,6 +246,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
        vma->vm_start = vma->vm_end - PAGE_SIZE;
        vma->vm_flags = VM_STACK_FLAGS;
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+        INIT_LIST_HEAD(&vma->anon_vma_chain);
        err = insert_vm_struct(mm, vma);
        if (err)
                goto err;
@@ -517,7 +517,8 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
        /*
         * cover the whole range: [new_start, old_end)
         */
-        vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL);
+        if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
+                return -ENOMEM;
        /*
         * move the page tables downwards, on failure we rely on
@@ -548,15 +549,13 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
        tlb_finish_mmu(tlb, new_end, old_end);
        /*
-         * shrink the vma to just the new range.
+         * Shrink the vma to just the new range.  Always succeeds.
         */
        vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
        return 0;
 }
-#define EXTRA_STACK_VM_PAGES    20      /* random */
 /*
 * Finalizes the stack vm_area_struct. The flags and permissions are updated,
 * the stack is optionally relocated, and some extra space is added.
@@ -572,10 +571,13 @@ int setup_arg_pages(struct linux_binprm *bprm,
        struct vm_area_struct *prev = NULL;
        unsigned long vm_flags;
        unsigned long stack_base;
+        unsigned long stack_size;
+        unsigned long stack_expand;
+        unsigned long rlim_stack;
 #ifdef CONFIG_STACK_GROWSUP
        /* Limit stack size to 1GB */
-        stack_base = current->signal->rlim[RLIMIT_STACK].rlim_max;
+        stack_base = rlimit_max(RLIMIT_STACK);
        if (stack_base > (1 << 30))
                stack_base = 1 << 30;
@@ -628,10 +630,23 @@ int setup_arg_pages(struct linux_binprm *bprm,
                        goto out_unlock;
        }
+        stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
+        stack_size = vma->vm_end - vma->vm_start;
+        /*
+         * Align this down to a page boundary as expand_stack
+         * will align it up.
+         */
+        rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK;
 #ifdef CONFIG_STACK_GROWSUP
-        stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE;
+        if (stack_size + stack_expand > rlim_stack)
+                stack_base = vma->vm_start + rlim_stack;
+        else
+                stack_base = vma->vm_end + stack_expand;
 #else
-        stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE;
+        if (stack_size + stack_expand > rlim_stack)
+                stack_base = vma->vm_end - rlim_stack;
+        else
+                stack_base = vma->vm_start - stack_expand;
 #endif
        ret = expand_stack(vma, stack_base);
        if (ret)
@@ -703,6 +718,7 @@ static int exec_mmap(struct mm_struct *mm)
        /* Notify parent that we're no longer interested in the old VM */
        tsk = current;
        old_mm = current->mm;
+        sync_mm_rss(tsk, old_mm);
        mm_release(tsk, old_mm);
        if (old_mm) {
@@ -827,7 +843,9 @@ static int de_thread(struct task_struct *tsk)
                attach_pid(tsk, PIDTYPE_PID,  task_pid(leader));
                transfer_pid(leader, tsk, PIDTYPE_PGID);
                transfer_pid(leader, tsk, PIDTYPE_SID);
                list_replace_rcu(&leader->tasks, &tsk->tasks);
+                list_replace_init(&leader->sibling, &tsk->sibling);
                tsk->group_leader = tsk;
                leader->group_leader = tsk;
@@ -924,6 +942,15 @@ char *get_task_comm(char *buf, struct task_struct *tsk)
 void set_task_comm(struct task_struct *tsk, char *buf)
 {
        task_lock(tsk);
+        /*
+         * Threads may access current->comm without holding
+         * the task lock, so write the string carefully.
+         * Readers without a lock may see incomplete new
+         * names but are safe from non-terminating string reads.
+         */
+        memset(tsk->comm, 0, TASK_COMM_LEN);
+        wmb();
        strlcpy(tsk->comm, buf, sizeof(tsk->comm));
        task_unlock(tsk);
        perf_event_comm(tsk);
@@ -931,9 +958,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
 int flush_old_exec(struct linux_binprm * bprm)
 {
-        char * name;
+        int retval;
-        int i, ch, retval;
-        char tcomm[sizeof(current->comm)];
        /*
         * Make sure we have a private signal table and that
@@ -954,6 +979,25 @@ int flush_old_exec(struct linux_binprm * bprm)
        bprm->mm = NULL;                /* We're using it now */
+        current->flags &= ~PF_RANDOMIZE;
+        flush_thread();
+        current->personality &= ~bprm->per_clear;
+        return 0;
+out:
+        return retval;
+}
+EXPORT_SYMBOL(flush_old_exec);
+void setup_new_exec(struct linux_binprm * bprm)
+{
+        int i, ch;
+        char * name;
+        char tcomm[sizeof(current->comm)];
+        arch_pick_mmap_layout(current->mm);
        /* This is the point of no return */
        current->sas_ss_sp = current->sas_ss_size = 0;
@@ -975,9 +1019,6 @@ int flush_old_exec(struct linux_binprm * bprm)
        tcomm[i] = '\0';
        set_task_comm(current, tcomm);
-        current->flags &= ~PF_RANDOMIZE;
-        flush_thread();
        /* Set the new mm task size. We have to do that late because it may
         * depend on TIF_32BIT which is only updated in flush_thread() on
         * some architectures like powerpc
@@ -993,8 +1034,6 @@ int flush_old_exec(struct linux_binprm * bprm)
                set_dumpable(current->mm, suid_dumpable);
        }
-        current->personality &= ~bprm->per_clear;
        /*
         * Flush performance counters when crossing a
         * security domain:
@@ -1009,14 +1048,8 @@ int flush_old_exec(struct linux_binprm * bprm)
                        
        flush_signal_handlers(current, 0);
        flush_old_files(current->files);
-        return 0;
-out:
-        return retval;
 }
+EXPORT_SYMBOL(setup_new_exec);
-EXPORT_SYMBOL(flush_old_exec);
 /*
 * Prepare credentials and lock ->cred_guard_mutex.
@@ -1209,9 +1242,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
        retval = security_bprm_check(bprm);
        if (retval)
                return retval;
-        retval = ima_bprm_check(bprm);
-        if (retval)
-                return retval;
        /* kernel module loader fixup */
        /* so we don't try to load run modprobe in kernel space. */
@@ -1503,7 +1533,7 @@ static int format_corename(char *corename, long signr)
                        /* core limit size */
                        case 'c':
                                rc = snprintf(out_ptr, out_end - out_ptr,
-                                              "%lu", current->signal->rlim[RLIMIT_CORE].rlim_cur);
+                                              "%lu", rlimit(RLIMIT_CORE));
                                if (rc > out_end - out_ptr)
                                        goto out;
                                out_ptr += rc;
@@ -1531,12 +1561,13 @@ out:
        return ispipe;
 }
-static int zap_process(struct task_struct *start)
+static int zap_process(struct task_struct *start, int exit_code)
 {
        struct task_struct *t;
        int nr = 0;
        start->signal->flags = SIGNAL_GROUP_EXIT;
+        start->signal->group_exit_code = exit_code;
        start->signal->group_stop_count = 0;
        t = start;
@@ -1561,8 +1592,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
        spin_lock_irq(&tsk->sighand->siglock);
        if (!signal_group_exit(tsk->signal)) {
                mm->core_state = core_state;
-                tsk->signal->group_exit_code = exit_code;
+                nr = zap_process(tsk, exit_code);
-                nr = zap_process(tsk);
        }
        spin_unlock_irq(&tsk->sighand->siglock);
        if (unlikely(nr < 0))
@@ -1611,7 +1641,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
                        if (p->mm) {
                                if (unlikely(p->mm == mm)) {
                                        lock_task_sighand(p, &flags);
-                                        nr += zap_process(p);
+                                        nr += zap_process(p, exit_code);
                                        unlock_task_sighand(p, &flags);
                                }
                                break;
@@ -1718,14 +1748,19 @@ void set_dumpable(struct mm_struct *mm, int value)
        }
 }
-int get_dumpable(struct mm_struct *mm)
+static int __get_dumpable(unsigned long mm_flags)
 {
        int ret;
-        ret = mm->flags & 0x3;
+        ret = mm_flags & MMF_DUMPABLE_MASK;
        return (ret >= 2) ? 2 : ret;
 }
+int get_dumpable(struct mm_struct *mm)
+{
+        return __get_dumpable(mm->flags);
+}
 static void wait_for_dump_helpers(struct file *file)
 {
        struct pipe_inode_info *pipe;
@@ -1756,17 +1791,26 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
        struct mm_struct *mm = current->mm;
        struct linux_binfmt * binfmt;
        struct inode * inode;
-        struct file * file;
        const struct cred *old_cred;
        struct cred *cred;
        int retval = 0;
        int flag = 0;
        int ispipe = 0;
-        unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
        char **helper_argv = NULL;
        int helper_argc = 0;
        int dump_count = 0;
        static atomic_t core_dump_count = ATOMIC_INIT(0);
+        struct coredump_params cprm = {
+                .signr = signr,
+                .regs = regs,
+                .limit = rlimit(RLIMIT_CORE),
+                /*
+                 * We must use the same mm->flags while dumping core to avoid
+                 * inconsistency of bit flags, since this flag is not protected
+                 * by any locks.
+                 */
+                .mm_flags = mm->flags,
+        };
        audit_core_dumps(signr);
@@ -1784,7 +1828,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
        /*
         * If another thread got here first, or we are not dumpable, bail out.
         */
-        if (mm->core_state || !get_dumpable(mm)) {
+        if (mm->core_state || !__get_dumpable(cprm.mm_flags)) {
                up_write(&mm->mmap_sem);
                put_cred(cred);
                goto fail;
@@ -1795,7 +1839,8 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
         *      process nor do we know its entire history. We only know it
         *      was tainted so we dump it as root in mode 2.
         */
-        if (get_dumpable(mm) == 2) {    /* Setuid core dump mode */
+        if (__get_dumpable(cprm.mm_flags) == 2) {
+                /* Setuid core dump mode */
                flag = O_EXCL;          /* Stop rewrite attacks */
                cred->fsuid = 0;        /* Dump root private */
        }
@@ -1822,15 +1867,15 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
        ispipe = format_corename(corename, signr);
        unlock_kernel();
-        if ((!ispipe) && (core_limit < binfmt->min_coredump))
+        if ((!ispipe) && (cprm.limit < binfmt->min_coredump))
                goto fail_unlock;
        if (ispipe) {
-                if (core_limit == 0) {
+                if (cprm.limit == 0) {
                        /*
                         * Normally core limits are irrelevant to pipes, since
                         * we're not writing to the file system, but we use
-                         * core_limit of 0 here as a speacial value. Any
+                         * cprm.limit of 0 here as a speacial value. Any
                         * non-zero limit gets set to RLIM_INFINITY below, but
                         * a limit of 0 skips the dump.  This is a consistent
                         * way to catch recursive crashes.  We can still crash
@@ -1863,25 +1908,25 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
                        goto fail_dropcount;
                }
-                core_limit = RLIM_INFINITY;
+                cprm.limit = RLIM_INFINITY;
                /* SIGPIPE can happen, but it's just never processed */
                if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL,
-                                &file)) {
+                                &cprm.file)) {
                        printk(KERN_INFO "Core dump to %s pipe failed\n",
                               corename);
                        goto fail_dropcount;
                }
        } else
-                file = filp_open(corename,
+                cprm.file = filp_open(corename,
                                 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
                                 0600);
-        if (IS_ERR(file))
+        if (IS_ERR(cprm.file))
                goto fail_dropcount;
-        inode = file->f_path.dentry->d_inode;
+        inode = cprm.file->f_path.dentry->d_inode;
        if (inode->i_nlink > 1)
                goto close_fail;        /* multiple links - don't dump */
-        if (!ispipe && d_unhashed(file->f_path.dentry))
+        if (!ispipe && d_unhashed(cprm.file->f_path.dentry))
                goto close_fail;
        /* AK: actually i see no reason to not allow this for named pipes etc.,
@@ -1891,24 +1936,26 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
        /*
         * Dont allow local users get cute and trick others to coredump
         * into their pre-created files:
+         * Note, this is not relevant for pipes
         */
-        if (inode->i_uid != current_fsuid())
+        if (!ispipe && (inode->i_uid != current_fsuid()))
                goto close_fail;
-        if (!file->f_op)
+        if (!cprm.file->f_op)
                goto close_fail;
-        if (!file->f_op->write)
+        if (!cprm.file->f_op->write)
                goto close_fail;
-        if (!ispipe && do_truncate(file->f_path.dentry, 0, 0, file) != 0)
+        if (!ispipe &&
+            do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0)
                goto close_fail;
-        retval = binfmt->core_dump(signr, regs, file, core_limit);
+        retval = binfmt->core_dump(&cprm);
        if (retval)
                current->signal->group_exit_code |= 0x80;
 close_fail:
        if (ispipe && core_pipe_limit)
-                wait_for_dump_helpers(file);
+                wait_for_dump_helpers(cprm.file);
-        filp_close(file, NULL);
+        filp_close(cprm.file, NULL);
 fail_dropcount:
        if (dump_count)
                atomic_dec(&core_dump_count);
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index cc2d22db119c..2d0f757fda3e 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -12,5 +12,5 @@
 # Kbuild - Gets included from the Kernels Makefile and build system
 #
-exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o
+exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o
 obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index c6718e4817fe..f0d520312d8b 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -49,11 +49,14 @@
 #define EXOFS_MIN_PID   0x10000 /* Smallest partition ID */
 #define EXOFS_OBJ_OFF   0x10000 /* offset for objects */
 #define EXOFS_SUPER_ID  0x10000 /* object ID for on-disk superblock */
+#define EXOFS_DEVTABLE_ID 0x10001 /* object ID for on-disk device table */
 #define EXOFS_ROOT_ID   0x10002 /* object ID for root directory */
 /* exofs Application specific page/attribute */
 # define EXOFS_APAGE_FS_DATA    (OSD_APAGE_APP_DEFINED_FIRST + 3)
 # define EXOFS_ATTR_INODE_DATA  1
+# define EXOFS_ATTR_INODE_FILE_LAYOUT   2
+# define EXOFS_ATTR_INODE_DIR_LAYOUT    3
 /*
 * The maximum number of files we can have is limited by the size of the
@@ -78,17 +81,67 @@ enum {
 #define EXOFS_SUPER_MAGIC       0x5DF5
 /*
- * The file system control block - stored in an object's data (mainly, the one
+ * The file system control block - stored in object EXOFS_SUPER_ID's data.
- * with ID EXOFS_SUPER_ID).  This is where the in-memory superblock is stored
+ * This is where the in-memory superblock is stored on disk.
- * on disk.  Right now it just has a magic value, which is basically a sanity
- * check on our ability to communicate with the object store.
 */
+enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
 struct exofs_fscb {
        __le64  s_nextid;       /* Highest object ID used */
-        __le32  s_numfiles;     /* Number of files on fs */
+        __le64  s_numfiles;     /* Number of files on fs */
+        __le32  s_version;      /* == EXOFS_FSCB_VER */
        __le16  s_magic;        /* Magic signature */
        __le16  s_newfs;        /* Non-zero if this is a new fs */
-};
+        /* From here on it's a static part, only written by mkexofs */
+        __le64  s_dev_table_oid;   /* Resurved, not used */
+        __le64  s_dev_table_count; /* == 0 means no dev_table */
+} __packed;
+/*
+ * Describes the raid used in the FS. It is part of the device table.
+ * This here is taken from the pNFS-objects definition. In exofs we
+ * use one raid policy through-out the filesystem. (NOTE: the funny
+ * alignment at begining. We take care of it at exofs_device_table.
+ */
+struct exofs_dt_data_map {
+        __le32  cb_num_comps;
+        __le64  cb_stripe_unit;
+        __le32  cb_group_width;
+        __le32  cb_group_depth;
+        __le32  cb_mirror_cnt;
+        __le32  cb_raid_algorithm;
+} __packed;
+/*
+ * This is an osd device information descriptor. It is a single entry in
+ * the exofs device table. It describes an osd target lun which
+ * contains data belonging to this FS. (Same partition_id on all devices)
+ */
+struct exofs_dt_device_info {
+        __le32  systemid_len;
+        u8      systemid[OSD_SYSTEMID_LEN];
+        __le64  long_name_offset;       /* If !0 then offset-in-file */
+        __le32  osdname_len;            /* */
+        u8      osdname[44];            /* Embbeded, Ususally an asci uuid */
+} __packed;
+/*
+ * The EXOFS device table - stored in object EXOFS_DEVTABLE_ID's data.
+ * It contains the raid used for this multy-device FS and an array of
+ * participating devices.
+ */
+struct exofs_device_table {
+        __le32                          dt_version;     /* == EXOFS_DT_VER */
+        struct exofs_dt_data_map        dt_data_map;    /* Raid policy to use */
+        /* Resurved space For future use. Total includeing this:
+         * (8 * sizeof(le64))
+         */
+        __le64                          __Resurved[4];
+        __le64                          dt_num_devices; /* Array size */
+        struct exofs_dt_device_info     dt_dev_table[]; /* Array of devices */
+} __packed;
 /****************************************************************************
 * inode-related things
@@ -155,22 +208,41 @@ enum {
        (((name_len) + offsetof(struct exofs_dir_entry, name)  + \
          EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
-/*************************
+/*
- * function declarations *
+ * The on-disk (optional) layout structure.
- *************************/
+ * sits in an EXOFS_ATTR_INODE_FILE_LAYOUT or EXOFS_ATTR_INODE_DIR_LAYOUT
-/* osd.c                 */
+ * attribute, attached to any inode, usually to a directory.
-void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
+ */
-                           const struct osd_obj_id *obj);
+enum exofs_inode_layout_gen_functions {
+        LAYOUT_MOVING_WINDOW = 0,
+        LAYOUT_IMPLICT = 1,
+};
-int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid);
+struct exofs_on_disk_inode_layout {
-static inline int exofs_check_ok(struct osd_request *or)
+        __le16 gen_func; /* One of enum exofs_inode_layout_gen_functions */
+        __le16 pad;
+        union {
+                /* gen_func == LAYOUT_MOVING_WINDOW (default) */
+                struct exofs_layout_sliding_window {
+                        __le32 num_devices; /* first n devices in global-table*/
+                } sliding_window __packed;
+                /* gen_func == LAYOUT_IMPLICT */
+                struct exofs_layout_implict_list {
+                        struct exofs_dt_data_map data_map;
+                        /* Variable array of size data_map.cb_num_comps. These
+                         * are device indexes of the devices in the global table
+                         */
+                        __le32 dev_indexes[];
+                } implict __packed;
+        };
+} __packed;
+static inline size_t exofs_on_disk_inode_layout_size(unsigned max_devs)
 {
-        return exofs_check_ok_resid(or, NULL, NULL);
+        return sizeof(struct exofs_on_disk_inode_layout) +
+                max_devs * sizeof(__le32);
 }
-int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred);
-int exofs_async_op(struct osd_request *or,
-        osd_req_done_fn *async_done, void *caller_context, u8 *cred);
-int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr);
 #endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 5ec72e020b22..8442e353309f 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -30,13 +30,17 @@
 * along with exofs; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
+#ifndef __EXOFS_H__
+#define __EXOFS_H__
 #include <linux/fs.h>
 #include <linux/time.h>
 #include "common.h"
-#ifndef __EXOFS_H__
+/* FIXME: Remove once pnfs hits mainline
-#define __EXOFS_H__
+ * #include <linux/exportfs/pnfs_osd_xdr.h>
+ */
+#include "pnfs.h"
 #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
@@ -51,34 +55,110 @@
 /* u64 has problems with printk this will cast it to unsigned long long */
 #define _LLU(x) (unsigned long long)(x)
+struct exofs_layout {
+        osd_id          s_pid;                  /* partition ID of file system*/
+        /* Our way of looking at the data_map */
+        unsigned stripe_unit;
+        unsigned mirrors_p1;
+        unsigned group_width;
+        u64      group_depth;
+        unsigned group_count;
+        enum exofs_inode_layout_gen_functions lay_func;
+        unsigned        s_numdevs;              /* Num of devices in array    */
+        struct osd_dev  *s_ods[0];              /* Variable length            */
+};
 /*
 * our extension to the in-memory superblock
 */
 struct exofs_sb_info {
-        struct osd_dev  *s_dev;                 /* returned by get_osd_dev    */
+        struct exofs_fscb s_fscb;               /* Written often, pre-allocate*/
-        osd_id          s_pid;                  /* partition ID of file system*/
        int             s_timeout;              /* timeout for OSD operations */
        uint64_t        s_nextid;               /* highest object ID used     */
        uint32_t        s_numfiles;             /* number of files on fs      */
        spinlock_t      s_next_gen_lock;        /* spinlock for gen # update  */
        u32             s_next_generation;      /* next gen # to use          */
        atomic_t        s_curr_pending;         /* number of pending commands */
-        uint8_t         s_cred[OSD_CAP_LEN];    /* all-powerful credential    */
+        uint8_t         s_cred[OSD_CAP_LEN];    /* credential for the fscb    */
+        struct pnfs_osd_data_map data_map;      /* Default raid to use
+                                                 * FIXME: Needed ?
+                                                 */
+/*      struct exofs_layout     dir_layout;*/   /* Default dir layout */
+        struct exofs_layout     layout;         /* Default files layout,
+                                                 * contains the variable osd_dev
+                                                 * array. Keep last */
+        struct osd_dev  *_min_one_dev[1];       /* Place holder for one dev   */
 };
 /*
 * our extension to the in-memory inode
 */
 struct exofs_i_info {
+        struct inode   vfs_inode;          /* normal in-memory inode          */
+        wait_queue_head_t i_wq;            /* wait queue for inode            */
        unsigned long  i_flags;            /* various atomic flags            */
        uint32_t       i_data[EXOFS_IDATA];/*short symlink names and device #s*/
        uint32_t       i_dir_start_lookup; /* which page to start lookup      */
-        wait_queue_head_t i_wq;            /* wait queue for inode            */
        uint64_t       i_commit_size;      /* the object's written length     */
        uint8_t        i_cred[OSD_CAP_LEN];/* all-powerful credential         */
-        struct inode   vfs_inode;          /* normal in-memory inode          */
 };
+static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
+{
+        return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF;
+}
+struct exofs_io_state;
+typedef void (*exofs_io_done_fn)(struct exofs_io_state *or, void *private);
+struct exofs_io_state {
+        struct kref             kref;
+        void                    *private;
+        exofs_io_done_fn        done;
+        struct exofs_layout     *layout;
+        struct osd_obj_id       obj;
+        u8                      *cred;
+        /* Global read/write IO*/
+        loff_t                  offset;
+        unsigned long           length;
+        void                    *kern_buff;
+        struct page             **pages;
+        unsigned                nr_pages;
+        unsigned                pgbase;
+        unsigned                pages_consumed;
+        /* Attributes */
+        unsigned                in_attr_len;
+        struct osd_attr         *in_attr;
+        unsigned                out_attr_len;
+        struct osd_attr         *out_attr;
+        /* Variable array of size numdevs */
+        unsigned numdevs;
+        struct exofs_per_dev_state {
+                struct osd_request *or;
+                struct bio *bio;
+                loff_t offset;
+                unsigned length;
+                unsigned dev;
+        } per_dev[];
+};
+static inline unsigned exofs_io_state_size(unsigned numdevs)
+{
+        return sizeof(struct exofs_io_state) +
+                sizeof(struct exofs_per_dev_state) * numdevs;
+}
 /*
 * our inode flags
 */
@@ -123,6 +203,12 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode)
 }
 /*
+ * Given a layout, object_number and stripe_index return the associated global
+ * dev_index
+ */
+unsigned exofs_layout_od_id(struct exofs_layout *layout,
+                            osd_id obj_no, unsigned layout_index);
+/*
 * Maximum count of links to a file
 */
 #define EXOFS_LINK_MAX           32000
@@ -130,6 +216,43 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode)
 /*************************
 * function declarations *
 *************************/
+/* ios.c */
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
+                           const struct osd_obj_id *obj);
+int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
+                    u64 offset, void *p, unsigned length);
+int  exofs_get_io_state(struct exofs_layout *layout,
+                        struct exofs_io_state **ios);
+void exofs_put_io_state(struct exofs_io_state *ios);
+int exofs_check_io(struct exofs_io_state *ios, u64 *resid);
+int exofs_sbi_create(struct exofs_io_state *ios);
+int exofs_sbi_remove(struct exofs_io_state *ios);
+int exofs_sbi_write(struct exofs_io_state *ios);
+int exofs_sbi_read(struct exofs_io_state *ios);
+int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr);
+int exofs_oi_truncate(struct exofs_i_info *oi, u64 new_len);
+static inline int exofs_oi_write(struct exofs_i_info *oi,
+                                 struct exofs_io_state *ios)
+{
+        ios->obj.id = exofs_oi_objno(oi);
+        ios->cred = oi->i_cred;
+        return exofs_sbi_write(ios);
+}
+static inline int exofs_oi_read(struct exofs_i_info *oi,
+                                struct exofs_io_state *ios)
+{
+        ios->obj.id = exofs_oi_objno(oi);
+        ios->cred = oi->i_cred;
+        return exofs_sbi_read(ios);
+}
 /* inode.c               */
 void exofs_truncate(struct inode *inode);
 int exofs_setattr(struct dentry *, struct iattr *);
@@ -138,7 +261,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
                struct page **pagep, void **fsdata);
 extern struct inode *exofs_iget(struct super_block *, unsigned long);
 struct inode *exofs_new_inode(struct inode *, int);
-extern int exofs_write_inode(struct inode *, int);
+extern int exofs_write_inode(struct inode *, struct writeback_control *wbc);
 extern void exofs_delete_inode(struct inode *);
 /* dir.c:                */
@@ -169,6 +292,7 @@ extern const struct file_operations exofs_file_operations;
 /* inode.c           */
 extern const struct address_space_operations exofs_aops;
+extern const struct osd_attr g_attr_logical_length;
 /* namei.c           */
 extern const struct inode_operations exofs_dir_inode_operations;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 6c10f7476699..76d2a79ef93e 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -31,94 +31,117 @@
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
+#include <linux/slab.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
 #include <scsi/scsi_device.h>
 #include "exofs.h"
-#ifdef CONFIG_EXOFS_DEBUG
+#define EXOFS_DBGMSG2(M...) do {} while (0)
-#  define EXOFS_DEBUG_OBJ_ISIZE 1
-#endif
+enum { BIO_MAX_PAGES_KMALLOC =
+                (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
+        MAX_PAGES_KMALLOC =
+                PAGE_SIZE / sizeof(struct page *),
+};
 struct page_collect {
        struct exofs_sb_info *sbi;
-        struct request_queue *req_q;
        struct inode *inode;
        unsigned expected_pages;
+        struct exofs_io_state *ios;
-        struct bio *bio;
+        struct page **pages;
+        unsigned alloc_pages;
        unsigned nr_pages;
        unsigned long length;
        loff_t pg_first; /* keep 64bit also in 32-arches */
 };
 static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
-                struct inode *inode)
+                       struct inode *inode)
 {
        struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
        pcol->sbi = sbi;
-        pcol->req_q = osd_request_queue(sbi->s_dev);
        pcol->inode = inode;
        pcol->expected_pages = expected_pages;
-        pcol->bio = NULL;
+        pcol->ios = NULL;
+        pcol->pages = NULL;
+        pcol->alloc_pages = 0;
        pcol->nr_pages = 0;
        pcol->length = 0;
        pcol->pg_first = -1;
-        EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino,
-                     expected_pages);
 }
 static void _pcol_reset(struct page_collect *pcol)
 {
        pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
-        pcol->bio = NULL;
+        pcol->pages = NULL;
+        pcol->alloc_pages = 0;
        pcol->nr_pages = 0;
        pcol->length = 0;
        pcol->pg_first = -1;
-        EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n",
+        pcol->ios = NULL;
-                     pcol->inode->i_ino, pcol->expected_pages);
        /* this is probably the end of the loop but in writes
         * it might not end here. don't be left with nothing
         */
        if (!pcol->expected_pages)
-                pcol->expected_pages = 128;
+                pcol->expected_pages = MAX_PAGES_KMALLOC;
 }
 static int pcol_try_alloc(struct page_collect *pcol)
 {
-        int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES);
+        unsigned pages = min_t(unsigned, pcol->expected_pages,
+                          MAX_PAGES_KMALLOC);
+        if (!pcol->ios) { /* First time allocate io_state */
+                int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios);
+                if (ret)
+                        return ret;
+        }
+        /* TODO: easily support bio chaining */
+        pages =  min_t(unsigned, pages,
+                       pcol->sbi->layout.group_width * BIO_MAX_PAGES_KMALLOC);
        for (; pages; pages >>= 1) {
-                pcol->bio = bio_alloc(GFP_KERNEL, pages);
+                pcol->pages = kmalloc(pages * sizeof(struct page *),
-                if (likely(pcol->bio))
+                                      GFP_KERNEL);
+                if (likely(pcol->pages)) {
+                        pcol->alloc_pages = pages;
                        return 0;
+                }
        }
-        EXOFS_ERR("Failed to kcalloc expected_pages=%u\n",
+        EXOFS_ERR("Failed to kmalloc expected_pages=%u\n",
                  pcol->expected_pages);
        return -ENOMEM;
 }
 static void pcol_free(struct page_collect *pcol)
 {
-        bio_put(pcol->bio);
+        kfree(pcol->pages);
-        pcol->bio = NULL;
+        pcol->pages = NULL;
+        if (pcol->ios) {
+                exofs_put_io_state(pcol->ios);
+                pcol->ios = NULL;
+        }
 }
 static int pcol_add_page(struct page_collect *pcol, struct page *page,
                         unsigned len)
 {
-        int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0);
+        if (unlikely(pcol->nr_pages >= pcol->alloc_pages))
-        if (unlikely(len != added_len))
                return -ENOMEM;
-        ++pcol->nr_pages;
+        pcol->pages[pcol->nr_pages++] = page;
        pcol->length += len;
        return 0;
 }
@@ -161,32 +184,26 @@ static void update_write_page(struct page *page, int ret)
 /* Called at the end of reads, to optionally unlock pages and update their
 * status.
 */
-static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
+static int __readpages_done(struct page_collect *pcol, bool do_unlock)
-                            bool do_unlock)
 {
-        struct bio_vec *bvec;
        int i;
        u64 resid;
        u64 good_bytes;
        u64 length = 0;
-        int ret = exofs_check_ok_resid(or, &resid, NULL);
+        int ret = exofs_check_io(pcol->ios, &resid);
-        osd_end_request(or);
        if (likely(!ret))
                good_bytes = pcol->length;
-        else if (!resid)
-                good_bytes = 0;
        else
                good_bytes = pcol->length - resid;
-        EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx"
+        EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
                     " length=0x%lx nr_pages=%u\n",
                     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
                     pcol->nr_pages);
-        __bio_for_each_segment(bvec, pcol->bio, i, 0) {
+        for (i = 0; i < pcol->nr_pages; i++) {
-                struct page *page = bvec->bv_page;
+                struct page *page = pcol->pages[i];
                struct inode *inode = page->mapping->host;
                int page_stat;
@@ -198,38 +215,37 @@ static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
                else
                        page_stat = ret;
-                EXOFS_DBGMSG("    readpages_done(0x%lx, 0x%lx) %s\n",
+                EXOFS_DBGMSG2("    readpages_done(0x%lx, 0x%lx) %s\n",
                          inode->i_ino, page->index,
                          page_stat ? "bad_bytes" : "good_bytes");
                ret = update_read_page(page, page_stat);
                if (do_unlock)
                        unlock_page(page);
-                length += bvec->bv_len;
+                length += PAGE_SIZE;
        }
        pcol_free(pcol);
-        EXOFS_DBGMSG("readpages_done END\n");
+        EXOFS_DBGMSG2("readpages_done END\n");
        return ret;
 }
 /* callback of async reads */
-static void readpages_done(struct osd_request *or, void *p)
+static void readpages_done(struct exofs_io_state *ios, void *p)
 {
        struct page_collect *pcol = p;
-        __readpages_done(or, pcol, true);
+        __readpages_done(pcol, true);
        atomic_dec(&pcol->sbi->s_curr_pending);
-        kfree(p);
+        kfree(pcol);
 }
 static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
 {
-        struct bio_vec *bvec;
        int i;
-        __bio_for_each_segment(bvec, pcol->bio, i, 0) {
+        for (i = 0; i < pcol->nr_pages; i++) {
-                struct page *page = bvec->bv_page;
+                struct page *page = pcol->pages[i];
                if (rw == READ)
                        update_read_page(page, ret);
@@ -238,36 +254,29 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
                unlock_page(page);
        }
-        pcol_free(pcol);
 }
 static int read_exec(struct page_collect *pcol, bool is_sync)
 {
        struct exofs_i_info *oi = exofs_i(pcol->inode);
-        struct osd_obj_id obj = {pcol->sbi->s_pid,
+        struct exofs_io_state *ios = pcol->ios;
-                                        pcol->inode->i_ino + EXOFS_OBJ_OFF};
-        struct osd_request *or = NULL;
        struct page_collect *pcol_copy = NULL;
-        loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
        int ret;
-        if (!pcol->bio)
+        if (!pcol->pages)
                return 0;
        /* see comment in _readpage() about sync reads */
        WARN_ON(is_sync && (pcol->nr_pages != 1));
-        or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
+        ios->pages = pcol->pages;
-        if (unlikely(!or)) {
+        ios->nr_pages = pcol->nr_pages;
-                ret = -ENOMEM;
+        ios->length = pcol->length;
-                goto err;
+        ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
-        }
-        osd_req_read(or, &obj, i_start, pcol->bio, pcol->length);
        if (is_sync) {
-                exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred);
+                exofs_oi_read(oi, pcol->ios);
-                return __readpages_done(or, pcol, false);
+                return __readpages_done(pcol, false);
        }
        pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
@@ -277,14 +286,16 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
        }
        *pcol_copy = *pcol;
-        ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred);
+        ios->done = readpages_done;
+        ios->private = pcol_copy;
+        ret = exofs_oi_read(oi, ios);
        if (unlikely(ret))
                goto err;
        atomic_inc(&pcol->sbi->s_curr_pending);
-        EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
+        EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
-                  obj.id, _LLU(i_start), pcol->length);
+                  ios->obj.id, _LLU(ios->offset), pcol->length);
        /* pages ownership was passed to pcol_copy */
        _pcol_reset(pcol);
@@ -293,12 +304,10 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
 err:
        if (!is_sync)
                _unlock_pcol_pages(pcol, ret, READ);
-        else /* Pages unlocked by caller in sync mode only free bio */
-                pcol_free(pcol);
+        pcol_free(pcol);
        kfree(pcol_copy);
-        if (or)
-                osd_end_request(or);
        return ret;
 }
@@ -361,7 +370,7 @@ try_again:
                goto try_again;
        }
-        if (!pcol->bio) {
+        if (!pcol->pages) {
                ret = pcol_try_alloc(pcol);
                if (unlikely(ret))
                        goto fail;
@@ -370,12 +379,12 @@ try_again:
        if (len != PAGE_CACHE_SIZE)
                zero_user(page, len, PAGE_CACHE_SIZE - len);
-        EXOFS_DBGMSG("    readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+        EXOFS_DBGMSG2("    readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
                     inode->i_ino, page->index, len);
        ret = pcol_add_page(pcol, page, len);
        if (ret) {
-                EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p "
+                EXOFS_DBGMSG2("Failed pcol_add_page pages[i]=%p "
                          "this_len=0x%zx nr_pages=%u length=0x%lx\n",
                          page, len, pcol->nr_pages, pcol->length);
@@ -419,9 +428,8 @@ static int _readpage(struct page *page, bool is_sync)
        _pcol_init(&pcol, 1, page->mapping->host);
-        /* readpage_strip might call read_exec(,async) inside at several places
+        /* readpage_strip might call read_exec(,is_sync==false) at several
-         * but this is safe for is_async=0 since read_exec will not do anything
+         * places but not if we have a single page.
-         * when we have a single page.
         */
        ret = readpage_strip(&pcol, page);
        if (ret) {
@@ -440,35 +448,30 @@ static int exofs_readpage(struct file *file, struct page *page)
        return _readpage(page, false);
 }
-/* Callback for osd_write. All writes are asynchronouse */
+/* Callback for osd_write. All writes are asynchronous */
-static void writepages_done(struct osd_request *or, void *p)
+static void writepages_done(struct exofs_io_state *ios, void *p)
 {
        struct page_collect *pcol = p;
-        struct bio_vec *bvec;
        int i;
        u64 resid;
        u64  good_bytes;
        u64  length = 0;
+        int ret = exofs_check_io(ios, &resid);
-        int ret = exofs_check_ok_resid(or, NULL, &resid);
-        osd_end_request(or);
        atomic_dec(&pcol->sbi->s_curr_pending);
        if (likely(!ret))
                good_bytes = pcol->length;
-        else if (!resid)
-                good_bytes = 0;
        else
                good_bytes = pcol->length - resid;
-        EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx"
+        EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
                     " length=0x%lx nr_pages=%u\n",
                     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
                     pcol->nr_pages);
-        __bio_for_each_segment(bvec, pcol->bio, i, 0) {
+        for (i = 0; i < pcol->nr_pages; i++) {
-                struct page *page = bvec->bv_page;
+                struct page *page = pcol->pages[i];
                struct inode *inode = page->mapping->host;
                int page_stat;
@@ -482,37 +485,27 @@ static void writepages_done(struct osd_request *or, void *p)
                update_write_page(page, page_stat);
                unlock_page(page);
-                EXOFS_DBGMSG("    writepages_done(0x%lx, 0x%lx) status=%d\n",
+                EXOFS_DBGMSG2("    writepages_done(0x%lx, 0x%lx) status=%d\n",
                             inode->i_ino, page->index, page_stat);
-                length += bvec->bv_len;
+                length += PAGE_SIZE;
        }
        pcol_free(pcol);
        kfree(pcol);
-        EXOFS_DBGMSG("writepages_done END\n");
+        EXOFS_DBGMSG2("writepages_done END\n");
 }
 static int write_exec(struct page_collect *pcol)
 {
        struct exofs_i_info *oi = exofs_i(pcol->inode);
-        struct osd_obj_id obj = {pcol->sbi->s_pid,
+        struct exofs_io_state *ios = pcol->ios;
-                                        pcol->inode->i_ino + EXOFS_OBJ_OFF};
-        struct osd_request *or = NULL;
        struct page_collect *pcol_copy = NULL;
-        loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
        int ret;
-        if (!pcol->bio)
+        if (!pcol->pages)
                return 0;
-        or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
-        if (unlikely(!or)) {
-                EXOFS_ERR("write_exec: Faild to osd_start_request()\n");
-                ret = -ENOMEM;
-                goto err;
-        }
        pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
        if (!pcol_copy) {
                EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
@@ -522,17 +515,22 @@ static int write_exec(struct page_collect *pcol)
        *pcol_copy = *pcol;
-        pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */
+        ios->pages = pcol_copy->pages;
-        osd_req_write(or, &obj, i_start, pcol_copy->bio, pcol_copy->length);
+        ios->nr_pages = pcol_copy->nr_pages;
-        ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred);
+        ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT;
+        ios->length = pcol_copy->length;
+        ios->done = writepages_done;
+        ios->private = pcol_copy;
+        ret = exofs_oi_write(oi, ios);
        if (unlikely(ret)) {
-                EXOFS_ERR("write_exec: exofs_async_op() Faild\n");
+                EXOFS_ERR("write_exec: exofs_oi_write() Faild\n");
                goto err;
        }
        atomic_inc(&pcol->sbi->s_curr_pending);
-        EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
+        EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
-                  pcol->inode->i_ino, pcol->pg_first, _LLU(i_start),
+                  pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
                  pcol->length);
        /* pages ownership was passed to pcol_copy */
        _pcol_reset(pcol);
@@ -540,9 +538,9 @@ static int write_exec(struct page_collect *pcol)
 err:
        _unlock_pcol_pages(pcol, ret, WRITE);
+        pcol_free(pcol);
        kfree(pcol_copy);
-        if (or)
-                osd_end_request(or);
        return ret;
 }
@@ -586,6 +584,9 @@ static int writepage_strip(struct page *page,
                        if (PageError(page))
                                ClearPageError(page);
                        unlock_page(page);
+                        EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) "
+                                     "outside the limits\n",
+                                     inode->i_ino, page->index);
                        return 0;
                }
        }
@@ -600,21 +601,24 @@ try_again:
                ret = write_exec(pcol);
                if (unlikely(ret))
                        goto fail;
+                EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) Discontinuity\n",
+                             inode->i_ino, page->index);
                goto try_again;
        }
-        if (!pcol->bio) {
+        if (!pcol->pages) {
                ret = pcol_try_alloc(pcol);
                if (unlikely(ret))
                        goto fail;
        }
-        EXOFS_DBGMSG("    writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+        EXOFS_DBGMSG2("    writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
                     inode->i_ino, page->index, len);
        ret = pcol_add_page(pcol, page, len);
        if (unlikely(ret)) {
-                EXOFS_DBGMSG("Failed pcol_add_page "
+                EXOFS_DBGMSG2("Failed pcol_add_page "
                             "nr_pages=%u total_length=0x%lx\n",
                             pcol->nr_pages, pcol->length);
@@ -634,6 +638,8 @@ try_again:
        return 0;
 fail:
+        EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n",
+                     inode->i_ino, page->index, ret);
        set_bit(AS_EIO, &page->mapping->flags);
        unlock_page(page);
        return ret;
@@ -652,14 +658,17 @@ static int exofs_writepages(struct address_space *mapping,
                        wbc->range_end >> PAGE_CACHE_SHIFT;
        if (start || end)
-                expected_pages = min(end - start + 1, 32L);
+                expected_pages = end - start + 1;
        else
                expected_pages = mapping->nrpages;
-        EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx"
+        if (expected_pages < 32L)
-                     " m->nrpages=%lu start=0x%lx end=0x%lx\n",
+                expected_pages = 32L;
+        EXOFS_DBGMSG2("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx "
+                     "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n",
                     mapping->host->i_ino, wbc->range_start, wbc->range_end,
-                     mapping->nrpages, start, end);
+                     mapping->nrpages, start, end, expected_pages);
        _pcol_init(&pcol, expected_pages, mapping->host);
@@ -731,13 +740,28 @@ static int exofs_write_begin_export(struct file *file,
                                        fsdata);
 }
+static int exofs_write_end(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned copied,
+                        struct page *page, void *fsdata)
+{
+        struct inode *inode = mapping->host;
+        /* According to comment in simple_write_end i_mutex is held */
+        loff_t i_size = inode->i_size;
+        int ret;
+        ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata);
+        if (i_size != inode->i_size)
+                mark_inode_dirty(inode);
+        return ret;
+}
 const struct address_space_operations exofs_aops = {
        .readpage       = exofs_readpage,
        .readpages      = exofs_readpages,
        .writepage      = exofs_writepage,
        .writepages     = exofs_writepages,
        .write_begin    = exofs_write_begin_export,
-        .write_end      = simple_write_end,
+        .write_end      = exofs_write_end,
 };
 /******************************************************************************
@@ -771,19 +795,28 @@ static int exofs_get_block(struct inode *inode, sector_t iblock,
 const struct osd_attr g_attr_logical_length = ATTR_DEF(
        OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
+static int _do_truncate(struct inode *inode)
+{
+        struct exofs_i_info *oi = exofs_i(inode);
+        loff_t isize = i_size_read(inode);
+        int ret;
+        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
+        ret = exofs_oi_truncate(oi, (u64)isize);
+        EXOFS_DBGMSG("(0x%lx) size=0x%llx\n", inode->i_ino, isize);
+        return ret;
+}
 /*
 * Truncate a file to the specified size - all we have to do is set the size
 * attribute.  We make sure the object exists first.
 */
 void exofs_truncate(struct inode *inode)
 {
-        struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
        struct exofs_i_info *oi = exofs_i(inode);
-        struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
-        struct osd_request *or;
-        struct osd_attr attr;
-        loff_t isize = i_size_read(inode);
-        __be64 newsize;
        int ret;
        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
@@ -793,22 +826,6 @@ void exofs_truncate(struct inode *inode)
                return;
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return;
-        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-        nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
-        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
-        if (unlikely(!or)) {
-                EXOFS_ERR("ERROR: exofs_truncate: osd_start_request failed\n");
-                goto fail;
-        }
-        osd_req_set_attributes(or, &obj);
-        newsize = cpu_to_be64((u64)isize);
-        attr = g_attr_logical_length;
-        attr.val_ptr = &newsize;
-        osd_req_add_set_attr_list(or, &attr, 1);
        /* if we are about to truncate an object, and it hasn't been
         * created yet, wait
@@ -816,8 +833,7 @@ void exofs_truncate(struct inode *inode)
        if (unlikely(wait_obj_created(oi)))
                goto fail;
-        ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+        ret = _do_truncate(inode);
-        osd_end_request(or);
        if (ret)
                goto fail;
@@ -845,67 +861,110 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
        return error;
 }
+static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF(
+        EXOFS_APAGE_FS_DATA,
+        EXOFS_ATTR_INODE_FILE_LAYOUT,
+        0);
+static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF(
+        EXOFS_APAGE_FS_DATA,
+        EXOFS_ATTR_INODE_DIR_LAYOUT,
+        0);
 /*
- * Read an inode from the OSD, and return it as is.  We also return the size
+ * Read the Linux inode info from the OSD, and return it as is. In exofs the
- * attribute in the 'sanity' argument if we got compiled with debugging turned
+ * inode info is in an application specific page/attribute of the osd-object.
- * on.
 */
 static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
-                    struct exofs_fcb *inode, uint64_t *sanity)
+                    struct exofs_fcb *inode)
 {
        struct exofs_sb_info *sbi = sb->s_fs_info;
-        struct osd_request *or;
+        struct osd_attr attrs[] = {
-        struct osd_attr attr;
+                [0] = g_attr_inode_data,
-        struct osd_obj_id obj = {sbi->s_pid,
+                [1] = g_attr_inode_file_layout,
-                                 oi->vfs_inode.i_ino + EXOFS_OBJ_OFF};
+                [2] = g_attr_inode_dir_layout,
+        };
+        struct exofs_io_state *ios;
+        struct exofs_on_disk_inode_layout *layout;
        int ret;
-        exofs_make_credential(oi->i_cred, &obj);
+        ret = exofs_get_io_state(&sbi->layout, &ios);
+        if (unlikely(ret)) {
-        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+                EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
-        if (unlikely(!or)) {
+                return ret;
-                EXOFS_ERR("exofs_get_inode: osd_start_request failed.\n");
-                return -ENOMEM;
        }
-        osd_req_get_attributes(or, &obj);
-        /* we need the inode attribute */
+        ios->obj.id = exofs_oi_objno(oi);
-        osd_req_add_get_attr_list(or, &g_attr_inode_data, 1);
+        exofs_make_credential(oi->i_cred, &ios->obj);
+        ios->cred = oi->i_cred;
-#ifdef EXOFS_DEBUG_OBJ_ISIZE
+        attrs[1].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
-        /* we get the size attributes to do a sanity check */
+        attrs[2].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
-        osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
-#endif
-        ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+        ios->in_attr = attrs;
-        if (ret)
+        ios->in_attr_len = ARRAY_SIZE(attrs);
+        ret = exofs_sbi_read(ios);
+        if (unlikely(ret)) {
+                EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n",
+                          _LLU(ios->obj.id), ret);
+                memset(inode, 0, sizeof(*inode));
+                inode->i_mode = 0040000 | (0777 & ~022);
+                /* If object is lost on target we might as well enable it's
+                 * delete.
+                 */
+                if ((ret == -ENOENT) || (ret == -EINVAL))
+                        ret = 0;
                goto out;
+        }
-        attr = g_attr_inode_data;
+        ret = extract_attr_from_ios(ios, &attrs[0]);
-        ret = extract_attr_from_req(or, &attr);
        if (ret) {
-                EXOFS_ERR("exofs_get_inode: extract_attr_from_req failed\n");
+                EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
                goto out;
        }
+        WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE);
+        memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE);
-        WARN_ON(attr.len != EXOFS_INO_ATTR_SIZE);
+        ret = extract_attr_from_ios(ios, &attrs[1]);
-        memcpy(inode, attr.val_ptr, EXOFS_INO_ATTR_SIZE);
+        if (ret) {
+                EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+                goto out;
+        }
+        if (attrs[1].len) {
+                layout = attrs[1].val_ptr;
+                if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
+                        EXOFS_ERR("%s: unsupported files layout %d\n",
+                                __func__, layout->gen_func);
+                        ret = -ENOTSUPP;
+                        goto out;
+                }
+        }
-#ifdef EXOFS_DEBUG_OBJ_ISIZE
+        ret = extract_attr_from_ios(ios, &attrs[2]);
-        attr = g_attr_logical_length;
-        ret = extract_attr_from_req(or, &attr);
        if (ret) {
-                EXOFS_ERR("ERROR: extract attr from or failed\n");
+                EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
                goto out;
        }
-        *sanity = get_unaligned_be64(attr.val_ptr);
+        if (attrs[2].len) {
-#endif
+                layout = attrs[2].val_ptr;
+                if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
+                        EXOFS_ERR("%s: unsupported meta-data layout %d\n",
+                                __func__, layout->gen_func);
+                        ret = -ENOTSUPP;
+                        goto out;
+                }
+        }
 out:
-        osd_end_request(or);
+        exofs_put_io_state(ios);
        return ret;
 }
+static void __oi_init(struct exofs_i_info *oi)
+{
+        init_waitqueue_head(&oi->i_wq);
+        oi->i_flags = 0;
+}
 /*
 * Fill in an inode read from the OSD and set it up for use
 */
@@ -914,7 +973,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
        struct exofs_i_info *oi;
        struct exofs_fcb fcb;
        struct inode *inode;
-        uint64_t uninitialized_var(sanity);
        int ret;
        inode = iget_locked(sb, ino);
@@ -923,13 +981,13 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
        if (!(inode->i_state & I_NEW))
                return inode;
        oi = exofs_i(inode);
+        __oi_init(oi);
        /* read the inode from the osd */
-        ret = exofs_get_inode(sb, oi, &fcb, &sanity);
+        ret = exofs_get_inode(sb, oi, &fcb);
        if (ret)
                goto bad_inode;
-        init_waitqueue_head(&oi->i_wq);
        set_obj_created(oi);
        /* copy stuff from on-disk struct to in-memory struct */
@@ -947,15 +1005,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
        inode->i_blkbits = EXOFS_BLKSHIFT;
        inode->i_generation = le32_to_cpu(fcb.i_generation);
-#ifdef EXOFS_DEBUG_OBJ_ISIZE
-        if ((inode->i_size != sanity) &&
-                (!exofs_inode_is_fast_symlink(inode))) {
-                EXOFS_ERR("WARNING: Size of object from inode and "
-                          "attributes differ (%lld != %llu)\n",
-                          inode->i_size, _LLU(sanity));
-        }
-#endif
        oi->i_dir_start_lookup = 0;
        if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
@@ -1020,23 +1069,30 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi)
 * set the obj_created flag so that other methods know that the object exists on
 * the OSD.
 */
-static void create_done(struct osd_request *or, void *p)
+static void create_done(struct exofs_io_state *ios, void *p)
 {
        struct inode *inode = p;
        struct exofs_i_info *oi = exofs_i(inode);
        struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
        int ret;
-        ret = exofs_check_ok(or);
+        ret = exofs_check_io(ios, NULL);
-        osd_end_request(or);
+        exofs_put_io_state(ios);
        atomic_dec(&sbi->s_curr_pending);
        if (unlikely(ret)) {
                EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
-                          _LLU(sbi->s_pid), _LLU(inode->i_ino + EXOFS_OBJ_OFF));
+                          _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid));
-                make_bad_inode(inode);
+                /*TODO: When FS is corrupted creation can fail, object already
-        } else
+                 * exist. Get rid of this asynchronous creation, if exist
-                set_obj_created(oi);
+                 * increment the obj counter and try the next object. Until we
+                 * succeed. All these dangling objects will be made into lost
+                 * files by chkfs.exofs
+                 */
+        }
+        set_obj_created(oi);
        atomic_dec(&inode->i_count);
        wake_up(&oi->i_wq);
@@ -1051,8 +1107,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
        struct inode *inode;
        struct exofs_i_info *oi;
        struct exofs_sb_info *sbi;
-        struct osd_request *or;
+        struct exofs_io_state *ios;
-        struct osd_obj_id obj;
        int ret;
        sb = dir->i_sb;
@@ -1061,8 +1116,8 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
                return ERR_PTR(-ENOMEM);
        oi = exofs_i(inode);
+        __oi_init(oi);
-        init_waitqueue_head(&oi->i_wq);
        set_obj_2bcreated(oi);
        sbi = sb->s_fs_info;
@@ -1089,28 +1144,28 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
        mark_inode_dirty(inode);
-        obj.partition = sbi->s_pid;
+        ret = exofs_get_io_state(&sbi->layout, &ios);
-        obj.id = inode->i_ino + EXOFS_OBJ_OFF;
+        if (unlikely(ret)) {
-        exofs_make_credential(oi->i_cred, &obj);
+                EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n");
+                return ERR_PTR(ret);
-        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
-        if (unlikely(!or)) {
-                EXOFS_ERR("exofs_new_inode: osd_start_request failed\n");
-                return ERR_PTR(-ENOMEM);
        }
-        osd_req_create_object(or, &obj);
+        ios->obj.id = exofs_oi_objno(oi);
+        exofs_make_credential(oi->i_cred, &ios->obj);
        /* increment the refcount so that the inode will still be around when we
         * reach the callback
         */
        atomic_inc(&inode->i_count);
-        ret = exofs_async_op(or, create_done, inode, oi->i_cred);
+        ios->done = create_done;
+        ios->private = inode;
+        ios->cred = oi->i_cred;
+        ret = exofs_sbi_create(ios);
        if (ret) {
                atomic_dec(&inode->i_count);
-                osd_end_request(or);
+                exofs_put_io_state(ios);
-                return ERR_PTR(-EIO);
+                return ERR_PTR(ret);
        }
        atomic_inc(&sbi->s_curr_pending);
@@ -1128,11 +1183,11 @@ struct updatei_args {
 /*
 * Callback function from exofs_update_inode().
 */
-static void updatei_done(struct osd_request *or, void *p)
+static void updatei_done(struct exofs_io_state *ios, void *p)
 {
        struct updatei_args *args = p;
-        osd_end_request(or);
+        exofs_put_io_state(ios);
        atomic_dec(&args->sbi->s_curr_pending);
@@ -1148,16 +1203,17 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
        struct exofs_i_info *oi = exofs_i(inode);
        struct super_block *sb = inode->i_sb;
        struct exofs_sb_info *sbi = sb->s_fs_info;
-        struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
+        struct exofs_io_state *ios;
-        struct osd_request *or;
        struct osd_attr attr;
        struct exofs_fcb *fcb;
        struct updatei_args *args;
        int ret;
        args = kzalloc(sizeof(*args), GFP_KERNEL);
-        if (!args)
+        if (!args) {
+                EXOFS_DBGMSG("Faild kzalloc of args\n");
                return -ENOMEM;
+        }
        fcb = &args->fcb;
@@ -1186,18 +1242,16 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
        } else
                memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
-        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        ret = exofs_get_io_state(&sbi->layout, &ios);
-        if (unlikely(!or)) {
+        if (unlikely(ret)) {
-                EXOFS_ERR("exofs_update_inode: osd_start_request failed.\n");
+                EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
-                ret = -ENOMEM;
                goto free_args;
        }
-        osd_req_set_attributes(or, &obj);
        attr = g_attr_inode_data;
        attr.val_ptr = fcb;
-        osd_req_add_set_attr_list(or, &attr, 1);
+        ios->out_attr_len = 1;
+        ios->out_attr = &attr;
        if (!obj_created(oi)) {
                EXOFS_DBGMSG("!obj_created\n");
@@ -1206,43 +1260,42 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
                EXOFS_DBGMSG("wait_event done\n");
        }
-        if (do_sync) {
+        if (!do_sync) {
-                ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
-                osd_end_request(or);
-                goto free_args;
-        } else {
                args->sbi = sbi;
+                ios->done = updatei_done;
+                ios->private = args;
+        }
-                ret = exofs_async_op(or, updatei_done, args, oi->i_cred);
+        ret = exofs_oi_write(oi, ios);
-                if (ret) {
+        if (!do_sync && !ret) {
-                        osd_end_request(or);
-                        goto free_args;
-                }
                atomic_inc(&sbi->s_curr_pending);
                goto out; /* deallocation in updatei_done */
        }
+        exofs_put_io_state(ios);
 free_args:
        kfree(args);
 out:
-        EXOFS_DBGMSG("ret=>%d\n", ret);
+        EXOFS_DBGMSG("(0x%lx) do_sync=%d ret=>%d\n",
+                     inode->i_ino, do_sync, ret);
        return ret;
 }
-int exofs_write_inode(struct inode *inode, int wait)
+int exofs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-        return exofs_update_inode(inode, wait);
+        return exofs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
 }
 /*
 * Callback function from exofs_delete_inode() - don't have much cleaning up to
 * do.
 */
-static void delete_done(struct osd_request *or, void *p)
+static void delete_done(struct exofs_io_state *ios, void *p)
 {
-        struct exofs_sb_info *sbi;
+        struct exofs_sb_info *sbi = p;
-        osd_end_request(or);
-        sbi = p;
+        exofs_put_io_state(ios);
        atomic_dec(&sbi->s_curr_pending);
 }
@@ -1256,8 +1309,7 @@ void exofs_delete_inode(struct inode *inode)
        struct exofs_i_info *oi = exofs_i(inode);
        struct super_block *sb = inode->i_sb;
        struct exofs_sb_info *sbi = sb->s_fs_info;
-        struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
+        struct exofs_io_state *ios;
-        struct osd_request *or;
        int ret;
        truncate_inode_pages(&inode->i_data, 0);
@@ -1274,25 +1326,26 @@ void exofs_delete_inode(struct inode *inode)
        clear_inode(inode);
-        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        ret = exofs_get_io_state(&sbi->layout, &ios);
-        if (unlikely(!or)) {
+        if (unlikely(ret)) {
-                EXOFS_ERR("exofs_delete_inode: osd_start_request failed\n");
+                EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__);
                return;
        }
-        osd_req_remove_object(or, &obj);
        /* if we are deleting an obj that hasn't been created yet, wait */
        if (!obj_created(oi)) {
                BUG_ON(!obj_2bcreated(oi));
                wait_event(oi->i_wq, obj_created(oi));
        }
-        ret = exofs_async_op(or, delete_done, sbi, oi->i_cred);
+        ios->obj.id = exofs_oi_objno(oi);
+        ios->done = delete_done;
+        ios->private = sbi;
+        ios->cred = oi->i_cred;
+        ret = exofs_sbi_remove(ios);
        if (ret) {
-                EXOFS_ERR(
+                EXOFS_ERR("%s: exofs_sbi_remove failed\n", __func__);
-                       "ERROR: @exofs_delete_inode exofs_async_op failed\n");
+                exofs_put_io_state(ios);
-                osd_end_request(or);
                return;
        }
        atomic_inc(&sbi->s_curr_pending);
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
new file mode 100644
index 000000000000..4337cad7777b
--- /dev/null
+++ b/fs/exofs/ios.c
@@ -0,0 +1,823 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com)
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/slab.h>
+#include <scsi/scsi_device.h>
+#include <asm/div64.h>
+#include "exofs.h"
+#define EXOFS_DBGMSG2(M...) do {} while (0)
+/* #define EXOFS_DBGMSG2 EXOFS_DBGMSG */
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
+{
+        osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
+}
+int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
+                    u64 offset, void *p, unsigned length)
+{
+        struct osd_request *or = osd_start_request(od, GFP_KERNEL);
+/*      struct osd_sense_info osi = {.key = 0};*/
+        int ret;
+        if (unlikely(!or)) {
+                EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__);
+                return -ENOMEM;
+        }
+        ret = osd_req_read_kern(or, obj, offset, p, length);
+        if (unlikely(ret)) {
+                EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
+                goto out;
+        }
+        ret = osd_finalize_request(or, 0, cred, NULL);
+        if (unlikely(ret)) {
+                EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
+                goto out;
+        }
+        ret = osd_execute_request(or);
+        if (unlikely(ret))
+                EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
+        /* osd_req_decode_sense(or, ret); */
+out:
+        osd_end_request(or);
+        return ret;
+}
+int exofs_get_io_state(struct exofs_layout *layout,
+                       struct exofs_io_state **pios)
+{
+        struct exofs_io_state *ios;
+        /*TODO: Maybe use kmem_cach per sbi of size
+         * exofs_io_state_size(layout->s_numdevs)
+         */
+        ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL);
+        if (unlikely(!ios)) {
+                EXOFS_DBGMSG("Faild kzalloc bytes=%d\n",
+                             exofs_io_state_size(layout->s_numdevs));
+                *pios = NULL;
+                return -ENOMEM;
+        }
+        ios->layout = layout;
+        ios->obj.partition = layout->s_pid;
+        *pios = ios;
+        return 0;
+}
+void exofs_put_io_state(struct exofs_io_state *ios)
+{
+        if (ios) {
+                unsigned i;
+                for (i = 0; i < ios->numdevs; i++) {
+                        struct exofs_per_dev_state *per_dev = &ios->per_dev[i];
+                        if (per_dev->or)
+                                osd_end_request(per_dev->or);
+                        if (per_dev->bio)
+                                bio_put(per_dev->bio);
+                }
+                kfree(ios);
+        }
+}
+unsigned exofs_layout_od_id(struct exofs_layout *layout,
+                            osd_id obj_no, unsigned layout_index)
+{
+/*      switch (layout->lay_func) {
+        case LAYOUT_MOVING_WINDOW:
+        {*/
+                unsigned dev_mod = obj_no;
+                return (layout_index + dev_mod * layout->mirrors_p1) %
+                                                              layout->s_numdevs;
+/*      }
+        case LAYOUT_FUNC_IMPLICT:
+                return layout->devs[layout_index];
+        }*/
+}
+static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios,
+                                           unsigned layout_index)
+{
+        return ios->layout->s_ods[
+                exofs_layout_od_id(ios->layout, ios->obj.id, layout_index)];
+}
+static void _sync_done(struct exofs_io_state *ios, void *p)
+{
+        struct completion *waiting = p;
+        complete(waiting);
+}
+static void _last_io(struct kref *kref)
+{
+        struct exofs_io_state *ios = container_of(
+                                        kref, struct exofs_io_state, kref);
+        ios->done(ios, ios->private);
+}
+static void _done_io(struct osd_request *or, void *p)
+{
+        struct exofs_io_state *ios = p;
+        kref_put(&ios->kref, _last_io);
+}
+static int exofs_io_execute(struct exofs_io_state *ios)
+{
+        DECLARE_COMPLETION_ONSTACK(wait);
+        bool sync = (ios->done == NULL);
+        int i, ret;
+        if (sync) {
+                ios->done = _sync_done;
+                ios->private = &wait;
+        }
+        for (i = 0; i < ios->numdevs; i++) {
+                struct osd_request *or = ios->per_dev[i].or;
+                if (unlikely(!or))
+                        continue;
+                ret = osd_finalize_request(or, 0, ios->cred, NULL);
+                if (unlikely(ret)) {
+                        EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n",
+                                     ret);
+                        return ret;
+                }
+        }
+        kref_init(&ios->kref);
+        for (i = 0; i < ios->numdevs; i++) {
+                struct osd_request *or = ios->per_dev[i].or;
+                if (unlikely(!or))
+                        continue;
+                kref_get(&ios->kref);
+                osd_execute_request_async(or, _done_io, ios);
+        }
+        kref_put(&ios->kref, _last_io);
+        ret = 0;
+        if (sync) {
+                wait_for_completion(&wait);
+                ret = exofs_check_io(ios, NULL);
+        }
+        return ret;
+}
+static void _clear_bio(struct bio *bio)
+{
+        struct bio_vec *bv;
+        unsigned i;
+        __bio_for_each_segment(bv, bio, i, 0) {
+                unsigned this_count = bv->bv_len;
+                if (likely(PAGE_SIZE == this_count))
+                        clear_highpage(bv->bv_page);
+                else
+                        zero_user(bv->bv_page, bv->bv_offset, this_count);
+        }
+}
+int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
+{
+        enum osd_err_priority acumulated_osd_err = 0;
+        int acumulated_lin_err = 0;
+        int i;
+        for (i = 0; i < ios->numdevs; i++) {
+                struct osd_sense_info osi;
+                struct osd_request *or = ios->per_dev[i].or;
+                int ret;
+                if (unlikely(!or))
+                        continue;
+                ret = osd_req_decode_sense(or, &osi);
+                if (likely(!ret))
+                        continue;
+                if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
+                        /* start read offset passed endof file */
+                        _clear_bio(ios->per_dev[i].bio);
+                        EXOFS_DBGMSG("start read offset passed end of file "
+                                "offset=0x%llx, length=0x%llx\n",
+                                _LLU(ios->per_dev[i].offset),
+                                _LLU(ios->per_dev[i].length));
+                        continue; /* we recovered */
+                }
+                if (osi.osd_err_pri >= acumulated_osd_err) {
+                        acumulated_osd_err = osi.osd_err_pri;
+                        acumulated_lin_err = ret;
+                }
+        }
+        /* TODO: raid specific residual calculations */
+        if (resid) {
+                if (likely(!acumulated_lin_err))
+                        *resid = 0;
+                else
+                        *resid = ios->length;
+        }
+        return acumulated_lin_err;
+}
+/*
+ * L - logical offset into the file
+ *
+ * U - The number of bytes in a stripe within a group
+ *
+ *      U = stripe_unit * group_width
+ *
+ * T - The number of bytes striped within a group of component objects
+ *     (before advancing to the next group)
+ *
+ *      T = stripe_unit * group_width * group_depth
+ *
+ * S - The number of bytes striped across all component objects
+ *     before the pattern repeats
+ *
+ *      S = stripe_unit * group_width * group_depth * group_count
+ *
+ * M - The "major" (i.e., across all components) stripe number
+ *
+ *      M = L / S
+ *
+ * G - Counts the groups from the beginning of the major stripe
+ *
+ *      G = (L - (M * S)) / T   [or (L % S) / T]
+ *
+ * H - The byte offset within the group
+ *
+ *      H = (L - (M * S)) % T   [or (L % S) % T]
+ *
+ * N - The "minor" (i.e., across the group) stripe number
+ *
+ *      N = H / U
+ *
+ * C - The component index coresponding to L
+ *
+ *      C = (H - (N * U)) / stripe_unit + G * group_width
+ *      [or (L % U) / stripe_unit + G * group_width]
+ *
+ * O - The component offset coresponding to L
+ *
+ *      O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
+ */
+struct _striping_info {
+        u64 obj_offset;
+        u64 group_length;
+        u64 total_group_length;
+        u64 Major;
+        unsigned dev;
+        unsigned unit_off;
+};
+static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset,
+                              struct _striping_info *si)
+{
+        u32     stripe_unit = ios->layout->stripe_unit;
+        u32     group_width = ios->layout->group_width;
+        u64     group_depth = ios->layout->group_depth;
+        u32     U = stripe_unit * group_width;
+        u64     T = U * group_depth;
+        u64     S = T * ios->layout->group_count;
+        u64     M = div64_u64(file_offset, S);
+        /*
+        G = (L - (M * S)) / T
+        H = (L - (M * S)) % T
+        */
+        u64     LmodS = file_offset - M * S;
+        u32     G = div64_u64(LmodS, T);
+        u64     H = LmodS - G * T;
+        u32     N = div_u64(H, U);
+        /* "H - (N * U)" is just "H % U" so it's bound to u32 */
+        si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
+        si->dev *= ios->layout->mirrors_p1;
+        div_u64_rem(file_offset, stripe_unit, &si->unit_off);
+        si->obj_offset = si->unit_off + (N * stripe_unit) +
+                                  (M * group_depth * stripe_unit);
+        si->group_length = T - H;
+        si->total_group_length = T;
+        si->Major = M;
+}
+static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_pg,
+                unsigned pgbase, struct exofs_per_dev_state *per_dev,
+                int cur_len)
+{
+        unsigned pg = *cur_pg;
+        struct request_queue *q =
+                        osd_request_queue(exofs_ios_od(ios, per_dev->dev));
+        per_dev->length += cur_len;
+        if (per_dev->bio == NULL) {
+                unsigned pages_in_stripe = ios->layout->group_width *
+                                        (ios->layout->stripe_unit / PAGE_SIZE);
+                unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
+                                                ios->layout->group_width;
+                per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
+                if (unlikely(!per_dev->bio)) {
+                        EXOFS_DBGMSG("Faild to allocate BIO size=%u\n",
+                                     bio_size);
+                        return -ENOMEM;
+                }
+        }
+        while (cur_len > 0) {
+                unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
+                unsigned added_len;
+                BUG_ON(ios->nr_pages <= pg);
+                cur_len -= pglen;
+                added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg],
+                                            pglen, pgbase);
+                if (unlikely(pglen != added_len))
+                        return -ENOMEM;
+                pgbase = 0;
+                ++pg;
+        }
+        BUG_ON(cur_len);
+        *cur_pg = pg;
+        return 0;
+}
+static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
+                              struct _striping_info *si, unsigned first_comp)
+{
+        unsigned stripe_unit = ios->layout->stripe_unit;
+        unsigned mirrors_p1 = ios->layout->mirrors_p1;
+        unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
+        unsigned dev = si->dev;
+        unsigned first_dev = dev - (dev % devs_in_group);
+        unsigned comp = first_comp + (dev - first_dev);
+        unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
+        unsigned cur_pg = ios->pages_consumed;
+        int ret = 0;
+        while (length) {
+                struct exofs_per_dev_state *per_dev = &ios->per_dev[comp];
+                unsigned cur_len, page_off = 0;
+                if (!per_dev->length) {
+                        per_dev->dev = dev;
+                        if (dev < si->dev) {
+                                per_dev->offset = si->obj_offset + stripe_unit -
+                                                                   si->unit_off;
+                                cur_len = stripe_unit;
+                        } else if (dev == si->dev) {
+                                per_dev->offset = si->obj_offset;
+                                cur_len = stripe_unit - si->unit_off;
+                                page_off = si->unit_off & ~PAGE_MASK;
+                                BUG_ON(page_off && (page_off != ios->pgbase));
+                        } else { /* dev > si->dev */
+                                per_dev->offset = si->obj_offset - si->unit_off;
+                                cur_len = stripe_unit;
+                        }
+                        if (max_comp < comp)
+                                max_comp = comp;
+                        dev += mirrors_p1;
+                        dev = (dev % devs_in_group) + first_dev;
+                } else {
+                        cur_len = stripe_unit;
+                }
+                if (cur_len >= length)
+                        cur_len = length;
+                ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
+                                       cur_len);
+                if (unlikely(ret))
+                        goto out;
+                comp += mirrors_p1;
+                comp = (comp % devs_in_group) + first_comp;
+                length -= cur_len;
+        }
+out:
+        ios->numdevs = max_comp + mirrors_p1;
+        ios->pages_consumed = cur_pg;
+        return ret;
+}
+static int _prepare_for_striping(struct exofs_io_state *ios)
+{
+        u64 length = ios->length;
+        struct _striping_info si;
+        unsigned devs_in_group = ios->layout->group_width *
+                                 ios->layout->mirrors_p1;
+        unsigned first_comp = 0;
+        int ret = 0;
+        _calc_stripe_info(ios, ios->offset, &si);
+        if (!ios->pages) {
+                if (ios->kern_buff) {
+                        struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
+                        per_dev->offset = si.obj_offset;
+                        per_dev->dev = si.dev;
+                        /* no cross device without page array */
+                        BUG_ON((ios->layout->group_width > 1) &&
+                               (si.unit_off + ios->length >
+                                ios->layout->stripe_unit));
+                }
+                ios->numdevs = ios->layout->mirrors_p1;
+                return 0;
+        }
+        while (length) {
+                if (length < si.group_length)
+                        si.group_length = length;
+                ret = _prepare_one_group(ios, si.group_length, &si, first_comp);
+                if (unlikely(ret))
+                        goto out;
+                length -= si.group_length;
+                si.group_length = si.total_group_length;
+                si.unit_off = 0;
+                ++si.Major;
+                si.obj_offset = si.Major * ios->layout->stripe_unit *
+                                                ios->layout->group_depth;
+                si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group;
+                si.dev %= ios->layout->s_numdevs;
+                first_comp += devs_in_group;
+                first_comp %= ios->layout->s_numdevs;
+        }
+out:
+        return ret;
+}
+int exofs_sbi_create(struct exofs_io_state *ios)
+{
+        int i, ret;
+        for (i = 0; i < ios->layout->s_numdevs; i++) {
+                struct osd_request *or;
+                or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
+                if (unlikely(!or)) {
+                        EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                ios->per_dev[i].or = or;
+                ios->numdevs++;
+                osd_req_create_object(or, &ios->obj);
+        }
+        ret = exofs_io_execute(ios);
+out:
+        return ret;
+}
+int exofs_sbi_remove(struct exofs_io_state *ios)
+{
+        int i, ret;
+        for (i = 0; i < ios->layout->s_numdevs; i++) {
+                struct osd_request *or;
+                or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
+                if (unlikely(!or)) {
+                        EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                ios->per_dev[i].or = or;
+                ios->numdevs++;
+                osd_req_remove_object(or, &ios->obj);
+        }
+        ret = exofs_io_execute(ios);
+out:
+        return ret;
+}
+static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
+{
+        struct exofs_per_dev_state *master_dev = &ios->per_dev[cur_comp];
+        unsigned dev = ios->per_dev[cur_comp].dev;
+        unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
+        int ret = 0;
+        if (ios->pages && !master_dev->length)
+                return 0; /* Just an empty slot */
+        for (; cur_comp < last_comp; ++cur_comp, ++dev) {
+                struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
+                struct osd_request *or;
+                or = osd_start_request(exofs_ios_od(ios, dev), GFP_KERNEL);
+                if (unlikely(!or)) {
+                        EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                per_dev->or = or;
+                per_dev->offset = master_dev->offset;
+                if (ios->pages) {
+                        struct bio *bio;
+                        if (per_dev != master_dev) {
+                                bio = bio_kmalloc(GFP_KERNEL,
+                                                  master_dev->bio->bi_max_vecs);
+                                if (unlikely(!bio)) {
+                                        EXOFS_DBGMSG(
+                                              "Faild to allocate BIO size=%u\n",
+                                              master_dev->bio->bi_max_vecs);
+                                        ret = -ENOMEM;
+                                        goto out;
+                                }
+                                __bio_clone(bio, master_dev->bio);
+                                bio->bi_bdev = NULL;
+                                bio->bi_next = NULL;
+                                per_dev->length = master_dev->length;
+                                per_dev->bio =  bio;
+                                per_dev->dev = dev;
+                        } else {
+                                bio = master_dev->bio;
+                                /* FIXME: bio_set_dir() */
+                                bio->bi_rw |= (1 << BIO_RW);
+                        }
+                        osd_req_write(or, &ios->obj, per_dev->offset, bio,
+                                      per_dev->length);
+                        EXOFS_DBGMSG("write(0x%llx) offset=0x%llx "
+                                      "length=0x%llx dev=%d\n",
+                                     _LLU(ios->obj.id), _LLU(per_dev->offset),
+                                     _LLU(per_dev->length), dev);
+                } else if (ios->kern_buff) {
+                        ret = osd_req_write_kern(or, &ios->obj, per_dev->offset,
+                                           ios->kern_buff, ios->length);
+                        if (unlikely(ret))
+                                goto out;
+                        EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
+                                      "length=0x%llx dev=%d\n",
+                                     _LLU(ios->obj.id), _LLU(per_dev->offset),
+                                     _LLU(ios->length), dev);
+                } else {
+                        osd_req_set_attributes(or, &ios->obj);
+                        EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
+                                     _LLU(ios->obj.id), ios->out_attr_len, dev);
+                }
+                if (ios->out_attr)
+                        osd_req_add_set_attr_list(or, ios->out_attr,
+                                                  ios->out_attr_len);
+                if (ios->in_attr)
+                        osd_req_add_get_attr_list(or, ios->in_attr,
+                                                  ios->in_attr_len);
+        }
+out:
+        return ret;
+}
+int exofs_sbi_write(struct exofs_io_state *ios)
+{
+        int i;
+        int ret;
+        ret = _prepare_for_striping(ios);
+        if (unlikely(ret))
+                return ret;
+        for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+                ret = _sbi_write_mirror(ios, i);
+                if (unlikely(ret))
+                        return ret;
+        }
+        ret = exofs_io_execute(ios);
+        return ret;
+}
+static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp)
+{
+        struct osd_request *or;
+        struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
+        unsigned first_dev = (unsigned)ios->obj.id;
+        if (ios->pages && !per_dev->length)
+                return 0; /* Just an empty slot */
+        first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
+        or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL);
+        if (unlikely(!or)) {
+                EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+                return -ENOMEM;
+        }
+        per_dev->or = or;
+        if (ios->pages) {
+                osd_req_read(or, &ios->obj, per_dev->offset,
+                                per_dev->bio, per_dev->length);
+                EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
+                             " dev=%d\n", _LLU(ios->obj.id),
+                             _LLU(per_dev->offset), _LLU(per_dev->length),
+                             first_dev);
+        } else if (ios->kern_buff) {
+                int ret = osd_req_read_kern(or, &ios->obj, per_dev->offset,
+                                            ios->kern_buff, ios->length);
+                EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
+                              "length=0x%llx dev=%d ret=>%d\n",
+                              _LLU(ios->obj.id), _LLU(per_dev->offset),
+                              _LLU(ios->length), first_dev, ret);
+                if (unlikely(ret))
+                        return ret;
+        } else {
+                osd_req_get_attributes(or, &ios->obj);
+                EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
+                              _LLU(ios->obj.id), ios->in_attr_len, first_dev);
+        }
+        if (ios->out_attr)
+                osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len);
+        if (ios->in_attr)
+                osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len);
+        return 0;
+}
+int exofs_sbi_read(struct exofs_io_state *ios)
+{
+        int i;
+        int ret;
+        ret = _prepare_for_striping(ios);
+        if (unlikely(ret))
+                return ret;
+        for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+                ret = _sbi_read_mirror(ios, i);
+                if (unlikely(ret))
+                        return ret;
+        }
+        ret = exofs_io_execute(ios);
+        return ret;
+}
+int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
+{
+        struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
+        void *iter = NULL;
+        int nelem;
+        do {
+                nelem = 1;
+                osd_req_decode_get_attr_list(ios->per_dev[0].or,
+                                             &cur_attr, &nelem, &iter);
+                if ((cur_attr.attr_page == attr->attr_page) &&
+                    (cur_attr.attr_id == attr->attr_id)) {
+                        attr->len = cur_attr.len;
+                        attr->val_ptr = cur_attr.val_ptr;
+                        return 0;
+                }
+        } while (iter);
+        return -EIO;
+}
+static int _truncate_mirrors(struct exofs_io_state *ios, unsigned cur_comp,
+                             struct osd_attr *attr)
+{
+        int last_comp = cur_comp + ios->layout->mirrors_p1;
+        for (; cur_comp < last_comp; ++cur_comp) {
+                struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
+                struct osd_request *or;
+                or = osd_start_request(exofs_ios_od(ios, cur_comp), GFP_KERNEL);
+                if (unlikely(!or)) {
+                        EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+                        return -ENOMEM;
+                }
+                per_dev->or = or;
+                osd_req_set_attributes(or, &ios->obj);
+                osd_req_add_set_attr_list(or, attr, 1);
+        }
+        return 0;
+}
+int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
+{
+        struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info;
+        struct exofs_io_state *ios;
+        struct exofs_trunc_attr {
+                struct osd_attr attr;
+                __be64 newsize;
+        } *size_attrs;
+        struct _striping_info si;
+        int i, ret;
+        ret = exofs_get_io_state(&sbi->layout, &ios);
+        if (unlikely(ret))
+                return ret;
+        size_attrs = kcalloc(ios->layout->group_width, sizeof(*size_attrs),
+                             GFP_KERNEL);
+        if (unlikely(!size_attrs)) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        ios->obj.id = exofs_oi_objno(oi);
+        ios->cred = oi->i_cred;
+        ios->numdevs = ios->layout->s_numdevs;
+        _calc_stripe_info(ios, size, &si);
+        for (i = 0; i < ios->layout->group_width; ++i) {
+                struct exofs_trunc_attr *size_attr = &size_attrs[i];
+                u64 obj_size;
+                if (i < si.dev)
+                        obj_size = si.obj_offset +
+                                        ios->layout->stripe_unit - si.unit_off;
+                else if (i == si.dev)
+                        obj_size = si.obj_offset;
+                else /* i > si.dev */
+                        obj_size = si.obj_offset - si.unit_off;
+                size_attr->newsize = cpu_to_be64(obj_size);
+                size_attr->attr = g_attr_logical_length;
+                size_attr->attr.val_ptr = &size_attr->newsize;
+                ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
+                                        &size_attr->attr);
+                if (unlikely(ret))
+                        goto out;
+        }
+        ret = exofs_io_execute(ios);
+out:
+        kfree(size_attrs);
+        exofs_put_io_state(ios);
+        return ret;
+}
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
deleted file mode 100644
index 4372542df284..000000000000
--- a/fs/exofs/osd.c
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation.  Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include <scsi/scsi_device.h>
-#include <scsi/osd_sense.h>
-#include "exofs.h"
-int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
-{
-        struct osd_sense_info osi;
-        int ret = osd_req_decode_sense(or, &osi);
-        if (ret) { /* translate to Linux codes */
-                if (osi.additional_code == scsi_invalid_field_in_cdb) {
-                        if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE)
-                                ret = -EFAULT;
-                        if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID)
-                                ret = -ENOENT;
-                        else
-                                ret = -EINVAL;
-                } else if (osi.additional_code == osd_quota_error)
-                        ret = -ENOSPC;
-                else
-                        ret = -EIO;
-        }
-        /* FIXME: should be include in osd_sense_info */
-        if (in_resid)
-                *in_resid = or->in.req ? or->in.req->resid_len : 0;
-        if (out_resid)
-                *out_resid = or->out.req ? or->out.req->resid_len : 0;
-        return ret;
-}
-void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
-{
-        osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
-}
-/*
- * Perform a synchronous OSD operation.
- */
-int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
-{
-        int ret;
-        or->timeout = timeout;
-        ret = osd_finalize_request(or, 0, credential, NULL);
-        if (ret) {
-                EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
-                return ret;
-        }
-        ret = osd_execute_request(or);
-        if (ret)
-                EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
-        /* osd_req_decode_sense(or, ret); */
-        return ret;
-}
-/*
- * Perform an asynchronous OSD operation.
- */
-int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done,
-                   void *caller_context, u8 *cred)
-{
-        int ret;
-        ret = osd_finalize_request(or, 0, cred, NULL);
-        if (ret) {
-                EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
-                return ret;
-        }
-        ret = osd_execute_request_async(or, async_done, caller_context);
-        if (ret)
-                EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret);
-        return ret;
-}
-int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
-{
-        struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
-        void *iter = NULL;
-        int nelem;
-        do {
-                nelem = 1;
-                osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
-                if ((cur_attr.attr_page == attr->attr_page) &&
-                    (cur_attr.attr_id == attr->attr_id)) {
-                        attr->len = cur_attr.len;
-                        attr->val_ptr = cur_attr.val_ptr;
-                        return 0;
-                }
-        } while (iter);
-        return -EIO;
-}
diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h
new file mode 100644
index 000000000000..c52e9888b8ab
--- /dev/null
+++ b/fs/exofs/pnfs.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License  version 2 as published by the Free
+ * Software Foundation.
+ *
+ */
+/* FIXME: Remove this file once pnfs hits mainline */
+#ifndef __EXOFS_PNFS_H__
+#define __EXOFS_PNFS_H__
+#if ! defined(__PNFS_OSD_XDR_H__)
+enum pnfs_iomode {
+        IOMODE_READ = 1,
+        IOMODE_RW = 2,
+        IOMODE_ANY = 3,
+};
+/* Layout Structure */
+enum pnfs_osd_raid_algorithm4 {
+        PNFS_OSD_RAID_0         = 1,
+        PNFS_OSD_RAID_4         = 2,
+        PNFS_OSD_RAID_5         = 3,
+        PNFS_OSD_RAID_PQ        = 4     /* Reed-Solomon P+Q */
+};
+struct pnfs_osd_data_map {
+        u32     odm_num_comps;
+        u64     odm_stripe_unit;
+        u32     odm_group_width;
+        u32     odm_group_depth;
+        u32     odm_mirror_cnt;
+        u32     odm_raid_algorithm;
+};
+#endif /* ! defined(__PNFS_OSD_XDR_H__) */
+#endif /* __EXOFS_PNFS_H__ */
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 9f500dec3b59..18e57ea1e5b4 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -37,6 +37,7 @@
 #include <linux/vfs.h>
 #include <linux/random.h>
 #include <linux/exportfs.h>
+#include <linux/slab.h>
 #include "exofs.h"
@@ -203,49 +204,45 @@ int exofs_sync_fs(struct super_block *sb, int wait)
 {
        struct exofs_sb_info *sbi;
        struct exofs_fscb *fscb;
-        struct osd_request *or;
+        struct exofs_io_state *ios;
-        struct osd_obj_id obj;
        int ret = -ENOMEM;
-        fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
-        if (!fscb) {
-                EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
-                return -ENOMEM;
-        }
        lock_super(sb);
        sbi = sb->s_fs_info;
+        fscb = &sbi->s_fscb;
+        ret = exofs_get_io_state(&sbi->layout, &ios);
+        if (ret)
+                goto out;
+        /* Note: We only write the changing part of the fscb. .i.e upto the
+         *       the fscb->s_dev_table_oid member. There is no read-modify-write
+         *       here.
+         */
+        ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
+        memset(fscb, 0, ios->length);
        fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
        fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
        fscb->s_magic = cpu_to_le16(sb->s_magic);
        fscb->s_newfs = 0;
+        fscb->s_version = EXOFS_FSCB_VER;
-        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        ios->obj.id = EXOFS_SUPER_ID;
-        if (unlikely(!or)) {
+        ios->offset = 0;
-                EXOFS_ERR("exofs_write_super: osd_start_request failed.\n");
+        ios->kern_buff = fscb;
-                goto out;
+        ios->cred = sbi->s_cred;
-        }
-        obj.partition = sbi->s_pid;
-        obj.id = EXOFS_SUPER_ID;
-        ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb));
-        if (unlikely(ret)) {
-                EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n");
-                goto out;
-        }
-        ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
+        ret = exofs_sbi_write(ios);
        if (unlikely(ret)) {
-                EXOFS_ERR("exofs_write_super: exofs_sync_op failed.\n");
+                EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
                goto out;
        }
        sb->s_dirt = 0;
 out:
-        if (or)
+        EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
-                osd_end_request(or);
+        exofs_put_io_state(ios);
        unlock_super(sb);
-        kfree(fscb);
        return ret;
 }
@@ -257,6 +254,29 @@ static void exofs_write_super(struct super_block *sb)
                sb->s_dirt = 0;
 }
+static void _exofs_print_device(const char *msg, const char *dev_path,
+                                struct osd_dev *od, u64 pid)
+{
+        const struct osd_dev_info *odi = osduld_device_info(od);
+        printk(KERN_NOTICE "exofs: %s %s osd_name-%s pid-0x%llx\n",
+                msg, dev_path ?: "", odi->osdname, _LLU(pid));
+}
+void exofs_free_sbi(struct exofs_sb_info *sbi)
+{
+        while (sbi->layout.s_numdevs) {
+                int i = --sbi->layout.s_numdevs;
+                struct osd_dev *od = sbi->layout.s_ods[i];
+                if (od) {
+                        sbi->layout.s_ods[i] = NULL;
+                        osduld_put_device(od);
+                }
+        }
+        kfree(sbi);
+}
 /*
 * This function is called when the vfs is freeing the superblock.  We just
 * need to free our own part.
@@ -279,11 +299,235 @@ static void exofs_put_super(struct super_block *sb)
                                  msecs_to_jiffies(100));
        }
-        osduld_put_device(sbi->s_dev);
+        _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0],
-        kfree(sb->s_fs_info);
+                            sbi->layout.s_pid);
+        exofs_free_sbi(sbi);
        sb->s_fs_info = NULL;
 }
+static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
+                                    struct exofs_device_table *dt)
+{
+        u64 stripe_length;
+        sbi->data_map.odm_num_comps   =
+                                le32_to_cpu(dt->dt_data_map.cb_num_comps);
+        sbi->data_map.odm_stripe_unit =
+                                le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
+        sbi->data_map.odm_group_width =
+                                le32_to_cpu(dt->dt_data_map.cb_group_width);
+        sbi->data_map.odm_group_depth =
+                                le32_to_cpu(dt->dt_data_map.cb_group_depth);
+        sbi->data_map.odm_mirror_cnt  =
+                                le32_to_cpu(dt->dt_data_map.cb_mirror_cnt);
+        sbi->data_map.odm_raid_algorithm  =
+                                le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
+/* FIXME: Only raid0 for now. if not so, do not mount */
+        if (sbi->data_map.odm_num_comps != numdevs) {
+                EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n",
+                          sbi->data_map.odm_num_comps, numdevs);
+                return -EINVAL;
+        }
+        if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) {
+                EXOFS_ERR("Only RAID_0 for now\n");
+                return -EINVAL;
+        }
+        if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) {
+                EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n",
+                          numdevs, sbi->data_map.odm_mirror_cnt);
+                return -EINVAL;
+        }
+        if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) {
+                EXOFS_ERR("Stripe Unit(0x%llx)"
+                          " must be Multples of PAGE_SIZE(0x%lx)\n",
+                          _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE);
+                return -EINVAL;
+        }
+        sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit;
+        sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1;
+        if (sbi->data_map.odm_group_width) {
+                sbi->layout.group_width = sbi->data_map.odm_group_width;
+                sbi->layout.group_depth = sbi->data_map.odm_group_depth;
+                if (!sbi->layout.group_depth) {
+                        EXOFS_ERR("group_depth == 0 && group_width != 0\n");
+                        return -EINVAL;
+                }
+                sbi->layout.group_count = sbi->data_map.odm_num_comps /
+                                                sbi->layout.mirrors_p1 /
+                                                sbi->data_map.odm_group_width;
+        } else {
+                if (sbi->data_map.odm_group_depth) {
+                        printk(KERN_NOTICE "Warning: group_depth ignored "
+                                "group_width == 0 && group_depth == %d\n",
+                                sbi->data_map.odm_group_depth);
+                        sbi->data_map.odm_group_depth = 0;
+                }
+                sbi->layout.group_width = sbi->data_map.odm_num_comps /
+                                                        sbi->layout.mirrors_p1;
+                sbi->layout.group_depth = -1;
+                sbi->layout.group_count = 1;
+        }
+        stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit;
+        if (stripe_length >= (1ULL << 32)) {
+                EXOFS_ERR("Total Stripe length(0x%llx)"
+                          " >= 32bit is not supported\n", _LLU(stripe_length));
+                return -EINVAL;
+        }
+        return 0;
+}
+/* @odi is valid only as long as @fscb_dev is valid */
+static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
+                             struct osd_dev_info *odi)
+{
+        odi->systemid_len = le32_to_cpu(dt_dev->systemid_len);
+        memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len);
+        odi->osdname_len = le32_to_cpu(dt_dev->osdname_len);
+        odi->osdname = dt_dev->osdname;
+        /* FIXME support long names. Will need a _put function */
+        if (dt_dev->long_name_offset)
+                return -EINVAL;
+        /* Make sure osdname is printable!
+         * mkexofs should give us space for a null-terminator else the
+         * device-table is invalid.
+         */
+        if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname)))
+                odi->osdname_len = sizeof(dt_dev->osdname) - 1;
+        dt_dev->osdname[odi->osdname_len] = 0;
+        /* If it's all zeros something is bad we read past end-of-obj */
+        return !(odi->systemid_len || odi->osdname_len);
+}
+static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
+                                       unsigned table_count)
+{
+        struct exofs_sb_info *sbi = *psbi;
+        struct osd_dev *fscb_od;
+        struct osd_obj_id obj = {.partition = sbi->layout.s_pid,
+                                 .id = EXOFS_DEVTABLE_ID};
+        struct exofs_device_table *dt;
+        unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
+                                             sizeof(*dt);
+        unsigned numdevs, i;
+        int ret;
+        dt = kmalloc(table_bytes, GFP_KERNEL);
+        if (unlikely(!dt)) {
+                EXOFS_ERR("ERROR: allocating %x bytes for device table\n",
+                          table_bytes);
+                return -ENOMEM;
+        }
+        fscb_od = sbi->layout.s_ods[0];
+        sbi->layout.s_ods[0] = NULL;
+        sbi->layout.s_numdevs = 0;
+        ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes);
+        if (unlikely(ret)) {
+                EXOFS_ERR("ERROR: reading device table\n");
+                goto out;
+        }
+        numdevs = le64_to_cpu(dt->dt_num_devices);
+        if (unlikely(!numdevs)) {
+                ret = -EINVAL;
+                goto out;
+        }
+        WARN_ON(table_count != numdevs);
+        ret = _read_and_match_data_map(sbi, numdevs, dt);
+        if (unlikely(ret))
+                goto out;
+        if (likely(numdevs > 1)) {
+                unsigned size = numdevs * sizeof(sbi->layout.s_ods[0]);
+                sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL);
+                if (unlikely(!sbi)) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                memset(&sbi->layout.s_ods[1], 0,
+                       size - sizeof(sbi->layout.s_ods[0]));
+                *psbi = sbi;
+        }
+        for (i = 0; i < numdevs; i++) {
+                struct exofs_fscb fscb;
+                struct osd_dev_info odi;
+                struct osd_dev *od;
+                if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) {
+                        EXOFS_ERR("ERROR: Read all-zeros device entry\n");
+                        ret = -EINVAL;
+                        goto out;
+                }
+                printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
+                       i, odi.osdname);
+                /* On all devices the device table is identical. The user can
+                 * specify any one of the participating devices on the command
+                 * line. We always keep them in device-table order.
+                 */
+                if (fscb_od && osduld_device_same(fscb_od, &odi)) {
+                        sbi->layout.s_ods[i] = fscb_od;
+                        ++sbi->layout.s_numdevs;
+                        fscb_od = NULL;
+                        continue;
+                }
+                od = osduld_info_lookup(&odi);
+                if (unlikely(IS_ERR(od))) {
+                        ret = PTR_ERR(od);
+                        EXOFS_ERR("ERROR: device requested is not found "
+                                  "osd_name-%s =>%d\n", odi.osdname, ret);
+                        goto out;
+                }
+                sbi->layout.s_ods[i] = od;
+                ++sbi->layout.s_numdevs;
+                /* Read the fscb of the other devices to make sure the FS
+                 * partition is there.
+                 */
+                ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb,
+                                      sizeof(fscb));
+                if (unlikely(ret)) {
+                        EXOFS_ERR("ERROR: Malformed participating device "
+                                  "error reading fscb osd_name-%s\n",
+                                  odi.osdname);
+                        goto out;
+                }
+                /* TODO: verify other information is correct and FS-uuid
+                 *       matches. Benny what did you say about device table
+                 *       generation and old devices?
+                 */
+        }
+out:
+        kfree(dt);
+        if (unlikely(!ret && fscb_od)) {
+                EXOFS_ERR(
+                      "ERROR: Bad device-table container device not present\n");
+                osduld_put_device(fscb_od);
+                ret = -EINVAL;
+        }
+        return ret;
+}
 /*
 * Read the superblock from the OSD and fill in the fields
 */
@@ -292,25 +536,32 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        struct inode *root;
        struct exofs_mountopt *opts = data;
        struct exofs_sb_info *sbi;      /*extended info                  */
+        struct osd_dev *od;             /* Master device                 */
        struct exofs_fscb fscb;         /*on-disk superblock info        */
-        struct osd_request *or = NULL;
        struct osd_obj_id obj;
+        unsigned table_count;
        int ret;
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
-        sb->s_fs_info = sbi;
        /* use mount options to fill superblock */
-        sbi->s_dev = osduld_path_lookup(opts->dev_name);
+        od = osduld_path_lookup(opts->dev_name);
-        if (IS_ERR(sbi->s_dev)) {
+        if (IS_ERR(od)) {
-                ret = PTR_ERR(sbi->s_dev);
+                ret = PTR_ERR(od);
-                sbi->s_dev = NULL;
                goto free_sbi;
        }
-        sbi->s_pid = opts->pid;
+        /* Default layout in case we do not have a device-table */
+        sbi->layout.stripe_unit = PAGE_SIZE;
+        sbi->layout.mirrors_p1 = 1;
+        sbi->layout.group_width = 1;
+        sbi->layout.group_depth = -1;
+        sbi->layout.group_count = 1;
+        sbi->layout.s_ods[0] = od;
+        sbi->layout.s_numdevs = 1;
+        sbi->layout.s_pid = opts->pid;
        sbi->s_timeout = opts->timeout;
        /* fill in some other data by hand */
@@ -323,35 +574,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_bdev = NULL;
        sb->s_dev = 0;
-        /* read data from on-disk superblock object */
+        obj.partition = sbi->layout.s_pid;
-        obj.partition = sbi->s_pid;
        obj.id = EXOFS_SUPER_ID;
        exofs_make_credential(sbi->s_cred, &obj);
-        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, sizeof(fscb));
-        if (unlikely(!or)) {
+        if (unlikely(ret))
-                if (!silent)
-                        EXOFS_ERR(
-                               "exofs_fill_super: osd_start_request failed.\n");
-                ret = -ENOMEM;
-                goto free_sbi;
-        }
-        ret = osd_req_read_kern(or, &obj, 0, &fscb, sizeof(fscb));
-        if (unlikely(ret)) {
-                if (!silent)
-                        EXOFS_ERR(
-                               "exofs_fill_super: osd_req_read_kern failed.\n");
-                ret = -ENOMEM;
-                goto free_sbi;
-        }
-        ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
-        if (unlikely(ret)) {
-                if (!silent)
-                        EXOFS_ERR("exofs_fill_super: exofs_sync_op failed.\n");
-                ret = -EIO;
                goto free_sbi;
-        }
        sb->s_magic = le16_to_cpu(fscb.s_magic);
        sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
@@ -364,12 +593,26 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
                ret = -EINVAL;
                goto free_sbi;
        }
+        if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) {
+                EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
+                          EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
+                ret = -EINVAL;
+                goto free_sbi;
+        }
        /* start generation numbers from a random point */
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
+        table_count = le64_to_cpu(fscb.s_dev_table_count);
+        if (table_count) {
+                ret = exofs_read_lookup_dev_table(&sbi, table_count);
+                if (unlikely(ret))
+                        goto free_sbi;
+        }
        /* set up operation vectors */
+        sb->s_fs_info = sbi;
        sb->s_op = &exofs_sops;
        sb->s_export_op = &exofs_export_ops;
        root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
@@ -395,16 +638,15 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
                goto free_sbi;
        }
-        ret = 0;
+        _exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0],
-out:
+                            sbi->layout.s_pid);
-        if (or)
+        return 0;
-                osd_end_request(or);
-        return ret;
 free_sbi:
-        osduld_put_device(sbi->s_dev); /* NULL safe */
+        EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
-        kfree(sbi);
+                  opts->dev_name, sbi->layout.s_pid, ret);
-        goto out;
+        exofs_free_sbi(sbi);
+        return ret;
 }
 /*
@@ -433,7 +675,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
        struct exofs_sb_info *sbi = sb->s_fs_info;
-        struct osd_obj_id obj = {sbi->s_pid, 0};
+        struct exofs_io_state *ios;
        struct osd_attr attrs[] = {
                ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
                        OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
@@ -442,32 +684,33 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
        };
        uint64_t capacity = ULLONG_MAX;
        uint64_t used = ULLONG_MAX;
-        struct osd_request *or;
        uint8_t cred_a[OSD_CAP_LEN];
        int ret;
-        /* get used/capacity attributes */
+        ret = exofs_get_io_state(&sbi->layout, &ios);
-        exofs_make_credential(cred_a, &obj);
+        if (ret) {
+                EXOFS_DBGMSG("exofs_get_io_state failed.\n");
-        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+                return ret;
-        if (unlikely(!or)) {
-                EXOFS_DBGMSG("exofs_statfs: osd_start_request failed.\n");
-                return -ENOMEM;
        }
-        osd_req_get_attributes(or, &obj);
+        exofs_make_credential(cred_a, &ios->obj);
-        osd_req_add_get_attr_list(or, attrs, ARRAY_SIZE(attrs));
+        ios->cred = sbi->s_cred;
-        ret = exofs_sync_op(or, sbi->s_timeout, cred_a);
+        ios->in_attr = attrs;
+        ios->in_attr_len = ARRAY_SIZE(attrs);
+        ret = exofs_sbi_read(ios);
        if (unlikely(ret))
                goto out;
-        ret = extract_attr_from_req(or, &attrs[0]);
+        ret = extract_attr_from_ios(ios, &attrs[0]);
-        if (likely(!ret))
+        if (likely(!ret)) {
                capacity = get_unaligned_be64(attrs[0].val_ptr);
-        else
+                if (unlikely(!capacity))
+                        capacity = ULLONG_MAX;
+        } else
                EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
-        ret = extract_attr_from_req(or, &attrs[1]);
+        ret = extract_attr_from_ios(ios, &attrs[1]);
        if (likely(!ret))
                used = get_unaligned_be64(attrs[1].val_ptr);
        else
@@ -476,15 +719,15 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
        /* fill in the stats buffer */
        buf->f_type = EXOFS_SUPER_MAGIC;
        buf->f_bsize = EXOFS_BLKSIZE;
-        buf->f_blocks = (capacity >> EXOFS_BLKSHIFT);
+        buf->f_blocks = capacity >> 9;
-        buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT);
+        buf->f_bfree = (capacity - used) >> 9;
        buf->f_bavail = buf->f_bfree;
        buf->f_files = sbi->s_numfiles;
        buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
        buf->f_namelen = EXOFS_NAME_LEN;
 out:
-        osd_end_request(or);
+        exofs_put_io_state(ios);
        return ret;
 }
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 197c7db583c7..e9e175949a63 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -6,7 +6,7 @@
 * and for mapping back from file handles to dentries.
 *
 * For details on why we do all the strange and hairy things in here
- * take a look at Documentation/filesystems/Exporting.
+ * take a look at Documentation/filesystems/nfs/Exporting.
 */
 #include <linux/exportfs.h>
 #include <linux/fs.h>
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index a63d44256a70..a99e54318c3d 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -339,12 +339,12 @@ ext2_acl_chmod(struct inode *inode)
 * Extended attribut handlers
 */
 static size_t
-ext2_xattr_list_acl_access(struct inode *inode, char *list, size_t list_size,
+ext2_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_size,
-                           const char *name, size_t name_len)
+                           const char *name, size_t name_len, int type)
 {
        const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-        if (!test_opt(inode->i_sb, POSIX_ACL))
+        if (!test_opt(dentry->d_sb, POSIX_ACL))
                return 0;
        if (list && size <= list_size)
                memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
@@ -352,12 +352,12 @@ ext2_xattr_list_acl_access(struct inode *inode, char *list, size_t list_size,
 }
 static size_t
-ext2_xattr_list_acl_default(struct inode *inode, char *list, size_t list_size,
+ext2_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_size,
-                            const char *name, size_t name_len)
+                            const char *name, size_t name_len, int type)
 {
        const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-        if (!test_opt(inode->i_sb, POSIX_ACL))
+        if (!test_opt(dentry->d_sb, POSIX_ACL))
                return 0;
        if (list && size <= list_size)
                memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
@@ -365,15 +365,18 @@ ext2_xattr_list_acl_default(struct inode *inode, char *list, size_t list_size,
 }
 static int
-ext2_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
+ext2_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
+                   size_t size, int type)
 {
        struct posix_acl *acl;
        int error;
-        if (!test_opt(inode->i_sb, POSIX_ACL))
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        if (!test_opt(dentry->d_sb, POSIX_ACL))
                return -EOPNOTSUPP;
-        acl = ext2_get_acl(inode, type);
+        acl = ext2_get_acl(dentry->d_inode, type);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl == NULL)
@@ -385,33 +388,17 @@ ext2_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
 }
 static int
-ext2_xattr_get_acl_access(struct inode *inode, const char *name,
+ext2_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
-                          void *buffer, size_t size)
+                   size_t size, int flags, int type)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ext2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
-}
-static int
-ext2_xattr_get_acl_default(struct inode *inode, const char *name,
-                           void *buffer, size_t size)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ext2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
-}
-static int
-ext2_xattr_set_acl(struct inode *inode, int type, const void *value,
-                   size_t size)
 {
        struct posix_acl *acl;
        int error;
-        if (!test_opt(inode->i_sb, POSIX_ACL))
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        if (!test_opt(dentry->d_sb, POSIX_ACL))
                return -EOPNOTSUPP;
-        if (!is_owner_or_cap(inode))
+        if (!is_owner_or_cap(dentry->d_inode))
                return -EPERM;
        if (value) {
@@ -426,41 +413,25 @@ ext2_xattr_set_acl(struct inode *inode, int type, const void *value,
        } else
                acl = NULL;
-        error = ext2_set_acl(inode, type, acl);
+        error = ext2_set_acl(dentry->d_inode, type, acl);
 release_and_out:
        posix_acl_release(acl);
        return error;
 }
-static int
-ext2_xattr_set_acl_access(struct inode *inode, const char *name,
-                          const void *value, size_t size, int flags)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ext2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
-}
-static int
-ext2_xattr_set_acl_default(struct inode *inode, const char *name,
-                           const void *value, size_t size, int flags)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ext2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
-}
 struct xattr_handler ext2_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .flags  = ACL_TYPE_ACCESS,
        .list   = ext2_xattr_list_acl_access,
-        .get    = ext2_xattr_get_acl_access,
+        .get    = ext2_xattr_get_acl,
-        .set    = ext2_xattr_set_acl_access,
+        .set    = ext2_xattr_set_acl,
 };
 struct xattr_handler ext2_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .flags  = ACL_TYPE_DEFAULT,
        .list   = ext2_xattr_list_acl_default,
-        .get    = ext2_xattr_get_acl_default,
+        .get    = ext2_xattr_get_acl,
-        .set    = ext2_xattr_set_acl_default,
+        .set    = ext2_xattr_set_acl,
 };
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 7f8d2e5a7ea6..3cf038c055d7 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -13,6 +13,7 @@
 #include "ext2.h"
 #include <linux/quotaops.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/buffer_head.h>
 #include <linux/capability.h>
@@ -570,7 +571,7 @@ do_more:
 error_return:
        brelse(bitmap_bh);
        release_blocks(sb, freed);
-        vfs_dq_free_block(inode, freed);
+        dquot_free_block(inode, freed);
 }
 /**
@@ -1236,6 +1237,7 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
        unsigned short windowsz = 0;
        unsigned long ngroups;
        unsigned long num = *count;
+        int ret;
        *errp = -ENOSPC;
        sb = inode->i_sb;
@@ -1247,8 +1249,9 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
        /*
         * Check quota for allocation of this block.
         */
-        if (vfs_dq_alloc_block(inode, num)) {
+        ret = dquot_alloc_block(inode, num);
-                *errp = -EDQUOT;
+        if (ret) {
+                *errp = ret;
                return 0;
        }
@@ -1409,7 +1412,7 @@ allocated:
        *errp = 0;
        brelse(bitmap_bh);
-        vfs_dq_free_block(inode, *count-num);
+        dquot_free_block(inode, *count-num);
        *count = num;
        return ret_block;
@@ -1420,7 +1423,7 @@ out:
         * Undo the block allocation
         */
        if (!performed_allocation)
-                vfs_dq_free_block(inode, *count);
+                dquot_free_block(inode, *count);
        brelse(bitmap_bh);
        return 0;
 }
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 6cde970b0a1a..7516957273ed 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -353,8 +353,8 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
 *      ext2_find_entry()
 *
 * finds an entry in the specified directory with the wanted name. It
- * returns the page in which the entry was found, and the entry itself
+ * returns the page in which the entry was found (as a parameter - res_page),
- * (as a parameter - res_dir). Page is returned mapped and unlocked.
+ * and the entry itself. Page is returned mapped and unlocked.
 * Entry is guaranteed to be valid.
 */
 struct ext2_dir_entry_2 *ext2_find_entry (struct inode * dir,
@@ -721,5 +721,5 @@ const struct file_operations ext2_dir_operations = {
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext2_compat_ioctl,
 #endif
-        .fsync          = simple_fsync,
+        .fsync          = ext2_fsync,
 };
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 9a8a8e27a063..0b038e47ad2f 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -118,7 +118,7 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned);
 /* inode.c */
 extern struct inode *ext2_iget (struct super_block *, unsigned long);
-extern int ext2_write_inode (struct inode *, int);
+extern int ext2_write_inode (struct inode *, struct writeback_control *);
 extern void ext2_delete_inode (struct inode *);
 extern int ext2_sync_inode (struct inode *);
 extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
@@ -142,7 +142,7 @@ struct dentry *ext2_get_parent(struct dentry *child);
 /* super.c */
 extern void ext2_error (struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
-extern void ext2_warning (struct super_block *, const char *, const char *, ...)
+extern void ext2_msg(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
 extern void ext2_update_dynamic_rev (struct super_block *sb);
 extern void ext2_write_super (struct super_block *);
@@ -155,6 +155,7 @@ extern void ext2_write_super (struct super_block *);
 extern const struct file_operations ext2_dir_operations;
 /* file.c */
+extern int ext2_fsync(struct file *file, struct dentry *dentry, int datasync);
 extern const struct inode_operations ext2_file_inode_operations;
 extern const struct file_operations ext2_file_operations;
 extern const struct file_operations ext2_xip_file_operations;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index a2f3afd1a1c1..5d198d0697fb 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -19,6 +19,8 @@
 */
 #include <linux/time.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
@@ -38,6 +40,22 @@ static int ext2_release_file (struct inode * inode, struct file * filp)
        return 0;
 }
+int ext2_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+        int ret;
+        struct super_block *sb = dentry->d_inode->i_sb;
+        struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+        ret = simple_fsync(file, dentry, datasync);
+        if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) {
+                /* We don't really know where the IO error happened... */
+                ext2_error(sb, __func__,
+                           "detected IO error when writing metadata buffers");
+                ret = -EIO;
+        }
+        return ret;
+}
 /*
 * We have mostly NULL's here: the current defaults are ok for
 * the ext2 filesystem.
@@ -53,9 +71,9 @@ const struct file_operations ext2_file_operations = {
        .compat_ioctl   = ext2_compat_ioctl,
 #endif
        .mmap           = generic_file_mmap,
-        .open           = generic_file_open,
+        .open           = dquot_file_open,
        .release        = ext2_release_file,
-        .fsync          = simple_fsync,
+        .fsync          = ext2_fsync,
        .splice_read    = generic_file_splice_read,
        .splice_write   = generic_file_splice_write,
 };
@@ -70,9 +88,9 @@ const struct file_operations ext2_xip_file_operations = {
        .compat_ioctl   = ext2_compat_ioctl,
 #endif
        .mmap           = xip_file_mmap,
-        .open           = generic_file_open,
+        .open           = dquot_file_open,
        .release        = ext2_release_file,
-        .fsync          = simple_fsync,
+        .fsync          = ext2_fsync,
 };
 #endif
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 15387c9c17d8..ad7d572ee8dc 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -121,8 +121,8 @@ void ext2_free_inode (struct inode * inode)
        if (!is_bad_inode(inode)) {
                /* Quota is already initialized in iput() */
                ext2_xattr_delete_inode(inode);
-                vfs_dq_free_inode(inode);
+                dquot_free_inode(inode);
-                vfs_dq_drop(inode);
+                dquot_drop(inode);
        }
        es = EXT2_SB(sb)->s_es;
@@ -586,10 +586,10 @@ got:
                goto fail_drop;
        }
-        if (vfs_dq_alloc_inode(inode)) {
+        dquot_initialize(inode);
-                err = -EDQUOT;
+        err = dquot_alloc_inode(inode);
+        if (err)
                goto fail_drop;
-        }
        err = ext2_init_acl(inode, dir);
        if (err)
@@ -605,10 +605,10 @@ got:
        return inode;
 fail_free_drop:
-        vfs_dq_free_inode(inode);
+        dquot_free_inode(inode);
 fail_drop:
-        vfs_dq_drop(inode);
+        dquot_drop(inode);
        inode->i_flags |= S_NOQUOTA;
        inode->i_nlink = 0;
        unlock_new_inode(inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index ade634076d0a..fc13cc119aad 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -41,6 +41,8 @@ MODULE_AUTHOR("Remy Card and others");
 MODULE_DESCRIPTION("Second Extended Filesystem");
 MODULE_LICENSE("GPL");
+static int __ext2_write_inode(struct inode *inode, int do_sync);
 /*
 * Test whether an inode is a fast symlink.
 */
@@ -58,13 +60,15 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode)
 */
 void ext2_delete_inode (struct inode * inode)
 {
+        if (!is_bad_inode(inode))
+                dquot_initialize(inode);
        truncate_inode_pages(&inode->i_data, 0);
        if (is_bad_inode(inode))
                goto no_delete;
        EXT2_I(inode)->i_dtime  = get_seconds();
        mark_inode_dirty(inode);
-        ext2_write_inode(inode, inode_needs_sync(inode));
+        __ext2_write_inode(inode, inode_needs_sync(inode));
        inode->i_size = 0;
        if (inode->i_blocks)
@@ -137,7 +141,8 @@ static int ext2_block_to_path(struct inode *inode,
        int final = 0;
        if (i_block < 0) {
-                ext2_warning (inode->i_sb, "ext2_block_to_path", "block < 0");
+                ext2_msg(inode->i_sb, KERN_WARNING,
+                        "warning: %s: block < 0", __func__);
        } else if (i_block < direct_blocks) {
                offsets[n++] = i_block;
                final = direct_blocks;
@@ -157,7 +162,8 @@ static int ext2_block_to_path(struct inode *inode,
                offsets[n++] = i_block & (ptrs - 1);
                final = ptrs;
        } else {
-                ext2_warning (inode->i_sb, "ext2_block_to_path", "block > big");
+                ext2_msg(inode->i_sb, KERN_WARNING,
+                        "warning: %s: block is too big", __func__);
        }
        if (boundary)
                *boundary = final - 1 - (i_block & (ptrs - 1));
@@ -1333,7 +1339,7 @@ bad_inode:
        return ERR_PTR(ret);
 }
-int ext2_write_inode(struct inode *inode, int do_sync)
+static int __ext2_write_inode(struct inode *inode, int do_sync)
 {
        struct ext2_inode_info *ei = EXT2_I(inode);
        struct super_block *sb = inode->i_sb;
@@ -1438,6 +1444,11 @@ int ext2_write_inode(struct inode *inode, int do_sync)
        return err;
 }
+int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+        return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
 int ext2_sync_inode(struct inode *inode)
 {
        struct writeback_control wbc = {
@@ -1455,9 +1466,12 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
        error = inode_change_ok(inode, iattr);
        if (error)
                return error;
+        if (iattr->ia_valid & ATTR_SIZE)
+                dquot_initialize(inode);
        if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
            (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
-                error = vfs_dq_transfer(inode, iattr) ? -EDQUOT : 0;
+                error = dquot_transfer(inode, iattr);
                if (error)
                        return error;
        }
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index dd7175ce5606..71efb0e9a3f2 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -31,6 +31,7 @@
 */
 #include <linux/pagemap.h>
+#include <linux/quotaops.h>
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
@@ -99,24 +100,27 @@ struct dentry *ext2_get_parent(struct dentry *child)
 */
 static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd)
 {
-        struct inode * inode = ext2_new_inode (dir, mode);
+        struct inode *inode;
-        int err = PTR_ERR(inode);
-        if (!IS_ERR(inode)) {
+        dquot_initialize(dir);
-                inode->i_op = &ext2_file_inode_operations;
-                if (ext2_use_xip(inode->i_sb)) {
+        inode = ext2_new_inode(dir, mode);
-                        inode->i_mapping->a_ops = &ext2_aops_xip;
+        if (IS_ERR(inode))
-                        inode->i_fop = &ext2_xip_file_operations;
+                return PTR_ERR(inode);
-                } else if (test_opt(inode->i_sb, NOBH)) {
-                        inode->i_mapping->a_ops = &ext2_nobh_aops;
+        inode->i_op = &ext2_file_inode_operations;
-                        inode->i_fop = &ext2_file_operations;
+        if (ext2_use_xip(inode->i_sb)) {
-                } else {
+                inode->i_mapping->a_ops = &ext2_aops_xip;
-                        inode->i_mapping->a_ops = &ext2_aops;
+                inode->i_fop = &ext2_xip_file_operations;
-                        inode->i_fop = &ext2_file_operations;
+        } else if (test_opt(inode->i_sb, NOBH)) {
-                }
+                inode->i_mapping->a_ops = &ext2_nobh_aops;
-                mark_inode_dirty(inode);
+                inode->i_fop = &ext2_file_operations;
-                err = ext2_add_nondir(dentry, inode);
+        } else {
+                inode->i_mapping->a_ops = &ext2_aops;
+                inode->i_fop = &ext2_file_operations;
        }
-        return err;
+        mark_inode_dirty(inode);
+        return ext2_add_nondir(dentry, inode);
 }
 static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev)
@@ -127,6 +131,8 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_
        if (!new_valid_dev(rdev))
                return -EINVAL;
+        dquot_initialize(dir);
        inode = ext2_new_inode (dir, mode);
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
@@ -151,6 +157,8 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
        if (l > sb->s_blocksize)
                goto out;
+        dquot_initialize(dir);
        inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
@@ -194,6 +202,8 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
        if (inode->i_nlink >= EXT2_LINK_MAX)
                return -EMLINK;
+        dquot_initialize(dir);
        inode->i_ctime = CURRENT_TIME_SEC;
        inode_inc_link_count(inode);
        atomic_inc(&inode->i_count);
@@ -216,6 +226,8 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
        if (dir->i_nlink >= EXT2_LINK_MAX)
                goto out;
+        dquot_initialize(dir);
        inode_inc_link_count(dir);
        inode = ext2_new_inode (dir, S_IFDIR | mode);
@@ -262,6 +274,8 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry)
        struct page * page;
        int err = -ENOENT;
+        dquot_initialize(dir);
        de = ext2_find_entry (dir, &dentry->d_name, &page);
        if (!de)
                goto out;
@@ -304,6 +318,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
        struct ext2_dir_entry_2 * old_de;
        int err = -ENOENT;
+        dquot_initialize(old_dir);
+        dquot_initialize(new_dir);
        old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page);
        if (!old_de)
                goto out;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 1a9ffee47d56..42e4a303b675 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -58,27 +58,27 @@ void ext2_error (struct super_block * sb, const char * function,
        }
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT2-fs error (device %s): %s: ",sb->s_id, function);
+        printk(KERN_CRIT "EXT2-fs (%s): error: %s: ", sb->s_id, function);
        vprintk(fmt, args);
        printk("\n");
        va_end(args);
        if (test_opt(sb, ERRORS_PANIC))
-                panic("EXT2-fs panic from previous error\n");
+                panic("EXT2-fs: panic from previous error\n");
        if (test_opt(sb, ERRORS_RO)) {
-                printk("Remounting filesystem read-only\n");
+                ext2_msg(sb, KERN_CRIT,
+                             "error: remounting filesystem read-only");
                sb->s_flags |= MS_RDONLY;
        }
 }
-void ext2_warning (struct super_block * sb, const char * function,
+void ext2_msg(struct super_block *sb, const char *prefix,
-                   const char * fmt, ...)
+                const char *fmt, ...)
 {
        va_list args;
        va_start(args, fmt);
-        printk(KERN_WARNING "EXT2-fs warning (device %s): %s: ",
+        printk("%sEXT2-fs (%s): ", prefix, sb->s_id);
-               sb->s_id, function);
        vprintk(fmt, args);
        printk("\n");
        va_end(args);
@@ -91,9 +91,9 @@ void ext2_update_dynamic_rev(struct super_block *sb)
        if (le32_to_cpu(es->s_rev_level) > EXT2_GOOD_OLD_REV)
                return;
-        ext2_warning(sb, __func__,
+        ext2_msg(sb, KERN_WARNING,
-                     "updating to rev %d because of new feature flag, "
+                     "warning: updating to rev %d because of "
-                     "running e2fsck is recommended",
+                     "new feature flag, running e2fsck is recommended",
                     EXT2_DYNAMIC_REV);
        es->s_first_ino = cpu_to_le32(EXT2_GOOD_OLD_FIRST_INO);
@@ -194,6 +194,8 @@ static void destroy_inodecache(void)
 static void ext2_clear_inode(struct inode *inode)
 {
        struct ext2_block_alloc_info *rsv = EXT2_I(inode)->i_block_alloc_info;
+        dquot_drop(inode);
        ext2_discard_reservation(inode);
        EXT2_I(inode)->i_block_alloc_info = NULL;
        if (unlikely(rsv))
@@ -419,10 +421,10 @@ static const match_table_t tokens = {
        {Opt_err, NULL}
 };
-static int parse_options (char * options,
+static int parse_options(char *options, struct super_block *sb)
-                          struct ext2_sb_info *sbi)
 {
-        char * p;
+        char *p;
+        struct ext2_sb_info *sbi = EXT2_SB(sb);
        substring_t args[MAX_OPT_ARGS];
        int option;
@@ -505,7 +507,8 @@ static int parse_options (char * options,
 #else
                case Opt_user_xattr:
                case Opt_nouser_xattr:
-                        printk("EXT2 (no)user_xattr options not supported\n");
+                        ext2_msg(sb, KERN_INFO, "(no)user_xattr options"
+                                "not supported");
                        break;
 #endif
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
@@ -518,14 +521,15 @@ static int parse_options (char * options,
 #else
                case Opt_acl:
                case Opt_noacl:
-                        printk("EXT2 (no)acl options not supported\n");
+                        ext2_msg(sb, KERN_INFO,
+                                "(no)acl options not supported");
                        break;
 #endif
                case Opt_xip:
 #ifdef CONFIG_EXT2_FS_XIP
                        set_opt (sbi->s_mount_opt, XIP);
 #else
-                        printk("EXT2 xip option not supported\n");
+                        ext2_msg(sb, KERN_INFO, "xip option not supported");
 #endif
                        break;
@@ -542,19 +546,18 @@ static int parse_options (char * options,
                case Opt_quota:
                case Opt_usrquota:
                case Opt_grpquota:
-                        printk(KERN_ERR
+                        ext2_msg(sb, KERN_INFO,
-                                "EXT2-fs: quota operations not supported.\n");
+                                "quota operations not supported");
                        break;
 #endif
                case Opt_reservation:
                        set_opt(sbi->s_mount_opt, RESERVATION);
-                        printk("reservations ON\n");
+                        ext2_msg(sb, KERN_INFO, "reservations ON");
                        break;
                case Opt_noreservation:
                        clear_opt(sbi->s_mount_opt, RESERVATION);
-                        printk("reservations OFF\n");
+                        ext2_msg(sb, KERN_INFO, "reservations OFF");
                        break;
                case Opt_ignore:
                        break;
@@ -573,34 +576,40 @@ static int ext2_setup_super (struct super_block * sb,
        struct ext2_sb_info *sbi = EXT2_SB(sb);
        if (le32_to_cpu(es->s_rev_level) > EXT2_MAX_SUPP_REV) {
-                printk ("EXT2-fs warning: revision level too high, "
+                ext2_msg(sb, KERN_ERR,
-                        "forcing read-only mode\n");
+                        "error: revision level too high, "
+                        "forcing read-only mode");
                res = MS_RDONLY;
        }
        if (read_only)
                return res;
        if (!(sbi->s_mount_state & EXT2_VALID_FS))
-                printk ("EXT2-fs warning: mounting unchecked fs, "
+                ext2_msg(sb, KERN_WARNING,
-                        "running e2fsck is recommended\n");
+                        "warning: mounting unchecked fs, "
+                        "running e2fsck is recommended");
        else if ((sbi->s_mount_state & EXT2_ERROR_FS))
-                printk ("EXT2-fs warning: mounting fs with errors, "
+                ext2_msg(sb, KERN_WARNING,
-                        "running e2fsck is recommended\n");
+                        "warning: mounting fs with errors, "
+                        "running e2fsck is recommended");
        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
                 le16_to_cpu(es->s_mnt_count) >=
                 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
-                printk ("EXT2-fs warning: maximal mount count reached, "
+                ext2_msg(sb, KERN_WARNING,
-                        "running e2fsck is recommended\n");
+                        "warning: maximal mount count reached, "
+                        "running e2fsck is recommended");
        else if (le32_to_cpu(es->s_checkinterval) &&
-                (le32_to_cpu(es->s_lastcheck) + le32_to_cpu(es->s_checkinterval) <= get_seconds()))
+                (le32_to_cpu(es->s_lastcheck) +
-                printk ("EXT2-fs warning: checktime reached, "
+                        le32_to_cpu(es->s_checkinterval) <= get_seconds()))
-                        "running e2fsck is recommended\n");
+                ext2_msg(sb, KERN_WARNING,
+                        "warning: checktime reached, "
+                        "running e2fsck is recommended");
        if (!le16_to_cpu(es->s_max_mnt_count))
                es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT);
        le16_add_cpu(&es->s_mnt_count, 1);
        ext2_write_super(sb);
        if (test_opt (sb, DEBUG))
-                printk ("[EXT II FS %s, %s, bs=%lu, fs=%lu, gc=%lu, "
+                ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, "
-                        "bpg=%lu, ipg=%lu, mo=%04lx]\n",
+                        "bpg=%lu, ipg=%lu, mo=%04lx]",
                        EXT2FS_VERSION, EXT2FS_DATE, sb->s_blocksize,
                        sbi->s_frag_size,
                        sbi->s_groups_count,
@@ -767,7 +776,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
         */
        blocksize = sb_min_blocksize(sb, BLOCK_SIZE);
        if (!blocksize) {
-                printk ("EXT2-fs: unable to set blocksize\n");
+                ext2_msg(sb, KERN_ERR, "error: unable to set blocksize");
                goto failed_sbi;
        }
@@ -783,7 +792,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        }
        if (!(bh = sb_bread(sb, logic_sb_block))) {
-                printk ("EXT2-fs: unable to read superblock\n");
+                ext2_msg(sb, KERN_ERR, "error: unable to read superblock");
                goto failed_sbi;
        }
        /*
@@ -826,7 +835,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        
        set_opt(sbi->s_mount_opt, RESERVATION);
-        if (!parse_options ((char *) data, sbi))
+        if (!parse_options((char *) data, sb))
                goto failed_mount;
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -840,8 +849,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
            (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) ||
             EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
             EXT2_HAS_INCOMPAT_FEATURE(sb, ~0U)))
-                printk("EXT2-fs warning: feature flags set on rev 0 fs, "
+                ext2_msg(sb, KERN_WARNING,
-                       "running e2fsck is recommended\n");
+                        "warning: feature flags set on rev 0 fs, "
+                        "running e2fsck is recommended");
        /*
         * Check feature flags regardless of the revision level, since we
         * previously didn't change the revision level when setting the flags,
@@ -849,16 +859,16 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
         */
        features = EXT2_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP);
        if (features) {
-                printk("EXT2-fs: %s: couldn't mount because of "
+                ext2_msg(sb, KERN_ERR,  "error: couldn't mount because of "
-                       "unsupported optional features (%x).\n",
+                       "unsupported optional features (%x)",
-                       sb->s_id, le32_to_cpu(features));
+                        le32_to_cpu(features));
                goto failed_mount;
        }
        if (!(sb->s_flags & MS_RDONLY) &&
            (features = EXT2_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))){
-                printk("EXT2-fs: %s: couldn't mount RDWR because of "
+                ext2_msg(sb, KERN_ERR, "error: couldn't mount RDWR because of "
-                       "unsupported optional features (%x).\n",
+                       "unsupported optional features (%x)",
-                       sb->s_id, le32_to_cpu(features));
+                       le32_to_cpu(features));
                goto failed_mount;
        }
@@ -866,7 +876,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        if (ext2_use_xip(sb) && blocksize != PAGE_SIZE) {
                if (!silent)
-                        printk("XIP: Unsupported blocksize\n");
+                        ext2_msg(sb, KERN_ERR,
+                                "error: unsupported blocksize for xip");
                goto failed_mount;
        }
@@ -875,7 +886,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
                brelse(bh);
                if (!sb_set_blocksize(sb, blocksize)) {
-                        printk(KERN_ERR "EXT2-fs: blocksize too small for device.\n");
+                        ext2_msg(sb, KERN_ERR, "error: blocksize is too small");
                        goto failed_sbi;
                }
@@ -883,14 +894,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
                offset = (sb_block*BLOCK_SIZE) % blocksize;
                bh = sb_bread(sb, logic_sb_block);
                if(!bh) {
-                        printk("EXT2-fs: Couldn't read superblock on "
+                        ext2_msg(sb, KERN_ERR, "error: couldn't read"
-                               "2nd try.\n");
+                                "superblock on 2nd try");
                        goto failed_sbi;
                }
                es = (struct ext2_super_block *) (((char *)bh->b_data) + offset);
                sbi->s_es = es;
                if (es->s_magic != cpu_to_le16(EXT2_SUPER_MAGIC)) {
-                        printk ("EXT2-fs: Magic mismatch, very weird !\n");
+                        ext2_msg(sb, KERN_ERR, "error: magic mismatch");
                        goto failed_mount;
                }
        }
@@ -906,7 +917,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
                if ((sbi->s_inode_size < EXT2_GOOD_OLD_INODE_SIZE) ||
                    !is_power_of_2(sbi->s_inode_size) ||
                    (sbi->s_inode_size > blocksize)) {
-                        printk ("EXT2-fs: unsupported inode size: %d\n",
+                        ext2_msg(sb, KERN_ERR,
+                                "error: unsupported inode size: %d",
                                sbi->s_inode_size);
                        goto failed_mount;
                }
@@ -943,29 +955,33 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        if (sb->s_blocksize != bh->b_size) {
                if (!silent)
-                        printk ("VFS: Unsupported blocksize on dev "
+                        ext2_msg(sb, KERN_ERR, "error: unsupported blocksize");
-                                "%s.\n", sb->s_id);
                goto failed_mount;
        }
        if (sb->s_blocksize != sbi->s_frag_size) {
-                printk ("EXT2-fs: fragsize %lu != blocksize %lu (not supported yet)\n",
+                ext2_msg(sb, KERN_ERR,
+                        "error: fragsize %lu != blocksize %lu"
+                        "(not supported yet)",
                        sbi->s_frag_size, sb->s_blocksize);
                goto failed_mount;
        }
        if (sbi->s_blocks_per_group > sb->s_blocksize * 8) {
-                printk ("EXT2-fs: #blocks per group too big: %lu\n",
+                ext2_msg(sb, KERN_ERR,
+                        "error: #blocks per group too big: %lu",
                        sbi->s_blocks_per_group);
                goto failed_mount;
        }
        if (sbi->s_frags_per_group > sb->s_blocksize * 8) {
-                printk ("EXT2-fs: #fragments per group too big: %lu\n",
+                ext2_msg(sb, KERN_ERR,
+                        "error: #fragments per group too big: %lu",
                        sbi->s_frags_per_group);
                goto failed_mount;
        }
        if (sbi->s_inodes_per_group > sb->s_blocksize * 8) {
-                printk ("EXT2-fs: #inodes per group too big: %lu\n",
+                ext2_msg(sb, KERN_ERR,
+                        "error: #inodes per group too big: %lu",
                        sbi->s_inodes_per_group);
                goto failed_mount;
        }
@@ -979,13 +995,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
                   EXT2_DESC_PER_BLOCK(sb);
        sbi->s_group_desc = kmalloc (db_count * sizeof (struct buffer_head *), GFP_KERNEL);
        if (sbi->s_group_desc == NULL) {
-                printk ("EXT2-fs: not enough memory\n");
+                ext2_msg(sb, KERN_ERR, "error: not enough memory");
                goto failed_mount;
        }
        bgl_lock_init(sbi->s_blockgroup_lock);
        sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
        if (!sbi->s_debts) {
-                printk ("EXT2-fs: not enough memory\n");
+                ext2_msg(sb, KERN_ERR, "error: not enough memory");
                goto failed_mount_group_desc;
        }
        for (i = 0; i < db_count; i++) {
@@ -994,12 +1010,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
                if (!sbi->s_group_desc[i]) {
                        for (j = 0; j < i; j++)
                                brelse (sbi->s_group_desc[j]);
-                        printk ("EXT2-fs: unable to read group descriptors\n");
+                        ext2_msg(sb, KERN_ERR,
+                                "error: unable to read group descriptors");
                        goto failed_mount_group_desc;
                }
        }
        if (!ext2_check_descriptors (sb)) {
-                printk ("EXT2-fs: group descriptors corrupted!\n");
+                ext2_msg(sb, KERN_ERR, "group descriptors corrupted");
                goto failed_mount2;
        }
        sbi->s_gdb_count = db_count;
@@ -1032,7 +1049,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
                                ext2_count_dirs(sb));
        }
        if (err) {
-                printk(KERN_ERR "EXT2-fs: insufficient memory\n");
+                ext2_msg(sb, KERN_ERR, "error: insufficient memory");
                goto failed_mount3;
        }
        /*
@@ -1048,27 +1065,28 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        }
        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
                iput(root);
-                printk(KERN_ERR "EXT2-fs: corrupt root inode, run e2fsck\n");
+                ext2_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
                goto failed_mount3;
        }
        sb->s_root = d_alloc_root(root);
        if (!sb->s_root) {
                iput(root);
-                printk(KERN_ERR "EXT2-fs: get root inode failed\n");
+                ext2_msg(sb, KERN_ERR, "error: get root inode failed");
                ret = -ENOMEM;
                goto failed_mount3;
        }
        if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL))
-                ext2_warning(sb, __func__,
+                ext2_msg(sb, KERN_WARNING,
-                        "mounting ext3 filesystem as ext2");
+                        "warning: mounting ext3 filesystem as ext2");
        ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY);
        return 0;
 cantfind_ext2:
        if (!silent)
-                printk("VFS: Can't find an ext2 filesystem on dev %s.\n",
+                ext2_msg(sb, KERN_ERR,
-                       sb->s_id);
+                        "error: can't find an ext2 filesystem on dev %s.",
+                        sb->s_id);
        goto failed_mount;
 failed_mount3:
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
@@ -1089,9 +1107,30 @@ failed_sbi:
        return ret;
 }
+static void ext2_clear_super_error(struct super_block *sb)
+{
+        struct buffer_head *sbh = EXT2_SB(sb)->s_sbh;
+        if (buffer_write_io_error(sbh)) {
+                /*
+                 * Oh, dear.  A previous attempt to write the
+                 * superblock failed.  This could happen because the
+                 * USB device was yanked out.  Or it could happen to
+                 * be a transient write error and maybe the block will
+                 * be remapped.  Nothing we can do but to retry the
+                 * write and hope for the best.
+                 */
+                printk(KERN_ERR "EXT2-fs: %s previous I/O error to "
+                       "superblock detected", sb->s_id);
+                clear_buffer_write_io_error(sbh);
+                set_buffer_uptodate(sbh);
+        }
+}
 static void ext2_commit_super (struct super_block * sb,
                               struct ext2_super_block * es)
 {
+        ext2_clear_super_error(sb);
        es->s_wtime = cpu_to_le32(get_seconds());
        mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
        sb->s_dirt = 0;
@@ -1099,6 +1138,7 @@ static void ext2_commit_super (struct super_block * sb,
 static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
 {
+        ext2_clear_super_error(sb);
        es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
        es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb));
        es->s_wtime = cpu_to_le32(get_seconds());
@@ -1121,8 +1161,24 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
 static int ext2_sync_fs(struct super_block *sb, int wait)
 {
        struct ext2_super_block *es = EXT2_SB(sb)->s_es;
+        struct buffer_head *sbh = EXT2_SB(sb)->s_sbh;
        lock_kernel();
+        if (buffer_write_io_error(sbh)) {
+                /*
+                 * Oh, dear.  A previous attempt to write the
+                 * superblock failed.  This could happen because the
+                 * USB device was yanked out.  Or it could happen to
+                 * be a transient write error and maybe the block will
+                 * be remapped.  Nothing we can do but to retry the
+                 * write and hope for the best.
+                 */
+                ext2_msg(sb, KERN_ERR,
+                       "previous I/O error to superblock detected\n");
+                clear_buffer_write_io_error(sbh);
+                set_buffer_uptodate(sbh);
+        }
        if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
                ext2_debug("setting valid to 0\n");
                es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
@@ -1170,7 +1226,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
        /*
         * Allow the "check" option to be passed as a remount option.
         */
-        if (!parse_options (data, sbi)) {
+        if (!parse_options(data, sb)) {
                err = -EINVAL;
                goto restore_opts;
        }
@@ -1182,7 +1238,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
                                    EXT2_MOUNT_XIP if not */
        if ((ext2_use_xip(sb)) && (sb->s_blocksize != PAGE_SIZE)) {
-                printk("XIP: Unsupported blocksize\n");
+                ext2_msg(sb, KERN_WARNING,
+                        "warning: unsupported blocksize for xip");
                err = -EINVAL;
                goto restore_opts;
        }
@@ -1191,8 +1248,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
        if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
            (old_mount_opt & EXT2_MOUNT_XIP)) &&
            invalidate_inodes(sb)) {
-                ext2_warning(sb, __func__, "refusing change of xip flag "
+                ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
-                             "with busy inodes while remounting");
+                         "xip flag with busy inodes while remounting");
                sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
                sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
        }
@@ -1216,9 +1273,10 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
                __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb,
                                               ~EXT2_FEATURE_RO_COMPAT_SUPP);
                if (ret) {
-                        printk("EXT2-fs: %s: couldn't remount RDWR because of "
+                        ext2_msg(sb, KERN_WARNING,
-                               "unsupported optional features (%x).\n",
+                                "warning: couldn't remount RDWR because of "
-                               sb->s_id, le32_to_cpu(ret));
+                                "unsupported optional features (%x).",
+                                le32_to_cpu(ret));
                        err = -EROFS;
                        goto restore_opts;
                }
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 7913531ec6d5..e44dc92609be 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -60,6 +60,7 @@
 #include <linux/mbcache.h>
 #include <linux/quotaops.h>
 #include <linux/rwsem.h>
+#include <linux/security.h>
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
@@ -249,8 +250,9 @@ cleanup:
 * used / required on success.
 */
 static int
-ext2_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
+ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
+        struct inode *inode = dentry->d_inode;
        struct buffer_head *bh = NULL;
        struct ext2_xattr_entry *entry;
        char *end;
@@ -300,9 +302,10 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_list",
                        ext2_xattr_handler(entry->e_name_index);
                if (handler) {
-                        size_t size = handler->list(inode, buffer, rest,
+                        size_t size = handler->list(dentry, buffer, rest,
                                                    entry->e_name,
-                                                    entry->e_name_len);
+                                                    entry->e_name_len,
+                                                    handler->flags);
                        if (buffer) {
                                if (size > rest) {
                                        error = -ERANGE;
@@ -330,7 +333,7 @@ cleanup:
 ssize_t
 ext2_listxattr(struct dentry *dentry, char *buffer, size_t size)
 {
-        return ext2_xattr_list(dentry->d_inode, buffer, size);
+        return ext2_xattr_list(dentry, buffer, size);
 }
 /*
@@ -641,8 +644,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
                                   the inode.  */
                                ea_bdebug(new_bh, "reusing block");
-                                error = -EDQUOT;
+                                error = dquot_alloc_block(inode, 1);
-                                if (vfs_dq_alloc_block(inode, 1)) {
+                                if (error) {
                                        unlock_buffer(new_bh);
                                        goto cleanup;
                                }
@@ -699,7 +702,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
                 * as if nothing happened and cleanup the unused block */
                if (error && error != -ENOSPC) {
                        if (new_bh && new_bh != old_bh)
-                                vfs_dq_free_block(inode, 1);
+                                dquot_free_block(inode, 1);
                        goto cleanup;
                }
        } else
@@ -731,7 +734,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
                        le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
                        if (ce)
                                mb_cache_entry_release(ce);
-                        vfs_dq_free_block(inode, 1);
+                        dquot_free_block(inode, 1);
                        mark_buffer_dirty(old_bh);
                        ea_bdebug(old_bh, "refcount now=%d",
                                le32_to_cpu(HDR(old_bh)->h_refcount));
@@ -794,7 +797,7 @@ ext2_xattr_delete_inode(struct inode *inode)
                mark_buffer_dirty(bh);
                if (IS_SYNC(inode))
                        sync_dirty_buffer(bh);
-                vfs_dq_free_block(inode, 1);
+                dquot_free_block(inode, 1);
        }
        EXT2_I(inode)->i_file_acl = 0;
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 70c0dbdcdcb7..b118c6383c6d 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -4,6 +4,7 @@
 */
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/ext2_fs.h>
@@ -11,8 +12,8 @@
 #include "xattr.h"
 static size_t
-ext2_xattr_security_list(struct inode *inode, char *list, size_t list_size,
+ext2_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
-                         const char *name, size_t name_len)
+                         const char *name, size_t name_len, int type)
 {
        const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
@@ -26,22 +27,22 @@ ext2_xattr_security_list(struct inode *inode, char *list, size_t list_size,
 }
 static int
-ext2_xattr_security_get(struct inode *inode, const char *name,
+ext2_xattr_security_get(struct dentry *dentry, const char *name,
-                       void *buffer, size_t size)
+                       void *buffer, size_t size, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ext2_xattr_get(inode, EXT2_XATTR_INDEX_SECURITY, name,
+        return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_SECURITY, name,
                              buffer, size);
 }
 static int
-ext2_xattr_security_set(struct inode *inode, const char *name,
+ext2_xattr_security_set(struct dentry *dentry, const char *name,
-                       const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, name,
+        return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_SECURITY, name,
                              value, size, flags);
 }
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index e8219f8eae9f..2a26d71f4771 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -13,8 +13,8 @@
 #include "xattr.h"
 static size_t
-ext2_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
+ext2_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
-                        const char *name, size_t name_len)
+                const char *name, size_t name_len, int type)
 {
        const int prefix_len = XATTR_TRUSTED_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
@@ -31,22 +31,22 @@ ext2_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
 }
 static int
-ext2_xattr_trusted_get(struct inode *inode, const char *name,
+ext2_xattr_trusted_get(struct dentry *dentry, const char *name,
-                       void *buffer, size_t size)
+                void *buffer, size_t size, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ext2_xattr_get(inode, EXT2_XATTR_INDEX_TRUSTED, name,
+        return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_TRUSTED, name,
                              buffer, size);
 }
 static int
-ext2_xattr_trusted_set(struct inode *inode, const char *name,
+ext2_xattr_trusted_set(struct dentry *dentry, const char *name,
-                       const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ext2_xattr_set(inode, EXT2_XATTR_INDEX_TRUSTED, name,
+        return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_TRUSTED, name,
                              value, size, flags);
 }
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 92495d28c62f..3f6caf3684b4 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -12,13 +12,13 @@
 #include "xattr.h"
 static size_t
-ext2_xattr_user_list(struct inode *inode, char *list, size_t list_size,
+ext2_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
-                     const char *name, size_t name_len)
+                const char *name, size_t name_len, int type)
 {
        const size_t prefix_len = XATTR_USER_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
-        if (!test_opt(inode->i_sb, XATTR_USER))
+        if (!test_opt(dentry->d_sb, XATTR_USER))
                return 0;
        if (list && total_len <= list_size) {
@@ -30,27 +30,28 @@ ext2_xattr_user_list(struct inode *inode, char *list, size_t list_size,
 }
 static int
-ext2_xattr_user_get(struct inode *inode, const char *name,
+ext2_xattr_user_get(struct dentry *dentry, const char *name,
-                    void *buffer, size_t size)
+                void *buffer, size_t size, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        if (!test_opt(inode->i_sb, XATTR_USER))
+        if (!test_opt(dentry->d_sb, XATTR_USER))
                return -EOPNOTSUPP;
-        return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER, name, buffer, size);
+        return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_USER,
+                              name, buffer, size);
 }
 static int
-ext2_xattr_user_set(struct inode *inode, const char *name,
+ext2_xattr_user_set(struct dentry *dentry, const char *name,
-                    const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        if (!test_opt(inode->i_sb, XATTR_USER))
+        if (!test_opt(dentry->d_sb, XATTR_USER))
                return -EOPNOTSUPP;
-        return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name,
+        return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_USER,
-                              value, size, flags);
+                              name, value, size, flags);
 }
 struct xattr_handler ext2_xattr_user_handler = {
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
index c18fbf3e4068..322a56b2dfb1 100644
--- a/fs/ext2/xip.c
+++ b/fs/ext2/xip.c
@@ -69,8 +69,9 @@ void ext2_xip_verify_sb(struct super_block *sb)
        if ((sbi->s_mount_opt & EXT2_MOUNT_XIP) &&
            !sb->s_bdev->bd_disk->fops->direct_access) {
                sbi->s_mount_opt &= (~EXT2_MOUNT_XIP);
-                ext2_warning(sb, __func__,
+                ext2_msg(sb, KERN_WARNING,
-                             "ignoring xip option - not supported by bdev");
+                             "warning: ignoring xip option - "
+                             "not supported by bdev");
        }
 }
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index c9b0df376b5f..82ba34158661 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -366,12 +366,12 @@ out:
 * Extended attribute handlers
 */
 static size_t
-ext3_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
+ext3_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len,
-                           const char *name, size_t name_len)
+                           const char *name, size_t name_len, int type)
 {
        const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-        if (!test_opt(inode->i_sb, POSIX_ACL))
+        if (!test_opt(dentry->d_sb, POSIX_ACL))
                return 0;
        if (list && size <= list_len)
                memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
@@ -379,12 +379,12 @@ ext3_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
 }
 static size_t
-ext3_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
+ext3_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len,
-                            const char *name, size_t name_len)
+                            const char *name, size_t name_len, int type)
 {
        const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-        if (!test_opt(inode->i_sb, POSIX_ACL))
+        if (!test_opt(dentry->d_sb, POSIX_ACL))
                return 0;
        if (list && size <= list_len)
                memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
@@ -392,15 +392,18 @@ ext3_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
 }
 static int
-ext3_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
+ext3_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
+                   size_t size, int type)
 {
        struct posix_acl *acl;
        int error;
-        if (!test_opt(inode->i_sb, POSIX_ACL))
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        if (!test_opt(dentry->d_sb, POSIX_ACL))
                return -EOPNOTSUPP;
-        acl = ext3_get_acl(inode, type);
+        acl = ext3_get_acl(dentry->d_inode, type);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl == NULL)
@@ -412,31 +415,16 @@ ext3_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
 }
 static int
-ext3_xattr_get_acl_access(struct inode *inode, const char *name,
+ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
-                          void *buffer, size_t size)
+                   size_t size, int flags, int type)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ext3_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
-}
-static int
-ext3_xattr_get_acl_default(struct inode *inode, const char *name,
-                           void *buffer, size_t size)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ext3_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
-}
-static int
-ext3_xattr_set_acl(struct inode *inode, int type, const void *value,
-                   size_t size)
 {
+        struct inode *inode = dentry->d_inode;
        handle_t *handle;
        struct posix_acl *acl;
        int error, retries = 0;
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
        if (!test_opt(inode->i_sb, POSIX_ACL))
                return -EOPNOTSUPP;
        if (!is_owner_or_cap(inode))
@@ -468,34 +456,18 @@ release_and_out:
        return error;
 }
-static int
-ext3_xattr_set_acl_access(struct inode *inode, const char *name,
-                          const void *value, size_t size, int flags)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ext3_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
-}
-static int
-ext3_xattr_set_acl_default(struct inode *inode, const char *name,
-                           const void *value, size_t size, int flags)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ext3_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
-}
 struct xattr_handler ext3_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .flags  = ACL_TYPE_ACCESS,
        .list   = ext3_xattr_list_acl_access,
-        .get    = ext3_xattr_get_acl_access,
+        .get    = ext3_xattr_get_acl,
-        .set    = ext3_xattr_set_acl_access,
+        .set    = ext3_xattr_set_acl,
 };
 struct xattr_handler ext3_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .flags  = ACL_TYPE_DEFAULT,
        .list   = ext3_xattr_list_acl_default,
-        .get    = ext3_xattr_get_acl_default,
+        .get    = ext3_xattr_get_acl,
-        .set    = ext3_xattr_set_acl_default,
+        .set    = ext3_xattr_set_acl,
 };
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 27967f92e820..a177122a1b25 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -14,6 +14,7 @@
 #include <linux/time.h>
 #include <linux/capability.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/jbd.h>
 #include <linux/ext3_fs.h>
 #include <linux/ext3_jbd.h>
@@ -676,7 +677,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode,
        }
        ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
        if (dquot_freed_blocks)
-                vfs_dq_free_block(inode, dquot_freed_blocks);
+                dquot_free_block(inode, dquot_freed_blocks);
        return;
 }
@@ -1502,8 +1503,9 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
        /*
         * Check quota for allocation of this block.
         */
-        if (vfs_dq_alloc_block(inode, num)) {
+        err = dquot_alloc_block(inode, num);
-                *errp = -EDQUOT;
+        if (err) {
+                *errp = err;
                return 0;
        }
@@ -1713,7 +1715,7 @@ allocated:
        *errp = 0;
        brelse(bitmap_bh);
-        vfs_dq_free_block(inode, *count-num);
+        dquot_free_block(inode, *count-num);
        *count = num;
        return ret_block;
@@ -1728,7 +1730,7 @@ out:
         * Undo the block allocation
         */
        if (!performed_allocation)
-                vfs_dq_free_block(inode, *count);
+                dquot_free_block(inode, *count);
        brelse(bitmap_bh);
        return 0;
 }
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 388bbdfa0b4e..f55df0e61cbd 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -21,6 +21,7 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd.h>
+#include <linux/quotaops.h>
 #include <linux/ext3_fs.h>
 #include <linux/ext3_jbd.h>
 #include "xattr.h"
@@ -33,9 +34,9 @@
 */
 static int ext3_release_file (struct inode * inode, struct file * filp)
 {
-        if (EXT3_I(inode)->i_state & EXT3_STATE_FLUSH_ON_CLOSE) {
+        if (ext3_test_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE)) {
                filemap_flush(inode->i_mapping);
-                EXT3_I(inode)->i_state &= ~EXT3_STATE_FLUSH_ON_CLOSE;
+                ext3_clear_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
        }
        /* if we are the last writer on the inode, drop the block reservation */
        if ((filp->f_mode & FMODE_WRITE) &&
@@ -62,7 +63,7 @@ const struct file_operations ext3_file_operations = {
        .compat_ioctl   = ext3_compat_ioctl,
 #endif
        .mmap           = generic_file_mmap,
-        .open           = generic_file_open,
+        .open           = dquot_file_open,
        .release        = ext3_release_file,
        .fsync          = ext3_sync_file,
        .splice_read    = generic_file_splice_read,
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index b39991285136..0d0e97ed3ff6 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -123,10 +123,10 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
         * Note: we must free any quota before locking the superblock,
         * as writing the quota to disk may need the lock as well.
         */
-        vfs_dq_init(inode);
+        dquot_initialize(inode);
        ext3_xattr_delete_inode(handle, inode);
-        vfs_dq_free_inode(inode);
+        dquot_free_inode(inode);
-        vfs_dq_drop(inode);
+        dquot_drop(inode);
        is_directory = S_ISDIR(inode->i_mode);
@@ -582,16 +582,18 @@ got:
        inode->i_generation = sbi->s_next_generation++;
        spin_unlock(&sbi->s_next_gen_lock);
-        ei->i_state = EXT3_STATE_NEW;
+        ei->i_state_flags = 0;
+        ext3_set_inode_state(inode, EXT3_STATE_NEW);
        ei->i_extra_isize =
                (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ?
                sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
        ret = inode;
-        if (vfs_dq_alloc_inode(inode)) {
+        dquot_initialize(inode);
-                err = -EDQUOT;
+        err = dquot_alloc_inode(inode);
+        if (err)
                goto fail_drop;
-        }
        err = ext3_init_acl(handle, inode, dir);
        if (err)
@@ -619,10 +621,10 @@ really_out:
        return ret;
 fail_free_drop:
-        vfs_dq_free_inode(inode);
+        dquot_free_inode(inode);
 fail_drop:
-        vfs_dq_drop(inode);
+        dquot_drop(inode);
        inode->i_flags |= S_NOQUOTA;
        inode->i_nlink = 0;
        unlock_new_inode(inode);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 354ed3b47b30..ea33bdf0a300 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -196,6 +196,9 @@ void ext3_delete_inode (struct inode * inode)
 {
        handle_t *handle;
+        if (!is_bad_inode(inode))
+                dquot_initialize(inode);
        truncate_inode_pages(&inode->i_data, 0);
        if (is_bad_inode(inode))
@@ -970,7 +973,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock,
                if (max_blocks > DIO_MAX_BLOCKS)
                        max_blocks = DIO_MAX_BLOCKS;
                handle = ext3_journal_start(inode, DIO_CREDITS +
-                                2 * EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb));
+                                EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        goto out;
@@ -1151,6 +1154,16 @@ static int do_journal_get_write_access(handle_t *handle,
        return ext3_journal_get_write_access(handle, bh);
 }
+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static void ext3_truncate_failed_write(struct inode *inode)
+{
+        truncate_inode_pages(inode->i_mapping, inode->i_size);
+        ext3_truncate(inode);
+}
 static int ext3_write_begin(struct file *file, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned flags,
                                struct page **pagep, void **fsdata)
@@ -1209,7 +1222,7 @@ write_begin_failed:
                unlock_page(page);
                page_cache_release(page);
                if (pos + len > inode->i_size)
-                        ext3_truncate(inode);
+                        ext3_truncate_failed_write(inode);
        }
        if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
@@ -1304,7 +1317,7 @@ static int ext3_ordered_write_end(struct file *file,
        page_cache_release(page);
        if (pos + len > inode->i_size)
-                ext3_truncate(inode);
+                ext3_truncate_failed_write(inode);
        return ret ? ret : copied;
 }
@@ -1330,7 +1343,7 @@ static int ext3_writeback_write_end(struct file *file,
        page_cache_release(page);
        if (pos + len > inode->i_size)
-                ext3_truncate(inode);
+                ext3_truncate_failed_write(inode);
        return ret ? ret : copied;
 }
@@ -1368,7 +1381,7 @@ static int ext3_journalled_write_end(struct file *file,
         */
        if (pos + len > inode->i_size && ext3_can_truncate(inode))
                ext3_orphan_add(handle, inode);
-        EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
+        ext3_set_inode_state(inode, EXT3_STATE_JDATA);
        if (inode->i_size > EXT3_I(inode)->i_disksize) {
                EXT3_I(inode)->i_disksize = inode->i_size;
                ret2 = ext3_mark_inode_dirty(handle, inode);
@@ -1383,7 +1396,7 @@ static int ext3_journalled_write_end(struct file *file,
        page_cache_release(page);
        if (pos + len > inode->i_size)
-                ext3_truncate(inode);
+                ext3_truncate_failed_write(inode);
        return ret ? ret : copied;
 }
@@ -1407,7 +1420,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
        journal_t *journal;
        int err;
-        if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
+        if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) {
                /*
                 * This is a REALLY heavyweight approach, but the use of
                 * bmap on dirty files is expected to be extremely rare:
@@ -1426,7 +1439,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
                 * everything they get.
                 */
-                EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
+                ext3_clear_inode_state(inode, EXT3_STATE_JDATA);
                journal = EXT3_JOURNAL(inode);
                journal_lock_updates(journal);
                err = journal_flush(journal);
@@ -1518,6 +1531,7 @@ static int ext3_ordered_writepage(struct page *page,
        int err;
        J_ASSERT(PageLocked(page));
+        WARN_ON_ONCE(IS_RDONLY(inode));
        /*
         * We give up here if we're reentered, because it might be for a
@@ -1590,6 +1604,9 @@ static int ext3_writeback_writepage(struct page *page,
        int ret = 0;
        int err;
+        J_ASSERT(PageLocked(page));
+        WARN_ON_ONCE(IS_RDONLY(inode));
        if (ext3_journal_current_handle())
                goto out_fail;
@@ -1632,6 +1649,9 @@ static int ext3_journalled_writepage(struct page *page,
        int ret = 0;
        int err;
+        J_ASSERT(PageLocked(page));
+        WARN_ON_ONCE(IS_RDONLY(inode));
        if (ext3_journal_current_handle())
                goto no_write;
@@ -1660,7 +1680,7 @@ static int ext3_journalled_writepage(struct page *page,
                                PAGE_CACHE_SIZE, NULL, write_end_fn);
                if (ret == 0)
                        ret = err;
-                EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
+                ext3_set_inode_state(inode, EXT3_STATE_JDATA);
                unlock_page(page);
        } else {
                /*
@@ -1775,8 +1795,9 @@ retry:
                handle = ext3_journal_start(inode, 2);
                if (IS_ERR(handle)) {
                        /* This is really bad luck. We've written the data
-                         * but cannot extend i_size. Bail out and pretend
+                         * but cannot extend i_size. Truncate allocated blocks
-                         * the write failed... */
+                         * and pretend the write failed... */
+                        ext3_truncate(inode);
                        ret = PTR_ERR(handle);
                        goto out;
                }
@@ -2033,7 +2054,7 @@ static Indirect *ext3_find_shared(struct inode *inode, int depth,
        int k, err;
        *top = 0;
-        /* Make k index the deepest non-null offest + 1 */
+        /* Make k index the deepest non-null offset + 1 */
        for (k = depth; k > 1 && !offsets[k-1]; k--)
                ;
        partial = ext3_get_branch(inode, k, offsets, chain, &err);
@@ -2392,7 +2413,7 @@ void ext3_truncate(struct inode *inode)
                goto out_notrans;
        if (inode->i_size == 0 && ext3_should_writeback_data(inode))
-                ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE;
+                ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
        /*
         * We have to lock the EOF page here, because lock_page() nests
@@ -2711,7 +2732,7 @@ int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
 {
        /* We have all inode data except xattrs in memory here. */
        return __ext3_get_inode_loc(inode, iloc,
-                !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR));
+                !ext3_test_inode_state(inode, EXT3_STATE_XATTR));
 }
 void ext3_set_inode_flags(struct inode *inode)
@@ -2790,7 +2811,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
        inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
        inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
-        ei->i_state = 0;
+        ei->i_state_flags = 0;
        ei->i_dir_start_lookup = 0;
        ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
        /* We now have enough fields to check if the inode was active or not.
@@ -2883,7 +2904,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
                                        EXT3_GOOD_OLD_INODE_SIZE +
                                        ei->i_extra_isize;
                        if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
-                                 ei->i_state |= EXT3_STATE_XATTR;
+                                 ext3_set_inode_state(inode, EXT3_STATE_XATTR);
                }
        } else
                ei->i_extra_isize = 0;
@@ -2945,7 +2966,7 @@ again:
        /* For fields not not tracking in the in-memory inode,
         * initialise them to zero for new inodes. */
-        if (ei->i_state & EXT3_STATE_NEW)
+        if (ext3_test_inode_state(inode, EXT3_STATE_NEW))
                memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
        ext3_get_inode_flags(ei);
@@ -3042,7 +3063,7 @@ again:
        rc = ext3_journal_dirty_metadata(handle, bh);
        if (!err)
                err = rc;
-        ei->i_state &= ~EXT3_STATE_NEW;
+        ext3_clear_inode_state(inode, EXT3_STATE_NEW);
        atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
 out_brelse:
@@ -3086,7 +3107,7 @@ out_brelse:
 * `stuff()' is running, and the new i_size will be lost.  Plus the inode
 * will no longer be on the superblock's dirty inode list.
 */
-int ext3_write_inode(struct inode *inode, int wait)
+int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        if (current->flags & PF_MEMALLOC)
                return 0;
@@ -3097,7 +3118,7 @@ int ext3_write_inode(struct inode *inode, int wait)
                return -EIO;
        }
-        if (!wait)
+        if (wbc->sync_mode != WB_SYNC_ALL)
                return 0;
        return ext3_force_commit(inode->i_sb);
@@ -3130,19 +3151,21 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
        if (error)
                return error;
+        if (ia_valid & ATTR_SIZE)
+                dquot_initialize(inode);
        if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
                (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
                handle_t *handle;
                /* (user+group)*(old+new) structure, inode write (sb,
                 * inode block, ? - but truncate inode update has it) */
-                handle = ext3_journal_start(inode, 2*(EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)+
+                handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
-                                        EXT3_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
+                                        EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3);
                if (IS_ERR(handle)) {
                        error = PTR_ERR(handle);
                        goto err_out;
                }
-                error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+                error = dquot_transfer(inode, attr);
                if (error) {
                        ext3_journal_stop(handle);
                        return error;
@@ -3227,9 +3250,9 @@ static int ext3_writepage_trans_blocks(struct inode *inode)
                ret = 2 * (bpp + indirects) + 2;
 #ifdef CONFIG_QUOTA
-        /* We know that structure was already allocated during vfs_dq_init so
+        /* We know that structure was already allocated during dquot_initialize so
         * we will be updating only the data blocks + inodes */
-        ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb);
+        ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
 #endif
        return ret;
@@ -3318,7 +3341,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
 * i_size has been changed by generic_commit_write() and we thus need
 * to include the updated inode in the current transaction.
 *
- * Also, vfs_dq_alloc_space() will always dirty the inode when blocks
+ * Also, dquot_alloc_space() will always dirty the inode when blocks
 * are allocated to the file.
 *
 * If the inode is marked synchronous, we don't honour that here - doing
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index aad6400c9b77..ee184084ca42 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1696,10 +1696,12 @@ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
        struct inode * inode;
        int err, retries = 0;
+        dquot_initialize(dir);
 retry:
        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-                                        2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
+                                        EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -1730,10 +1732,12 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry,
        if (!new_valid_dev(rdev))
                return -EINVAL;
+        dquot_initialize(dir);
 retry:
        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-                                        2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
+                                        EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -1766,10 +1770,12 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
        if (dir->i_nlink >= EXT3_LINK_MAX)
                return -EMLINK;
+        dquot_initialize(dir);
 retry:
        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-                                        2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
+                                        EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -1920,7 +1926,7 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode)
        struct ext3_iloc iloc;
        int err = 0, rc;
-        lock_super(sb);
+        mutex_lock(&EXT3_SB(sb)->s_orphan_lock);
        if (!list_empty(&EXT3_I(inode)->i_orphan))
                goto out_unlock;
@@ -1929,9 +1935,13 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode)
        /* @@@ FIXME: Observation from aviro:
         * I think I can trigger J_ASSERT in ext3_orphan_add().  We block
-         * here (on lock_super()), so race with ext3_link() which might bump
+         * here (on s_orphan_lock), so race with ext3_link() which might bump
         * ->i_nlink. For, say it, character device. Not a regular file,
         * not a directory, not a symlink and ->i_nlink > 0.
+         *
+         * tytso, 4/25/2009: I'm not sure how that could happen;
+         * shouldn't the fs core protect us from these sort of
+         * unlink()/link() races?
         */
        J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
                S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
@@ -1968,7 +1978,7 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode)
        jbd_debug(4, "orphan inode %lu will point to %d\n",
                        inode->i_ino, NEXT_ORPHAN(inode));
 out_unlock:
-        unlock_super(sb);
+        mutex_unlock(&EXT3_SB(sb)->s_orphan_lock);
        ext3_std_error(inode->i_sb, err);
        return err;
 }
@@ -1986,11 +1996,9 @@ int ext3_orphan_del(handle_t *handle, struct inode *inode)
        struct ext3_iloc iloc;
        int err = 0;
-        lock_super(inode->i_sb);
+        mutex_lock(&EXT3_SB(inode->i_sb)->s_orphan_lock);
-        if (list_empty(&ei->i_orphan)) {
+        if (list_empty(&ei->i_orphan))
-                unlock_super(inode->i_sb);
+                goto out;
-                return 0;
-        }
        ino_next = NEXT_ORPHAN(inode);
        prev = ei->i_orphan.prev;
@@ -2040,7 +2048,7 @@ int ext3_orphan_del(handle_t *handle, struct inode *inode)
 out_err:
        ext3_std_error(inode->i_sb, err);
 out:
-        unlock_super(inode->i_sb);
+        mutex_unlock(&EXT3_SB(inode->i_sb)->s_orphan_lock);
        return err;
 out_brelse:
@@ -2058,7 +2066,9 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
        /* Initialize quotas before so that eventual writes go in
         * separate transaction */
-        vfs_dq_init(dentry->d_inode);
+        dquot_initialize(dir);
+        dquot_initialize(dentry->d_inode);
        handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -2117,7 +2127,9 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
        /* Initialize quotas before so that eventual writes go
         * in separate transaction */
-        vfs_dq_init(dentry->d_inode);
+        dquot_initialize(dir);
+        dquot_initialize(dentry->d_inode);
        handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -2172,10 +2184,12 @@ static int ext3_symlink (struct inode * dir,
        if (l > dir->i_sb->s_blocksize)
                return -ENAMETOOLONG;
+        dquot_initialize(dir);
 retry:
        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
-                                        2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
+                                        EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -2226,6 +2240,9 @@ static int ext3_link (struct dentry * old_dentry,
        if (inode->i_nlink >= EXT3_LINK_MAX)
                return -EMLINK;
+        dquot_initialize(dir);
        /*
         * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
         * otherwise has the potential to corrupt the orphan inode list.
@@ -2276,12 +2293,15 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
        struct ext3_dir_entry_2 * old_de, * new_de;
        int retval, flush_file = 0;
+        dquot_initialize(old_dir);
+        dquot_initialize(new_dir);
        old_bh = new_bh = dir_bh = NULL;
        /* Initialize quotas before so that eventual writes go
         * in separate transaction */
        if (new_dentry->d_inode)
-                vfs_dq_init(new_dentry->d_inode);
+                dquot_initialize(new_dentry->d_inode);
        handle = ext3_journal_start(old_dir, 2 *
                                        EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
                                        EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 8359e7b3dc89..54351ac7cef9 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -209,7 +209,7 @@ static int setup_new_group_blocks(struct super_block *sb,
        if (IS_ERR(handle))
                return PTR_ERR(handle);
-        lock_super(sb);
+        mutex_lock(&sbi->s_resize_lock);
        if (input->group != sbi->s_groups_count) {
                err = -EBUSY;
                goto exit_journal;
@@ -266,7 +266,7 @@ static int setup_new_group_blocks(struct super_block *sb,
                        goto exit_bh;
                if (IS_ERR(gdb = bclean(handle, sb, block))) {
-                        err = PTR_ERR(bh);
+                        err = PTR_ERR(gdb);
                        goto exit_bh;
                }
                ext3_journal_dirty_metadata(handle, gdb);
@@ -324,7 +324,7 @@ exit_bh:
        brelse(bh);
 exit_journal:
-        unlock_super(sb);
+        mutex_unlock(&sbi->s_resize_lock);
        if ((err2 = ext3_journal_stop(handle)) && !err)
                err = err2;
@@ -662,11 +662,12 @@ exit_free:
 * important part is that the new block and inode counts are in the backup
 * superblocks, and the location of the new group metadata in the GDT backups.
 *
- * We do not need lock_super() for this, because these blocks are not
+ * We do not need take the s_resize_lock for this, because these
- * otherwise touched by the filesystem code when it is mounted.  We don't
+ * blocks are not otherwise touched by the filesystem code when it is
- * need to worry about last changing from sbi->s_groups_count, because the
+ * mounted.  We don't need to worry about last changing from
- * worst that can happen is that we do not copy the full number of backups
+ * sbi->s_groups_count, because the worst that can happen is that we
- * at this time.  The resize which changed s_groups_count will backup again.
+ * do not copy the full number of backups at this time.  The resize
+ * which changed s_groups_count will backup again.
 */
 static void update_backups(struct super_block *sb,
                           int blk_off, char *data, int size)
@@ -825,7 +826,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
                goto exit_put;
        }
-        lock_super(sb);
+        mutex_lock(&sbi->s_resize_lock);
        if (input->group != sbi->s_groups_count) {
                ext3_warning(sb, __func__,
                             "multiple resizers run on filesystem!");
@@ -856,7 +857,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
        /*
         * OK, now we've set up the new group.  Time to make it active.
         *
-         * Current kernels don't lock all allocations via lock_super(),
+         * We do not lock all allocations via s_resize_lock
         * so we have to be safe wrt. concurrent accesses the group
         * data.  So we need to be careful to set all of the relevant
         * group descriptor data etc. *before* we enable the group.
@@ -900,12 +901,12 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
         *
         * The precise rules we use are:
         *
-         * * Writers of s_groups_count *must* hold lock_super
+         * * Writers of s_groups_count *must* hold s_resize_lock
         * AND
         * * Writers must perform a smp_wmb() after updating all dependent
         *   data and before modifying the groups count
         *
-         * * Readers must hold lock_super() over the access
+         * * Readers must hold s_resize_lock over the access
         * OR
         * * Readers must perform an smp_rmb() after reading the groups count
         *   and before reading any dependent data.
@@ -936,7 +937,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
        ext3_journal_dirty_metadata(handle, sbi->s_sbh);
 exit_journal:
-        unlock_super(sb);
+        mutex_unlock(&sbi->s_resize_lock);
        if ((err2 = ext3_journal_stop(handle)) && !err)
                err = err2;
        if (!err) {
@@ -973,7 +974,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
        /* We don't need to worry about locking wrt other resizers just
         * yet: we're going to revalidate es->s_blocks_count after
-         * taking lock_super() below. */
+         * taking the s_resize_lock below. */
        o_blocks_count = le32_to_cpu(es->s_blocks_count);
        o_groups_count = EXT3_SB(sb)->s_groups_count;
@@ -1045,11 +1046,11 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
                goto exit_put;
        }
-        lock_super(sb);
+        mutex_lock(&EXT3_SB(sb)->s_resize_lock);
        if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) {
                ext3_warning(sb, __func__,
                             "multiple resizers run on filesystem!");
-                unlock_super(sb);
+                mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
                ext3_journal_stop(handle);
                err = -EBUSY;
                goto exit_put;
@@ -1059,13 +1060,13 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
                                                 EXT3_SB(sb)->s_sbh))) {
                ext3_warning(sb, __func__,
                             "error %d on journal write access", err);
-                unlock_super(sb);
+                mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
                ext3_journal_stop(handle);
                goto exit_put;
        }
        es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
        ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
-        unlock_super(sb);
+        mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
        ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count,
                   o_blocks_count + add);
        ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 427496c4767c..1bee604cc6cd 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -135,12 +135,24 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn,
        if (is_handle_aborted(handle))
                return;
-        printk(KERN_ERR "%s: aborting transaction: %s in %s\n",
+        printk(KERN_ERR "EXT3-fs: %s: aborting transaction: %s in %s\n",
-               caller, errstr, err_fn);
+                caller, errstr, err_fn);
        journal_abort_handle(handle);
 }
+void ext3_msg(struct super_block *sb, const char *prefix,
+                const char *fmt, ...)
+{
+        va_list args;
+        va_start(args, fmt);
+        printk("%sEXT3-fs (%s): ", prefix, sb->s_id);
+        vprintk(fmt, args);
+        printk("\n");
+        va_end(args);
+}
 /* Deal with the reporting of failure conditions on a filesystem such as
 * inconsistencies detected or read IO failures.
 *
@@ -152,7 +164,7 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn,
 * write out the superblock safely.
 *
 * We'll just use the journal_abort() error code to record an error in
- * the journal instead.  On recovery, the journal will compain about
+ * the journal instead.  On recovery, the journal will complain about
 * that error until we've noted it down and cleared it.
 */
@@ -169,17 +181,18 @@ static void ext3_handle_error(struct super_block *sb)
        if (!test_opt (sb, ERRORS_CONT)) {
                journal_t *journal = EXT3_SB(sb)->s_journal;
-                EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
+                set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
                if (journal)
                        journal_abort(journal, -EIO);
        }
        if (test_opt (sb, ERRORS_RO)) {
-                printk (KERN_CRIT "Remounting filesystem read-only\n");
+                ext3_msg(sb, KERN_CRIT,
+                        "error: remounting filesystem read-only");
                sb->s_flags |= MS_RDONLY;
        }
        ext3_commit_super(sb, es, 1);
        if (test_opt(sb, ERRORS_PANIC))
-                panic("EXT3-fs (device %s): panic forced after error\n",
+                panic("EXT3-fs (%s): panic forced after error\n",
                        sb->s_id);
 }
@@ -247,8 +260,7 @@ void __ext3_std_error (struct super_block * sb, const char * function,
                return;
        errstr = ext3_decode_error(sb, errno, nbuf);
-        printk (KERN_CRIT "EXT3-fs error (device %s) in %s: %s\n",
+        ext3_msg(sb, KERN_CRIT, "error in %s: %s", function, errstr);
-                sb->s_id, function, errstr);
        ext3_handle_error(sb);
 }
@@ -268,24 +280,23 @@ void ext3_abort (struct super_block * sb, const char * function,
 {
        va_list args;
-        printk (KERN_CRIT "ext3_abort called.\n");
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function);
+        printk(KERN_CRIT "EXT3-fs (%s): error: %s: ", sb->s_id, function);
        vprintk(fmt, args);
        printk("\n");
        va_end(args);
        if (test_opt(sb, ERRORS_PANIC))
-                panic("EXT3-fs panic from previous error\n");
+                panic("EXT3-fs: panic from previous error\n");
        if (sb->s_flags & MS_RDONLY)
                return;
-        printk(KERN_CRIT "Remounting filesystem read-only\n");
+        ext3_msg(sb, KERN_CRIT,
+                "error: remounting filesystem read-only");
        EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
        sb->s_flags |= MS_RDONLY;
-        EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
+        set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
        if (EXT3_SB(sb)->s_journal)
                journal_abort(EXT3_SB(sb)->s_journal, -EIO);
 }
@@ -296,7 +307,7 @@ void ext3_warning (struct super_block * sb, const char * function,
        va_list args;
        va_start(args, fmt);
-        printk(KERN_WARNING "EXT3-fs warning (device %s): %s: ",
+        printk(KERN_WARNING "EXT3-fs (%s): warning: %s: ",
               sb->s_id, function);
        vprintk(fmt, args);
        printk("\n");
@@ -310,10 +321,10 @@ void ext3_update_dynamic_rev(struct super_block *sb)
        if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV)
                return;
-        ext3_warning(sb, __func__,
+        ext3_msg(sb, KERN_WARNING,
-                     "updating to rev %d because of new feature flag, "
+                "warning: updating to rev %d because of "
-                     "running e2fsck is recommended",
+                "new feature flag, running e2fsck is recommended",
-                     EXT3_DYNAMIC_REV);
+                EXT3_DYNAMIC_REV);
        es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO);
        es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE);
@@ -331,7 +342,7 @@ void ext3_update_dynamic_rev(struct super_block *sb)
 /*
 * Open the external journal device
 */
-static struct block_device *ext3_blkdev_get(dev_t dev)
+static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
 {
        struct block_device *bdev;
        char b[BDEVNAME_SIZE];
@@ -342,8 +353,9 @@ static struct block_device *ext3_blkdev_get(dev_t dev)
        return bdev;
 fail:
-        printk(KERN_ERR "EXT3: failed to open journal device %s: %ld\n",
+        ext3_msg(sb, "error: failed to open journal device %s: %ld",
-                        __bdevname(dev, b), PTR_ERR(bdev));
+                __bdevname(dev, b), PTR_ERR(bdev));
        return NULL;
 }
@@ -378,13 +390,13 @@ static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi)
 {
        struct list_head *l;
-        printk(KERN_ERR "sb orphan head is %d\n",
+        ext3_msg(sb, KERN_ERR, "error: sb orphan head is %d",
               le32_to_cpu(sbi->s_es->s_last_orphan));
-        printk(KERN_ERR "sb_info orphan list:\n");
+        ext3_msg(sb, KERN_ERR, "sb_info orphan list:");
        list_for_each(l, &sbi->s_orphan) {
                struct inode *inode = orphan_list_entry(l);
-                printk(KERN_ERR "  "
+                ext3_msg(sb, KERN_ERR, "  "
                       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
                       inode->i_sb->s_id, inode->i_ino, inode,
                       inode->i_mode, inode->i_nlink,
@@ -516,6 +528,8 @@ static void destroy_inodecache(void)
 static void ext3_clear_inode(struct inode *inode)
 {
        struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
+        dquot_drop(inode);
        ext3_discard_reservation(inode);
        EXT3_I(inode)->i_block_alloc_info = NULL;
        if (unlikely(rsv))
@@ -527,9 +541,22 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl
 #if defined(CONFIG_QUOTA)
        struct ext3_sb_info *sbi = EXT3_SB(sb);
-        if (sbi->s_jquota_fmt)
+        if (sbi->s_jquota_fmt) {
-                seq_printf(seq, ",jqfmt=%s",
+                char *fmtname = "";
-                (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0");
+                switch (sbi->s_jquota_fmt) {
+                case QFMT_VFS_OLD:
+                        fmtname = "vfsold";
+                        break;
+                case QFMT_VFS_V0:
+                        fmtname = "vfsv0";
+                        break;
+                case QFMT_VFS_V1:
+                        fmtname = "vfsv1";
+                        break;
+                }
+                seq_printf(seq, ",jqfmt=%s", fmtname);
+        }
        if (sbi->s_qf_names[USRQUOTA])
                seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
@@ -537,10 +564,10 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl
        if (sbi->s_qf_names[GRPQUOTA])
                seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
-        if (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA)
+        if (test_opt(sb, USRQUOTA))
                seq_puts(seq, ",usrquota");
-        if (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)
+        if (test_opt(sb, GRPQUOTA))
                seq_puts(seq, ",grpquota");
 #endif
 }
@@ -631,11 +658,13 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (test_opt(sb, NOBH))
                seq_puts(seq, ",nobh");
-        seq_printf(seq, ",data=%s", data_mode_string(sbi->s_mount_opt &
+        seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS)));
-                                                     EXT3_MOUNT_DATA_FLAGS));
        if (test_opt(sb, DATA_ERR_ABORT))
                seq_puts(seq, ",data_err=abort");
+        if (test_opt(sb, NOLOAD))
+                seq_puts(seq, ",norecovery");
        ext3_show_quota_options(seq, sb);
        return 0;
@@ -723,13 +752,6 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
                                const char *data, size_t len, loff_t off);
 static const struct dquot_operations ext3_quota_operations = {
-        .initialize     = dquot_initialize,
-        .drop           = dquot_drop,
-        .alloc_space    = dquot_alloc_space,
-        .alloc_inode    = dquot_alloc_inode,
-        .free_space     = dquot_free_space,
-        .free_inode     = dquot_free_inode,
-        .transfer       = dquot_transfer,
        .write_dquot    = ext3_write_dquot,
        .acquire_dquot  = ext3_acquire_dquot,
        .release_dquot  = ext3_release_dquot,
@@ -787,9 +809,9 @@ enum {
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
        Opt_data_err_abort, Opt_data_err_ignore,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
-        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
+        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
-        Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
+        Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
-        Opt_grpquota
+        Opt_usrquota, Opt_grpquota
 };
 static const match_table_t tokens = {
@@ -818,6 +840,7 @@ static const match_table_t tokens = {
        {Opt_reservation, "reservation"},
        {Opt_noreservation, "noreservation"},
        {Opt_noload, "noload"},
+        {Opt_noload, "norecovery"},
        {Opt_nobh, "nobh"},
        {Opt_bh, "bh"},
        {Opt_commit, "commit=%u"},
@@ -836,6 +859,7 @@ static const match_table_t tokens = {
        {Opt_grpjquota, "grpjquota=%s"},
        {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
        {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
+        {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
        {Opt_grpquota, "grpquota"},
        {Opt_noquota, "noquota"},
        {Opt_quota, "quota"},
@@ -845,7 +869,7 @@ static const match_table_t tokens = {
        {Opt_err, NULL},
 };
-static ext3_fsblk_t get_sb_block(void **data)
+static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb)
 {
        ext3_fsblk_t    sb_block;
        char            *options = (char *) *data;
@@ -856,7 +880,7 @@ static ext3_fsblk_t get_sb_block(void **data)
        /*todo: use simple_strtoll with >32bit ext3 */
        sb_block = simple_strtoul(options, &options, 0);
        if (*options && *options != ',') {
-                printk("EXT3-fs: Invalid sb specification: %s\n",
+                ext3_msg(sb, "error: invalid sb specification: %s",
                       (char *) *data);
                return 1;
        }
@@ -866,6 +890,63 @@ static ext3_fsblk_t get_sb_block(void **data)
        return sb_block;
 }
+#ifdef CONFIG_QUOTA
+static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
+{
+        struct ext3_sb_info *sbi = EXT3_SB(sb);
+        char *qname;
+        if (sb_any_quota_loaded(sb) &&
+                !sbi->s_qf_names[qtype]) {
+                ext3_msg(sb, KERN_ERR,
+                        "Cannot change journaled "
+                        "quota options when quota turned on");
+                return 0;
+        }
+        qname = match_strdup(args);
+        if (!qname) {
+                ext3_msg(sb, KERN_ERR,
+                        "Not enough memory for storing quotafile name");
+                return 0;
+        }
+        if (sbi->s_qf_names[qtype] &&
+                strcmp(sbi->s_qf_names[qtype], qname)) {
+                ext3_msg(sb, KERN_ERR,
+                        "%s quota file already specified", QTYPE2NAME(qtype));
+                kfree(qname);
+                return 0;
+        }
+        sbi->s_qf_names[qtype] = qname;
+        if (strchr(sbi->s_qf_names[qtype], '/')) {
+                ext3_msg(sb, KERN_ERR,
+                        "quotafile must be on filesystem root");
+                kfree(sbi->s_qf_names[qtype]);
+                sbi->s_qf_names[qtype] = NULL;
+                return 0;
+        }
+        set_opt(sbi->s_mount_opt, QUOTA);
+        return 1;
+}
+static int clear_qf_name(struct super_block *sb, int qtype) {
+        struct ext3_sb_info *sbi = EXT3_SB(sb);
+        if (sb_any_quota_loaded(sb) &&
+                sbi->s_qf_names[qtype]) {
+                ext3_msg(sb, KERN_ERR, "Cannot change journaled quota options"
+                        " when quota turned on");
+                return 0;
+        }
+        /*
+         * The space will be released later when all options are confirmed
+         * to be correct
+         */
+        sbi->s_qf_names[qtype] = NULL;
+        return 1;
+}
+#endif
 static int parse_options (char *options, struct super_block *sb,
                          unsigned int *inum, unsigned long *journal_devnum,
                          ext3_fsblk_t *n_blocks_count, int is_remount)
@@ -876,8 +957,7 @@ static int parse_options (char *options, struct super_block *sb,
        int data_opt = 0;
        int option;
 #ifdef CONFIG_QUOTA
-        int qtype, qfmt;
+        int qfmt;
-        char *qname;
 #endif
        if (!options)
@@ -956,7 +1036,8 @@ static int parse_options (char *options, struct super_block *sb,
 #else
                case Opt_user_xattr:
                case Opt_nouser_xattr:
-                        printk("EXT3 (no)user_xattr options not supported\n");
+                        ext3_msg(sb, KERN_INFO,
+                                "(no)user_xattr options not supported");
                        break;
 #endif
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
@@ -969,7 +1050,8 @@ static int parse_options (char *options, struct super_block *sb,
 #else
                case Opt_acl:
                case Opt_noacl:
-                        printk("EXT3 (no)acl options not supported\n");
+                        ext3_msg(sb, KERN_INFO,
+                                "(no)acl options not supported");
                        break;
 #endif
                case Opt_reservation:
@@ -985,16 +1067,16 @@ static int parse_options (char *options, struct super_block *sb,
                           user to specify an existing inode to be the
                           journal file. */
                        if (is_remount) {
-                                printk(KERN_ERR "EXT3-fs: cannot specify "
+                                ext3_msg(sb, KERN_ERR, "error: cannot specify "
-                                       "journal on remount\n");
+                                        "journal on remount");
                                return 0;
                        }
                        set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
                        break;
                case Opt_journal_inum:
                        if (is_remount) {
-                                printk(KERN_ERR "EXT3-fs: cannot specify "
+                                ext3_msg(sb, KERN_ERR, "error: cannot specify "
-                                       "journal on remount\n");
+                                       "journal on remount");
                                return 0;
                        }
                        if (match_int(&args[0], &option))
@@ -1003,8 +1085,8 @@ static int parse_options (char *options, struct super_block *sb,
                        break;
                case Opt_journal_dev:
                        if (is_remount) {
-                                printk(KERN_ERR "EXT3-fs: cannot specify "
+                                ext3_msg(sb, KERN_ERR, "error: cannot specify "
-                                       "journal on remount\n");
+                                       "journal on remount");
                                return 0;
                        }
                        if (match_int(&args[0], &option))
@@ -1033,21 +1115,19 @@ static int parse_options (char *options, struct super_block *sb,
                        data_opt = EXT3_MOUNT_WRITEBACK_DATA;
                datacheck:
                        if (is_remount) {
-                                if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS)
+                                if (test_opt(sb, DATA_FLAGS) == data_opt)
-                                                == data_opt)
                                        break;
-                                printk(KERN_ERR
+                                ext3_msg(sb, KERN_ERR,
-                                        "EXT3-fs (device %s): Cannot change "
+                                        "error: cannot change "
                                        "data mode on remount. The filesystem "
                                        "is mounted in data=%s mode and you "
-                                        "try to remount it in data=%s mode.\n",
+                                        "try to remount it in data=%s mode.",
-                                        sb->s_id,
+                                        data_mode_string(test_opt(sb,
-                                        data_mode_string(sbi->s_mount_opt &
+                                                        DATA_FLAGS)),
-                                                        EXT3_MOUNT_DATA_FLAGS),
                                        data_mode_string(data_opt));
                                return 0;
                        } else {
-                                sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS;
+                                clear_opt(sbi->s_mount_opt, DATA_FLAGS);
                                sbi->s_mount_opt |= data_opt;
                        }
                        break;
@@ -1059,74 +1139,35 @@ static int parse_options (char *options, struct super_block *sb,
                        break;
 #ifdef CONFIG_QUOTA
                case Opt_usrjquota:
-                        qtype = USRQUOTA;
+                        if (!set_qf_name(sb, USRQUOTA, &args[0]))
-                        goto set_qf_name;
-                case Opt_grpjquota:
-                        qtype = GRPQUOTA;
-set_qf_name:
-                        if (sb_any_quota_loaded(sb) &&
-                            !sbi->s_qf_names[qtype]) {
-                                printk(KERN_ERR
-                                        "EXT3-fs: Cannot change journaled "
-                                        "quota options when quota turned on.\n");
                                return 0;
-                        }
+                        break;
-                        qname = match_strdup(&args[0]);
+                case Opt_grpjquota:
-                        if (!qname) {
+                        if (!set_qf_name(sb, GRPQUOTA, &args[0]))
-                                printk(KERN_ERR
-                                        "EXT3-fs: not enough memory for "
-                                        "storing quotafile name.\n");
-                                return 0;
-                        }
-                        if (sbi->s_qf_names[qtype] &&
-                            strcmp(sbi->s_qf_names[qtype], qname)) {
-                                printk(KERN_ERR
-                                        "EXT3-fs: %s quota file already "
-                                        "specified.\n", QTYPE2NAME(qtype));
-                                kfree(qname);
-                                return 0;
-                        }
-                        sbi->s_qf_names[qtype] = qname;
-                        if (strchr(sbi->s_qf_names[qtype], '/')) {
-                                printk(KERN_ERR
-                                        "EXT3-fs: quotafile must be on "
-                                        "filesystem root.\n");
-                                kfree(sbi->s_qf_names[qtype]);
-                                sbi->s_qf_names[qtype] = NULL;
                                return 0;
-                        }
-                        set_opt(sbi->s_mount_opt, QUOTA);
                        break;
                case Opt_offusrjquota:
-                        qtype = USRQUOTA;
+                        if (!clear_qf_name(sb, USRQUOTA))
-                        goto clear_qf_name;
+                                return 0;
+                        break;
                case Opt_offgrpjquota:
-                        qtype = GRPQUOTA;
+                        if (!clear_qf_name(sb, GRPQUOTA))
-clear_qf_name:
-                        if (sb_any_quota_loaded(sb) &&
-                            sbi->s_qf_names[qtype]) {
-                                printk(KERN_ERR "EXT3-fs: Cannot change "
-                                        "journaled quota options when "
-                                        "quota turned on.\n");
                                return 0;
-                        }
-                        /*
-                         * The space will be released later when all options
-                         * are confirmed to be correct
-                         */
-                        sbi->s_qf_names[qtype] = NULL;
                        break;
                case Opt_jqfmt_vfsold:
                        qfmt = QFMT_VFS_OLD;
                        goto set_qf_format;
                case Opt_jqfmt_vfsv0:
                        qfmt = QFMT_VFS_V0;
+                        goto set_qf_format;
+                case Opt_jqfmt_vfsv1:
+                        qfmt = QFMT_VFS_V1;
 set_qf_format:
                        if (sb_any_quota_loaded(sb) &&
                            sbi->s_jquota_fmt != qfmt) {
-                                printk(KERN_ERR "EXT3-fs: Cannot change "
+                                ext3_msg(sb, KERN_ERR, "error: cannot change "
                                        "journaled quota options when "
-                                        "quota turned on.\n");
+                                        "quota turned on.");
                                return 0;
                        }
                        sbi->s_jquota_fmt = qfmt;
@@ -1142,8 +1183,8 @@ set_qf_format:
                        break;
                case Opt_noquota:
                        if (sb_any_quota_loaded(sb)) {
-                                printk(KERN_ERR "EXT3-fs: Cannot change quota "
+                                ext3_msg(sb, KERN_ERR, "error: cannot change "
-                                        "options when quota turned on.\n");
+                                        "quota options when quota turned on.");
                                return 0;
                        }
                        clear_opt(sbi->s_mount_opt, QUOTA);
@@ -1154,8 +1195,8 @@ set_qf_format:
                case Opt_quota:
                case Opt_usrquota:
                case Opt_grpquota:
-                        printk(KERN_ERR
+                        ext3_msg(sb, KERN_ERR,
-                                "EXT3-fs: quota options not supported.\n");
+                                "error: quota options not supported.");
                        break;
                case Opt_usrjquota:
                case Opt_grpjquota:
@@ -1163,9 +1204,10 @@ set_qf_format:
                case Opt_offgrpjquota:
                case Opt_jqfmt_vfsold:
                case Opt_jqfmt_vfsv0:
-                        printk(KERN_ERR
+                case Opt_jqfmt_vfsv1:
-                                "EXT3-fs: journaled quota options not "
+                        ext3_msg(sb, KERN_ERR,
-                                "supported.\n");
+                                "error: journaled quota options not "
+                                "supported.");
                        break;
                case Opt_noquota:
                        break;
@@ -1185,8 +1227,9 @@ set_qf_format:
                        break;
                case Opt_resize:
                        if (!is_remount) {
-                                printk("EXT3-fs: resize option only available "
+                                ext3_msg(sb, KERN_ERR,
-                                        "for remount\n");
+                                        "error: resize option only available "
+                                        "for remount");
                                return 0;
                        }
                        if (match_int(&args[0], &option) != 0)
@@ -1200,41 +1243,35 @@ set_qf_format:
                        clear_opt(sbi->s_mount_opt, NOBH);
                        break;
                default:
-                        printk (KERN_ERR
+                        ext3_msg(sb, KERN_ERR,
-                                "EXT3-fs: Unrecognized mount option \"%s\" "
+                                "error: unrecognized mount option \"%s\" "
-                                "or missing value\n", p);
+                                "or missing value", p);
                        return 0;
                }
        }
 #ifdef CONFIG_QUOTA
        if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
-                if ((sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) &&
+                if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
-                     sbi->s_qf_names[USRQUOTA])
                        clear_opt(sbi->s_mount_opt, USRQUOTA);
+                if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
-                if ((sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) &&
-                     sbi->s_qf_names[GRPQUOTA])
                        clear_opt(sbi->s_mount_opt, GRPQUOTA);
-                if ((sbi->s_qf_names[USRQUOTA] &&
+                if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
-                                (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)) ||
+                        ext3_msg(sb, KERN_ERR, "error: old and new quota "
-                    (sbi->s_qf_names[GRPQUOTA] &&
+                                        "format mixing.");
-                                (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA))) {
-                        printk(KERN_ERR "EXT3-fs: old and new quota "
-                                        "format mixing.\n");
                        return 0;
                }
                if (!sbi->s_jquota_fmt) {
-                        printk(KERN_ERR "EXT3-fs: journaled quota format "
+                        ext3_msg(sb, KERN_ERR, "error: journaled quota format "
-                                        "not specified.\n");
+                                        "not specified.");
                        return 0;
                }
        } else {
                if (sbi->s_jquota_fmt) {
-                        printk(KERN_ERR "EXT3-fs: journaled quota format "
+                        ext3_msg(sb, KERN_ERR, "error: journaled quota format "
                                        "specified with no journaling "
-                                        "enabled.\n");
+                                        "enabled.");
                        return 0;
                }
        }
@@ -1249,31 +1286,33 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
        int res = 0;
        if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) {
-                printk (KERN_ERR "EXT3-fs warning: revision level too high, "
+                ext3_msg(sb, KERN_ERR,
-                        "forcing read-only mode\n");
+                        "error: revision level too high, "
+                        "forcing read-only mode");
                res = MS_RDONLY;
        }
        if (read_only)
                return res;
        if (!(sbi->s_mount_state & EXT3_VALID_FS))
-                printk (KERN_WARNING "EXT3-fs warning: mounting unchecked fs, "
+                ext3_msg(sb, KERN_WARNING,
-                        "running e2fsck is recommended\n");
+                        "warning: mounting unchecked fs, "
+                        "running e2fsck is recommended");
        else if ((sbi->s_mount_state & EXT3_ERROR_FS))
-                printk (KERN_WARNING
+                ext3_msg(sb, KERN_WARNING,
-                        "EXT3-fs warning: mounting fs with errors, "
+                        "warning: mounting fs with errors, "
-                        "running e2fsck is recommended\n");
+                        "running e2fsck is recommended");
        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
                 le16_to_cpu(es->s_mnt_count) >=
                 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
-                printk (KERN_WARNING
+                ext3_msg(sb, KERN_WARNING,
-                        "EXT3-fs warning: maximal mount count reached, "
+                        "warning: maximal mount count reached, "
-                        "running e2fsck is recommended\n");
+                        "running e2fsck is recommended");
        else if (le32_to_cpu(es->s_checkinterval) &&
                (le32_to_cpu(es->s_lastcheck) +
                        le32_to_cpu(es->s_checkinterval) <= get_seconds()))
-                printk (KERN_WARNING
+                ext3_msg(sb, KERN_WARNING,
-                        "EXT3-fs warning: checktime reached, "
+                        "warning: checktime reached, "
-                        "running e2fsck is recommended\n");
+                        "running e2fsck is recommended");
 #if 0
                /* @@@ We _will_ want to clear the valid bit if we find
                   inconsistencies, to force a fsck at reboot.  But for
@@ -1290,22 +1329,20 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
        ext3_commit_super(sb, es, 1);
        if (test_opt(sb, DEBUG))
-                printk(KERN_INFO "[EXT3 FS bs=%lu, gc=%lu, "
+                ext3_msg(sb, KERN_INFO, "[bs=%lu, gc=%lu, "
-                                "bpg=%lu, ipg=%lu, mo=%04lx]\n",
+                                "bpg=%lu, ipg=%lu, mo=%04lx]",
                        sb->s_blocksize,
                        sbi->s_groups_count,
                        EXT3_BLOCKS_PER_GROUP(sb),
                        EXT3_INODES_PER_GROUP(sb),
                        sbi->s_mount_opt);
-        printk(KERN_INFO "EXT3 FS on %s, ", sb->s_id);
        if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
                char b[BDEVNAME_SIZE];
+                ext3_msg(sb, KERN_INFO, "using external journal on %s",
-                printk("external journal on %s\n",
                        bdevname(EXT3_SB(sb)->s_journal->j_dev, b));
        } else {
-                printk("internal journal\n");
+                ext3_msg(sb, KERN_INFO, "using internal journal");
        }
        return res;
 }
@@ -1399,8 +1436,8 @@ static void ext3_orphan_cleanup (struct super_block * sb,
        }
        if (bdev_read_only(sb->s_bdev)) {
-                printk(KERN_ERR "EXT3-fs: write access "
+                ext3_msg(sb, KERN_ERR, "error: write access "
-                        "unavailable, skipping orphan cleanup.\n");
+                        "unavailable, skipping orphan cleanup.");
                return;
        }
@@ -1414,8 +1451,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
        }
        if (s_flags & MS_RDONLY) {
-                printk(KERN_INFO "EXT3-fs: %s: orphan cleanup on readonly fs\n",
+                ext3_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
-                       sb->s_id);
                sb->s_flags &= ~MS_RDONLY;
        }
 #ifdef CONFIG_QUOTA
@@ -1426,9 +1462,9 @@ static void ext3_orphan_cleanup (struct super_block * sb,
                if (EXT3_SB(sb)->s_qf_names[i]) {
                        int ret = ext3_quota_on_mount(sb, i);
                        if (ret < 0)
-                                printk(KERN_ERR
+                                ext3_msg(sb, KERN_ERR,
-                                        "EXT3-fs: Cannot turn on journaled "
+                                        "error: cannot turn on journaled "
-                                        "quota: error %d\n", ret);
+                                        "quota: %d", ret);
                }
        }
 #endif
@@ -1443,7 +1479,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
                }
                list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
-                vfs_dq_init(inode);
+                dquot_initialize(inode);
                if (inode->i_nlink) {
                        printk(KERN_DEBUG
                                "%s: truncating inode %lu to %Ld bytes\n",
@@ -1466,11 +1502,11 @@ static void ext3_orphan_cleanup (struct super_block * sb,
 #define PLURAL(x) (x), ((x)==1) ? "" : "s"
        if (nr_orphans)
-                printk(KERN_INFO "EXT3-fs: %s: %d orphan inode%s deleted\n",
+                ext3_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
-                       sb->s_id, PLURAL(nr_orphans));
+                       PLURAL(nr_orphans));
        if (nr_truncates)
-                printk(KERN_INFO "EXT3-fs: %s: %d truncate%s cleaned up\n",
+                ext3_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
-                       sb->s_id, PLURAL(nr_truncates));
+                       PLURAL(nr_truncates));
 #ifdef CONFIG_QUOTA
        /* Turn quotas off */
        for (i = 0; i < MAXQUOTAS; i++) {
@@ -1554,7 +1590,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        struct ext3_super_block *es = NULL;
        struct ext3_sb_info *sbi;
        ext3_fsblk_t block;
-        ext3_fsblk_t sb_block = get_sb_block(&data);
+        ext3_fsblk_t sb_block = get_sb_block(&data, sb);
        ext3_fsblk_t logic_sb_block;
        unsigned long offset = 0;
        unsigned int journal_inum = 0;
@@ -1590,7 +1626,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
        if (!blocksize) {
-                printk(KERN_ERR "EXT3-fs: unable to set blocksize\n");
+                ext3_msg(sb, KERN_ERR, "error: unable to set blocksize");
                goto out_fail;
        }
@@ -1606,7 +1642,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        }
        if (!(bh = sb_bread(sb, logic_sb_block))) {
-                printk (KERN_ERR "EXT3-fs: unable to read superblock\n");
+                ext3_msg(sb, KERN_ERR, "error: unable to read superblock");
                goto out_fail;
        }
        /*
@@ -1636,11 +1672,11 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                set_opt(sbi->s_mount_opt, POSIX_ACL);
 #endif
        if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA)
-                sbi->s_mount_opt |= EXT3_MOUNT_JOURNAL_DATA;
+                set_opt(sbi->s_mount_opt, JOURNAL_DATA);
        else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED)
-                sbi->s_mount_opt |= EXT3_MOUNT_ORDERED_DATA;
+                set_opt(sbi->s_mount_opt, ORDERED_DATA);
        else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK)
-                sbi->s_mount_opt |= EXT3_MOUNT_WRITEBACK_DATA;
+                set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
        if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC)
                set_opt(sbi->s_mount_opt, ERRORS_PANIC);
@@ -1659,15 +1695,15 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                goto failed_mount;
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-                ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+                (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
        if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
            (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
             EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
             EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U)))
-                printk(KERN_WARNING
+                ext3_msg(sb, KERN_WARNING,
-                       "EXT3-fs warning: feature flags set on rev 0 fs, "
+                        "warning: feature flags set on rev 0 fs, "
-                       "running e2fsck is recommended\n");
+                        "running e2fsck is recommended");
        /*
         * Check feature flags regardless of the revision level, since we
         * previously didn't change the revision level when setting the flags,
@@ -1675,25 +1711,25 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
         */
        features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP);
        if (features) {
-                printk(KERN_ERR "EXT3-fs: %s: couldn't mount because of "
+                ext3_msg(sb, KERN_ERR,
-                       "unsupported optional features (%x).\n",
+                        "error: couldn't mount because of unsupported "
-                       sb->s_id, le32_to_cpu(features));
+                        "optional features (%x)", le32_to_cpu(features));
                goto failed_mount;
        }
        features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP);
        if (!(sb->s_flags & MS_RDONLY) && features) {
-                printk(KERN_ERR "EXT3-fs: %s: couldn't mount RDWR because of "
+                ext3_msg(sb, KERN_ERR,
-                       "unsupported optional features (%x).\n",
+                        "error: couldn't mount RDWR because of unsupported "
-                       sb->s_id, le32_to_cpu(features));
+                        "optional features (%x)", le32_to_cpu(features));
                goto failed_mount;
        }
        blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
        if (blocksize < EXT3_MIN_BLOCK_SIZE ||
            blocksize > EXT3_MAX_BLOCK_SIZE) {
-                printk(KERN_ERR
+                ext3_msg(sb, KERN_ERR,
-                       "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n",
+                        "error: couldn't mount because of unsupported "
-                       blocksize, sb->s_id);
+                        "filesystem blocksize %d", blocksize);
                goto failed_mount;
        }
@@ -1704,30 +1740,31 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                 * than the hardware sectorsize for the machine.
                 */
                if (blocksize < hblock) {
-                        printk(KERN_ERR "EXT3-fs: blocksize %d too small for "
+                        ext3_msg(sb, KERN_ERR,
-                               "device blocksize %d.\n", blocksize, hblock);
+                                "error: fsblocksize %d too small for "
+                                "hardware sectorsize %d", blocksize, hblock);
                        goto failed_mount;
                }
                brelse (bh);
                if (!sb_set_blocksize(sb, blocksize)) {
-                        printk(KERN_ERR "EXT3-fs: bad blocksize %d.\n",
+                        ext3_msg(sb, KERN_ERR,
-                                blocksize);
+                                "error: bad blocksize %d", blocksize);
                        goto out_fail;
                }
                logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
                offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
                bh = sb_bread(sb, logic_sb_block);
                if (!bh) {
-                        printk(KERN_ERR
+                        ext3_msg(sb, KERN_ERR,
-                               "EXT3-fs: Can't read superblock on 2nd try.\n");
+                               "error: can't read superblock on 2nd try");
                        goto failed_mount;
                }
                es = (struct ext3_super_block *)(((char *)bh->b_data) + offset);
                sbi->s_es = es;
                if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
-                        printk (KERN_ERR
+                        ext3_msg(sb, KERN_ERR,
-                                "EXT3-fs: Magic mismatch, very weird !\n");
+                                "error: magic mismatch");
                        goto failed_mount;
                }
        }
@@ -1743,8 +1780,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) ||
                    (!is_power_of_2(sbi->s_inode_size)) ||
                    (sbi->s_inode_size > blocksize)) {
-                        printk (KERN_ERR
+                        ext3_msg(sb, KERN_ERR,
-                                "EXT3-fs: unsupported inode size: %d\n",
+                                "error: unsupported inode size: %d",
                                sbi->s_inode_size);
                        goto failed_mount;
                }
@@ -1752,8 +1789,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        sbi->s_frag_size = EXT3_MIN_FRAG_SIZE <<
                                   le32_to_cpu(es->s_log_frag_size);
        if (blocksize != sbi->s_frag_size) {
-                printk(KERN_ERR
+                ext3_msg(sb, KERN_ERR,
-                       "EXT3-fs: fragsize %lu != blocksize %u (unsupported)\n",
+                       "error: fragsize %lu != blocksize %u (unsupported)",
                       sbi->s_frag_size, blocksize);
                goto failed_mount;
        }
@@ -1789,31 +1826,31 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        }
        if (sbi->s_blocks_per_group > blocksize * 8) {
-                printk (KERN_ERR
+                ext3_msg(sb, KERN_ERR,
-                        "EXT3-fs: #blocks per group too big: %lu\n",
+                        "#blocks per group too big: %lu",
                        sbi->s_blocks_per_group);
                goto failed_mount;
        }
        if (sbi->s_frags_per_group > blocksize * 8) {
-                printk (KERN_ERR
+                ext3_msg(sb, KERN_ERR,
-                        "EXT3-fs: #fragments per group too big: %lu\n",
+                        "error: #fragments per group too big: %lu",
                        sbi->s_frags_per_group);
                goto failed_mount;
        }
        if (sbi->s_inodes_per_group > blocksize * 8) {
-                printk (KERN_ERR
+                ext3_msg(sb, KERN_ERR,
-                        "EXT3-fs: #inodes per group too big: %lu\n",
+                        "error: #inodes per group too big: %lu",
                        sbi->s_inodes_per_group);
                goto failed_mount;
        }
        if (le32_to_cpu(es->s_blocks_count) >
                    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
-                printk(KERN_ERR "EXT3-fs: filesystem on %s:"
+                ext3_msg(sb, KERN_ERR,
-                        " too large to mount safely\n", sb->s_id);
+                        "error: filesystem is too large to mount safely");
                if (sizeof(sector_t) < 8)
-                        printk(KERN_WARNING "EXT3-fs: CONFIG_LBDAF not "
+                        ext3_msg(sb, KERN_ERR,
-                                        "enabled\n");
+                                "error: CONFIG_LBDAF not enabled");
                goto failed_mount;
        }
@@ -1827,7 +1864,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
                                    GFP_KERNEL);
        if (sbi->s_group_desc == NULL) {
-                printk (KERN_ERR "EXT3-fs: not enough memory\n");
+                ext3_msg(sb, KERN_ERR,
+                        "error: not enough memory");
                goto failed_mount;
        }
@@ -1837,14 +1875,15 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                block = descriptor_loc(sb, logic_sb_block, i);
                sbi->s_group_desc[i] = sb_bread(sb, block);
                if (!sbi->s_group_desc[i]) {
-                        printk (KERN_ERR "EXT3-fs: "
+                        ext3_msg(sb, KERN_ERR,
-                                "can't read group descriptor %d\n", i);
+                                "error: can't read group descriptor %d", i);
                        db_count = i;
                        goto failed_mount2;
                }
        }
        if (!ext3_check_descriptors (sb)) {
-                printk(KERN_ERR "EXT3-fs: group descriptors corrupted!\n");
+                ext3_msg(sb, KERN_ERR,
+                        "error: group descriptors corrupted");
                goto failed_mount2;
        }
        sbi->s_gdb_count = db_count;
@@ -1862,7 +1901,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                                ext3_count_dirs(sb));
        }
        if (err) {
-                printk(KERN_ERR "EXT3-fs: insufficient memory\n");
+                ext3_msg(sb, KERN_ERR, "error: insufficient memory");
                goto failed_mount3;
        }
@@ -1890,6 +1929,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        sb->dq_op = &ext3_quota_operations;
 #endif
        INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
+        mutex_init(&sbi->s_orphan_lock);
+        mutex_init(&sbi->s_resize_lock);
        sb->s_root = NULL;
@@ -1910,9 +1951,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                        goto failed_mount3;
        } else {
                if (!silent)
-                        printk (KERN_ERR
+                        ext3_msg(sb, KERN_ERR,
-                                "ext3: No journal on filesystem on %s\n",
+                                "error: no journal found. "
-                                sb->s_id);
+                                "mounting ext3 over ext2?");
                goto failed_mount3;
        }
@@ -1934,8 +1975,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        case EXT3_MOUNT_WRITEBACK_DATA:
                if (!journal_check_available_features
                    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
-                        printk(KERN_ERR "EXT3-fs: Journal does not support "
+                        ext3_msg(sb, KERN_ERR,
-                               "requested data journaling mode\n");
+                                "error: journal does not support "
+                                "requested data journaling mode");
                        goto failed_mount4;
                }
        default:
@@ -1944,8 +1986,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        if (test_opt(sb, NOBH)) {
                if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) {
-                        printk(KERN_WARNING "EXT3-fs: Ignoring nobh option - "
+                        ext3_msg(sb, KERN_WARNING,
-                                "its supported only with writeback mode\n");
+                                "warning: ignoring nobh option - "
+                                "it is supported only with writeback mode");
                        clear_opt(sbi->s_mount_opt, NOBH);
                }
        }
@@ -1956,39 +1999,32 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        root = ext3_iget(sb, EXT3_ROOT_INO);
        if (IS_ERR(root)) {
-                printk(KERN_ERR "EXT3-fs: get root inode failed\n");
+                ext3_msg(sb, KERN_ERR, "error: get root inode failed");
                ret = PTR_ERR(root);
                goto failed_mount4;
        }
        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
                iput(root);
-                printk(KERN_ERR "EXT3-fs: corrupt root inode, run e2fsck\n");
+                ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
                goto failed_mount4;
        }
        sb->s_root = d_alloc_root(root);
        if (!sb->s_root) {
-                printk(KERN_ERR "EXT3-fs: get root dentry failed\n");
+                ext3_msg(sb, KERN_ERR, "error: get root dentry failed");
                iput(root);
                ret = -ENOMEM;
                goto failed_mount4;
        }
        ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
-        /*
-         * akpm: core read_super() calls in here with the superblock locked.
-         * That deadlocks, because orphan cleanup needs to lock the superblock
-         * in numerous places.  Here we just pop the lock - it's relatively
-         * harmless, because we are now ready to accept write_super() requests,
-         * and aviro says that's the only reason for hanging onto the
-         * superblock lock.
-         */
        EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
        ext3_orphan_cleanup(sb, es);
        EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
        if (needs_recovery)
-                printk (KERN_INFO "EXT3-fs: recovery complete.\n");
+                ext3_msg(sb, KERN_INFO, "recovery complete");
        ext3_mark_recovery_complete(sb, es);
-        printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n",
+        ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode",
                test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
                test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
                "writeback");
@@ -1998,7 +2034,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 cantfind_ext3:
        if (!silent)
-                printk(KERN_ERR "VFS: Can't find ext3 filesystem on dev %s.\n",
+                ext3_msg(sb, KERN_INFO,
+                        "error: can't find ext3 filesystem on dev %s.",
                       sb->s_id);
        goto failed_mount;
@@ -2066,27 +2103,27 @@ static journal_t *ext3_get_journal(struct super_block *sb,
        journal_inode = ext3_iget(sb, journal_inum);
        if (IS_ERR(journal_inode)) {
-                printk(KERN_ERR "EXT3-fs: no journal found.\n");
+                ext3_msg(sb, KERN_ERR, "error: no journal found");
                return NULL;
        }
        if (!journal_inode->i_nlink) {
                make_bad_inode(journal_inode);
                iput(journal_inode);
-                printk(KERN_ERR "EXT3-fs: journal inode is deleted.\n");
+                ext3_msg(sb, KERN_ERR, "error: journal inode is deleted");
                return NULL;
        }
        jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
                  journal_inode, journal_inode->i_size);
        if (!S_ISREG(journal_inode->i_mode)) {
-                printk(KERN_ERR "EXT3-fs: invalid journal inode.\n");
+                ext3_msg(sb, KERN_ERR, "error: invalid journal inode");
                iput(journal_inode);
                return NULL;
        }
        journal = journal_init_inode(journal_inode);
        if (!journal) {
-                printk(KERN_ERR "EXT3-fs: Could not load journal inode\n");
+                ext3_msg(sb, KERN_ERR, "error: could not load journal inode");
                iput(journal_inode);
                return NULL;
        }
@@ -2108,13 +2145,13 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
        struct ext3_super_block * es;
        struct block_device *bdev;
-        bdev = ext3_blkdev_get(j_dev);
+        bdev = ext3_blkdev_get(j_dev, sb);
        if (bdev == NULL)
                return NULL;
        if (bd_claim(bdev, sb)) {
-                printk(KERN_ERR
+                ext3_msg(sb, KERN_ERR,
-                        "EXT3: failed to claim external journal device.\n");
+                        "error: failed to claim external journal device");
                blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
                return NULL;
        }
@@ -2122,8 +2159,8 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
        blocksize = sb->s_blocksize;
        hblock = bdev_logical_block_size(bdev);
        if (blocksize < hblock) {
-                printk(KERN_ERR
+                ext3_msg(sb, KERN_ERR,
-                        "EXT3-fs: blocksize too small for journal device.\n");
+                        "error: blocksize too small for journal device");
                goto out_bdev;
        }
@@ -2131,8 +2168,8 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
        offset = EXT3_MIN_BLOCK_SIZE % blocksize;
        set_blocksize(bdev, blocksize);
        if (!(bh = __bread(bdev, sb_block, blocksize))) {
-                printk(KERN_ERR "EXT3-fs: couldn't read superblock of "
+                ext3_msg(sb, KERN_ERR, "error: couldn't read superblock of "
-                       "external journal\n");
+                        "external journal");
                goto out_bdev;
        }
@@ -2140,14 +2177,14 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
        if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
            !(le32_to_cpu(es->s_feature_incompat) &
              EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
-                printk(KERN_ERR "EXT3-fs: external journal has "
+                ext3_msg(sb, KERN_ERR, "error: external journal has "
-                                        "bad superblock\n");
+                        "bad superblock");
                brelse(bh);
                goto out_bdev;
        }
        if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
-                printk(KERN_ERR "EXT3-fs: journal UUID does not match\n");
+                ext3_msg(sb, KERN_ERR, "error: journal UUID does not match");
                brelse(bh);
                goto out_bdev;
        }
@@ -2159,19 +2196,21 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
        journal = journal_init_dev(bdev, sb->s_bdev,
                                        start, len, blocksize);
        if (!journal) {
-                printk(KERN_ERR "EXT3-fs: failed to create device journal\n");
+                ext3_msg(sb, KERN_ERR,
+                        "error: failed to create device journal");
                goto out_bdev;
        }
        journal->j_private = sb;
        ll_rw_block(READ, 1, &journal->j_sb_buffer);
        wait_on_buffer(journal->j_sb_buffer);
        if (!buffer_uptodate(journal->j_sb_buffer)) {
-                printk(KERN_ERR "EXT3-fs: I/O error on journal device\n");
+                ext3_msg(sb, KERN_ERR, "I/O error on journal device");
                goto out_journal;
        }
        if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
-                printk(KERN_ERR "EXT3-fs: External journal has more than one "
+                ext3_msg(sb, KERN_ERR,
-                                        "user (unsupported) - %d\n",
+                        "error: external journal has more than one "
+                        "user (unsupported) - %d",
                        be32_to_cpu(journal->j_superblock->s_nr_users));
                goto out_journal;
        }
@@ -2197,8 +2236,8 @@ static int ext3_load_journal(struct super_block *sb,
        if (journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
-                printk(KERN_INFO "EXT3-fs: external journal device major/minor "
+                ext3_msg(sb, KERN_INFO, "external journal device major/minor "
-                        "numbers have changed\n");
+                        "numbers have changed");
                journal_dev = new_decode_dev(journal_devnum);
        } else
                journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
@@ -2213,21 +2252,21 @@ static int ext3_load_journal(struct super_block *sb,
        if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) {
                if (sb->s_flags & MS_RDONLY) {
-                        printk(KERN_INFO "EXT3-fs: INFO: recovery "
+                        ext3_msg(sb, KERN_INFO,
-                                        "required on readonly filesystem.\n");
+                                "recovery required on readonly filesystem");
                        if (really_read_only) {
-                                printk(KERN_ERR "EXT3-fs: write access "
+                                ext3_msg(sb, KERN_ERR, "error: write access "
-                                        "unavailable, cannot proceed.\n");
+                                        "unavailable, cannot proceed");
                                return -EROFS;
                        }
-                        printk (KERN_INFO "EXT3-fs: write access will "
+                        ext3_msg(sb, KERN_INFO,
-                                        "be enabled during recovery.\n");
+                                "write access will be enabled during recovery");
                }
        }
        if (journal_inum && journal_dev) {
-                printk(KERN_ERR "EXT3-fs: filesystem has both journal "
+                ext3_msg(sb, KERN_ERR, "error: filesystem has both journal "
-                       "and inode journals!\n");
+                       "and inode journals");
                return -EINVAL;
        }
@@ -2242,7 +2281,7 @@ static int ext3_load_journal(struct super_block *sb,
        if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
                err = journal_update_format(journal);
                if (err)  {
-                        printk(KERN_ERR "EXT3-fs: error updating journal.\n");
+                        ext3_msg(sb, KERN_ERR, "error updating journal");
                        journal_destroy(journal);
                        return err;
                }
@@ -2254,7 +2293,7 @@ static int ext3_load_journal(struct super_block *sb,
                err = journal_load(journal);
        if (err) {
-                printk(KERN_ERR "EXT3-fs: error loading journal.\n");
+                ext3_msg(sb, KERN_ERR, "error loading journal");
                journal_destroy(journal);
                return err;
        }
@@ -2273,16 +2312,17 @@ static int ext3_load_journal(struct super_block *sb,
        return 0;
 }
-static int ext3_create_journal(struct super_block * sb,
+static int ext3_create_journal(struct super_block *sb,
-                               struct ext3_super_block * es,
+                               struct ext3_super_block *es,
                               unsigned int journal_inum)
 {
        journal_t *journal;
        int err;
        if (sb->s_flags & MS_RDONLY) {
-                printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to "
+                ext3_msg(sb, KERN_ERR,
-                                "create journal.\n");
+                        "error: readonly filesystem when trying to "
+                        "create journal");
                return -EROFS;
        }
@@ -2290,12 +2330,12 @@ static int ext3_create_journal(struct super_block * sb,
        if (!journal)
                return -EINVAL;
-        printk(KERN_INFO "EXT3-fs: creating new journal on inode %u\n",
+        ext3_msg(sb, KERN_INFO, "creating new journal on inode %u",
               journal_inum);
        err = journal_create(journal);
        if (err) {
-                printk(KERN_ERR "EXT3-fs: error creating journal.\n");
+                ext3_msg(sb, KERN_ERR, "error creating journal");
                journal_destroy(journal);
                return -EIO;
        }
@@ -2359,13 +2399,11 @@ static void ext3_mark_recovery_complete(struct super_block * sb,
        if (journal_flush(journal) < 0)
                goto out;
-        lock_super(sb);
        if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
            sb->s_flags & MS_RDONLY) {
                EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
                ext3_commit_super(sb, es, 1);
        }
-        unlock_super(sb);
 out:
        journal_unlock_updates(journal);
@@ -2376,8 +2414,8 @@ out:
 * has recorded an error from a previous lifetime, move that error to the
 * main filesystem now.
 */
-static void ext3_clear_journal_err(struct super_block * sb,
+static void ext3_clear_journal_err(struct super_block *sb,
-                                   struct ext3_super_block * es)
+                                   struct ext3_super_block *es)
 {
        journal_t *journal;
        int j_errno;
@@ -2524,11 +2562,11 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
                goto restore_opts;
        }
-        if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
+        if (test_opt(sb, ABORT))
                ext3_abort(sb, __func__, "Abort forced by user");
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-                ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+                (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
        es = sbi->s_es;
@@ -2536,7 +2574,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
        if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
                n_blocks_count > le32_to_cpu(es->s_blocks_count)) {
-                if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) {
+                if (test_opt(sb, ABORT)) {
                        err = -EROFS;
                        goto restore_opts;
                }
@@ -2557,21 +2595,15 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
                            (sbi->s_mount_state & EXT3_VALID_FS))
                                es->s_state = cpu_to_le16(sbi->s_mount_state);
-                        /*
-                         * We have to unlock super so that we can wait for
-                         * transactions.
-                         */
-                        unlock_super(sb);
                        ext3_mark_recovery_complete(sb, es);
-                        lock_super(sb);
                } else {
                        __le32 ret;
                        if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb,
                                        ~EXT3_FEATURE_RO_COMPAT_SUPP))) {
-                                printk(KERN_WARNING "EXT3-fs: %s: couldn't "
+                                ext3_msg(sb, KERN_WARNING,
-                                       "remount RDWR because of unsupported "
+                                        "warning: couldn't remount RDWR "
-                                       "optional features (%x).\n",
+                                        "because of unsupported optional "
-                                       sb->s_id, le32_to_cpu(ret));
+                                        "features (%x)", le32_to_cpu(ret));
                                err = -EROFS;
                                goto restore_opts;
                        }
@@ -2582,11 +2614,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
                         * require a full umount/remount for now.
                         */
                        if (es->s_last_orphan) {
-                                printk(KERN_WARNING "EXT3-fs: %s: couldn't "
+                                ext3_msg(sb, KERN_WARNING, "warning: couldn't "
                                       "remount RDWR because of unprocessed "
                                       "orphan inode list.  Please "
-                                       "umount/remount instead.\n",
+                                       "umount/remount instead.");
-                                       sb->s_id);
                                err = -EINVAL;
                                goto restore_opts;
                        }
@@ -2686,13 +2717,11 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last;
        buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
-        es->s_free_blocks_count = cpu_to_le32(buf->f_bfree);
        buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
        if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
                buf->f_bavail = 0;
        buf->f_files = le32_to_cpu(es->s_inodes_count);
        buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
-        es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
        buf->f_namelen = EXT3_NAME_LEN;
        fsid = le64_to_cpup((void *)es->s_uuid) ^
               le64_to_cpup((void *)es->s_uuid + sizeof(u64));
@@ -2706,7 +2735,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
 * Process 1                         Process 2
 * ext3_create()                     quota_sync()
 *   journal_start()                   write_dquot()
- *   vfs_dq_init()                       down(dqio_mutex)
+ *   dquot_initialize()                       down(dqio_mutex)
 *     down(dqio_mutex)                    journal_start()
 *
 */
@@ -2837,9 +2866,9 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
        if (EXT3_SB(sb)->s_qf_names[type]) {
                /* Quotafile not of fs root? */
                if (path.dentry->d_parent != sb->s_root)
-                        printk(KERN_WARNING
+                        ext3_msg(sb, KERN_WARNING,
-                                "EXT3-fs: Quota file not on filesystem root. "
+                                "warning: Quota file not on filesystem root. "
-                                "Journaled quota will not work.\n");
+                                "Journaled quota will not work.");
        }
        /*
@@ -2914,65 +2943,65 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
        sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
        int err = 0;
        int offset = off & (sb->s_blocksize - 1);
-        int tocopy;
        int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL;
-        size_t towrite = len;
        struct buffer_head *bh;
        handle_t *handle = journal_current_handle();
        if (!handle) {
-                printk(KERN_WARNING "EXT3-fs: Quota write (off=%Lu, len=%Lu)"
+                ext3_msg(sb, KERN_WARNING,
-                        " cancelled because transaction is not started.\n",
+                        "warning: quota write (off=%llu, len=%llu)"
+                        " cancelled because transaction is not started.",
+                        (unsigned long long)off, (unsigned long long)len);
+                return -EIO;
+        }
+        /*
+         * Since we account only one data block in transaction credits,
+         * then it is impossible to cross a block boundary.
+         */
+        if (sb->s_blocksize - offset < len) {
+                ext3_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+                        " cancelled because not block aligned",
                        (unsigned long long)off, (unsigned long long)len);
                return -EIO;
        }
        mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
-        while (towrite > 0) {
+        bh = ext3_bread(handle, inode, blk, 1, &err);
-                tocopy = sb->s_blocksize - offset < towrite ?
+        if (!bh)
-                                sb->s_blocksize - offset : towrite;
+                goto out;
-                bh = ext3_bread(handle, inode, blk, 1, &err);
+        if (journal_quota) {
-                if (!bh)
+                err = ext3_journal_get_write_access(handle, bh);
+                if (err) {
+                        brelse(bh);
                        goto out;
-                if (journal_quota) {
-                        err = ext3_journal_get_write_access(handle, bh);
-                        if (err) {
-                                brelse(bh);
-                                goto out;
-                        }
-                }
-                lock_buffer(bh);
-                memcpy(bh->b_data+offset, data, tocopy);
-                flush_dcache_page(bh->b_page);
-                unlock_buffer(bh);
-                if (journal_quota)
-                        err = ext3_journal_dirty_metadata(handle, bh);
-                else {
-                        /* Always do at least ordered writes for quotas */
-                        err = ext3_journal_dirty_data(handle, bh);
-                        mark_buffer_dirty(bh);
                }
-                brelse(bh);
-                if (err)
-                        goto out;
-                offset = 0;
-                towrite -= tocopy;
-                data += tocopy;
-                blk++;
        }
+        lock_buffer(bh);
+        memcpy(bh->b_data+offset, data, len);
+        flush_dcache_page(bh->b_page);
+        unlock_buffer(bh);
+        if (journal_quota)
+                err = ext3_journal_dirty_metadata(handle, bh);
+        else {
+                /* Always do at least ordered writes for quotas */
+                err = ext3_journal_dirty_data(handle, bh);
+                mark_buffer_dirty(bh);
+        }
+        brelse(bh);
 out:
-        if (len == towrite) {
+        if (err) {
                mutex_unlock(&inode->i_mutex);
                return err;
        }
-        if (inode->i_size < off+len-towrite) {
+        if (inode->i_size < off + len) {
-                i_size_write(inode, off+len-towrite);
+                i_size_write(inode, off + len);
                EXT3_I(inode)->i_disksize = inode->i_size;
        }
        inode->i_version++;
        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        ext3_mark_inode_dirty(handle, inode);
        mutex_unlock(&inode->i_mutex);
-        return len - towrite;
+        return len;
 }
 #endif
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 545e37c4b91e..534a94c3a933 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -99,7 +99,7 @@ static struct buffer_head *ext3_xattr_cache_find(struct inode *,
                                                 struct mb_cache_entry **);
 static void ext3_xattr_rehash(struct ext3_xattr_header *,
                              struct ext3_xattr_entry *);
-static int ext3_xattr_list(struct inode *inode, char *buffer,
+static int ext3_xattr_list(struct dentry *dentry, char *buffer,
                           size_t buffer_size);
 static struct mb_cache *ext3_xattr_cache;
@@ -147,7 +147,7 @@ ext3_xattr_handler(int name_index)
 ssize_t
 ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
 {
-        return ext3_xattr_list(dentry->d_inode, buffer, size);
+        return ext3_xattr_list(dentry, buffer, size);
 }
 static int
@@ -274,7 +274,7 @@ ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
        void *end;
        int error;
-        if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR))
+        if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
                return -ENODATA;
        error = ext3_get_inode_loc(inode, &iloc);
        if (error)
@@ -332,7 +332,7 @@ ext3_xattr_get(struct inode *inode, int name_index, const char *name,
 }
 static int
-ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry,
+ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry,
                        char *buffer, size_t buffer_size)
 {
        size_t rest = buffer_size;
@@ -342,9 +342,10 @@ ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry,
                        ext3_xattr_handler(entry->e_name_index);
                if (handler) {
-                        size_t size = handler->list(inode, buffer, rest,
+                        size_t size = handler->list(dentry, buffer, rest,
                                                    entry->e_name,
-                                                    entry->e_name_len);
+                                                    entry->e_name_len,
+                                                    handler->flags);
                        if (buffer) {
                                if (size > rest)
                                        return -ERANGE;
@@ -357,8 +358,9 @@ ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry,
 }
 static int
-ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
+ext3_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
+        struct inode *inode = dentry->d_inode;
        struct buffer_head *bh = NULL;
        int error;
@@ -383,7 +385,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
                goto cleanup;
        }
        ext3_xattr_cache_insert(bh);
-        error = ext3_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size);
+        error = ext3_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
 cleanup:
        brelse(bh);
@@ -392,15 +394,16 @@ cleanup:
 }
 static int
-ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
+ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
+        struct inode *inode = dentry->d_inode;
        struct ext3_xattr_ibody_header *header;
        struct ext3_inode *raw_inode;
        struct ext3_iloc iloc;
        void *end;
        int error;
-        if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR))
+        if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
                return 0;
        error = ext3_get_inode_loc(inode, &iloc);
        if (error)
@@ -411,7 +414,7 @@ ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
        error = ext3_xattr_check_names(IFIRST(header), end);
        if (error)
                goto cleanup;
-        error = ext3_xattr_list_entries(inode, IFIRST(header),
+        error = ext3_xattr_list_entries(dentry, IFIRST(header),
                                        buffer, buffer_size);
 cleanup:
@@ -430,12 +433,12 @@ cleanup:
 * used / required on success.
 */
 static int
-ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
+ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
        int i_error, b_error;
-        down_read(&EXT3_I(inode)->xattr_sem);
+        down_read(&EXT3_I(dentry->d_inode)->xattr_sem);
-        i_error = ext3_xattr_ibody_list(inode, buffer, buffer_size);
+        i_error = ext3_xattr_ibody_list(dentry, buffer, buffer_size);
        if (i_error < 0) {
                b_error = 0;
        } else {
@@ -443,11 +446,11 @@ ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
                        buffer += i_error;
                        buffer_size -= i_error;
                }
-                b_error = ext3_xattr_block_list(inode, buffer, buffer_size);
+                b_error = ext3_xattr_block_list(dentry, buffer, buffer_size);
                if (b_error < 0)
                        i_error = 0;
        }
-        up_read(&EXT3_I(inode)->xattr_sem);
+        up_read(&EXT3_I(dentry->d_inode)->xattr_sem);
        return i_error + b_error;
 }
@@ -497,7 +500,7 @@ ext3_xattr_release_block(handle_t *handle, struct inode *inode,
                error = ext3_journal_dirty_metadata(handle, bh);
                if (IS_SYNC(inode))
                        handle->h_sync = 1;
-                vfs_dq_free_block(inode, 1);
+                dquot_free_block(inode, 1);
                ea_bdebug(bh, "refcount now=%d; releasing",
                          le32_to_cpu(BHDR(bh)->h_refcount));
                if (ce)
@@ -772,8 +775,8 @@ inserted:
                        else {
                                /* The old block is released after updating
                                   the inode. */
-                                error = -EDQUOT;
+                                error = dquot_alloc_block(inode, 1);
-                                if (vfs_dq_alloc_block(inode, 1))
+                                if (error)
                                        goto cleanup;
                                error = ext3_journal_get_write_access(handle,
                                                                      new_bh);
@@ -847,7 +850,7 @@ cleanup:
        return error;
 cleanup_dquot:
-        vfs_dq_free_block(inode, 1);
+        dquot_free_block(inode, 1);
        goto cleanup;
 bad_block:
@@ -879,7 +882,7 @@ ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i,
        is->s.base = is->s.first = IFIRST(header);
        is->s.here = is->s.first;
        is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
-        if (EXT3_I(inode)->i_state & EXT3_STATE_XATTR) {
+        if (ext3_test_inode_state(inode, EXT3_STATE_XATTR)) {
                error = ext3_xattr_check_names(IFIRST(header), is->s.end);
                if (error)
                        return error;
@@ -911,10 +914,10 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
        header = IHDR(inode, ext3_raw_inode(&is->iloc));
        if (!IS_LAST_ENTRY(s->first)) {
                header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
-                EXT3_I(inode)->i_state |= EXT3_STATE_XATTR;
+                ext3_set_inode_state(inode, EXT3_STATE_XATTR);
        } else {
                header->h_magic = cpu_to_le32(0);
-                EXT3_I(inode)->i_state &= ~EXT3_STATE_XATTR;
+                ext3_clear_inode_state(inode, EXT3_STATE_XATTR);
        }
        return 0;
 }
@@ -960,10 +963,14 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
        if (error)
                goto cleanup;
-        if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) {
+        error = ext3_journal_get_write_access(handle, is.iloc.bh);
+        if (error)
+                goto cleanup;
+        if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) {
                struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc);
                memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
-                EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW;
+                ext3_clear_inode_state(inode, EXT3_STATE_NEW);
        }
        error = ext3_xattr_ibody_find(inode, &i, &is);
@@ -985,9 +992,6 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
                if (flags & XATTR_CREATE)
                        goto cleanup;
        }
-        error = ext3_journal_get_write_access(handle, is.iloc.bh);
-        if (error)
-                goto cleanup;
        if (!value) {
                if (!is.s.not_found)
                        error = ext3_xattr_ibody_set(handle, inode, &i, &is);
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 37b81097bdf2..3af91f476dff 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -4,6 +4,7 @@
 */
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/ext3_jbd.h>
@@ -12,8 +13,8 @@
 #include "xattr.h"
 static size_t
-ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size,
+ext3_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
-                         const char *name, size_t name_len)
+                         const char *name, size_t name_len, int type)
 {
        const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
@@ -28,23 +29,23 @@ ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size,
 }
 static int
-ext3_xattr_security_get(struct inode *inode, const char *name,
+ext3_xattr_security_get(struct dentry *dentry, const char *name,
-                       void *buffer, size_t size)
+                void *buffer, size_t size, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ext3_xattr_get(inode, EXT3_XATTR_INDEX_SECURITY, name,
+        return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY,
-                              buffer, size);
+                              name, buffer, size);
 }
 static int
-ext3_xattr_security_set(struct inode *inode, const char *name,
+ext3_xattr_security_set(struct dentry *dentry, const char *name,
-                       const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ext3_xattr_set(inode, EXT3_XATTR_INDEX_SECURITY, name,
+        return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY,
-                              value, size, flags);
+                              name, value, size, flags);
 }
 int
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index c7c41a410c4b..e5562845ed96 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -14,8 +14,8 @@
 #include "xattr.h"
 static size_t
-ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
+ext3_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
-                        const char *name, size_t name_len)
+                const char *name, size_t name_len, int type)
 {
        const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
@@ -32,22 +32,22 @@ ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
 }
 static int
-ext3_xattr_trusted_get(struct inode *inode, const char *name,
+ext3_xattr_trusted_get(struct dentry *dentry, const char *name,
-                       void *buffer, size_t size)
+                       void *buffer, size_t size, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, name,
+        return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED,
-                              buffer, size);
+                              name, buffer, size);
 }
 static int
-ext3_xattr_trusted_set(struct inode *inode, const char *name,
+ext3_xattr_trusted_set(struct dentry *dentry, const char *name,
-                       const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ext3_xattr_set(inode, EXT3_XATTR_INDEX_TRUSTED, name,
+        return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED, name,
                              value, size, flags);
 }
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index 430fe63b31b3..3bcfe9ee0a68 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -13,13 +13,13 @@
 #include "xattr.h"
 static size_t
-ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size,
+ext3_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
-                     const char *name, size_t name_len)
+                const char *name, size_t name_len, int type)
 {
        const size_t prefix_len = XATTR_USER_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
-        if (!test_opt(inode->i_sb, XATTR_USER))
+        if (!test_opt(dentry->d_sb, XATTR_USER))
                return 0;
        if (list && total_len <= list_size) {
@@ -31,26 +31,27 @@ ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size,
 }
 static int
-ext3_xattr_user_get(struct inode *inode, const char *name,
+ext3_xattr_user_get(struct dentry *dentry, const char *name, void *buffer,
-                    void *buffer, size_t size)
+                size_t size, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        if (!test_opt(inode->i_sb, XATTR_USER))
+        if (!test_opt(dentry->d_sb, XATTR_USER))
                return -EOPNOTSUPP;
-        return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name, buffer, size);
+        return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_USER,
+                              name, buffer, size);
 }
 static int
-ext3_xattr_user_set(struct inode *inode, const char *name,
+ext3_xattr_user_set(struct dentry *dentry, const char *name,
-                    const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        if (!test_opt(inode->i_sb, XATTR_USER))
+        if (!test_opt(dentry->d_sb, XATTR_USER))
                return -EOPNOTSUPP;
-        return ext3_xattr_set(inode, EXT3_XATTR_INDEX_USER, name,
+        return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_USER,
-                              value, size, flags);
+                              name, value, size, flags);
 }
 struct xattr_handler ext3_xattr_user_handler = {
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 9f2d45d75b1a..9ed1bb1f319f 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -26,6 +26,17 @@ config EXT4_FS
          If unsure, say N.
+config EXT4_USE_FOR_EXT23
+        bool "Use ext4 for ext2/ext3 file systems"
+        depends on EXT4_FS
+        depends on EXT3_FS=n || EXT2_FS=n
+        default y
+        help
+          Allow the ext4 file system driver code to be used for ext2 or
+          ext3 file system mounts.  This allows users to reduce their
+          compiled kernel size by using one file system driver for
+          ext2, ext3, and ext4 file systems.
 config EXT4_FS_XATTR
        bool "Ext4 extended attributes"
        depends on EXT4_FS
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 0df88b2a69b0..8a2a29d35a6f 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -364,12 +364,12 @@ out:
 * Extended attribute handlers
 */
 static size_t
-ext4_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
+ext4_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len,
-                           const char *name, size_t name_len)
+                           const char *name, size_t name_len, int type)
 {
        const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-        if (!test_opt(inode->i_sb, POSIX_ACL))
+        if (!test_opt(dentry->d_sb, POSIX_ACL))
                return 0;
        if (list && size <= list_len)
                memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
@@ -377,12 +377,12 @@ ext4_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
 }
 static size_t
-ext4_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
+ext4_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len,
-                            const char *name, size_t name_len)
+                            const char *name, size_t name_len, int type)
 {
        const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-        if (!test_opt(inode->i_sb, POSIX_ACL))
+        if (!test_opt(dentry->d_sb, POSIX_ACL))
                return 0;
        if (list && size <= list_len)
                memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
@@ -390,15 +390,18 @@ ext4_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
 }
 static int
-ext4_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
+ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
+                   size_t size, int type)
 {
        struct posix_acl *acl;
        int error;
-        if (!test_opt(inode->i_sb, POSIX_ACL))
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        if (!test_opt(dentry->d_sb, POSIX_ACL))
                return -EOPNOTSUPP;
-        acl = ext4_get_acl(inode, type);
+        acl = ext4_get_acl(dentry->d_inode, type);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl == NULL)
@@ -410,31 +413,16 @@ ext4_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
 }
 static int
-ext4_xattr_get_acl_access(struct inode *inode, const char *name,
+ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
-                          void *buffer, size_t size)
+                   size_t size, int flags, int type)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ext4_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
-}
-static int
-ext4_xattr_get_acl_default(struct inode *inode, const char *name,
-                           void *buffer, size_t size)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ext4_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
-}
-static int
-ext4_xattr_set_acl(struct inode *inode, int type, const void *value,
-                   size_t size)
 {
+        struct inode *inode = dentry->d_inode;
        handle_t *handle;
        struct posix_acl *acl;
        int error, retries = 0;
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
        if (!test_opt(inode->i_sb, POSIX_ACL))
                return -EOPNOTSUPP;
        if (!is_owner_or_cap(inode))
@@ -466,34 +454,18 @@ release_and_out:
        return error;
 }
-static int
-ext4_xattr_set_acl_access(struct inode *inode, const char *name,
-                          const void *value, size_t size, int flags)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ext4_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
-}
-static int
-ext4_xattr_set_acl_default(struct inode *inode, const char *name,
-                           const void *value, size_t size, int flags)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ext4_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
-}
 struct xattr_handler ext4_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .flags  = ACL_TYPE_ACCESS,
        .list   = ext4_xattr_list_acl_access,
-        .get    = ext4_xattr_get_acl_access,
+        .get    = ext4_xattr_get_acl,
-        .set    = ext4_xattr_set_acl_access,
+        .set    = ext4_xattr_set_acl,
 };
 struct xattr_handler ext4_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .flags  = ACL_TYPE_DEFAULT,
        .list   = ext4_xattr_list_acl_default,
-        .get    = ext4_xattr_get_acl_default,
+        .get    = ext4_xattr_get_acl,
-        .set    = ext4_xattr_set_acl_default,
+        .set    = ext4_xattr_set_acl,
 };
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1d0418980f8d..d2f37a5516c7 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -97,8 +97,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                /* If checksum is bad mark all blocks used to prevent allocation
                 * essentially implementing a per-group read-only flag. */
                if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
-                        ext4_error(sb, __func__,
+                        ext4_error(sb, "Checksum bad for group %u",
-                                  "Checksum bad for group %u", block_group);
+                                        block_group);
                        ext4_free_blks_set(sb, gdp, 0);
                        ext4_free_inodes_set(sb, gdp, 0);
                        ext4_itable_unused_set(sb, gdp, 0);
@@ -130,8 +130,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 * to make sure we calculate the right free blocks
                 */
                group_blocks = ext4_blocks_count(sbi->s_es) -
-                        le32_to_cpu(sbi->s_es->s_first_data_block) -
+                        ext4_group_first_block_no(sb, ngroups - 1);
-                        (EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1));
        } else {
                group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
        }
@@ -189,9 +188,6 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 * when a file system is mounted (see ext4_fill_super).
 */
-#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
 /**
 * ext4_get_group_desc() -- load group descriptor from disk
 * @sb:                 super block
@@ -210,10 +206,8 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        if (block_group >= ngroups) {
-                ext4_error(sb, "ext4_get_group_desc",
+                ext4_error(sb, "block_group >= groups_count - block_group = %u,"
-                           "block_group >= groups_count - "
+                           " groups_count = %u", block_group, ngroups);
-                           "block_group = %u, groups_count = %u",
-                           block_group, ngroups);
                return NULL;
        }
@@ -221,8 +215,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
        group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
        offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
        if (!sbi->s_group_desc[group_desc]) {
-                ext4_error(sb, "ext4_get_group_desc",
+                ext4_error(sb, "Group descriptor not loaded - "
-                           "Group descriptor not loaded - "
                           "block_group = %u, group_desc = %u, desc = %u",
                           block_group, group_desc, offset);
                return NULL;
@@ -282,9 +275,7 @@ static int ext4_valid_block_bitmap(struct super_block *sb,
                return 1;
 err_out:
-        ext4_error(sb, __func__,
+        ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu",
-                        "Invalid block bitmap - "
-                        "block_group = %d, block = %llu",
                        block_group, bitmap_blk);
        return 0;
 }
@@ -311,8 +302,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
        bitmap_blk = ext4_block_bitmap(sb, desc);
        bh = sb_getblk(sb, bitmap_blk);
        if (unlikely(!bh)) {
-                ext4_error(sb, __func__,
+                ext4_error(sb, "Cannot read block bitmap - "
-                            "Cannot read block bitmap - "
                            "block_group = %u, block_bitmap = %llu",
                            block_group, bitmap_blk);
                return NULL;
@@ -354,8 +344,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
        set_bitmap_uptodate(bh);
        if (bh_submit_read(bh) < 0) {
                put_bh(bh);
-                ext4_error(sb, __func__,
+                ext4_error(sb, "Cannot read block bitmap - "
-                            "Cannot read block bitmap - "
                            "block_group = %u, block_bitmap = %llu",
                            block_group, bitmap_blk);
                return NULL;
@@ -419,8 +408,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
            in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
            in_range(block + count - 1, ext4_inode_table(sb, desc),
                     sbi->s_itb_per_group)) {
-                ext4_error(sb, __func__,
+                ext4_error(sb, "Adding blocks in system zones - "
-                           "Adding blocks in system zones - "
                           "Block = %llu, count = %lu",
                           block, count);
                goto error_return;
@@ -453,8 +441,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
                BUFFER_TRACE(bitmap_bh, "clear bit");
                if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
                                                bit + i, bitmap_bh->b_data)) {
-                        ext4_error(sb, __func__,
+                        ext4_error(sb, "bit already cleared for block %llu",
-                                   "bit already cleared for block %llu",
                                   (ext4_fsblk_t)(block + i));
                        BUFFER_TRACE(bitmap_bh, "bit already cleared");
                } else {
@@ -499,44 +486,6 @@ error_return:
 }
 /**
- * ext4_free_blocks() -- Free given blocks and update quota
- * @handle:             handle for this transaction
- * @inode:              inode
- * @block:              start physical block to free
- * @count:              number of blocks to count
- * @metadata:           Are these metadata blocks
- */
-void ext4_free_blocks(handle_t *handle, struct inode *inode,
-                        ext4_fsblk_t block, unsigned long count,
-                        int metadata)
-{
-        struct super_block *sb;
-        unsigned long dquot_freed_blocks;
-        /* this isn't the right place to decide whether block is metadata
-         * inode.c/extents.c knows better, but for safety ... */
-        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-                metadata = 1;
-        /* We need to make sure we don't reuse
-         * block released untill the transaction commit.
-         * writeback mode have weak data consistency so
-         * don't force data as metadata when freeing block
-         * for writeback mode.
-         */
-        if (metadata == 0 && !ext4_should_writeback_data(inode))
-                metadata = 1;
-        sb = inode->i_sb;
-        ext4_mb_free_blocks(handle, inode, block, count,
-                            metadata, &dquot_freed_blocks);
-        if (dquot_freed_blocks)
-                vfs_dq_free_block(inode, dquot_freed_blocks);
-        return;
-}
-/**
 * ext4_has_free_blocks()
 * @sbi:        in-core super block structure.
 * @nblocks:    number of needed blocks
@@ -761,7 +710,13 @@ static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
 static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
                                        ext4_group_t group)
 {
-        return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0;
+        if (!ext4_bg_has_super(sb, group))
+                return 0;
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
+                return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
+        else
+                return EXT4_SB(sb)->s_gdb_count;
 }
 /**
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 50784ef07563..538c48655084 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -16,9 +16,9 @@
 #include <linux/module.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
-#include <linux/version.h>
 #include <linux/blkdev.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include "ext4.h"
 struct ext4_system_zone {
@@ -160,7 +160,7 @@ int ext4_setup_system_zone(struct super_block *sb)
                if (ext4_bg_has_super(sb, i) &&
                    ((i < 5) || ((i % flex_size) == 0)))
                        add_system_zone(sbi, ext4_group_first_block_no(sb, i),
-                                        sbi->s_gdb_count + 1);
+                                        ext4_bg_num_gdb(sb, i) + 1);
                gdp = ext4_get_group_desc(sb, i, NULL);
                ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
                if (ret)
@@ -206,14 +206,14 @@ void ext4_release_system_zone(struct super_block *sb)
                entry = rb_entry(n, struct ext4_system_zone, node);
                kmem_cache_free(ext4_system_zone_cachep, entry);
                if (!parent)
-                        EXT4_SB(sb)->system_blks.rb_node = NULL;
+                        EXT4_SB(sb)->system_blks = RB_ROOT;
                else if (parent->rb_left == n)
                        parent->rb_left = NULL;
                else if (parent->rb_right == n)
                        parent->rb_right = NULL;
                n = parent;
        }
-        EXT4_SB(sb)->system_blks.rb_node = NULL;
+        EXT4_SB(sb)->system_blks = RB_ROOT;
 }
 /*
@@ -228,6 +228,7 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
        struct rb_node *n = sbi->system_blks.rb_node;
        if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+            (start_blk + count < start_blk) ||
            (start_blk + count > ext4_blocks_count(sbi->s_es)))
                return 0;
        while (n) {
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 9dc93168e262..86cb6d86a048 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -83,10 +83,12 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
                error_msg = "inode out of bounds";
        if (error_msg != NULL)
-                ext4_error(dir->i_sb, function,
+                __ext4_error(dir->i_sb, function,
-                        "bad entry in directory #%lu: %s - "
+                        "bad entry in directory #%lu: %s - block=%llu"
-                        "offset=%u, inode=%u, rec_len=%d, name_len=%d",
+                        "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
-                        dir->i_ino, error_msg, offset,
+                        dir->i_ino, error_msg, 
+                        (unsigned long long) bh->b_blocknr,     
+                        (unsigned) (offset%bh->b_size), offset,
                        le32_to_cpu(de->inode),
                        rlen, de->name_len);
        return error_msg == NULL ? 1 : 0;
@@ -150,7 +152,7 @@ static int ext4_readdir(struct file *filp,
                 */
                if (!bh) {
                        if (!dir_has_error) {
-                                ext4_error(sb, __func__, "directory #%lu "
+                                ext4_error(sb, "directory #%lu "
                                           "contains a hole at offset %Lu",
                                           inode->i_ino,
                                           (unsigned long long) filp->f_pos);
@@ -303,7 +305,7 @@ static void free_rb_tree_fname(struct rb_root *root)
                        kfree(old);
                }
                if (!parent)
-                        root->rb_node = NULL;
+                        *root = RB_ROOT;
                else if (parent->rb_left == n)
                        parent->rb_left = NULL;
                else if (parent->rb_right == n)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8825515eeddd..bf938cf7c5f0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -53,6 +53,12 @@
 #define ext4_debug(f, a...)     do {} while (0)
 #endif
+#define EXT4_ERROR_INODE(inode, fmt, a...) \
+        ext4_error_inode(__func__, (inode), (fmt), ## a);
+#define EXT4_ERROR_FILE(file, fmt, a...)        \
+        ext4_error_file(__func__, (file), (fmt), ## a);
 /* data type for block offset of block group */
 typedef int ext4_grpblk_t;
@@ -133,14 +139,14 @@ struct mpage_da_data {
        int pages_written;
        int retval;
 };
-#define DIO_AIO_UNWRITTEN       0x1
+#define EXT4_IO_UNWRITTEN       0x1
 typedef struct ext4_io_end {
        struct list_head        list;           /* per-file finished AIO list */
        struct inode            *inode;         /* file being written to */
        unsigned int            flag;           /* unwritten or not */
-        int                     error;          /* I/O error code */
+        struct page             *page;          /* page struct for buffer write */
-        ext4_lblk_t             offset;         /* offset in the file */
+        loff_t                  offset;         /* offset in the file */
-        size_t                  size;           /* size of the extent */
+        ssize_t                 size;           /* size of the extent */
        struct work_struct      work;           /* data work queue */
 } ext4_io_end_t;
@@ -284,10 +290,12 @@ struct flex_groups {
 #define EXT4_TOPDIR_FL                  0x00020000 /* Top of directory hierarchies*/
 #define EXT4_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
 #define EXT4_EXTENTS_FL                 0x00080000 /* Inode uses extents */
+#define EXT4_EA_INODE_FL                0x00200000 /* Inode used for large EA */
+#define EXT4_EOFBLOCKS_FL               0x00400000 /* Blocks allocated beyond EOF */
 #define EXT4_RESERVED_FL                0x80000000 /* reserved for ext4 lib */
-#define EXT4_FL_USER_VISIBLE            0x000BDFFF /* User visible flags */
+#define EXT4_FL_USER_VISIBLE            0x004BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE         0x000B80FF /* User modifiable flags */
+#define EXT4_FL_USER_MODIFIABLE         0x004B80FF /* User modifiable flags */
 /* Flags that should be inherited by new inodes from their parent. */
 #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
@@ -313,17 +321,6 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
                return flags & EXT4_OTHER_FLMASK;
 }
-/*
- * Inode dynamic state flags
- */
-#define EXT4_STATE_JDATA                0x00000001 /* journaled data exists */
-#define EXT4_STATE_NEW                  0x00000002 /* inode is newly created */
-#define EXT4_STATE_XATTR                0x00000004 /* has in-inode xattrs */
-#define EXT4_STATE_NO_EXPAND            0x00000008 /* No space for expansion */
-#define EXT4_STATE_DA_ALLOC_CLOSE       0x00000010 /* Alloc DA blks on close */
-#define EXT4_STATE_EXT_MIGRATE          0x00000020 /* Inode is migrating */
-#define EXT4_STATE_DIO_UNWRITTEN        0x00000040 /* need convert on dio done*/
 /* Used to pass group descriptor data when online resize is done */
 struct ext4_new_group_input {
        __u32 group;            /* Group number for this data */
@@ -361,19 +358,23 @@ struct ext4_new_group_data {
           so set the magic i_delalloc_reserve_flag after taking the 
           inode allocation semaphore for */
 #define EXT4_GET_BLOCKS_DELALLOC_RESERVE        0x0004
-        /* Call ext4_da_update_reserve_space() after successfully 
-           allocating the blocks */
-#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE    0x0008
        /* caller is from the direct IO path, request to creation of an
        unitialized extents if not allocated, split the uninitialized
        extent if blocks has been preallocated already*/
-#define EXT4_GET_BLOCKS_DIO                     0x0010
+#define EXT4_GET_BLOCKS_PRE_IO                  0x0008
-#define EXT4_GET_BLOCKS_CONVERT                 0x0020
+#define EXT4_GET_BLOCKS_CONVERT                 0x0010
-#define EXT4_GET_BLOCKS_DIO_CREATE_EXT          (EXT4_GET_BLOCKS_DIO|\
+#define EXT4_GET_BLOCKS_IO_CREATE_EXT           (EXT4_GET_BLOCKS_PRE_IO|\
+                                         EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+        /* Convert extent to initialized after IO complete */
+#define EXT4_GET_BLOCKS_IO_CONVERT_EXT          (EXT4_GET_BLOCKS_CONVERT|\
                                         EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
-        /* Convert extent to initialized after direct IO complete */
-#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT         (EXT4_GET_BLOCKS_CONVERT|\
+/*
-                                         EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+ * Flags used by ext4_free_blocks
+ */
+#define EXT4_FREE_BLOCKS_METADATA       0x0001
+#define EXT4_FREE_BLOCKS_FORGET         0x0002
+#define EXT4_FREE_BLOCKS_VALIDATED      0x0004
 /*
 * ioctl commands
@@ -627,7 +628,7 @@ struct ext4_inode_info {
         * near to their parent directory's inode.
         */
        ext4_group_t    i_block_group;
-        __u32   i_state;                /* Dynamic state flags for ext4 */
+        unsigned long   i_state_flags;          /* Dynamic state flags */
        ext4_lblk_t             i_dir_start_lookup;
 #ifdef CONFIG_EXT4_FS_XATTR
@@ -693,16 +694,30 @@ struct ext4_inode_info {
        unsigned int i_reserved_meta_blocks;
        unsigned int i_allocated_meta_blocks;
        unsigned short i_delalloc_reserved_flag;
+        sector_t i_da_metadata_calc_last_lblock;
+        int i_da_metadata_calc_len;
        /* on-disk additional length */
        __u16 i_extra_isize;
        spinlock_t i_block_reservation_lock;
+#ifdef CONFIG_QUOTA
+        /* quota space reservation, managed internally by quota code */
+        qsize_t i_reserved_quota;
+#endif
-        /* completed async DIOs that might need unwritten extents handling */
+        /* completed IOs that might need unwritten extents handling */
-        struct list_head i_aio_dio_complete_list;
+        struct list_head i_completed_io_list;
+        spinlock_t i_completed_io_lock;
        /* current io_end structure for async DIO write*/
        ext4_io_end_t *cur_aio_dio;
+        /*
+         * Transactions that contain inode's metadata needed to complete
+         * fsync and fdatasync, respectively.
+         */
+        tid_t i_sync_tid;
+        tid_t i_datasync_tid;
 };
 /*
@@ -744,12 +759,14 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_QUOTA                0x80000 /* Some quota option set */
 #define EXT4_MOUNT_USRQUOTA             0x100000 /* "old" user quota */
 #define EXT4_MOUNT_GRPQUOTA             0x200000 /* "old" group quota */
+#define EXT4_MOUNT_DIOREAD_NOLOCK       0x400000 /* Enable support for dio read nolocking */
 #define EXT4_MOUNT_JOURNAL_CHECKSUM     0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_DELALLOC             0x8000000 /* Delalloc support */
 #define EXT4_MOUNT_DATA_ERR_ABORT       0x10000000 /* Abort on file data write */
 #define EXT4_MOUNT_BLOCK_VALIDITY       0x20000000 /* Block validity checking */
+#define EXT4_MOUNT_DISCARD              0x40000000 /* Issue DISCARD requests */
 #define clear_opt(o, opt)               o &= ~EXT4_MOUNT_##opt
 #define set_opt(o, opt)                 o |= EXT4_MOUNT_##opt
@@ -997,7 +1014,7 @@ struct ext4_sb_info {
        atomic_t s_lock_busy;
        /* locality groups */
-        struct ext4_locality_group *s_locality_groups;
+        struct ext4_locality_group __percpu *s_locality_groups;
        /* for write statistics */
        unsigned long s_sectors_written_start;
@@ -1033,6 +1050,34 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
                (ino >= EXT4_FIRST_INO(sb) &&
                 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
 }
+/*
+ * Inode dynamic state flags
+ */
+enum {
+        EXT4_STATE_JDATA,               /* journaled data exists */
+        EXT4_STATE_NEW,                 /* inode is newly created */
+        EXT4_STATE_XATTR,               /* has in-inode xattrs */
+        EXT4_STATE_NO_EXPAND,           /* No space for expansion */
+        EXT4_STATE_DA_ALLOC_CLOSE,      /* Alloc DA blks on close */
+        EXT4_STATE_EXT_MIGRATE,         /* Inode is migrating */
+        EXT4_STATE_DIO_UNWRITTEN,       /* need convert on dio done*/
+};
+static inline int ext4_test_inode_state(struct inode *inode, int bit)
+{
+        return test_bit(bit, &EXT4_I(inode)->i_state_flags);
+}
+static inline void ext4_set_inode_state(struct inode *inode, int bit)
+{
+        set_bit(bit, &EXT4_I(inode)->i_state_flags);
+}
+static inline void ext4_clear_inode_state(struct inode *inode, int bit)
+{
+        clear_bit(bit, &EXT4_I(inode)->i_state_flags);
+}
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
 * a kernel struct super_block.  This will allow us to call the feature-test
@@ -1109,6 +1154,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define EXT4_FEATURE_INCOMPAT_64BIT             0x0080
 #define EXT4_FEATURE_INCOMPAT_MMP               0x0100
 #define EXT4_FEATURE_INCOMPAT_FLEX_BG           0x0200
+#define EXT4_FEATURE_INCOMPAT_EA_INODE          0x0400 /* EA in inode */
+#define EXT4_FEATURE_INCOMPAT_DIRDATA           0x1000 /* data in dirent */
 #define EXT4_FEATURE_COMPAT_SUPP        EXT2_FEATURE_COMPAT_EXT_ATTR
 #define EXT4_FEATURE_INCOMPAT_SUPP      (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1324,8 +1371,6 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, unsigned long *count, int *errp);
 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
-extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
-                        ext4_fsblk_t block, unsigned long count, int metadata);
 extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
                                ext4_fsblk_t block, unsigned long count);
 extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
@@ -1384,16 +1429,15 @@ extern int ext4_mb_reserve_blocks(struct super_block *, int);
 extern void ext4_discard_preallocations(struct inode *);
 extern int __init init_ext4_mballoc(void);
 extern void exit_ext4_mballoc(void);
-extern void ext4_mb_free_blocks(handle_t *, struct inode *,
+extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
-                ext4_fsblk_t, unsigned long, int, unsigned long *);
+                             struct buffer_head *bh, ext4_fsblk_t block,
+                             unsigned long count, int flags);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
                ext4_group_t i, struct ext4_group_desc *desc);
 extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
 extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
                                                ext4_group_t, int);
 /* inode.c */
-int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
-                struct buffer_head *bh, ext4_fsblk_t blocknr);
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
                                                ext4_lblk_t, int, int *);
 struct buffer_head *ext4_bread(handle_t *, struct inode *,
@@ -1402,7 +1446,7 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
                                struct buffer_head *bh_result, int create);
 extern struct inode *ext4_iget(struct super_block *, unsigned long);
-extern int  ext4_write_inode(struct inode *, int);
+extern int  ext4_write_inode(struct inode *, struct writeback_control *);
 extern int  ext4_setattr(struct dentry *, struct iattr *);
 extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
                                struct kstat *stat);
@@ -1424,8 +1468,10 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
-extern qsize_t ext4_get_reserved_space(struct inode *inode);
+extern qsize_t *ext4_get_reserved_space(struct inode *inode);
-extern int flush_aio_dio_completed_IO(struct inode *inode);
+extern int flush_completed_IO(struct inode *inode);
+extern void ext4_da_update_reserve_space(struct inode *inode,
+                                        int used, int quota_claim);
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
 extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -1449,13 +1495,20 @@ extern int ext4_group_extend(struct super_block *sb,
                                ext4_fsblk_t n_blocks_count);
 /* super.c */
-extern void ext4_error(struct super_block *, const char *, const char *, ...)
+extern void __ext4_error(struct super_block *, const char *, const char *, ...)
+        __attribute__ ((format (printf, 3, 4)));
+#define ext4_error(sb, message...)      __ext4_error(sb, __func__, ## message)
+extern void ext4_error_inode(const char *, struct inode *, const char *, ...)
+        __attribute__ ((format (printf, 3, 4)));
+extern void ext4_error_file(const char *, struct file *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
 extern void __ext4_std_error(struct super_block *, const char *, int);
 extern void ext4_abort(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
-extern void ext4_warning(struct super_block *, const char *, const char *, ...)
+extern void __ext4_warning(struct super_block *, const char *,
+                          const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
+#define ext4_warning(sb, message...)    __ext4_warning(sb, __func__, ## message)
 extern void ext4_msg(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
 extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
@@ -1728,7 +1781,7 @@ extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
                          loff_t len);
 extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
-                          loff_t len);
+                          ssize_t len);
 extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
                           sector_t block, unsigned int max_blocks,
                           struct buffer_head *bh, int flags);
@@ -1740,6 +1793,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
                             __u64 len, __u64 *moved_len);
+/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
+enum ext4_state_bits {
+        BH_Uninit       /* blocks are allocated but uninitialized on disk */
+          = BH_JBDPrivateStart,
+};
+BUFFER_FNS(Uninit, uninit)
+TAS_BUFFER_FNS(Uninit, uninit)
 /*
 * Add new method to test wether block and inode bitmaps are properly
 * initialized. With uninit_bg reading the block from disk is not enough
@@ -1757,6 +1819,8 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
        set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
 }
+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
 #endif  /* __KERNEL__ */
 #endif  /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 2ca686454e87..bdb6ce7e2eb4 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -225,7 +225,8 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
        ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
 }
-extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
+extern int ext4_ext_calc_metadata_amount(struct inode *inode,
+                                         sector_t lblocks);
 extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
 extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
 extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 6a9409920dee..53d2764d71ca 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -4,6 +4,8 @@
 #include "ext4_jbd2.h"
+#include <trace/events/ext4.h>
 int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
                                struct buffer_head *bh)
 {
@@ -32,35 +34,69 @@ int __ext4_journal_get_write_access(const char *where, handle_t *handle,
        return err;
 }
-int __ext4_journal_forget(const char *where, handle_t *handle,
+/*
-                                struct buffer_head *bh)
+ * The ext4 forget function must perform a revoke if we are freeing data
+ * which has been journaled.  Metadata (eg. indirect blocks) must be
+ * revoked in all cases.
+ *
+ * "bh" may be NULL: a metadata block may have been freed from memory
+ * but there may still be a record of it in the journal, and that record
+ * still needs to be revoked.
+ *
+ * If the handle isn't valid we're not journaling, but we still need to
+ * call into ext4_journal_revoke() to put the buffer head.
+ */
+int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
+                  struct inode *inode, struct buffer_head *bh,
+                  ext4_fsblk_t blocknr)
 {
-        int err = 0;
+        int err;
-        if (ext4_handle_valid(handle)) {
+        might_sleep();
-                err = jbd2_journal_forget(handle, bh);
-                if (err)
+        trace_ext4_forget(inode, is_metadata, blocknr);
-                        ext4_journal_abort_handle(where, __func__, bh,
+        BUFFER_TRACE(bh, "enter");
-                                                  handle, err);
-        }
+        jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
-        else
+                  "data mode %x\n",
+                  bh, is_metadata, inode->i_mode,
+                  test_opt(inode->i_sb, DATA_FLAGS));
+        /* In the no journal case, we can just do a bforget and return */
+        if (!ext4_handle_valid(handle)) {
                bforget(bh);
-        return err;
+                return 0;
-}
+        }
-int __ext4_journal_revoke(const char *where, handle_t *handle,
+        /* Never use the revoke function if we are doing full data
-                                ext4_fsblk_t blocknr, struct buffer_head *bh)
+         * journaling: there is no need to, and a V1 superblock won't
-{
+         * support it.  Otherwise, only skip the revoke on un-journaled
-        int err = 0;
+         * data blocks. */
-        if (ext4_handle_valid(handle)) {
+        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
-                err = jbd2_journal_revoke(handle, blocknr, bh);
+            (!is_metadata && !ext4_should_journal_data(inode))) {
-                if (err)
+                if (bh) {
-                        ext4_journal_abort_handle(where, __func__, bh,
+                        BUFFER_TRACE(bh, "call jbd2_journal_forget");
-                                                  handle, err);
+                        err = jbd2_journal_forget(handle, bh);
+                        if (err)
+                                ext4_journal_abort_handle(where, __func__, bh,
+                                                          handle, err);
+                        return err;
+                }
+                return 0;
        }
-        else
-                bforget(bh);
+        /*
+         * data!=journal && (is_metadata || should_journal_data(inode))
+         */
+        BUFFER_TRACE(bh, "call jbd2_journal_revoke");
+        err = jbd2_journal_revoke(handle, blocknr, bh);
+        if (err) {
+                ext4_journal_abort_handle(where, __func__, bh, handle, err);
+                ext4_abort(inode->i_sb, __func__,
+                           "error %d when attempting revoke", err);
+        }
+        BUFFER_TRACE(bh, "exit");
        return err;
 }
@@ -89,14 +125,14 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
                        ext4_journal_abort_handle(where, __func__, bh,
                                                  handle, err);
        } else {
-                if (inode && bh)
+                if (inode)
                        mark_buffer_dirty_inode(bh, inode);
                else
                        mark_buffer_dirty(bh);
                if (inode && inode_needs_sync(inode)) {
                        sync_dirty_buffer(bh);
                        if (buffer_req(bh) && !buffer_uptodate(bh)) {
-                                ext4_error(inode->i_sb, __func__,
+                                ext4_error(inode->i_sb,
                                           "IO error syncing inode, "
                                           "inode=%lu, block=%llu",
                                           inode->i_ino,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index a2865980342f..b79ad5126468 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -49,7 +49,7 @@
 #define EXT4_DATA_TRANS_BLOCKS(sb)      (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
                                         EXT4_XATTR_TRANS_BLOCKS - 2 + \
-                                         2*EXT4_QUOTA_TRANS_BLOCKS(sb))
+                                         EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
 /*
 * Define the number of metadata blocks we need to account to modify data.
@@ -57,7 +57,7 @@
 * This include super block, inode block, quota blocks and xattr blocks
 */
 #define EXT4_META_TRANS_BLOCKS(sb)      (EXT4_XATTR_TRANS_BLOCKS + \
-                                        2*EXT4_QUOTA_TRANS_BLOCKS(sb))
+                                        EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
 /* Delete operations potentially hit one directory's namespace plus an
 * entire inode, plus arbitrary amounts of bitmap/indirection data.  Be
@@ -92,6 +92,7 @@
 * but inode, sb and group updates are done only once */
 #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
                (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0)
 #define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
                (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0)
 #else
@@ -99,6 +100,9 @@
 #define EXT4_QUOTA_INIT_BLOCKS(sb) 0
 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0
 #endif
+#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
 int
 ext4_mark_iloc_dirty(handle_t *handle,
@@ -116,12 +120,8 @@ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
 /*
- * Wrapper functions with which ext4 calls into JBD.  The intent here is
+ * Wrapper functions with which ext4 calls into JBD.
- * to allow these to be turned into appropriate stubs so ext4 can control
- * ext2 filesystems, so ext2+ext4 systems only nee one fs.  This work hasn't
- * been done yet.
 */
 void ext4_journal_abort_handle(const char *caller, const char *err_fn,
                struct buffer_head *bh, handle_t *handle, int err);
@@ -131,13 +131,9 @@ int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
 int __ext4_journal_get_write_access(const char *where, handle_t *handle,
                                struct buffer_head *bh);
-/* When called with an invalid handle, this will still do a put on the BH */
+int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
-int __ext4_journal_forget(const char *where, handle_t *handle,
+                  struct inode *inode, struct buffer_head *bh,
-                                struct buffer_head *bh);
+                  ext4_fsblk_t blocknr);
-/* When called with an invalid handle, this will still do a put on the BH */
-int __ext4_journal_revoke(const char *where, handle_t *handle,
-                                ext4_fsblk_t blocknr, struct buffer_head *bh);
 int __ext4_journal_get_create_access(const char *where,
                                handle_t *handle, struct buffer_head *bh);
@@ -149,12 +145,11 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
        __ext4_journal_get_undo_access(__func__, (handle), (bh))
 #define ext4_journal_get_write_access(handle, bh) \
        __ext4_journal_get_write_access(__func__, (handle), (bh))
-#define ext4_journal_revoke(handle, blocknr, bh) \
+#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
-        __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
+        __ext4_forget(__func__, (handle), (is_metadata), (inode), (bh),\
+                      (block_nr))
 #define ext4_journal_get_create_access(handle, bh) \
        __ext4_journal_get_create_access(__func__, (handle), (bh))
-#define ext4_journal_forget(handle, bh) \
-        __ext4_journal_forget(__func__, (handle), (bh))
 #define ext4_handle_dirty_metadata(handle, inode, bh) \
        __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh))
@@ -254,6 +249,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
        return 0;
 }
+static inline void ext4_update_inode_fsync_trans(handle_t *handle,
+                                                 struct inode *inode,
+                                                 int datasync)
+{
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        if (ext4_handle_valid(handle)) {
+                ei->i_sync_tid = handle->h_transaction->t_tid;
+                if (datasync)
+                        ei->i_datasync_tid = handle->h_transaction->t_tid;
+        }
+}
 /* super.c */
 int ext4_force_commit(struct super_block *sb);
@@ -296,4 +304,28 @@ static inline int ext4_should_writeback_data(struct inode *inode)
        return 0;
 }
+/*
+ * This function controls whether or not we should try to go down the
+ * dioread_nolock code paths, which makes it safe to avoid taking
+ * i_mutex for direct I/O reads.  This only works for extent-based
+ * files, and it doesn't work for nobh or if data journaling is
+ * enabled, since the dioread_nolock code uses b_private to pass
+ * information back to the I/O completion handler, and this conflicts
+ * with the jbd's use of b_private.
+ */
+static inline int ext4_should_dioread_nolock(struct inode *inode)
+{
+        if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
+                return 0;
+        if (test_opt(inode->i_sb, NOBH))
+                return 0;
+        if (!S_ISREG(inode->i_mode))
+                return 0;
+        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+                return 0;
+        if (ext4_should_journal_data(inode))
+                return 0;
+        return 1;
+}
 #endif  /* _EXT4_JBD2_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 715264b4bae4..94c8ee81f5e1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -195,8 +195,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
                if (S_ISREG(inode->i_mode))
                        block_group++;
        }
-        bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
+        bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
-                le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
        last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
        /*
@@ -296,29 +295,44 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 * to allocate @blocks
 * Worse case is one block per extent
 */
-int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
+int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock)
 {
-        int lcap, icap, rcap, leafs, idxs, num;
+        struct ext4_inode_info *ei = EXT4_I(inode);
-        int newextents = blocks;
+        int idxs, num = 0;
-        rcap = ext4_ext_space_root_idx(inode, 0);
-        lcap = ext4_ext_space_block(inode, 0);
-        icap = ext4_ext_space_block_idx(inode, 0);
-        /* number of new leaf blocks needed */
+        idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
-        num = leafs = (newextents + lcap - 1) / lcap;
+                / sizeof(struct ext4_extent_idx));
        /*
-         * Worse case, we need separate index block(s)
+         * If the new delayed allocation block is contiguous with the
-         * to link all new leaf blocks
+         * previous da block, it can share index blocks with the
+         * previous block, so we only need to allocate a new index
+         * block every idxs leaf blocks.  At ldxs**2 blocks, we need
+         * an additional index block, and at ldxs**3 blocks, yet
+         * another index blocks.
         */
-        idxs = (leafs + icap - 1) / icap;
+        if (ei->i_da_metadata_calc_len &&
-        do {
+            ei->i_da_metadata_calc_last_lblock+1 == lblock) {
-                num += idxs;
+                if ((ei->i_da_metadata_calc_len % idxs) == 0)
-                idxs = (idxs + icap - 1) / icap;
+                        num++;
-        } while (idxs > rcap);
+                if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
+                        num++;
+                if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
+                        num++;
+                        ei->i_da_metadata_calc_len = 0;
+                } else
+                        ei->i_da_metadata_calc_len++;
+                ei->i_da_metadata_calc_last_lblock++;
+                return num;
+        }
-        return num;
+        /*
+         * In the worst case we need a new set of index blocks at
+         * every level of the inode's extent tree.
+         */
+        ei->i_da_metadata_calc_len = 1;
+        ei->i_da_metadata_calc_last_lblock = lblock;
+        return ext_depth(inode) + 1;
 }
 static int
@@ -425,7 +439,7 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
        return 0;
 corrupted:
-        ext4_error(inode->i_sb, function,
+        __ext4_error(inode->i_sb, function,
                        "bad header/extent in inode #%lu: %s - magic %x, "
                        "entries %u, max %u(%u), depth %u(%u)",
                        inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
@@ -688,7 +702,12 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                }
                eh = ext_block_hdr(bh);
                ppos++;
-                BUG_ON(ppos > depth);
+                if (unlikely(ppos > depth)) {
+                        put_bh(bh);
+                        EXT4_ERROR_INODE(inode,
+                                         "ppos %d > depth %d", ppos, depth);
+                        goto err;
+                }
                path[ppos].p_bh = bh;
                path[ppos].p_hdr = eh;
                i--;
@@ -734,7 +753,12 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
        if (err)
                return err;
-        BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block));
+        if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
+                EXT4_ERROR_INODE(inode,
+                                 "logical %d == ei_block %d!",
+                                 logical, le32_to_cpu(curp->p_idx->ei_block));
+                return -EIO;
+        }
        len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
        if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
                /* insert after */
@@ -764,9 +788,17 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
        ext4_idx_store_pblock(ix, ptr);
        le16_add_cpu(&curp->p_hdr->eh_entries, 1);
-        BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries)
+        if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
-                             > le16_to_cpu(curp->p_hdr->eh_max));
+                             > le16_to_cpu(curp->p_hdr->eh_max))) {
-        BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr));
+                EXT4_ERROR_INODE(inode,
+                                 "logical %d == ei_block %d!",
+                                 logical, le32_to_cpu(curp->p_idx->ei_block));
+                return -EIO;
+        }
+        if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
+                EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
+                return -EIO;
+        }
        err = ext4_ext_dirty(handle, inode, curp);
        ext4_std_error(inode->i_sb, err);
@@ -804,7 +836,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
        /* if current leaf will be split, then we should use
         * border from split point */
-        BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr));
+        if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
+                EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
+                return -EIO;
+        }
        if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
                border = path[depth].p_ext[1].ee_block;
                ext_debug("leaf will be split."
@@ -845,7 +880,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
        /* initialize new leaf */
        newblock = ablocks[--a];
-        BUG_ON(newblock == 0);
+        if (unlikely(newblock == 0)) {
+                EXT4_ERROR_INODE(inode, "newblock == 0!");
+                err = -EIO;
+                goto cleanup;
+        }
        bh = sb_getblk(inode->i_sb, newblock);
        if (!bh) {
                err = -EIO;
@@ -865,7 +904,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
        ex = EXT_FIRST_EXTENT(neh);
        /* move remainder of path[depth] to the new leaf */
-        BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max);
+        if (unlikely(path[depth].p_hdr->eh_entries !=
+                     path[depth].p_hdr->eh_max)) {
+                EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
+                                 path[depth].p_hdr->eh_entries,
+                                 path[depth].p_hdr->eh_max);
+                err = -EIO;
+                goto cleanup;
+        }
        /* start copy from next extent */
        /* TODO: we could do it by single memmove */
        m = 0;
@@ -912,7 +958,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
        /* create intermediate indexes */
        k = depth - at - 1;
-        BUG_ON(k < 0);
+        if (unlikely(k < 0)) {
+                EXT4_ERROR_INODE(inode, "k %d < 0!", k);
+                err = -EIO;
+                goto cleanup;
+        }
        if (k)
                ext_debug("create %d intermediate indices\n", k);
        /* insert new index into current index block */
@@ -949,8 +999,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
                                EXT_MAX_INDEX(path[i].p_hdr));
-                BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) !=
+                if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
-                                EXT_LAST_INDEX(path[i].p_hdr));
+                                        EXT_LAST_INDEX(path[i].p_hdr))) {
+                        EXT4_ERROR_INODE(inode,
+                                         "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
+                                         le32_to_cpu(path[i].p_ext->ee_block));
+                        err = -EIO;
+                        goto cleanup;
+                }
                while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
                        ext_debug("%d: move %d:%llu in new index %llu\n", i,
                                        le32_to_cpu(path[i].p_idx->ei_block),
@@ -1007,7 +1063,8 @@ cleanup:
                for (i = 0; i < depth; i++) {
                        if (!ablocks[i])
                                continue;
-                        ext4_free_blocks(handle, inode, ablocks[i], 1, 1);
+                        ext4_free_blocks(handle, inode, 0, ablocks[i], 1,
+                                         EXT4_FREE_BLOCKS_METADATA);
                }
        }
        kfree(ablocks);
@@ -1187,7 +1244,10 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
        struct ext4_extent *ex;
        int depth, ee_len;
-        BUG_ON(path == NULL);
+        if (unlikely(path == NULL)) {
+                EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
+                return -EIO;
+        }
        depth = path->p_depth;
        *phys = 0;
@@ -1201,15 +1261,33 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
        ex = path[depth].p_ext;
        ee_len = ext4_ext_get_actual_len(ex);
        if (*logical < le32_to_cpu(ex->ee_block)) {
-                BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
+                if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
+                        EXT4_ERROR_INODE(inode,
+                                         "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
+                                         *logical, le32_to_cpu(ex->ee_block));
+                        return -EIO;
+                }
                while (--depth >= 0) {
                        ix = path[depth].p_idx;
-                        BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
+                        if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
+                                EXT4_ERROR_INODE(inode,
+                                  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
+                                  ix != NULL ? ix->ei_block : 0,
+                                  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
+                                    EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0,
+                                  depth);
+                                return -EIO;
+                        }
                }
                return 0;
        }
-        BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
+        if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
+                EXT4_ERROR_INODE(inode,
+                                 "logical %d < ee_block %d + ee_len %d!",
+                                 *logical, le32_to_cpu(ex->ee_block), ee_len);
+                return -EIO;
+        }
        *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
        *phys = ext_pblock(ex) + ee_len - 1;
@@ -1235,7 +1313,10 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
        int depth;      /* Note, NOT eh_depth; depth from top of tree */
        int ee_len;
-        BUG_ON(path == NULL);
+        if (unlikely(path == NULL)) {
+                EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
+                return -EIO;
+        }
        depth = path->p_depth;
        *phys = 0;
@@ -1249,17 +1330,32 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
        ex = path[depth].p_ext;
        ee_len = ext4_ext_get_actual_len(ex);
        if (*logical < le32_to_cpu(ex->ee_block)) {
-                BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
+                if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
+                        EXT4_ERROR_INODE(inode,
+                                         "first_extent(path[%d].p_hdr) != ex",
+                                         depth);
+                        return -EIO;
+                }
                while (--depth >= 0) {
                        ix = path[depth].p_idx;
-                        BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
+                        if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
+                                EXT4_ERROR_INODE(inode,
+                                                 "ix != EXT_FIRST_INDEX *logical %d!",
+                                                 *logical);
+                                return -EIO;
+                        }
                }
                *logical = le32_to_cpu(ex->ee_block);
                *phys = ext_pblock(ex);
                return 0;
        }
-        BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
+        if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
+                EXT4_ERROR_INODE(inode,
+                                 "logical %d < ee_block %d + ee_len %d!",
+                                 *logical, le32_to_cpu(ex->ee_block), ee_len);
+                return -EIO;
+        }
        if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
                /* next allocated block in this leaf */
@@ -1398,8 +1494,12 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
-        BUG_ON(ex == NULL);
-        BUG_ON(eh == NULL);
+        if (unlikely(ex == NULL || eh == NULL)) {
+                EXT4_ERROR_INODE(inode,
+                                 "ex %p == NULL or eh %p == NULL", ex, eh);
+                return -EIO;
+        }
        if (depth == 0) {
                /* there is no tree at all */
@@ -1522,8 +1622,9 @@ int ext4_ext_try_to_merge(struct inode *inode,
                merge_done = 1;
                WARN_ON(eh->eh_entries == 0);
                if (!eh->eh_entries)
-                        ext4_error(inode->i_sb, "ext4_ext_try_to_merge",
+                        ext4_error(inode->i_sb,
-                           "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
+                                   "inode#%lu, eh->eh_entries = 0!",
+                                   inode->i_ino);
        }
        return merge_done;
@@ -1596,13 +1697,19 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        ext4_lblk_t next;
        unsigned uninitialized = 0;
-        BUG_ON(ext4_ext_get_actual_len(newext) == 0);
+        if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
+                EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
+                return -EIO;
+        }
        depth = ext_depth(inode);
        ex = path[depth].p_ext;
-        BUG_ON(path[depth].p_hdr == NULL);
+        if (unlikely(path[depth].p_hdr == NULL)) {
+                EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+                return -EIO;
+        }
        /* try to insert block into found extent and return */
-        if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+        if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
                && ext4_can_extents_be_merged(inode, ex, newext)) {
                ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
                                ext4_ext_is_uninitialized(newext),
@@ -1723,7 +1830,7 @@ has_space:
 merge:
        /* try to merge extents to the right */
-        if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+        if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
                ext4_ext_try_to_merge(inode, path, nearex);
        /* try to merge extents to the left */
@@ -1761,7 +1868,9 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
        while (block < last && block != EXT_MAX_BLOCK) {
                num = last - block;
                /* find extent for this block */
+                down_read(&EXT4_I(inode)->i_data_sem);
                path = ext4_ext_find_extent(inode, block, path);
+                up_read(&EXT4_I(inode)->i_data_sem);
                if (IS_ERR(path)) {
                        err = PTR_ERR(path);
                        path = NULL;
@@ -1769,7 +1878,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
                }
                depth = ext_depth(inode);
-                BUG_ON(path[depth].p_hdr == NULL);
+                if (unlikely(path[depth].p_hdr == NULL)) {
+                        EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+                        err = -EIO;
+                        break;
+                }
                ex = path[depth].p_ext;
                next = ext4_ext_next_allocated_block(path);
@@ -1820,7 +1933,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
                        cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
                }
-                BUG_ON(cbex.ec_len == 0);
+                if (unlikely(cbex.ec_len == 0)) {
+                        EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
+                        err = -EIO;
+                        break;
+                }
                err = func(inode, path, &cbex, ex, cbdata);
                ext4_ext_drop_refs(path);
@@ -1934,7 +2051,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
        BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
                        cex->ec_type != EXT4_EXT_CACHE_EXTENT);
-        if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
+        if (in_range(block, cex->ec_block, cex->ec_len)) {
                ex->ee_block = cpu_to_le32(cex->ec_block);
                ext4_ext_store_pblock(ex, cex->ec_start);
                ex->ee_len = cpu_to_le16(cex->ec_len);
@@ -1957,14 +2074,16 @@ errout:
 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
                        struct ext4_ext_path *path)
 {
-        struct buffer_head *bh;
        int err;
        ext4_fsblk_t leaf;
        /* free index block */
        path--;
        leaf = idx_pblock(path->p_idx);
-        BUG_ON(path->p_hdr->eh_entries == 0);
+        if (unlikely(path->p_hdr->eh_entries == 0)) {
+                EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
+                return -EIO;
+        }
        err = ext4_ext_get_access(handle, inode, path);
        if (err)
                return err;
@@ -1973,9 +2092,8 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
        if (err)
                return err;
        ext_debug("index is empty, remove it, free block %llu\n", leaf);
-        bh = sb_find_get_block(inode->i_sb, leaf);
+        ext4_free_blocks(handle, inode, 0, leaf, 1,
-        ext4_forget(handle, 1, inode, bh, leaf);
+                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
-        ext4_free_blocks(handle, inode, leaf, 1, 1);
        return err;
 }
@@ -2042,12 +2160,11 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                                struct ext4_extent *ex,
                                ext4_lblk_t from, ext4_lblk_t to)
 {
-        struct buffer_head *bh;
        unsigned short ee_len =  ext4_ext_get_actual_len(ex);
-        int i, metadata = 0;
+        int flags = EXT4_FREE_BLOCKS_FORGET;
        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-                metadata = 1;
+                flags |= EXT4_FREE_BLOCKS_METADATA;
 #ifdef EXTENTS_STATS
        {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2072,11 +2189,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                num = le32_to_cpu(ex->ee_block) + ee_len - from;
                start = ext_pblock(ex) + ee_len - num;
                ext_debug("free last %u blocks starting %llu\n", num, start);
-                for (i = 0; i < num; i++) {
+                ext4_free_blocks(handle, inode, 0, start, num, flags);
-                        bh = sb_find_get_block(inode->i_sb, start + i);
-                        ext4_forget(handle, 0, inode, bh, start + i);
-                }
-                ext4_free_blocks(handle, inode, start, num, metadata);
        } else if (from == le32_to_cpu(ex->ee_block)
                   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
                printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
@@ -2108,8 +2221,10 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        if (!path[depth].p_hdr)
                path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
        eh = path[depth].p_hdr;
-        BUG_ON(eh == NULL);
+        if (unlikely(path[depth].p_hdr == NULL)) {
+                EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+                return -EIO;
+        }
        /* find where to start removing */
        ex = EXT_LAST_EXTENT(eh);
@@ -2167,7 +2282,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                        correct_index = 1;
                        credits += (ext_depth(inode)) + 1;
                }
-                credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+                credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
                err = ext4_ext_truncate_extend_restart(handle, inode, credits);
                if (err)
@@ -2972,7 +3087,7 @@ fix_extent_len:
        ext4_ext_dirty(handle, inode, path + depth);
        return err;
 }
-static int ext4_convert_unwritten_extents_dio(handle_t *handle,
+static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                                              struct inode *inode,
                                              struct ext4_ext_path *path)
 {
@@ -3027,6 +3142,14 @@ out:
        return err;
 }
+static void unmap_underlying_metadata_blocks(struct block_device *bdev,
+                        sector_t block, int count)
+{
+        int i;
+        for (i = 0; i < count; i++)
+                unmap_underlying_metadata(bdev, block + i);
+}
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                        ext4_lblk_t iblock, unsigned int max_blocks,
@@ -3044,8 +3167,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                  flags, allocated);
        ext4_ext_show_leaf(inode, path);
-        /* DIO get_block() before submit the IO, split the extent */
+        /* get_block() before submit the IO, split the extent */
-        if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
+        if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
                ret = ext4_split_unwritten_extents(handle,
                                                inode, path, iblock,
                                                max_blocks, flags);
@@ -3055,15 +3178,19 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                 * completed
                 */
                if (io)
-                        io->flag = DIO_AIO_UNWRITTEN;
+                        io->flag = EXT4_IO_UNWRITTEN;
                else
-                        EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN;
+                        ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+                if (ext4_should_dioread_nolock(inode))
+                        set_buffer_uninit(bh_result);
                goto out;
        }
-        /* async DIO end_io complete, convert the filled extent to written */
+        /* IO end_io complete, convert the filled extent to written */
-        if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
+        if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
-                ret = ext4_convert_unwritten_extents_dio(handle, inode,
+                ret = ext4_convert_unwritten_extents_endio(handle, inode,
                                                        path);
+                if (ret >= 0)
+                        ext4_update_inode_fsync_trans(handle, inode, 1);
                goto out2;
        }
        /* buffered IO case */
@@ -3091,6 +3218,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        ret = ext4_ext_convert_to_initialized(handle, inode,
                                                path, iblock,
                                                max_blocks);
+        if (ret >= 0)
+                ext4_update_inode_fsync_trans(handle, inode, 1);
 out:
        if (ret <= 0) {
                err = ret;
@@ -3098,6 +3227,30 @@ out:
        } else
                allocated = ret;
        set_buffer_new(bh_result);
+        /*
+         * if we allocated more blocks than requested
+         * we need to make sure we unmap the extra block
+         * allocated. The actual needed block will get
+         * unmapped later when we find the buffer_head marked
+         * new.
+         */
+        if (allocated > max_blocks) {
+                unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
+                                        newblock + max_blocks,
+                                        allocated - max_blocks);
+                allocated = max_blocks;
+        }
+        /*
+         * If we have done fallocate with the offset that is already
+         * delayed allocated, we would have block reservation
+         * and quota reservation done in the delayed write path.
+         * But fallocate would have already updated quota and block
+         * count for this offset. So cancel these reservation
+         */
+        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+                ext4_da_update_reserve_space(inode, allocated, 0);
 map_out:
        set_buffer_mapped(bh_result);
 out1:
@@ -3138,7 +3291,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 {
        struct ext4_ext_path *path = NULL;
        struct ext4_extent_header *eh;
-        struct ext4_extent newex, *ex;
+        struct ext4_extent newex, *ex, *last_ex;
        ext4_fsblk_t newblock;
        int err = 0, depth, ret, cache_type;
        unsigned int allocated = 0;
@@ -3190,7 +3343,13 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         * this situation is possible, though, _during_ tree modification;
         * this is why assert can't be put in ext4_ext_find_extent()
         */
-        BUG_ON(path[depth].p_ext == NULL && depth != 0);
+        if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
+                EXT4_ERROR_INODE(inode, "bad extent address "
+                                 "iblock: %d, depth: %d pblock %lld",
+                                 iblock, depth, path[depth].p_block);
+                err = -EIO;
+                goto out2;
+        }
        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
@@ -3205,7 +3364,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                 */
                ee_len = ext4_ext_get_actual_len(ex);
                /* if found extent covers block, simply return it */
-                if (iblock >= ee_block && iblock < ee_block + ee_len) {
+                if (in_range(iblock, ee_block, ee_len)) {
                        newblock = iblock - ee_block + ee_start;
                        /* number of remaining blocks in the extent */
                        allocated = ee_len - (iblock - ee_block);
@@ -3297,21 +3456,35 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
                ext4_ext_mark_uninitialized(&newex);
                /*
-                 * io_end structure was created for every async
+                 * io_end structure was created for every IO write to an
-                 * direct IO write to the middle of the file.
+                 * uninitialized extent. To avoid unecessary conversion,
-                 * To avoid unecessary convertion for every aio dio rewrite
+                 * here we flag the IO that really needs the conversion.
-                 * to the mid of file, here we flag the IO that is really
-                 * need the convertion.
                 * For non asycn direct IO case, flag the inode state
                 * that we need to perform convertion when IO is done.
                 */
-                if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
+                if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
                        if (io)
-                                io->flag = DIO_AIO_UNWRITTEN;
+                                io->flag = EXT4_IO_UNWRITTEN;
                        else
-                                EXT4_I(inode)->i_state |=
+                                ext4_set_inode_state(inode,
-                                        EXT4_STATE_DIO_UNWRITTEN;;
+                                                     EXT4_STATE_DIO_UNWRITTEN);
                }
+                if (ext4_should_dioread_nolock(inode))
+                        set_buffer_uninit(bh_result);
+        }
+        if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
+                if (unlikely(!eh->eh_entries)) {
+                        EXT4_ERROR_INODE(inode,
+                                         "eh->eh_entries == 0 ee_block %d",
+                                         ex->ee_block);
+                        err = -EIO;
+                        goto out2;
+                }
+                last_ex = EXT_LAST_EXTENT(eh);
+                if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
+                    + ext4_ext_get_actual_len(last_ex))
+                        EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
        }
        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
        if (err) {
@@ -3319,20 +3492,35 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                /* not a good idea to call discard here directly,
                 * but otherwise we'd need to call it every free() */
                ext4_discard_preallocations(inode);
-                ext4_free_blocks(handle, inode, ext_pblock(&newex),
+                ext4_free_blocks(handle, inode, 0, ext_pblock(&newex),
-                                        ext4_ext_get_actual_len(&newex), 0);
+                                 ext4_ext_get_actual_len(&newex), 0);
                goto out2;
        }
        /* previous routine could use block we allocated */
        newblock = ext_pblock(&newex);
        allocated = ext4_ext_get_actual_len(&newex);
+        if (allocated > max_blocks)
+                allocated = max_blocks;
        set_buffer_new(bh_result);
-        /* Cache only when it is _not_ an uninitialized extent */
+        /*
-        if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
+         * Update reserved blocks/metadata blocks after successful
+         * block allocation which had been deferred till now.
+         */
+        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+                ext4_da_update_reserve_space(inode, allocated, 1);
+        /*
+         * Cache the extent and update transaction to commit on fdatasync only
+         * when it is _not_ an uninitialized extent.
+         */
+        if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
                ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
                                                EXT4_EXT_CACHE_EXTENT);
+                ext4_update_inode_fsync_trans(handle, inode, 1);
+        } else
+                ext4_update_inode_fsync_trans(handle, inode, 0);
 out:
        if (allocated > max_blocks)
                allocated = max_blocks;
@@ -3431,6 +3619,13 @@ static void ext4_falloc_update_inode(struct inode *inode,
                        i_size_write(inode, new_size);
                if (new_size > EXT4_I(inode)->i_disksize)
                        ext4_update_i_disksize(inode, new_size);
+        } else {
+                /*
+                 * Mark that we allocate beyond EOF so the subsequent truncate
+                 * can proceed even if the new size is the same as i_size.
+                 */
+                if (new_size > i_size_read(inode))
+                        EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL;
        }
 }
@@ -3535,7 +3730,7 @@ retry:
 * Returns 0 on success.
 */
 int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
-                                    loff_t len)
+                                    ssize_t len)
 {
        handle_t *handle;
        ext4_lblk_t block;
@@ -3567,7 +3762,7 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
                map_bh.b_state = 0;
                ret = ext4_get_blocks(handle, inode, block,
                                      max_blocks, &map_bh,
-                                      EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
+                                      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
                if (ret <= 0) {
                        WARN_ON(ret <= 0);
                        printk(KERN_ERR "%s: ext4_ext_get_blocks "
@@ -3671,7 +3866,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
        int error = 0;
        /* in-inode? */
-        if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
+        if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
                struct ext4_iloc iloc;
                int offset;     /* offset of xattr in inode */
@@ -3699,7 +3894,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len)
 {
        ext4_lblk_t start_blk;
-        ext4_lblk_t len_blks;
        int error = 0;
        /* fallback to generic here if not in extents fmt */
@@ -3713,17 +3907,21 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
                error = ext4_xattr_fiemap(inode, fieinfo);
        } else {
+                ext4_lblk_t len_blks;
+                __u64 last_blk;
                start_blk = start >> inode->i_sb->s_blocksize_bits;
-                len_blks = len >> inode->i_sb->s_blocksize_bits;
+                last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
+                if (last_blk >= EXT_MAX_BLOCK)
+                        last_blk = EXT_MAX_BLOCK-1;
+                len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
                /*
                 * Walk the extent tree gathering extent information.
                 * ext4_ext_fiemap_cb will push extents back to user.
                 */
-                down_read(&EXT4_I(inode)->i_data_sem);
                error = ext4_ext_walk_space(inode, start_blk, len_blks,
                                          ext4_ext_fiemap_cb, fieinfo);
-                up_read(&EXT4_I(inode)->i_data_sem);
        }
        return error;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 9630583cef28..d0776e410f34 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -23,6 +23,7 @@
 #include <linux/jbd2.h>
 #include <linux/mount.h>
 #include <linux/path.h>
+#include <linux/quotaops.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -35,9 +36,9 @@
 */
 static int ext4_release_file(struct inode *inode, struct file *filp)
 {
-        if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
+        if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
                ext4_alloc_da_blocks(inode);
-                EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
+                ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
        }
        /* if we are the last writer on the inode, drop the block reservation */
        if ((filp->f_mode & FMODE_WRITE) &&
@@ -116,18 +117,16 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
                 * devices or filesystem images.
                 */
                memset(buf, 0, sizeof(buf));
-                path.mnt = mnt->mnt_parent;
+                path.mnt = mnt;
-                path.dentry = mnt->mnt_mountpoint;
+                path.dentry = mnt->mnt_root;
-                path_get(&path);
                cp = d_path(&path, buf, sizeof(buf));
-                path_put(&path);
                if (!IS_ERR(cp)) {
                        memcpy(sbi->s_es->s_last_mounted, cp,
                               sizeof(sbi->s_es->s_last_mounted));
                        sb->s_dirt = 1;
                }
        }
-        return generic_file_open(inode, filp);
+        return dquot_file_open(inode, filp);
 }
 const struct file_operations ext4_file_operations = {
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 2b1531266ee2..0d0c3239c1cd 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -51,25 +51,30 @@
 int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
        struct inode *inode = dentry->d_inode;
+        struct ext4_inode_info *ei = EXT4_I(inode);
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
-        int err, ret = 0;
+        int ret;
+        tid_t commit_tid;
        J_ASSERT(ext4_journal_current_handle() == NULL);
        trace_ext4_sync_file(file, dentry, datasync);
-        ret = flush_aio_dio_completed_IO(inode);
+        if (inode->i_sb->s_flags & MS_RDONLY)
+                return 0;
+        ret = flush_completed_IO(inode);
        if (ret < 0)
-                goto out;
+                return ret;
+        
+        if (!journal)
+                return simple_fsync(file, dentry, datasync);
        /*
-         * data=writeback:
+         * data=writeback,ordered:
         *  The caller's filemap_fdatawrite()/wait will sync the data.
-         *  sync_inode() will sync the metadata
+         *  Metadata is in the journal, we wait for proper transaction to
-         *
+         *  commit here.
-         * data=ordered:
-         *  The caller's filemap_fdatawrite() will write the data and
-         *  sync_inode() will write the inode if it is dirty.  Then the caller's
-         *  filemap_fdatawait() will wait on the pages.
         *
         * data=journal:
         *  filemap_fdatawrite won't do anything (the buffers are clean).
@@ -79,32 +84,25 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
         *  (they were dirtied by commit).  But that's OK - the blocks are
         *  safe in-journal, which is all fsync() needs to ensure.
         */
-        if (ext4_should_journal_data(inode)) {
+        if (ext4_should_journal_data(inode))
-                ret = ext4_force_commit(inode->i_sb);
+                return ext4_force_commit(inode->i_sb);
-                goto out;
-        }
-        if (!journal)
+        commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
-                ret = sync_mapping_buffers(inode->i_mapping);
+        if (jbd2_log_start_commit(journal, commit_tid)) {
+                /*
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+                 * When the journal is on a different device than the
-                goto out;
+                 * fs data disk, we need to issue the barrier in
+                 * writeback mode.  (In ordered mode, the jbd2 layer
-        /*
+                 * will take care of issuing the barrier.  In
-         * The VFS has written the file data.  If the inode is unaltered
+                 * data=journal, all of the data blocks are written to
-         * then we need not start a commit.
+                 * the journal device.)
-         */
+                 */
-        if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
+                if (ext4_should_writeback_data(inode) &&
-                struct writeback_control wbc = {
+                    (journal->j_fs_dev != journal->j_dev) &&
-                        .sync_mode = WB_SYNC_ALL,
+                    (journal->j_flags & JBD2_BARRIER))
-                        .nr_to_write = 0, /* sys_fsync did this */
+                        blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
-                };
+                jbd2_log_wait_commit(journal, commit_tid);
-                err = sync_inode(inode, &wbc);
+        } else if (journal->j_flags & JBD2_BARRIER)
-                if (ret == 0)
-                        ret = err;
-        }
-out:
-        if (journal && (journal->j_flags & JBD2_BARRIER))
                blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
        return ret;
 }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f3624ead4f6c..57f6eef6ccd6 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -76,8 +76,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
        /* If checksum is bad mark all blocks and inodes use to prevent
         * allocation, essentially implementing a per-group read-only flag. */
        if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
-                ext4_error(sb, __func__, "Checksum bad for group %u",
+                ext4_error(sb, "Checksum bad for group %u", block_group);
-                           block_group);
                ext4_free_blks_set(sb, gdp, 0);
                ext4_free_inodes_set(sb, gdp, 0);
                ext4_itable_unused_set(sb, gdp, 0);
@@ -111,8 +110,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
        bitmap_blk = ext4_inode_bitmap(sb, desc);
        bh = sb_getblk(sb, bitmap_blk);
        if (unlikely(!bh)) {
-                ext4_error(sb, __func__,
+                ext4_error(sb, "Cannot read inode bitmap - "
-                            "Cannot read inode bitmap - "
                            "block_group = %u, inode_bitmap = %llu",
                            block_group, bitmap_blk);
                return NULL;
@@ -153,8 +151,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
        set_bitmap_uptodate(bh);
        if (bh_submit_read(bh) < 0) {
                put_bh(bh);
-                ext4_error(sb, __func__,
+                ext4_error(sb, "Cannot read inode bitmap - "
-                            "Cannot read inode bitmap - "
                            "block_group = %u, inode_bitmap = %llu",
                            block_group, bitmap_blk);
                return NULL;
@@ -217,10 +214,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
         * Note: we must free any quota before locking the superblock,
         * as writing the quota to disk may need the lock as well.
         */
-        vfs_dq_init(inode);
+        dquot_initialize(inode);
        ext4_xattr_delete_inode(handle, inode);
-        vfs_dq_free_inode(inode);
+        dquot_free_inode(inode);
-        vfs_dq_drop(inode);
+        dquot_drop(inode);
        is_directory = S_ISDIR(inode->i_mode);
@@ -229,8 +226,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
        es = EXT4_SB(sb)->s_es;
        if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
-                ext4_error(sb, "ext4_free_inode",
+                ext4_error(sb, "reserved or nonexistent inode %lu", ino);
-                           "reserved or nonexistent inode %lu", ino);
                goto error_return;
        }
        block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
@@ -248,8 +244,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
        cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
                                        bit, bitmap_bh->b_data);
        if (!cleared)
-                ext4_error(sb, "ext4_free_inode",
+                ext4_error(sb, "bit already cleared for inode %lu", ino);
-                           "bit already cleared for inode %lu", ino);
        else {
                gdp = ext4_get_group_desc(sb, block_group, &bh2);
@@ -268,7 +263,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                                        ext4_group_t f;
                                        f = ext4_flex_group(sbi, block_group);
-                                        atomic_dec(&sbi->s_flex_groups[f].free_inodes);
+                                        atomic_dec(&sbi->s_flex_groups[f].used_dirs);
                                }
                        }
@@ -736,8 +731,7 @@ static int ext4_claim_inode(struct super_block *sb,
        if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
                        ino > EXT4_INODES_PER_GROUP(sb)) {
                ext4_unlock_group(sb, group);
-                ext4_error(sb, __func__,
+                ext4_error(sb, "reserved inode or inode > inodes count - "
-                           "reserved inode or inode > inodes count - "
                           "block_group = %u, inode=%lu", group,
                           ino + group * EXT4_INODES_PER_GROUP(sb));
                return 1;
@@ -779,7 +773,7 @@ static int ext4_claim_inode(struct super_block *sb,
                if (sbi->s_log_groups_per_flex) {
                        ext4_group_t f = ext4_flex_group(sbi, group);
-                        atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+                        atomic_inc(&sbi->s_flex_groups[f].used_dirs);
                }
        }
        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
@@ -904,7 +898,7 @@ repeat_in_this_group:
                                BUFFER_TRACE(inode_bitmap_bh,
                                        "call ext4_handle_dirty_metadata");
                                err = ext4_handle_dirty_metadata(handle,
-                                                                 inode,
+                                                                 NULL,
                                                        inode_bitmap_bh);
                                if (err)
                                        goto fail;
@@ -1029,15 +1023,16 @@ got:
        inode->i_generation = sbi->s_next_generation++;
        spin_unlock(&sbi->s_next_gen_lock);
-        ei->i_state = EXT4_STATE_NEW;
+        ei->i_state_flags = 0;
+        ext4_set_inode_state(inode, EXT4_STATE_NEW);
        ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
        ret = inode;
-        if (vfs_dq_alloc_inode(inode)) {
+        dquot_initialize(inode);
-                err = -EDQUOT;
+        err = dquot_alloc_inode(inode);
+        if (err)
                goto fail_drop;
-        }
        err = ext4_init_acl(handle, inode, dir);
        if (err)
@@ -1074,10 +1069,10 @@ really_out:
        return ret;
 fail_free_drop:
-        vfs_dq_free_inode(inode);
+        dquot_free_inode(inode);
 fail_drop:
-        vfs_dq_drop(inode);
+        dquot_drop(inode);
        inode->i_flags |= S_NOQUOTA;
        inode->i_nlink = 0;
        unlock_new_inode(inode);
@@ -1098,8 +1093,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
        /* Error cases - e2fsck has already cleaned up for us */
        if (ino > max_ino) {
-                ext4_warning(sb, __func__,
+                ext4_warning(sb, "bad orphan ino %lu!  e2fsck was run?", ino);
-                             "bad orphan ino %lu!  e2fsck was run?", ino);
                goto error;
        }
@@ -1107,8 +1101,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
        bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
        bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
        if (!bitmap_bh) {
-                ext4_warning(sb, __func__,
+                ext4_warning(sb, "inode bitmap error for orphan %lu", ino);
-                             "inode bitmap error for orphan %lu", ino);
                goto error;
        }
@@ -1140,8 +1133,7 @@ iget_failed:
        err = PTR_ERR(inode);
        inode = NULL;
 bad_orphan:
-        ext4_warning(sb, __func__,
+        ext4_warning(sb, "bad orphan inode %lu!  e2fsck was run?", ino);
-                     "bad orphan inode %lu!  e2fsck was run?", ino);
        printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
               bit, (unsigned long long)bitmap_bh->b_blocknr,
               ext4_test_bit(bit, bitmap_bh->b_data));
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2c8caa51addb..5381802d6052 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -38,6 +38,8 @@
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include <linux/workqueue.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -71,58 +73,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
 }
 /*
- * The ext4 forget function must perform a revoke if we are freeing data
- * which has been journaled.  Metadata (eg. indirect blocks) must be
- * revoked in all cases.
- *
- * "bh" may be NULL: a metadata block may have been freed from memory
- * but there may still be a record of it in the journal, and that record
- * still needs to be revoked.
- *
- * If the handle isn't valid we're not journaling, but we still need to
- * call into ext4_journal_revoke() to put the buffer head.
- */
-int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
-                struct buffer_head *bh, ext4_fsblk_t blocknr)
-{
-        int err;
-        might_sleep();
-        BUFFER_TRACE(bh, "enter");
-        jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
-                  "data mode %x\n",
-                  bh, is_metadata, inode->i_mode,
-                  test_opt(inode->i_sb, DATA_FLAGS));
-        /* Never use the revoke function if we are doing full data
-         * journaling: there is no need to, and a V1 superblock won't
-         * support it.  Otherwise, only skip the revoke on un-journaled
-         * data blocks. */
-        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
-            (!is_metadata && !ext4_should_journal_data(inode))) {
-                if (bh) {
-                        BUFFER_TRACE(bh, "call jbd2_journal_forget");
-                        return ext4_journal_forget(handle, bh);
-                }
-                return 0;
-        }
-        /*
-         * data!=journal && (is_metadata || should_journal_data(inode))
-         */
-        BUFFER_TRACE(bh, "call ext4_journal_revoke");
-        err = ext4_journal_revoke(handle, blocknr, bh);
-        if (err)
-                ext4_abort(inode->i_sb, __func__,
-                           "error %d when attempting revoke", err);
-        BUFFER_TRACE(bh, "exit");
-        return err;
-}
-/*
 * Work out how many blocks we need to proceed with the next chunk of a
 * truncate transaction.
 */
@@ -222,6 +172,9 @@ void ext4_delete_inode(struct inode *inode)
        handle_t *handle;
        int err;
+        if (!is_bad_inode(inode))
+                dquot_initialize(inode);
        if (ext4_should_order_data(inode))
                ext4_begin_ordered_truncate(inode, 0);
        truncate_inode_pages(&inode->i_data, 0);
@@ -246,7 +199,7 @@ void ext4_delete_inode(struct inode *inode)
        inode->i_size = 0;
        err = ext4_mark_inode_dirty(handle, inode);
        if (err) {
-                ext4_warning(inode->i_sb, __func__,
+                ext4_warning(inode->i_sb,
                             "couldn't mark inode dirty (err %d)", err);
                goto stop_handle;
        }
@@ -264,7 +217,7 @@ void ext4_delete_inode(struct inode *inode)
                if (err > 0)
                        err = ext4_journal_restart(handle, 3);
                if (err != 0) {
-                        ext4_warning(inode->i_sb, __func__,
+                        ext4_warning(inode->i_sb,
                                     "couldn't extend journal (err %d)", err);
                stop_handle:
                        ext4_journal_stop(handle);
@@ -375,8 +328,7 @@ static int ext4_block_to_path(struct inode *inode,
                offsets[n++] = i_block & (ptrs - 1);
                final = ptrs;
        } else {
-                ext4_warning(inode->i_sb, "ext4_block_to_path",
+                ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
-                             "block %lu > max in inode %lu",
                             i_block + direct_blocks +
                             indirect_blocks + double_blocks, inode->i_ino);
        }
@@ -396,7 +348,7 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
                if (blk &&
                    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
                                                    blk, 1))) {
-                        ext4_error(inode->i_sb, function,
+                        __ext4_error(inode->i_sb, function,
                                   "invalid block reference %u "
                                   "in inode #%lu", blk, inode->i_ino);
                        return -EIO;
@@ -659,7 +611,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                if (*err)
                        goto failed_out;
-                BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS);
+                if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
+                        EXT4_ERROR_INODE(inode,
+                                         "current_block %llu + count %lu > %d!",
+                                         current_block, count,
+                                         EXT4_MAX_BLOCK_FILE_PHYS);
+                        *err = -EIO;
+                        goto failed_out;
+                }
                target -= count;
                /* allocate blocks for indirect blocks */
@@ -695,7 +654,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                ar.flags = EXT4_MB_HINT_DATA;
        current_block = ext4_mb_new_blocks(handle, &ar, err);
-        BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS);
+        if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
+                EXT4_ERROR_INODE(inode,
+                                 "current_block %llu + ar.len %d > %d!",
+                                 current_block, ar.len,
+                                 EXT4_MAX_BLOCK_FILE_PHYS);
+                *err = -EIO;
+                goto failed_out;
+        }
        if (*err && (target == blks)) {
                /*
@@ -721,7 +687,7 @@ allocated:
        return ret;
 failed_out:
        for (i = 0; i < index; i++)
-                ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
+                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
        return ret;
 }
@@ -817,14 +783,20 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
        return err;
 failed:
        /* Allocation failed, free what we already allocated */
+        ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
        for (i = 1; i <= n ; i++) {
-                BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
+                /* 
-                ext4_journal_forget(handle, branch[i].bh);
+                 * branch[i].bh is newly allocated, so there is no
+                 * need to revoke the block, which is why we don't
+                 * need to set EXT4_FREE_BLOCKS_METADATA.
+                 */
+                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1,
+                                 EXT4_FREE_BLOCKS_FORGET);
        }
-        for (i = 0; i < indirect_blks; i++)
+        for (i = n+1; i < indirect_blks; i++)
-                ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
+                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
-        ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
+        ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0);
        return err;
 }
@@ -903,12 +875,16 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 err_out:
        for (i = 1; i <= num; i++) {
-                BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
+                /* 
-                ext4_journal_forget(handle, where[i].bh);
+                 * branch[i].bh is newly allocated, so there is no
-                ext4_free_blocks(handle, inode,
+                 * need to revoke the block, which is why we don't
-                                        le32_to_cpu(where[i-1].key), 1, 0);
+                 * need to set EXT4_FREE_BLOCKS_METADATA.
+                 */
+                ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
+                                 EXT4_FREE_BLOCKS_FORGET);
        }
-        ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
+        ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key),
+                         blks, 0);
        return err;
 }
@@ -1021,10 +997,12 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
        if (!err)
                err = ext4_splice_branch(handle, inode, iblock,
                                         partial, indirect_blks, count);
-        else
+        if (err)
                goto cleanup;
        set_buffer_new(bh_result);
+        ext4_update_inode_fsync_trans(handle, inode, 1);
 got_it:
        map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
        if (count > blocks_to_boundary)
@@ -1043,92 +1021,121 @@ out:
        return err;
 }
-qsize_t ext4_get_reserved_space(struct inode *inode)
+#ifdef CONFIG_QUOTA
+qsize_t *ext4_get_reserved_space(struct inode *inode)
 {
-        unsigned long long total;
+        return &EXT4_I(inode)->i_reserved_quota;
-        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-        total = EXT4_I(inode)->i_reserved_data_blocks +
-                EXT4_I(inode)->i_reserved_meta_blocks;
-        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-        return total;
 }
+#endif
 /*
 * Calculate the number of metadata blocks need to reserve
- * to allocate @blocks for non extent file based file
+ * to allocate a new block at @lblocks for non extent file based file
 */
-static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
+static int ext4_indirect_calc_metadata_amount(struct inode *inode,
+                                              sector_t lblock)
 {
-        int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+        struct ext4_inode_info *ei = EXT4_I(inode);
-        int ind_blks, dind_blks, tind_blks;
+        sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
+        int blk_bits;
-        /* number of new indirect blocks needed */
-        ind_blks = (blocks + icap - 1) / icap;
-        dind_blks = (ind_blks + icap - 1) / icap;
+        if (lblock < EXT4_NDIR_BLOCKS)
+                return 0;
-        tind_blks = 1;
+        lblock -= EXT4_NDIR_BLOCKS;
-        return ind_blks + dind_blks + tind_blks;
+        if (ei->i_da_metadata_calc_len &&
+            (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
+                ei->i_da_metadata_calc_len++;
+                return 0;
+        }
+        ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
+        ei->i_da_metadata_calc_len = 1;
+        blk_bits = order_base_2(lblock);
+        return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
 }
 /*
 * Calculate the number of metadata blocks need to reserve
- * to allocate given number of blocks
+ * to allocate a block located at @lblock
 */
-static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
+static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
 {
-        if (!blocks)
-                return 0;
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
-                return ext4_ext_calc_metadata_amount(inode, blocks);
+                return ext4_ext_calc_metadata_amount(inode, lblock);
-        return ext4_indirect_calc_metadata_amount(inode, blocks);
+        return ext4_indirect_calc_metadata_amount(inode, lblock);
 }
-static void ext4_da_update_reserve_space(struct inode *inode, int used)
+/*
+ * Called with i_data_sem down, which is important since we can call
+ * ext4_discard_preallocations() from here.
+ */
+void ext4_da_update_reserve_space(struct inode *inode,
+                                        int used, int quota_claim)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-        int total, mdb, mdb_free;
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        int mdb_free = 0, allocated_meta_blocks = 0;
-        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-        /* recalculate the number of metablocks still need to be reserved */
+        spin_lock(&ei->i_block_reservation_lock);
-        total = EXT4_I(inode)->i_reserved_data_blocks - used;
+        trace_ext4_da_update_reserve_space(inode, used);
-        mdb = ext4_calc_metadata_amount(inode, total);
+        if (unlikely(used > ei->i_reserved_data_blocks)) {
+                ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
-        /* figure out how many metablocks to release */
+                         "with only %d reserved data blocks\n",
-        BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+                         __func__, inode->i_ino, used,
-        mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+                         ei->i_reserved_data_blocks);
+                WARN_ON(1);
-        if (mdb_free) {
+                used = ei->i_reserved_data_blocks;
-                /* Account for allocated meta_blocks */
+        }
-                mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+        /* Update per-inode reservations */
-                /* update fs dirty blocks counter */
+        ei->i_reserved_data_blocks -= used;
+        used += ei->i_allocated_meta_blocks;
+        ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
+        allocated_meta_blocks = ei->i_allocated_meta_blocks;
+        ei->i_allocated_meta_blocks = 0;
+        percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
+        if (ei->i_reserved_data_blocks == 0) {
+                /*
+                 * We can release all of the reserved metadata blocks
+                 * only when we have written all of the delayed
+                 * allocation blocks.
+                 */
+                mdb_free = ei->i_reserved_meta_blocks;
+                ei->i_reserved_meta_blocks = 0;
+                ei->i_da_metadata_calc_len = 0;
                percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
-                EXT4_I(inode)->i_allocated_meta_blocks = 0;
-                EXT4_I(inode)->i_reserved_meta_blocks = mdb;
        }
-        /* update per-inode reservations */
-        BUG_ON(used  > EXT4_I(inode)->i_reserved_data_blocks);
-        EXT4_I(inode)->i_reserved_data_blocks -= used;
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-        /*
+        /* Update quota subsystem */
-         * free those over-booking quota for metadata blocks
+        if (quota_claim) {
-         */
+                dquot_claim_block(inode, used);
-        if (mdb_free)
+                if (mdb_free)
-                vfs_dq_release_reservation_block(inode, mdb_free);
+                        dquot_release_reservation_block(inode, mdb_free);
+        } else {
+                /*
+                 * We did fallocate with an offset that is already delayed
+                 * allocated. So on delayed allocated writeback we should
+                 * not update the quota for allocated blocks. But then
+                 * converting an fallocate region to initialized region would
+                 * have caused a metadata allocation. So claim quota for
+                 * that
+                 */
+                if (allocated_meta_blocks)
+                        dquot_claim_block(inode, allocated_meta_blocks);
+                dquot_release_reservation_block(inode, mdb_free + used);
+        }
        /*
         * If we have done all the pending block allocations and if
         * there aren't any writers on the inode, we can discard the
         * inode's preallocations.
         */
-        if (!total && (atomic_read(&inode->i_writecount) == 0))
+        if ((ei->i_reserved_data_blocks == 0) &&
+            (atomic_read(&inode->i_writecount) == 0))
                ext4_discard_preallocations(inode);
 }
@@ -1136,7 +1143,7 @@ static int check_block_validity(struct inode *inode, const char *msg,
                                sector_t logical, sector_t phys, int len)
 {
        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
-                ext4_error(inode->i_sb, msg,
+                __ext4_error(inode->i_sb, msg,
                           "inode #%lu logical block %llu mapped to %llu "
                           "(size %d)", inode->i_ino,
                           (unsigned long long) logical,
@@ -1318,20 +1325,22 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
                         * i_data's format changing.  Force the migrate
                         * to fail by clearing migrate flags
                         */
-                        EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
+                        ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
                }
-        }
+                /*
+                 * Update reserved blocks/metadata blocks after successful
+                 * block allocation which had been deferred till now. We don't
+                 * support fallocate for non extent files. So we can update
+                 * reserve space here.
+                 */
+                if ((retval > 0) &&
+                        (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
+                        ext4_da_update_reserve_space(inode, retval, 1);
+        }
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                EXT4_I(inode)->i_delalloc_reserved_flag = 0;
-        /*
-         * Update reserved blocks/metadata blocks after successful
-         * block allocation which had been deferred till now.
-         */
-        if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
-                ext4_da_update_reserve_space(inode, retval);
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && buffer_mapped(bh)) {
                int ret = check_block_validity(inode, "file system "
@@ -1534,6 +1543,18 @@ static int do_journal_get_write_access(handle_t *handle,
        return ext4_journal_get_write_access(handle, bh);
 }
+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static void ext4_truncate_failed_write(struct inode *inode)
+{
+        truncate_inode_pages(inode->i_mapping, inode->i_size);
+        ext4_truncate(inode);
+}
+static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+                   struct buffer_head *bh_result, int create);
 static int ext4_write_begin(struct file *file, struct address_space *mapping,
                            loff_t pos, unsigned len, unsigned flags,
                            struct page **pagep, void **fsdata)
@@ -1575,8 +1596,12 @@ retry:
        }
        *pagep = page;
-        ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        if (ext4_should_dioread_nolock(inode))
-                                ext4_get_block);
+                ret = block_write_begin(file, mapping, pos, len, flags, pagep,
+                                fsdata, ext4_get_block_write);
+        else
+                ret = block_write_begin(file, mapping, pos, len, flags, pagep,
+                                fsdata, ext4_get_block);
        if (!ret && ext4_should_journal_data(inode)) {
                ret = walk_page_buffers(handle, page_buffers(page),
@@ -1599,7 +1624,7 @@ retry:
                ext4_journal_stop(handle);
                if (pos + len > inode->i_size) {
-                        ext4_truncate(inode);
+                        ext4_truncate_failed_write(inode);
                        /*
                         * If truncate failed early the inode might
                         * still be on the orphan list; we need to
@@ -1709,7 +1734,7 @@ static int ext4_ordered_write_end(struct file *file,
                ret = ret2;
        if (pos + len > inode->i_size) {
-                ext4_truncate(inode);
+                ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
@@ -1751,7 +1776,7 @@ static int ext4_writeback_write_end(struct file *file,
                ret = ret2;
        if (pos + len > inode->i_size) {
-                ext4_truncate(inode);
+                ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
@@ -1793,7 +1818,7 @@ static int ext4_journalled_write_end(struct file *file,
        new_i_size = pos + copied;
        if (new_i_size > inode->i_size)
                i_size_write(inode, pos+copied);
-        EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+        ext4_set_inode_state(inode, EXT4_STATE_JDATA);
        if (new_i_size > EXT4_I(inode)->i_disksize) {
                ext4_update_i_disksize(inode, new_i_size);
                ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -1814,7 +1839,7 @@ static int ext4_journalled_write_end(struct file *file,
        if (!ret)
                ret = ret2;
        if (pos + len > inode->i_size) {
-                ext4_truncate(inode);
+                ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
@@ -1827,11 +1852,16 @@ static int ext4_journalled_write_end(struct file *file,
        return ret ? ret : copied;
 }
-static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+/*
+ * Reserve a single block located at lblock
+ */
+static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
 {
        int retries = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-        unsigned long md_needed, mdblocks, total = 0;
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        unsigned long md_needed, md_reserved;
+        int ret;
        /*
         * recalculate the amount of metadata blocks to reserve
@@ -1839,86 +1869,80 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
         * worse case is one extent per block
         */
 repeat:
-        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+        spin_lock(&ei->i_block_reservation_lock);
-        total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
+        md_reserved = ei->i_reserved_meta_blocks;
-        mdblocks = ext4_calc_metadata_amount(inode, total);
+        md_needed = ext4_calc_metadata_amount(inode, lblock);
-        BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
+        trace_ext4_da_reserve_space(inode, md_needed);
+        spin_unlock(&ei->i_block_reservation_lock);
-        md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
-        total = md_needed + nrblocks;
        /*
         * Make quota reservation here to prevent quota overflow
         * later. Real quota accounting is done at pages writeout
         * time.
         */
-        if (vfs_dq_reserve_block(inode, total)) {
+        ret = dquot_reserve_block(inode, md_needed + 1);
-                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+        if (ret)
-                return -EDQUOT;
+                return ret;
-        }
-        if (ext4_claim_free_blocks(sbi, total)) {
+        if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
-                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+                dquot_release_reservation_block(inode, md_needed + 1);
-                vfs_dq_release_reservation_block(inode, total);
                if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
                        yield();
                        goto repeat;
                }
                return -ENOSPC;
        }
-        EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
+        spin_lock(&ei->i_block_reservation_lock);
-        EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
+        ei->i_reserved_data_blocks++;
+        ei->i_reserved_meta_blocks += md_needed;
+        spin_unlock(&ei->i_block_reservation_lock);
-        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
        return 0;       /* success */
 }
 static void ext4_da_release_space(struct inode *inode, int to_free)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-        int total, mdb, mdb_free, release;
+        struct ext4_inode_info *ei = EXT4_I(inode);
        if (!to_free)
                return;         /* Nothing to release, exit */
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-        if (!EXT4_I(inode)->i_reserved_data_blocks) {
+        if (unlikely(to_free > ei->i_reserved_data_blocks)) {
                /*
-                 * if there is no reserved blocks, but we try to free some
+                 * if there aren't enough reserved blocks, then the
-                 * then the counter is messed up somewhere.
+                 * counter is messed up somewhere.  Since this
-                 * but since this function is called from invalidate
+                 * function is called from invalidate page, it's
-                 * page, it's harmless to return without any action
+                 * harmless to return without any action.
                 */
-                printk(KERN_INFO "ext4 delalloc try to release %d reserved "
+                ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
-                            "blocks for inode %lu, but there is no reserved "
+                         "ino %lu, to_free %d with only %d reserved "
-                            "data blocks\n", to_free, inode->i_ino);
+                         "data blocks\n", inode->i_ino, to_free,
-                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+                         ei->i_reserved_data_blocks);
-                return;
+                WARN_ON(1);
+                to_free = ei->i_reserved_data_blocks;
        }
+        ei->i_reserved_data_blocks -= to_free;
-        /* recalculate the number of metablocks still need to be reserved */
+        if (ei->i_reserved_data_blocks == 0) {
-        total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
+                /*
-        mdb = ext4_calc_metadata_amount(inode, total);
+                 * We can release all of the reserved metadata blocks
+                 * only when we have written all of the delayed
-        /* figure out how many metablocks to release */
+                 * allocation blocks.
-        BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+                 */
-        mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+                to_free += ei->i_reserved_meta_blocks;
+                ei->i_reserved_meta_blocks = 0;
-        release = to_free + mdb_free;
+                ei->i_da_metadata_calc_len = 0;
+        }
-        /* update fs dirty blocks counter for truncate case */
-        percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
-        /* update per-inode reservations */
+        /* update fs dirty blocks counter */
-        BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
+        percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
-        EXT4_I(inode)->i_reserved_data_blocks -= to_free;
-        BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
-        EXT4_I(inode)->i_reserved_meta_blocks = mdb;
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-        vfs_dq_release_reservation_block(inode, release);
+        dquot_release_reservation_block(inode, to_free);
 }
 static void ext4_da_page_release_reservation(struct page *page,
@@ -2095,6 +2119,8 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
                                } else if (buffer_mapped(bh))
                                        BUG_ON(bh->b_blocknr != pblock);
+                                if (buffer_uninit(exbh))
+                                        set_buffer_uninit(bh);
                                cur_logical++;
                                pblock++;
                        } while ((bh = bh->b_this_page) != head);
@@ -2137,17 +2163,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
                        break;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
-                        index = page->index;
+                        if (page->index > end)
-                        if (index > end)
                                break;
-                        index++;
                        BUG_ON(!PageLocked(page));
                        BUG_ON(PageWriteback(page));
                        block_invalidatepage(page, 0);
                        ClearPageUptodate(page);
                        unlock_page(page);
                }
+                index = pvec.pages[nr_pages - 1]->index + 1;
+                pagevec_release(&pvec);
        }
        return;
 }
@@ -2223,10 +2248,12 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
         * variables are updated after the blocks have been allocated.
         */
        new.b_state = 0;
-        get_blocks_flags = (EXT4_GET_BLOCKS_CREATE |
+        get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
-                            EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+        if (ext4_should_dioread_nolock(mpd->inode))
+                get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
        if (mpd->b_state & (1 << BH_Delay))
-                get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE;
+                get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
        blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
                               &new, get_blocks_flags);
        if (blks < 0) {
@@ -2524,7 +2551,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                 * XXX: __block_prepare_write() unmaps passed block,
                 * is it OK?
                 */
-                ret = ext4_da_reserve_space(inode, 1);
+                ret = ext4_da_reserve_space(inode, iblock);
                if (ret)
                        /* not enough space to reserve */
                        return ret;
@@ -2600,7 +2627,6 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
 }
 static int __ext4_journalled_writepage(struct page *page,
-                                       struct writeback_control *wbc,
                                       unsigned int len)
 {
        struct address_space *mapping = page->mapping;
@@ -2635,11 +2661,14 @@ static int __ext4_journalled_writepage(struct page *page,
                ret = err;
        walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
-        EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+        ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 out:
        return ret;
 }
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 /*
 * Note that we don't need to start a transaction unless we're journaling data
 * because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -2687,7 +2716,7 @@ static int ext4_writepage(struct page *page,
        int ret = 0;
        loff_t size;
        unsigned int len;
-        struct buffer_head *page_bufs;
+        struct buffer_head *page_bufs = NULL;
        struct inode *inode = page->mapping->host;
        trace_ext4_writepage(inode, page);
@@ -2758,12 +2787,16 @@ static int ext4_writepage(struct page *page,
                 * doesn't seem much point in redirtying the page here.
                 */
                ClearPageChecked(page);
-                return __ext4_journalled_writepage(page, wbc, len);
+                return __ext4_journalled_writepage(page, len);
        }
        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
                ret = nobh_writepage(page, noalloc_get_block_write, wbc);
-        else
+        else if (page_bufs && buffer_uninit(page_bufs)) {
+                ext4_set_bh_endio(page_bufs, inode);
+                ret = block_write_full_page_endio(page, noalloc_get_block_write,
+                                            wbc, ext4_end_io_buffer_write);
+        } else
                ret = block_write_full_page(page, noalloc_get_block_write,
                                            wbc);
@@ -2788,7 +2821,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
         * number of contiguous block. So we will limit
         * number of contiguous block to a sane value
         */
-        if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
+        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
            (max_blocks > EXT4_MAX_TRANS_DATA))
                max_blocks = EXT4_MAX_TRANS_DATA;
@@ -2933,7 +2966,7 @@ retry:
                ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
                                        &mpd);
                /*
-                 * If we have a contigous extent of pages and we
+                 * If we have a contiguous extent of pages and we
                 * haven't done the I/O yet, map the blocks and submit
                 * them for I/O.
                 */
@@ -2999,8 +3032,7 @@ retry:
 out_writepages:
        if (!no_nrwrite_index_update)
                wbc->no_nrwrite_index_update = 0;
-        if (wbc->nr_to_write > nr_to_writebump)
+        wbc->nr_to_write -= nr_to_writebump;
-                wbc->nr_to_write -= nr_to_writebump;
        wbc->range_start = range_start;
        trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
        return ret;
@@ -3025,11 +3057,18 @@ static int ext4_nonda_switch(struct super_block *sb)
        if (2 * free_blocks < 3 * dirty_blocks ||
                free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
                /*
-                 * free block count is less that 150% of dirty blocks
+                 * free block count is less than 150% of dirty blocks
-                 * or free blocks is less that watermark
+                 * or free blocks is less than watermark
                 */
                return 1;
        }
+        /*
+         * Even if we don't switch but are nearing capacity,
+         * start pushing delalloc when 1/2 of free blocks are dirty.
+         */
+        if (free_blocks < 2 * dirty_blocks)
+                writeback_inodes_sb_if_idle(sb);
        return 0;
 }
@@ -3037,7 +3076,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                               loff_t pos, unsigned len, unsigned flags,
                               struct page **pagep, void **fsdata)
 {
-        int ret, retries = 0;
+        int ret, retries = 0, quota_retries = 0;
        struct page *page;
        pgoff_t index;
        unsigned from, to;
@@ -3091,11 +3130,27 @@ retry:
                 * i_size_read because we hold i_mutex.
                 */
                if (pos + len > inode->i_size)
-                        ext4_truncate(inode);
+                        ext4_truncate_failed_write(inode);
        }
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
+        if ((ret == -EDQUOT) &&
+            EXT4_I(inode)->i_reserved_meta_blocks &&
+            (quota_retries++ < 3)) {
+                /*
+                 * Since we often over-estimate the number of meta
+                 * data blocks required, we may sometimes get a
+                 * spurios out of quota error even though there would
+                 * be enough space once we write the data blocks and
+                 * find out how many meta data blocks were _really_
+                 * required.  So try forcing the inode write to see if
+                 * that helps.
+                 */
+                write_inode_now(inode, (quota_retries == 3));
+                goto retry;
+        }
 out:
        return ret;
 }
@@ -3284,7 +3339,8 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
                filemap_write_and_wait(mapping);
        }
-        if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
+        if (EXT4_JOURNAL(inode) &&
+            ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
                /*
                 * This is a REALLY heavyweight approach, but the use of
                 * bmap on dirty files is expected to be extremely rare:
@@ -3303,7 +3359,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
                 * everything they get.
                 */
-                EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
+                ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
                journal = EXT4_JOURNAL(inode);
                jbd2_journal_lock_updates(journal);
                err = jbd2_journal_flush(journal);
@@ -3328,11 +3384,45 @@ ext4_readpages(struct file *file, struct address_space *mapping,
        return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
+static void ext4_free_io_end(ext4_io_end_t *io)
+{
+        BUG_ON(!io);
+        if (io->page)
+                put_page(io->page);
+        iput(io->inode);
+        kfree(io);
+}
+static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
+{
+        struct buffer_head *head, *bh;
+        unsigned int curr_off = 0;
+        if (!page_has_buffers(page))
+                return;
+        head = bh = page_buffers(page);
+        do {
+                if (offset <= curr_off && test_clear_buffer_uninit(bh)
+                                        && bh->b_private) {
+                        ext4_free_io_end(bh->b_private);
+                        bh->b_private = NULL;
+                        bh->b_end_io = NULL;
+                }
+                curr_off = curr_off + bh->b_size;
+                bh = bh->b_this_page;
+        } while (bh != head);
+}
 static void ext4_invalidatepage(struct page *page, unsigned long offset)
 {
        journal_t *journal = EXT4_JOURNAL(page->mapping->host);
        /*
+         * free any io_end structure allocated for buffers to be discarded
+         */
+        if (ext4_should_dioread_nolock(page->mapping->host))
+                ext4_invalidatepage_free_endio(page, offset);
+        /*
         * If it's a full truncate we just forget about the pending dirtying
         */
        if (offset == 0)
@@ -3403,7 +3493,14 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
        }
 retry:
-        ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+        if (rw == READ && ext4_should_dioread_nolock(inode))
+                ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
+                                 inode->i_sb->s_bdev, iov,
+                                 offset, nr_segs,
+                                 ext4_get_block, NULL);
+        else
+                ret = blockdev_direct_IO(rw, iocb, inode,
+                                 inode->i_sb->s_bdev, iov,
                                 offset, nr_segs,
                                 ext4_get_block, NULL);
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3419,6 +3516,9 @@ retry:
                         * but cannot extend i_size. Bail out and pretend
                         * the write failed... */
                        ret = PTR_ERR(handle);
+                        if (inode->i_nlink)
+                                ext4_orphan_del(NULL, inode);
                        goto out;
                }
                if (inode->i_nlink)
@@ -3446,75 +3546,63 @@ out:
        return ret;
 }
-static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
+static int ext4_get_block_write(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh_result, int create)
 {
-        handle_t *handle = NULL;
+        handle_t *handle = ext4_journal_current_handle();
        int ret = 0;
        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
        int dio_credits;
+        int started = 0;
-        ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
+        ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
                   inode->i_ino, create);
        /*
-         * DIO VFS code passes create = 0 flag for write to
+         * ext4_get_block in prepare for a DIO write or buffer write.
-         * the middle of file. It does this to avoid block
+         * We allocate an uinitialized extent if blocks haven't been allocated.
-         * allocation for holes, to prevent expose stale data
+         * The extent will be converted to initialized after IO complete.
-         * out when there is parallel buffered read (which does
-         * not hold the i_mutex lock) while direct IO write has
-         * not completed. DIO request on holes finally falls back
-         * to buffered IO for this reason.
-         *
-         * For ext4 extent based file, since we support fallocate,
-         * new allocated extent as uninitialized, for holes, we
-         * could fallocate blocks for holes, thus parallel
-         * buffered IO read will zero out the page when read on
-         * a hole while parallel DIO write to the hole has not completed.
-         *
-         * when we come here, we know it's a direct IO write to
-         * to the middle of file (<i_size)
-         * so it's safe to override the create flag from VFS.
         */
-        create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
+        create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
-        if (max_blocks > DIO_MAX_BLOCKS)
+        if (!handle) {
-                max_blocks = DIO_MAX_BLOCKS;
+                if (max_blocks > DIO_MAX_BLOCKS)
-        dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+                        max_blocks = DIO_MAX_BLOCKS;
-        handle = ext4_journal_start(inode, dio_credits);
+                dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
-        if (IS_ERR(handle)) {
+                handle = ext4_journal_start(inode, dio_credits);
-                ret = PTR_ERR(handle);
+                if (IS_ERR(handle)) {
-                goto out;
+                        ret = PTR_ERR(handle);
+                        goto out;
+                }
+                started = 1;
        }
        ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
                              create);
        if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
                ret = 0;
        }
-        ext4_journal_stop(handle);
+        if (started)
+                ext4_journal_stop(handle);
 out:
        return ret;
 }
-static void ext4_free_io_end(ext4_io_end_t *io)
+static void dump_completed_IO(struct inode * inode)
-{
-        BUG_ON(!io);
-        iput(io->inode);
-        kfree(io);
-}
-static void dump_aio_dio_list(struct inode * inode)
 {
 #ifdef  EXT4_DEBUG
        struct list_head *cur, *before, *after;
        ext4_io_end_t *io, *io0, *io1;
+        unsigned long flags;
-        if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
+        if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
-                ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
+                ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
                return;
        }
-        ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
+        ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
-        list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
+        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+        list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
                cur = &io->list;
                before = cur->prev;
                io0 = container_of(before, ext4_io_end_t, list);
@@ -3524,32 +3612,31 @@ static void dump_aio_dio_list(struct inode * inode)
                ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
                            io, inode->i_ino, io0, io1);
        }
+        spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
 #endif
 }
 /*
 * check a range of space and convert unwritten extents to written.
 */
-static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
+static int ext4_end_io_nolock(ext4_io_end_t *io)
 {
        struct inode *inode = io->inode;
        loff_t offset = io->offset;
-        size_t size = io->size;
+        ssize_t size = io->size;
        int ret = 0;
-        ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
+        ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
                   "list->prev 0x%p\n",
                   io, inode->i_ino, io->list.next, io->list.prev);
        if (list_empty(&io->list))
                return ret;
-        if (io->flag != DIO_AIO_UNWRITTEN)
+        if (io->flag != EXT4_IO_UNWRITTEN)
                return ret;
-        if (offset + size <= i_size_read(inode))
+        ret = ext4_convert_unwritten_extents(inode, offset, size);
-                ret = ext4_convert_unwritten_extents(inode, offset, size);
        if (ret < 0) {
                printk(KERN_EMERG "%s: failed to convert unwritten"
                        "extents to written extents, error is %d"
@@ -3562,50 +3649,64 @@ static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
        io->flag = 0;
        return ret;
 }
 /*
 * work on completed aio dio IO, to convert unwritten extents to extents
 */
-static void ext4_end_aio_dio_work(struct work_struct *work)
+static void ext4_end_io_work(struct work_struct *work)
 {
-        ext4_io_end_t *io  = container_of(work, ext4_io_end_t, work);
+        ext4_io_end_t           *io = container_of(work, ext4_io_end_t, work);
-        struct inode *inode = io->inode;
+        struct inode            *inode = io->inode;
-        int ret = 0;
+        struct ext4_inode_info  *ei = EXT4_I(inode);
+        unsigned long           flags;
+        int                     ret;
        mutex_lock(&inode->i_mutex);
-        ret = ext4_end_aio_dio_nolock(io);
+        ret = ext4_end_io_nolock(io);
-        if (ret >= 0) {
+        if (ret < 0) {
-                if (!list_empty(&io->list))
+                mutex_unlock(&inode->i_mutex);
-                        list_del_init(&io->list);
+                return;
-                ext4_free_io_end(io);
        }
+        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+        if (!list_empty(&io->list))
+                list_del_init(&io->list);
+        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
        mutex_unlock(&inode->i_mutex);
+        ext4_free_io_end(io);
 }
 /*
 * This function is called from ext4_sync_file().
 *
- * When AIO DIO IO is completed, the work to convert unwritten
+ * When IO is completed, the work to convert unwritten extents to
- * extents to written is queued on workqueue but may not get immediately
+ * written is queued on workqueue but may not get immediately
 * scheduled. When fsync is called, we need to ensure the
 * conversion is complete before fsync returns.
- * The inode keeps track of a list of completed AIO from DIO path
+ * The inode keeps track of a list of pending/completed IO that
- * that might needs to do the conversion. This function walks through
+ * might needs to do the conversion. This function walks through
- * the list and convert the related unwritten extents to written.
+ * the list and convert the related unwritten extents for completed IO
+ * to written.
+ * The function return the number of pending IOs on success.
 */
-int flush_aio_dio_completed_IO(struct inode *inode)
+int flush_completed_IO(struct inode *inode)
 {
        ext4_io_end_t *io;
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        unsigned long flags;
        int ret = 0;
        int ret2 = 0;
-        if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
+        if (list_empty(&ei->i_completed_io_list))
                return ret;
-        dump_aio_dio_list(inode);
+        dump_completed_IO(inode);
-        while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
+        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-                io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
+        while (!list_empty(&ei->i_completed_io_list)){
+                io = list_entry(ei->i_completed_io_list.next,
                                ext4_io_end_t, list);
                /*
-                 * Calling ext4_end_aio_dio_nolock() to convert completed
+                 * Calling ext4_end_io_nolock() to convert completed
                 * IO to written.
                 *
                 * When ext4_sync_file() is called, run_queue() may already
@@ -3618,20 +3719,23 @@ int flush_aio_dio_completed_IO(struct inode *inode)
                 * avoid double converting from both fsync and background work
                 * queue work.
                 */
-                ret = ext4_end_aio_dio_nolock(io);
+                spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+                ret = ext4_end_io_nolock(io);
+                spin_lock_irqsave(&ei->i_completed_io_lock, flags);
                if (ret < 0)
                        ret2 = ret;
                else
                        list_del_init(&io->list);
        }
+        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
        return (ret2 < 0) ? ret2 : 0;
 }
-static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
+static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
 {
        ext4_io_end_t *io = NULL;
-        io = kmalloc(sizeof(*io), GFP_NOFS);
+        io = kmalloc(sizeof(*io), flags);
        if (io) {
                igrab(inode);
@@ -3639,8 +3743,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
                io->flag = 0;
                io->offset = 0;
                io->size = 0;
-                io->error = 0;
+                io->page = NULL;
-                INIT_WORK(&io->work, ext4_end_aio_dio_work);
+                INIT_WORK(&io->work, ext4_end_io_work);
                INIT_LIST_HEAD(&io->list);
        }
@@ -3652,6 +3756,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 {
        ext4_io_end_t *io_end = iocb->private;
        struct workqueue_struct *wq;
+        unsigned long flags;
+        struct ext4_inode_info *ei;
        /* if not async direct IO or dio with 0 bytes write, just return */
        if (!io_end || !size)
@@ -3663,7 +3769,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                  size);
        /* if not aio dio with unwritten extents, just free io and return */
-        if (io_end->flag != DIO_AIO_UNWRITTEN){
+        if (io_end->flag != EXT4_IO_UNWRITTEN){
                ext4_free_io_end(io_end);
                iocb->private = NULL;
                return;
@@ -3671,16 +3777,85 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
        io_end->offset = offset;
        io_end->size = size;
+        io_end->flag = EXT4_IO_UNWRITTEN;
        wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
        /* queue the work to convert unwritten extents to written */
        queue_work(wq, &io_end->work);
        /* Add the io_end to per-inode completed aio dio list*/
-        list_add_tail(&io_end->list,
+        ei = EXT4_I(io_end->inode);
-                 &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
+        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+        list_add_tail(&io_end->list, &ei->i_completed_io_list);
+        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
        iocb->private = NULL;
 }
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
+{
+        ext4_io_end_t *io_end = bh->b_private;
+        struct workqueue_struct *wq;
+        struct inode *inode;
+        unsigned long flags;
+        if (!test_clear_buffer_uninit(bh) || !io_end)
+                goto out;
+        if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
+                printk("sb umounted, discard end_io request for inode %lu\n",
+                        io_end->inode->i_ino);
+                ext4_free_io_end(io_end);
+                goto out;
+        }
+        io_end->flag = EXT4_IO_UNWRITTEN;
+        inode = io_end->inode;
+        /* Add the io_end to per-inode completed io list*/
+        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+        list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
+        spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+        wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
+        /* queue the work to convert unwritten extents to written */
+        queue_work(wq, &io_end->work);
+out:
+        bh->b_private = NULL;
+        bh->b_end_io = NULL;
+        clear_buffer_uninit(bh);
+        end_buffer_async_write(bh, uptodate);
+}
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
+{
+        ext4_io_end_t *io_end;
+        struct page *page = bh->b_page;
+        loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
+        size_t size = bh->b_size;
+retry:
+        io_end = ext4_init_io_end(inode, GFP_ATOMIC);
+        if (!io_end) {
+                if (printk_ratelimit())
+                        printk(KERN_WARNING "%s: allocation fail\n", __func__);
+                schedule();
+                goto retry;
+        }
+        io_end->offset = offset;
+        io_end->size = size;
+        /*
+         * We need to hold a reference to the page to make sure it
+         * doesn't get evicted before ext4_end_io_work() has a chance
+         * to convert the extent from written to unwritten.
+         */
+        io_end->page = page;
+        get_page(io_end->page);
+        bh->b_private = io_end;
+        bh->b_end_io = ext4_end_io_buffer_write;
+        return 0;
+}
 /*
 * For ext4 extent files, ext4 will do direct-io write to holes,
 * preallocated extents, and those write extend the file, no need to
@@ -3734,7 +3909,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                iocb->private = NULL;
                EXT4_I(inode)->cur_aio_dio = NULL;
                if (!is_sync_kiocb(iocb)) {
-                        iocb->private = ext4_init_io_end(inode);
+                        iocb->private = ext4_init_io_end(inode, GFP_NOFS);
                        if (!iocb->private)
                                return -ENOMEM;
                        /*
@@ -3750,7 +3925,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                ret = blockdev_direct_IO(rw, iocb, inode,
                                         inode->i_sb->s_bdev, iov,
                                         offset, nr_segs,
-                                         ext4_get_block_dio_write,
+                                         ext4_get_block_write,
                                         ext4_end_io_dio);
                if (iocb->private)
                        EXT4_I(inode)->cur_aio_dio = NULL;
@@ -3771,8 +3946,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
                        ext4_free_io_end(iocb->private);
                        iocb->private = NULL;
-                } else if (ret > 0 && (EXT4_I(inode)->i_state &
+                } else if (ret > 0 && ext4_test_inode_state(inode,
-                                       EXT4_STATE_DIO_UNWRITTEN)) {
+                                                EXT4_STATE_DIO_UNWRITTEN)) {
                        int err;
                        /*
                         * for non AIO case, since the IO is already
@@ -3782,7 +3957,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                                                             offset, ret);
                        if (err < 0)
                                ret = err;
-                        EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN;
+                        ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
                }
                return ret;
        }
@@ -4064,7 +4239,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
        int k, err;
        *top = 0;
-        /* Make k index the deepest non-null offest + 1 */
+        /* Make k index the deepest non-null offset + 1 */
        for (k = depth; k > 1 && !offsets[k-1]; k--)
                ;
        partial = ext4_get_branch(inode, k, offsets, chain, &err);
@@ -4113,13 +4288,27 @@ no_top:
 * We release `count' blocks on disk, but (last - first) may be greater
 * than `count' because there can be holes in there.
 */
-static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
+static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
-                              struct buffer_head *bh,
+                             struct buffer_head *bh,
-                              ext4_fsblk_t block_to_free,
+                             ext4_fsblk_t block_to_free,
-                              unsigned long count, __le32 *first,
+                             unsigned long count, __le32 *first,
-                              __le32 *last)
+                             __le32 *last)
 {
        __le32 *p;
+        int     flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
+        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+                flags |= EXT4_FREE_BLOCKS_METADATA;
+        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
+                                   count)) {
+                ext4_error(inode->i_sb, "inode #%lu: "
+                           "attempt to clear blocks %llu len %lu, invalid",
+                           inode->i_ino, (unsigned long long) block_to_free,
+                           count);
+                return 1;
+        }
        if (try_to_extend_transaction(handle, inode)) {
                if (bh) {
                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
@@ -4134,27 +4323,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
                }
        }
-        /*
+        for (p = first; p < last; p++)
-         * Any buffers which are on the journal will be in memory. We
+                *p = 0;
-         * find them on the hash table so jbd2_journal_revoke() will
-         * run jbd2_journal_forget() on them.  We've already detached
-         * each block from the file, so bforget() in
-         * jbd2_journal_forget() should be safe.
-         *
-         * AKPM: turn on bforget in jbd2_journal_forget()!!!
-         */
-        for (p = first; p < last; p++) {
-                u32 nr = le32_to_cpu(*p);
-                if (nr) {
-                        struct buffer_head *tbh;
-                        *p = 0;
-                        tbh = sb_find_get_block(inode->i_sb, nr);
-                        ext4_forget(handle, 0, inode, tbh, nr);
-                }
-        }
-        ext4_free_blocks(handle, inode, block_to_free, count, 0);
+        ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
+        return 0;
 }
 /**
@@ -4210,9 +4383,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
                        } else if (nr == block_to_free + count) {
                                count++;
                        } else {
-                                ext4_clear_blocks(handle, inode, this_bh,
+                                if (ext4_clear_blocks(handle, inode, this_bh,
-                                                  block_to_free,
+                                                      block_to_free, count,
-                                                  count, block_to_free_p, p);
+                                                      block_to_free_p, p))
+                                        break;
                                block_to_free = nr;
                                block_to_free_p = p;
                                count = 1;
@@ -4236,7 +4410,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
                if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
                        ext4_handle_dirty_metadata(handle, inode, this_bh);
                else
-                        ext4_error(inode->i_sb, __func__,
+                        ext4_error(inode->i_sb,
                                   "circular indirect block detected, "
                                   "inode=%lu, block=%llu",
                                   inode->i_ino,
@@ -4276,6 +4450,16 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                        if (!nr)
                                continue;               /* A hole */
+                        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
+                                                   nr, 1)) {
+                                ext4_error(inode->i_sb,
+                                           "indirect mapped block in inode "
+                                           "#%lu invalid (level %d, blk #%lu)",
+                                           inode->i_ino, depth,
+                                           (unsigned long) nr);
+                                break;
+                        }
                        /* Go read the buffer for the next level down */
                        bh = sb_bread(inode->i_sb, nr);
@@ -4284,7 +4468,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                         * (should be rare).
                         */
                        if (!bh) {
-                                ext4_error(inode->i_sb, "ext4_free_branches",
+                                ext4_error(inode->i_sb,
                                           "Read failure, inode=%lu, block=%llu",
                                           inode->i_ino, nr);
                                continue;
@@ -4342,7 +4526,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                                            blocks_for_truncate(inode));
                        }
-                        ext4_free_blocks(handle, inode, nr, 1, 1);
+                        ext4_free_blocks(handle, inode, 0, nr, 1,
+                                         EXT4_FREE_BLOCKS_METADATA);
                        if (parent_bh) {
                                /*
@@ -4427,8 +4612,10 @@ void ext4_truncate(struct inode *inode)
        if (!ext4_can_truncate(inode))
                return;
+        EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
        if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
-                ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
+                ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
                ext4_ext_truncate(inode);
@@ -4598,9 +4785,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
        bh = sb_getblk(sb, block);
        if (!bh) {
-                ext4_error(sb, "ext4_get_inode_loc", "unable to read "
+                ext4_error(sb, "unable to read inode block - "
-                           "inode block - inode=%lu, block=%llu",
+                           "inode=%lu, block=%llu", inode->i_ino, block);
-                           inode->i_ino, block);
                return -EIO;
        }
        if (!buffer_uptodate(bh)) {
@@ -4698,9 +4884,8 @@ make_io:
                submit_bh(READ_META, bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
-                        ext4_error(sb, __func__,
+                        ext4_error(sb, "unable to read inode block - inode=%lu,"
-                                   "unable to read inode block - inode=%lu, "
+                                   " block=%llu", inode->i_ino, block);
-                                   "block=%llu", inode->i_ino, block);
                        brelse(bh);
                        return -EIO;
                }
@@ -4714,7 +4899,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
 {
        /* We have all inode data except xattrs in memory here. */
        return __ext4_get_inode_loc(inode, iloc,
-                !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
+                !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
 }
 void ext4_set_inode_flags(struct inode *inode)
@@ -4781,8 +4966,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        struct ext4_iloc iloc;
        struct ext4_inode *raw_inode;
        struct ext4_inode_info *ei;
-        struct buffer_head *bh;
        struct inode *inode;
+        journal_t *journal = EXT4_SB(sb)->s_journal;
        long ret;
        int block;
@@ -4793,11 +4978,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                return inode;
        ei = EXT4_I(inode);
+        iloc.bh = 0;
        ret = __ext4_get_inode_loc(inode, &iloc, 0);
        if (ret < 0)
                goto bad_inode;
-        bh = iloc.bh;
        raw_inode = ext4_raw_inode(&iloc);
        inode->i_mode = le16_to_cpu(raw_inode->i_mode);
        inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
@@ -4808,7 +4993,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        }
        inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
-        ei->i_state = 0;
+        ei->i_state_flags = 0;
        ei->i_dir_start_lookup = 0;
        ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
        /* We now have enough fields to check if the inode was active or not.
@@ -4820,7 +5005,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                if (inode->i_mode == 0 ||
                    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
                        /* this inode is deleted */
-                        brelse(bh);
                        ret = -ESTALE;
                        goto bad_inode;
                }
@@ -4837,6 +5021,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                        ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
        inode->i_size = ext4_isize(raw_inode);
        ei->i_disksize = inode->i_size;
+#ifdef CONFIG_QUOTA
+        ei->i_reserved_quota = 0;
+#endif
        inode->i_generation = le32_to_cpu(raw_inode->i_generation);
        ei->i_block_group = iloc.block_group;
        ei->i_last_alloc_group = ~0;
@@ -4848,11 +5035,35 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                ei->i_data[block] = raw_inode->i_block[block];
        INIT_LIST_HEAD(&ei->i_orphan);
+        /*
+         * Set transaction id's of transactions that have to be committed
+         * to finish f[data]sync. We set them to currently running transaction
+         * as we cannot be sure that the inode or some of its metadata isn't
+         * part of the transaction - the inode could have been reclaimed and
+         * now it is reread from disk.
+         */
+        if (journal) {
+                transaction_t *transaction;
+                tid_t tid;
+                spin_lock(&journal->j_state_lock);
+                if (journal->j_running_transaction)
+                        transaction = journal->j_running_transaction;
+                else
+                        transaction = journal->j_committing_transaction;
+                if (transaction)
+                        tid = transaction->t_tid;
+                else
+                        tid = journal->j_commit_sequence;
+                spin_unlock(&journal->j_state_lock);
+                ei->i_sync_tid = tid;
+                ei->i_datasync_tid = tid;
+        }
        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
                if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
                    EXT4_INODE_SIZE(inode->i_sb)) {
-                        brelse(bh);
                        ret = -EIO;
                        goto bad_inode;
                }
@@ -4865,7 +5076,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                                        EXT4_GOOD_OLD_INODE_SIZE +
                                        ei->i_extra_isize;
                        if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
-                                ei->i_state |= EXT4_STATE_XATTR;
+                                ext4_set_inode_state(inode, EXT4_STATE_XATTR);
                }
        } else
                ei->i_extra_isize = 0;
@@ -4884,12 +5095,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        ret = 0;
        if (ei->i_file_acl &&
-            ((ei->i_file_acl <
+            !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
-              (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
+                ext4_error(sb, "bad extended attribute block %llu inode #%lu",
-               EXT4_SB(sb)->s_gdb_count)) ||
-             (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
-                ext4_error(sb, __func__,
-                           "bad extended attribute block %llu in inode #%lu",
                           ei->i_file_acl, inode->i_ino);
                ret = -EIO;
                goto bad_inode;
@@ -4905,10 +5112,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                /* Validate block references which are part of inode */
                ret = ext4_check_inode_blockref(inode);
        }
-        if (ret) {
+        if (ret)
-                brelse(bh);
                goto bad_inode;
-        }
        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &ext4_file_inode_operations;
@@ -4936,10 +5141,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                        init_special_inode(inode, inode->i_mode,
                           new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
        } else {
-                brelse(bh);
                ret = -EIO;
-                ext4_error(inode->i_sb, __func__,
+                ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu",
-                           "bogus i_mode (%o) for inode=%lu",
                           inode->i_mode, inode->i_ino);
                goto bad_inode;
        }
@@ -4949,6 +5152,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        return inode;
 bad_inode:
+        brelse(iloc.bh);
        iget_failed(inode);
        return ERR_PTR(ret);
 }
@@ -5010,7 +5214,7 @@ static int ext4_do_update_inode(handle_t *handle,
        /* For fields not not tracking in the in-memory inode,
         * initialise them to zero for new inodes. */
-        if (ei->i_state & EXT4_STATE_NEW)
+        if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
                memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
        ext4_get_inode_flags(ei);
@@ -5074,7 +5278,7 @@ static int ext4_do_update_inode(handle_t *handle,
                                        EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
                        sb->s_dirt = 1;
                        ext4_handle_sync(handle);
-                        err = ext4_handle_dirty_metadata(handle, inode,
+                        err = ext4_handle_dirty_metadata(handle, NULL,
                                        EXT4_SB(sb)->s_sbh);
                }
        }
@@ -5103,11 +5307,12 @@ static int ext4_do_update_inode(handle_t *handle,
        }
        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-        rc = ext4_handle_dirty_metadata(handle, inode, bh);
+        rc = ext4_handle_dirty_metadata(handle, NULL, bh);
        if (!err)
                err = rc;
-        ei->i_state &= ~EXT4_STATE_NEW;
+        ext4_clear_inode_state(inode, EXT4_STATE_NEW);
+        ext4_update_inode_fsync_trans(handle, inode, 0);
 out_brelse:
        brelse(bh);
        ext4_std_error(inode->i_sb, err);
@@ -5149,7 +5354,7 @@ out_brelse:
 * `stuff()' is running, and the new i_size will be lost.  Plus the inode
 * will no longer be on the superblock's dirty inode list.
 */
-int ext4_write_inode(struct inode *inode, int wait)
+int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        int err;
@@ -5163,7 +5368,7 @@ int ext4_write_inode(struct inode *inode, int wait)
                        return -EIO;
                }
-                if (!wait)
+                if (wbc->sync_mode != WB_SYNC_ALL)
                        return 0;
                err = ext4_force_commit(inode->i_sb);
@@ -5173,13 +5378,11 @@ int ext4_write_inode(struct inode *inode, int wait)
                err = ext4_get_inode_loc(inode, &iloc);
                if (err)
                        return err;
-                if (wait)
+                if (wbc->sync_mode == WB_SYNC_ALL)
                        sync_dirty_buffer(iloc.bh);
                if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
-                        ext4_error(inode->i_sb, __func__,
+                        ext4_error(inode->i_sb, "IO error syncing inode, "
-                                   "IO error syncing inode, "
+                                   "inode=%lu, block=%llu", inode->i_ino,
-                                   "inode=%lu, block=%llu",
-                                   inode->i_ino,
                                   (unsigned long long)iloc.bh->b_blocknr);
                        err = -EIO;
                }
@@ -5221,19 +5424,21 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
        if (error)
                return error;
+        if (ia_valid & ATTR_SIZE)
+                dquot_initialize(inode);
        if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
                (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
                handle_t *handle;
                /* (user+group)*(old+new) structure, inode write (sb,
                 * inode block, ? - but truncate inode update has it) */
-                handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
+                handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
-                                        EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
+                                        EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
                if (IS_ERR(handle)) {
                        error = PTR_ERR(handle);
                        goto err_out;
                }
-                error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+                error = dquot_transfer(inode, attr);
                if (error) {
                        ext4_journal_stop(handle);
                        return error;
@@ -5260,7 +5465,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
        }
        if (S_ISREG(inode->i_mode) &&
-            attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
+            attr->ia_valid & ATTR_SIZE &&
+            (attr->ia_size < inode->i_size ||
+             (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
                handle_t *handle;
                handle = ext4_journal_start(inode, 3);
@@ -5291,6 +5498,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                                goto err_out;
                        }
                }
+                /* ext4_truncate will clear the flag */
+                if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
+                        ext4_truncate(inode);
        }
        rc = inode_setattr(inode, attr);
@@ -5376,7 +5586,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 * worse case, the indexs blocks spread over different block groups
 *
 * If datablocks are discontiguous, they are possible to spread over
- * different block groups too. If they are contiugous, with flexbg,
+ * different block groups too. If they are contiuguous, with flexbg,
 * they could still across block group boundary.
 *
 * Also account for superblock, inode, quota and xattr blocks
@@ -5452,7 +5662,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
 * Calculate the journal credits for a chunk of data modification.
 *
 * This is called from DIO, fallocate or whoever calling
- * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks.
+ * ext4_get_blocks() to map/allocate a chunk of contiguous disk blocks.
 *
 * journal buffers for data blocks are not included here, as DIO
 * and fallocate do no need to journal data buffers.
@@ -5529,8 +5739,8 @@ static int ext4_expand_extra_isize(struct inode *inode,
        entry = IFIRST(header);
        /* No extended attributes present */
-        if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||
+        if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
-                header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+            header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
                memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
                        new_extra_isize);
                EXT4_I(inode)->i_extra_isize = new_extra_isize;
@@ -5574,7 +5784,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
        err = ext4_reserve_inode_write(handle, inode, &iloc);
        if (ext4_handle_valid(handle) &&
            EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
-            !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
+            !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
                /*
                 * We need extra buffer credits since we may write into EA block
                 * with this same handle. If journal_extend fails, then it will
@@ -5588,10 +5798,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
                                                      sbi->s_want_extra_isize,
                                                      iloc, handle);
                        if (ret) {
-                                EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+                                ext4_set_inode_state(inode,
+                                                     EXT4_STATE_NO_EXPAND);
                                if (mnt_count !=
                                        le16_to_cpu(sbi->s_es->s_mnt_count)) {
-                                        ext4_warning(inode->i_sb, __func__,
+                                        ext4_warning(inode->i_sb,
                                        "Unable to expand inode %lu. Delete"
                                        " some EAs or run e2fsck.",
                                        inode->i_ino);
@@ -5613,7 +5824,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 * i_size has been changed by generic_commit_write() and we thus need
 * to include the updated inode in the current transaction.
 *
- * Also, vfs_dq_alloc_block() will always dirty the inode when blocks
+ * Also, dquot_alloc_block() will always dirty the inode when blocks
 * are allocated to the file.
 *
 * If the inode is marked synchronous, we don't honour that here - doing
@@ -5655,7 +5866,7 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
                        err = jbd2_journal_get_write_access(handle, iloc.bh);
                        if (!err)
                                err = ext4_handle_dirty_metadata(handle,
-                                                                 inode,
+                                                                 NULL,
                                                                 iloc.bh);
                        brelse(iloc.bh);
                }
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index c1cdf613e725..016d0249294f 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -92,6 +92,15 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        flags &= ~EXT4_EXTENTS_FL;
                }
+                if (flags & EXT4_EOFBLOCKS_FL) {
+                        /* we don't support adding EOFBLOCKS flag */
+                        if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
+                                err = -EOPNOTSUPP;
+                                goto flags_out;
+                        }
+                } else if (oldflags & EXT4_EOFBLOCKS_FL)
+                        ext4_truncate(inode);
                handle = ext4_journal_start(inode, 1);
                if (IS_ERR(handle)) {
                        err = PTR_ERR(handle);
@@ -221,31 +230,39 @@ setversion_out:
                struct file *donor_filp;
                int err;
+                if (!(filp->f_mode & FMODE_READ) ||
+                    !(filp->f_mode & FMODE_WRITE))
+                        return -EBADF;
                if (copy_from_user(&me,
                        (struct move_extent __user *)arg, sizeof(me)))
                        return -EFAULT;
+                me.moved_len = 0;
                donor_filp = fget(me.donor_fd);
                if (!donor_filp)
                        return -EBADF;
-                if (!capable(CAP_DAC_OVERRIDE)) {
+                if (!(donor_filp->f_mode & FMODE_WRITE)) {
-                        if ((current->real_cred->fsuid != inode->i_uid) ||
+                        err = -EBADF;
-                                !(inode->i_mode & S_IRUSR) ||
+                        goto mext_out;
-                                !(donor_filp->f_dentry->d_inode->i_mode &
-                                S_IRUSR)) {
-                                fput(donor_filp);
-                                return -EACCES;
-                        }
                }
+                err = mnt_want_write(filp->f_path.mnt);
+                if (err)
+                        goto mext_out;
                err = ext4_move_extents(filp, donor_filp, me.orig_start,
                                        me.donor_start, me.len, &me.moved_len);
-                fput(donor_filp);
+                mnt_drop_write(filp->f_path.mnt);
+                if (me.moved_len > 0)
-                if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
+                        file_remove_suid(donor_filp);
-                        return -EFAULT;
+                if (copy_to_user((struct move_extent __user *)arg, 
+                                 &me, sizeof(me)))
+                        err = -EFAULT;
+mext_out:
+                fput(donor_filp);
                return err;
        }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index bba12824defa..bde9d0b170c2 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -23,6 +23,7 @@
 #include "mballoc.h"
 #include <linux/debugfs.h>
+#include <linux/slab.h>
 #include <trace/events/ext4.h>
 /*
@@ -69,7 +70,7 @@
 *
 * pa_lstart -> the logical start block for this prealloc space
 * pa_pstart -> the physical start block for this prealloc space
- * pa_len    -> lenght for this prealloc space
+ * pa_len    -> length for this prealloc space
 * pa_free   ->  free space available in this prealloc space
 *
 * The inode preallocation space is used looking at the _logical_ start
@@ -142,7 +143,7 @@
 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
 * value of s_mb_order2_reqs can be tuned via
 * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
- * stripe size (sbi->s_stripe), we try to search for contigous block in
+ * stripe size (sbi->s_stripe), we try to search for contiguous block in
 * stripe size. This should result in better allocation on RAID setups. If
 * not, we search in the specific group using bitmap for best extents. The
 * tunable min_to_scan and max_to_scan control the behaviour here.
@@ -441,10 +442,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
        for (i = 0; i < count; i++) {
                if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
                        ext4_fsblk_t blocknr;
-                        blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
+                        blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
                        blocknr += first + i;
-                        blocknr +=
-                            le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
                        ext4_grp_locked_error(sb, e4b->bd_group,
                                   __func__, "double-free of inode"
                                   " %lu's block %llu(bit %u in group %u)",
@@ -1255,10 +1255,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
                        ext4_fsblk_t blocknr;
-                        blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
+                        blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
                        blocknr += block;
-                        blocknr +=
-                            le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
                        ext4_grp_locked_error(sb, e4b->bd_group,
                                   __func__, "double-free of inode"
                                   " %lu's block %llu(bit %u in group %u)",
@@ -1631,7 +1630,6 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
        int max;
        int err;
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
-        struct ext4_super_block *es = sbi->s_es;
        struct ext4_free_extent ex;
        if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
@@ -1648,8 +1646,8 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
        if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
                ext4_fsblk_t start;
-                start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) +
+                start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
-                        ex.fe_start + le32_to_cpu(es->s_first_data_block);
+                        ex.fe_start;
                /* use do_div to get remainder (would be 64-bit modulo) */
                if (do_div(start, sbi->s_stripe) == 0) {
                        ac->ac_found++;
@@ -1803,8 +1801,8 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
        BUG_ON(sbi->s_stripe == 0);
        /* find first stripe-aligned block in group */
-        first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb)
+        first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
-                + le32_to_cpu(sbi->s_es->s_first_data_block);
        a = first_group_block + sbi->s_stripe - 1;
        do_div(a, sbi->s_stripe);
        i = (a * sbi->s_stripe) - first_group_block;
@@ -2256,7 +2254,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
        init_rwsem(&meta_group_info[i]->alloc_sem);
-        meta_group_info[i]->bb_free_root.rb_node = NULL;
+        meta_group_info[i]->bb_free_root = RB_ROOT;
 #ifdef DOUBLE_CHECK
        {
@@ -2529,7 +2527,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
        struct ext4_group_info *db;
        int err, count = 0, count2 = 0;
        struct ext4_free_data *entry;
-        ext4_fsblk_t discard_block;
        struct list_head *l, *ltmp;
        list_for_each_safe(l, ltmp, &txn->t_private_list) {
@@ -2559,13 +2556,16 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                        page_cache_release(e4b.bd_bitmap_page);
                }
                ext4_unlock_group(sb, entry->group);
-                discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
+                if (test_opt(sb, DISCARD)) {
-                        + entry->start_blk
+                        ext4_fsblk_t discard_block;
-                        + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-                trace_ext4_discard_blocks(sb, (unsigned long long)discard_block,
+                        discard_block = entry->start_blk +
-                                          entry->count);
+                                ext4_group_first_block_no(sb, entry->group);
-                sb_issue_discard(sb, discard_block, entry->count);
+                        trace_ext4_discard_blocks(sb,
+                                        (unsigned long long)discard_block,
+                                        entry->count);
+                        sb_issue_discard(sb, discard_block, entry->count);
+                }
                kmem_cache_free(ext4_free_ext_cachep, entry);
                ext4_mb_release_desc(&e4b);
        }
@@ -2698,14 +2698,11 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        if (err)
                goto out_err;
-        block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb)
+        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
-                + ac->ac_b_ex.fe_start
-                + le32_to_cpu(es->s_first_data_block);
        len = ac->ac_b_ex.fe_len;
        if (!ext4_data_block_valid(sbi, block, len)) {
-                ext4_error(sb, __func__,
+                ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
-                           "Allocating blocks %llu-%llu which overlap "
                           "fs metadata\n", block, block+len);
                /* File system mounted not to panic on error
                 * Fix the bitmap and repeat the block allocation
@@ -2750,12 +2747,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
                /* release all the reserved blocks if non delalloc */
                percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
-        else {
-                percpu_counter_sub(&sbi->s_dirtyblocks_counter,
-                                                ac->ac_b_ex.fe_len);
-                /* convert reserved quota blocks to real quota blocks */
-                vfs_dq_claim_block(ac->ac_inode, ac->ac_b_ex.fe_len);
-        }
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi,
@@ -3006,6 +2997,24 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
 }
 /*
+ * Called on failure; free up any blocks from the inode PA for this
+ * context.  We don't need this for MB_GROUP_PA because we only change
+ * pa_free in ext4_mb_release_context(), but on failure, we've already
+ * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
+ */
+static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
+{
+        struct ext4_prealloc_space *pa = ac->ac_pa;
+        int len;
+        if (pa && pa->pa_type == MB_INODE_PA) {
+                len = ac->ac_b_ex.fe_len;
+                pa->pa_free += len;
+        }
+}
+/*
 * use blocks preallocated to inode
 */
 static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
@@ -3144,9 +3153,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
                /* The max size of hash table is PREALLOC_TB_SIZE */
                order = PREALLOC_TB_SIZE - 1;
-        goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) +
+        goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
-                     ac->ac_g_ex.fe_start +
-                     le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block);
        /*
         * search for the prealloc space that is having
         * minimal distance from the goal block.
@@ -3509,8 +3516,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                if (bit >= end)
                        break;
                next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
-                start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
+                start = ext4_group_first_block_no(sb, group) + bit;
-                                le32_to_cpu(sbi->s_es->s_first_data_block);
                mb_debug(1, "    free preallocated %u/%u in group %u\n",
                                (unsigned) start, (unsigned) next - bit,
                                (unsigned) group);
@@ -3606,15 +3612,13 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
        bitmap_bh = ext4_read_block_bitmap(sb, group);
        if (bitmap_bh == NULL) {
-                ext4_error(sb, __func__, "Error in reading block "
+                ext4_error(sb, "Error reading block bitmap for %u", group);
-                                "bitmap for %u", group);
                return 0;
        }
        err = ext4_mb_load_buddy(sb, group, &e4b);
        if (err) {
-                ext4_error(sb, __func__, "Error in loading buddy "
+                ext4_error(sb, "Error loading buddy information for %u", group);
-                                "information for %u", group);
                put_bh(bitmap_bh);
                return 0;
        }
@@ -3787,15 +3791,15 @@ repeat:
                err = ext4_mb_load_buddy(sb, group, &e4b);
                if (err) {
-                        ext4_error(sb, __func__, "Error in loading buddy "
+                        ext4_error(sb, "Error loading buddy information for %u",
-                                        "information for %u", group);
+                                        group);
                        continue;
                }
                bitmap_bh = ext4_read_block_bitmap(sb, group);
                if (bitmap_bh == NULL) {
-                        ext4_error(sb, __func__, "Error in reading block "
+                        ext4_error(sb, "Error reading block bitmap for %u",
-                                        "bitmap for %u", group);
+                                        group);
                        ext4_mb_release_desc(&e4b);
                        continue;
                }
@@ -3921,7 +3925,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
        /* don't use group allocation for large files */
        size = max(size, isize);
-        if (size >= sbi->s_mb_stream_request) {
+        if (size > sbi->s_mb_stream_request) {
                ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
                return;
        }
@@ -3932,7 +3936,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
         * per cpu locality group is to reduce the contention between block
         * request from multiple CPUs.
         */
-        ac->ac_lg = per_cpu_ptr(sbi->s_locality_groups, raw_smp_processor_id());
+        ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups);
        /* we're going to use group allocation */
        ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
@@ -4060,8 +4064,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
                ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
                if (ext4_mb_load_buddy(sb, group, &e4b)) {
-                        ext4_error(sb, __func__, "Error in loading buddy "
+                        ext4_error(sb, "Error loading buddy information for %u",
-                                        "information for %u", group);
+                                        group);
                        continue;
                }
                ext4_lock_group(sb, group);
@@ -4237,7 +4241,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                        return 0;
                }
                reserv_blks = ar->len;
-                while (ar->len && vfs_dq_alloc_block(ar->inode, ar->len)) {
+                while (ar->len && dquot_alloc_block(ar->inode, ar->len)) {
                        ar->flags |= EXT4_MB_HINT_NOPREALLOC;
                        ar->len--;
                }
@@ -4290,6 +4294,7 @@ repeat:
                        ac->ac_status = AC_STATUS_CONTINUE;
                        goto repeat;
                } else if (*errp) {
+                        ext4_discard_allocated_blocks(ac);
                        ac->ac_b_ex.fe_len = 0;
                        ar->len = 0;
                        ext4_mb_show_ac(ac);
@@ -4313,7 +4318,7 @@ out2:
        kmem_cache_free(ext4_ac_cachep, ac);
 out1:
        if (inquota && ar->len < inquota)
-                vfs_dq_free_block(ar->inode, inquota - ar->len);
+                dquot_free_block(ar->inode, inquota - ar->len);
 out3:
        if (!ar->len) {
                if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
@@ -4422,18 +4427,24 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
        return 0;
 }
-/*
+/**
- * Main entry point into mballoc to free blocks
+ * ext4_free_blocks() -- Free given blocks and update quota
+ * @handle:             handle for this transaction
+ * @inode:              inode
+ * @block:              start physical block to free
+ * @count:              number of blocks to count
+ * @metadata:           Are these metadata blocks
 */
-void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
+void ext4_free_blocks(handle_t *handle, struct inode *inode,
-                        ext4_fsblk_t block, unsigned long count,
+                      struct buffer_head *bh, ext4_fsblk_t block,
-                        int metadata, unsigned long *freed)
+                      unsigned long count, int flags)
 {
        struct buffer_head *bitmap_bh = NULL;
        struct super_block *sb = inode->i_sb;
        struct ext4_allocation_context *ac = NULL;
        struct ext4_group_desc *gdp;
        struct ext4_super_block *es;
+        unsigned long freed = 0;
        unsigned int overflow;
        ext4_grpblk_t bit;
        struct buffer_head *gd_bh;
@@ -4443,21 +4454,49 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
        int err = 0;
        int ret;
-        *freed = 0;
+        if (bh) {
+                if (block)
+                        BUG_ON(block != bh->b_blocknr);
+                else
+                        block = bh->b_blocknr;
+        }
        sbi = EXT4_SB(sb);
        es = EXT4_SB(sb)->s_es;
-        if (block < le32_to_cpu(es->s_first_data_block) ||
+        if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
-            block + count < block ||
+            !ext4_data_block_valid(sbi, block, count)) {
-            block + count > ext4_blocks_count(es)) {
+                ext4_error(sb, "Freeing blocks not in datazone - "
-                ext4_error(sb, __func__,
+                           "block = %llu, count = %lu", block, count);
-                            "Freeing blocks not in datazone - "
-                            "block = %llu, count = %lu", block, count);
                goto error_return;
        }
        ext4_debug("freeing block %llu\n", block);
-        trace_ext4_free_blocks(inode, block, count, metadata);
+        trace_ext4_free_blocks(inode, block, count, flags);
+        if (flags & EXT4_FREE_BLOCKS_FORGET) {
+                struct buffer_head *tbh = bh;
+                int i;
+                BUG_ON(bh && (count > 1));
+                for (i = 0; i < count; i++) {
+                        if (!bh)
+                                tbh = sb_find_get_block(inode->i_sb,
+                                                        block + i);
+                        ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 
+                                    inode, tbh, block + i);
+                }
+        }
+        /* 
+         * We need to make sure we don't reuse the freed block until
+         * after the transaction is committed, which we can do by
+         * treating the block as metadata, below.  We make an
+         * exception if the inode is to be written in writeback mode
+         * since writeback mode has weak data consistency guarantees.
+         */
+        if (!ext4_should_writeback_data(inode))
+                flags |= EXT4_FREE_BLOCKS_METADATA;
        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
        if (ac) {
@@ -4495,8 +4534,7 @@ do_more:
            in_range(block + count - 1, ext4_inode_table(sb, gdp),
                      EXT4_SB(sb)->s_itb_per_group)) {
-                ext4_error(sb, __func__,
+                ext4_error(sb, "Freeing blocks in system zone - "
-                           "Freeing blocks in system zone - "
                           "Block = %llu, count = %lu", block, count);
                /* err = 0. ext4_std_error should be a no op */
                goto error_return;
@@ -4533,7 +4571,8 @@ do_more:
        err = ext4_mb_load_buddy(sb, block_group, &e4b);
        if (err)
                goto error_return;
-        if (metadata && ext4_handle_valid(handle)) {
+        if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
                struct ext4_free_data *new_entry;
                /*
                 * blocks being freed are metadata. these blocks shouldn't
@@ -4572,7 +4611,7 @@ do_more:
        ext4_mb_release_desc(&e4b);
-        *freed += count;
+        freed += count;
        /* We dirtied the bitmap block */
        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@ -4592,6 +4631,8 @@ do_more:
        }
        sb->s_dirt = 1;
 error_return:
+        if (freed)
+                dquot_free_block(inode, freed);
        brelse(bitmap_bh);
        ext4_std_error(sb, err);
        if (ac)
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 0ca811061bc7..b619322c76f0 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -17,7 +17,6 @@
 #include <linux/proc_fs.h>
 #include <linux/pagemap.h>
 #include <linux/seq_file.h>
-#include <linux/version.h>
 #include <linux/blkdev.h>
 #include <linux/mutex.h>
 #include "ext4_jbd2.h"
@@ -221,16 +220,9 @@ struct ext4_buddy {
 #define EXT4_MB_BITMAP(e4b)     ((e4b)->bd_bitmap)
 #define EXT4_MB_BUDDY(e4b)      ((e4b)->bd_buddy)
-#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
 static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
                                        struct ext4_free_extent *fex)
 {
-        ext4_fsblk_t block;
+        return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start;
-        block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb)
-                        + fex->fe_start
-                        + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-        return block;
 }
 #endif
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index a93d5b80f3e2..34dcfc52ef44 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -13,6 +13,7 @@
 */
 #include <linux/module.h>
+#include <linux/slab.h>
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
@@ -238,7 +239,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
         * So allocate a credit of 3. We may update
         * quota (user and group).
         */
-        needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+        needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
        if (ext4_journal_extend(handle, needed) != 0)
                retval = ext4_journal_restart(handle, needed);
@@ -262,13 +263,17 @@ static int free_dind_blocks(handle_t *handle,
        for (i = 0; i < max_entries; i++) {
                if (tmp_idata[i]) {
                        extend_credit_for_blkdel(handle, inode);
-                        ext4_free_blocks(handle, inode,
+                        ext4_free_blocks(handle, inode, 0,
-                                        le32_to_cpu(tmp_idata[i]), 1, 1);
+                                         le32_to_cpu(tmp_idata[i]), 1,
+                                         EXT4_FREE_BLOCKS_METADATA |
+                                         EXT4_FREE_BLOCKS_FORGET);
                }
        }
        put_bh(bh);
        extend_credit_for_blkdel(handle, inode);
-        ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
+        ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
+                         EXT4_FREE_BLOCKS_METADATA |
+                         EXT4_FREE_BLOCKS_FORGET);
        return 0;
 }
@@ -297,7 +302,9 @@ static int free_tind_blocks(handle_t *handle,
        }
        put_bh(bh);
        extend_credit_for_blkdel(handle, inode);
-        ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
+        ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
+                         EXT4_FREE_BLOCKS_METADATA |
+                         EXT4_FREE_BLOCKS_FORGET);
        return 0;
 }
@@ -308,8 +315,10 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
        /* ei->i_data[EXT4_IND_BLOCK] */
        if (i_data[0]) {
                extend_credit_for_blkdel(handle, inode);
-                ext4_free_blocks(handle, inode,
+                ext4_free_blocks(handle, inode, 0,
-                                le32_to_cpu(i_data[0]), 1, 1);
+                                le32_to_cpu(i_data[0]), 1,
+                                 EXT4_FREE_BLOCKS_METADATA |
+                                 EXT4_FREE_BLOCKS_FORGET);
        }
        /* ei->i_data[EXT4_DIND_BLOCK] */
@@ -357,12 +366,12 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
         * happened after we started the migrate. We need to
         * fail the migrate
         */
-        if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
+        if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) {
                retval = -EAGAIN;
                up_write(&EXT4_I(inode)->i_data_sem);
                goto err_out;
        } else
-                EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
+                ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
        /*
         * We have the extent map build with the tmp inode.
         * Now copy the i_data across
@@ -419,7 +428,8 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
        }
        put_bh(bh);
        extend_credit_for_blkdel(handle, inode);
-        ext4_free_blocks(handle, inode, block, 1, 1);
+        ext4_free_blocks(handle, inode, 0, block, 1,
+                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
        return retval;
 }
@@ -477,7 +487,7 @@ int ext4_ext_migrate(struct inode *inode)
        handle = ext4_journal_start(inode,
                                        EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-                                        2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)
+                                        EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)
                                        + 1);
        if (IS_ERR(handle)) {
                retval = PTR_ERR(handle);
@@ -494,14 +504,10 @@ int ext4_ext_migrate(struct inode *inode)
        }
        i_size_write(tmp_inode, i_size_read(inode));
        /*
-         * We don't want the inode to be reclaimed
+         * Set the i_nlink to zero so it will be deleted later
-         * if we got interrupted in between. We have
+         * when we drop inode reference.
-         * this tmp inode carrying reference to the
-         * data blocks of the original file. We set
-         * the i_nlink to zero at the last stage after
-         * switching the original file to extent format
         */
-        tmp_inode->i_nlink = 1;
+        tmp_inode->i_nlink = 0;
        ext4_ext_tree_init(handle, tmp_inode);
        ext4_orphan_add(handle, tmp_inode);
@@ -524,10 +530,20 @@ int ext4_ext_migrate(struct inode *inode)
         * allocation.
         */
        down_read((&EXT4_I(inode)->i_data_sem));
-        EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
+        ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
        up_read((&EXT4_I(inode)->i_data_sem));
        handle = ext4_journal_start(inode, 1);
+        if (IS_ERR(handle)) {
+                /*
+                 * It is impossible to update on-disk structures without
+                 * a handle, so just rollback in-core changes and live other
+                 * work to orphan_list_cleanup()
+                 */
+                ext4_orphan_del(NULL, tmp_inode);
+                retval = PTR_ERR(handle);
+                goto out;
+        }
        ei = EXT4_I(inode);
        i_data = ei->i_data;
@@ -609,15 +625,8 @@ err_out:
        /* Reset the extent details */
        ext4_ext_tree_init(handle, tmp_inode);
-        /*
-         * Set the i_nlink to zero so that
-         * generic_drop_inode really deletes the
-         * inode
-         */
-        tmp_inode->i_nlink = 0;
        ext4_journal_stop(handle);
+out:
        unlock_new_inode(tmp_inode);
        iput(tmp_inode);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 25b6b1457360..d1fc662cc311 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -15,6 +15,7 @@
 #include <linux/fs.h>
 #include <linux/quotaops.h>
+#include <linux/slab.h>
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
 #include "ext4.h"
@@ -77,12 +78,14 @@ static int
 mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                      struct ext4_extent **extent)
 {
+        struct ext4_extent_header *eh;
        int ppos, leaf_ppos = path->p_depth;
        ppos = leaf_ppos;
        if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
                /* leaf block */
                *extent = ++path[ppos].p_ext;
+                path[ppos].p_block = ext_pblock(path[ppos].p_ext);
                return 0;
        }
@@ -119,9 +122,18 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                                        ext_block_hdr(path[cur_ppos+1].p_bh);
                        }
+                        path[leaf_ppos].p_ext = *extent = NULL;
+                        eh = path[leaf_ppos].p_hdr;
+                        if (le16_to_cpu(eh->eh_entries) == 0)
+                                /* empty leaf is found */
+                                return -ENODATA;
                        /* leaf block */
                        path[leaf_ppos].p_ext = *extent =
                                EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
+                        path[leaf_ppos].p_block =
+                                        ext_pblock(path[leaf_ppos].p_ext);
                        return 0;
                }
        }
@@ -141,12 +153,12 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2,
        int ret = 0;
        if (inode1 == NULL) {
-                ext4_error(inode2->i_sb, function,
+                __ext4_error(inode2->i_sb, function,
                        "Both inodes should not be NULL: "
                        "inode1 NULL inode2 %lu", inode2->i_ino);
                ret = -EIO;
        } else if (inode2 == NULL) {
-                ext4_error(inode1->i_sb, function,
+                __ext4_error(inode1->i_sb, function,
                        "Both inodes should not be NULL: "
                        "inode1 %lu inode2 NULL", inode1->i_ino);
                ret = -EIO;
@@ -155,40 +167,15 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2,
 }
 /**
- * mext_double_down_read - Acquire two inodes' read semaphore
+ * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
 *
 * @orig_inode:         original inode structure
 * @donor_inode:        donor inode structure
- * Acquire read semaphore of the two inodes (orig and donor) by i_ino order.
+ * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
+ * i_ino order.
 */
 static void
-mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
+double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
-{
-        struct inode *first = orig_inode, *second = donor_inode;
-        /*
-         * Use the inode number to provide the stable locking order instead
-         * of its address, because the C language doesn't guarantee you can
-         * compare pointers that don't come from the same array.
-         */
-        if (donor_inode->i_ino < orig_inode->i_ino) {
-                first = donor_inode;
-                second = orig_inode;
-        }
-        down_read(&EXT4_I(first)->i_data_sem);
-        down_read(&EXT4_I(second)->i_data_sem);
-}
-/**
- * mext_double_down_write - Acquire two inodes' write semaphore
- *
- * @orig_inode:         original inode structure
- * @donor_inode:        donor inode structure
- * Acquire write semaphore of the two inodes (orig and donor) by i_ino order.
- */
-static void
-mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
 {
        struct inode *first = orig_inode, *second = donor_inode;
@@ -203,32 +190,18 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
        }
        down_write(&EXT4_I(first)->i_data_sem);
-        down_write(&EXT4_I(second)->i_data_sem);
+        down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
-}
-/**
- * mext_double_up_read - Release two inodes' read semaphore
- *
- * @orig_inode:         original inode structure to be released its lock first
- * @donor_inode:        donor inode structure to be released its lock second
- * Release read semaphore of two inodes (orig and donor).
- */
-static void
-mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
-{
-        up_read(&EXT4_I(orig_inode)->i_data_sem);
-        up_read(&EXT4_I(donor_inode)->i_data_sem);
 }
 /**
- * mext_double_up_write - Release two inodes' write semaphore
+ * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
 *
 * @orig_inode:         original inode structure to be released its lock first
 * @donor_inode:        donor inode structure to be released its lock second
- * Release write semaphore of two inodes (orig and donor).
+ * Release write lock of i_data_sem of two inodes (orig and donor).
 */
 static void
-mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
+double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
 {
        up_write(&EXT4_I(orig_inode)->i_data_sem);
        up_write(&EXT4_I(donor_inode)->i_data_sem);
@@ -280,6 +253,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
                }
                o_start->ee_len = start_ext->ee_len;
+                eblock = le32_to_cpu(start_ext->ee_block);
                new_flag = 1;
        } else if (start_ext->ee_len && new_ext->ee_len &&
@@ -290,6 +264,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
                 * orig  |------------------------------|
                 */
                o_start->ee_len = start_ext->ee_len;
+                eblock = le32_to_cpu(start_ext->ee_block);
                new_flag = 1;
        } else if (!start_ext->ee_len && new_ext->ee_len &&
@@ -503,7 +478,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
        struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
        struct ext4_extent new_ext, start_ext, end_ext;
        ext4_lblk_t new_ext_end;
-        ext4_fsblk_t new_phys_end;
        int oext_alen, new_ext_alen, end_ext_alen;
        int depth = ext_depth(orig_inode);
        int ret;
@@ -517,7 +491,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
        new_ext.ee_len = dext->ee_len;
        new_ext_alen = ext4_ext_get_actual_len(&new_ext);
        new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
-        new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1;
        /*
         * Case: original extent is first
@@ -530,6 +503,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
                le32_to_cpu(oext->ee_block) + oext_alen) {
                start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
                                               le32_to_cpu(oext->ee_block));
+                start_ext.ee_block = oext->ee_block;
                copy_extent_status(oext, &start_ext);
        } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
                prev_ext = oext - 1;
@@ -543,6 +517,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
                        start_ext.ee_len = cpu_to_le16(
                                ext4_ext_get_actual_len(prev_ext) +
                                new_ext_alen);
+                        start_ext.ee_block = oext->ee_block;
                        copy_extent_status(prev_ext, &start_ext);
                        new_ext.ee_len = 0;
                }
@@ -554,7 +529,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
         * new_ext       |-------|
         */
        if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
-                ext4_error(orig_inode->i_sb, __func__,
+                ext4_error(orig_inode->i_sb,
                        "new_ext_end(%u) should be less than or equal to "
                        "oext->ee_block(%u) + oext_alen(%d) - 1",
                        new_ext_end, le32_to_cpu(oext->ee_block),
@@ -596,7 +571,7 @@ out:
 * @tmp_oext:           the extent that will belong to the donor inode
 * @orig_off:           block offset of original inode
 * @donor_off:          block offset of donor inode
- * @max_count:          the maximun length of extents
+ * @max_count:          the maximum length of extents
 *
 * Return 0 on success, or a negative error value on failure.
 */
@@ -661,6 +636,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
 * @donor_inode:        donor inode
 * @from:               block offset of orig_inode
 * @count:              block count to be replaced
+ * @err:                pointer to save return value
 *
 * Replace original inode extents and donor inode extents page by page.
 * We implement this replacement in the following three steps:
@@ -671,33 +647,33 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
 * 3. Change the block information of donor inode to point at the saved
 *    original inode blocks in the dummy extents.
 *
- * Return 0 on success, or a negative error value on failure.
+ * Return replaced block count.
 */
 static int
 mext_replace_branches(handle_t *handle, struct inode *orig_inode,
                           struct inode *donor_inode, ext4_lblk_t from,
-                           ext4_lblk_t count)
+                           ext4_lblk_t count, int *err)
 {
        struct ext4_ext_path *orig_path = NULL;
        struct ext4_ext_path *donor_path = NULL;
        struct ext4_extent *oext, *dext;
        struct ext4_extent tmp_dext, tmp_oext;
        ext4_lblk_t orig_off = from, donor_off = from;
-        int err = 0;
        int depth;
        int replaced_count = 0;
        int dext_alen;
-        mext_double_down_write(orig_inode, donor_inode);
+        /* Protect extent trees against block allocations via delalloc */
+        double_down_write_data_sem(orig_inode, donor_inode);
        /* Get the original extent for the block "orig_off" */
-        err = get_ext_path(orig_inode, orig_off, &orig_path);
+        *err = get_ext_path(orig_inode, orig_off, &orig_path);
-        if (err)
+        if (*err)
                goto out;
        /* Get the donor extent for the head */
-        err = get_ext_path(donor_inode, donor_off, &donor_path);
+        *err = get_ext_path(donor_inode, donor_off, &donor_path);
-        if (err)
+        if (*err)
                goto out;
        depth = ext_depth(orig_inode);
        oext = orig_path[depth].p_ext;
@@ -707,39 +683,39 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
        dext = donor_path[depth].p_ext;
        tmp_dext = *dext;
-        err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+        *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
                                      donor_off, count);
-        if (err)
+        if (*err)
                goto out;
        /* Loop for the donor extents */
        while (1) {
                /* The extent for donor must be found. */
                if (!dext) {
-                        ext4_error(donor_inode->i_sb, __func__,
+                        ext4_error(donor_inode->i_sb,
                                   "The extent for donor must be found");
-                        err = -EIO;
+                        *err = -EIO;
                        goto out;
                } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
-                        ext4_error(donor_inode->i_sb, __func__,
+                        ext4_error(donor_inode->i_sb,
                                "Donor offset(%u) and the first block of donor "
                                "extent(%u) should be equal",
                                donor_off,
                                le32_to_cpu(tmp_dext.ee_block));
-                        err = -EIO;
+                        *err = -EIO;
                        goto out;
                }
                /* Set donor extent to orig extent */
-                err = mext_leaf_block(handle, orig_inode,
+                *err = mext_leaf_block(handle, orig_inode,
                                           orig_path, &tmp_dext, &orig_off);
-                if (err < 0)
+                if (*err)
                        goto out;
                /* Set orig extent to donor extent */
-                err = mext_leaf_block(handle, donor_inode,
+                *err = mext_leaf_block(handle, donor_inode,
                                           donor_path, &tmp_oext, &donor_off);
-                if (err < 0)
+                if (*err)
                        goto out;
                dext_alen = ext4_ext_get_actual_len(&tmp_dext);
@@ -753,35 +729,25 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
                if (orig_path)
                        ext4_ext_drop_refs(orig_path);
-                err = get_ext_path(orig_inode, orig_off, &orig_path);
+                *err = get_ext_path(orig_inode, orig_off, &orig_path);
-                if (err)
+                if (*err)
                        goto out;
                depth = ext_depth(orig_inode);
                oext = orig_path[depth].p_ext;
-                if (le32_to_cpu(oext->ee_block) +
-                                ext4_ext_get_actual_len(oext) <= orig_off) {
-                        err = 0;
-                        goto out;
-                }
                tmp_oext = *oext;
                if (donor_path)
                        ext4_ext_drop_refs(donor_path);
-                err = get_ext_path(donor_inode, donor_off, &donor_path);
+                *err = get_ext_path(donor_inode, donor_off, &donor_path);
-                if (err)
+                if (*err)
                        goto out;
                depth = ext_depth(donor_inode);
                dext = donor_path[depth].p_ext;
-                if (le32_to_cpu(dext->ee_block) +
-                                ext4_ext_get_actual_len(dext) <= donor_off) {
-                        err = 0;
-                        goto out;
-                }
                tmp_dext = *dext;
-                err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+                *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
                                           donor_off, count - replaced_count);
-                if (err)
+                if (*err)
                        goto out;
        }
@@ -795,8 +761,12 @@ out:
                kfree(donor_path);
        }
-        mext_double_up_write(orig_inode, donor_inode);
+        ext4_ext_invalidate_cache(orig_inode);
-        return err;
+        ext4_ext_invalidate_cache(donor_inode);
+        double_up_write_data_sem(orig_inode, donor_inode);
+        return replaced_count;
 }
 /**
@@ -808,16 +778,17 @@ out:
 * @data_offset_in_page:        block index where data swapping starts
 * @block_len_in_page:          the number of blocks to be swapped
 * @uninit:                     orig extent is uninitialized or not
+ * @err:                        pointer to save return value
 *
 * Save the data in original inode blocks and replace original inode extents
 * with donor inode extents by calling mext_replace_branches().
- * Finally, write out the saved data in new original inode blocks. Return 0
+ * Finally, write out the saved data in new original inode blocks. Return
- * on success, or a negative error value on failure.
+ * replaced block count.
 */
 static int
 move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
                  pgoff_t orig_page_offset, int data_offset_in_page,
-                  int block_len_in_page, int uninit)
+                  int block_len_in_page, int uninit, int *err)
 {
        struct inode *orig_inode = o_filp->f_dentry->d_inode;
        struct address_space *mapping = orig_inode->i_mapping;
@@ -829,9 +800,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
        long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
        unsigned long blocksize = orig_inode->i_sb->s_blocksize;
        unsigned int w_flags = 0;
-        unsigned int tmp_data_len, data_len;
+        unsigned int tmp_data_size, data_size, replaced_size;
        void *fsdata;
-        int ret, i, jblocks;
+        int i, jblocks;
+        int err2 = 0;
+        int replaced_count = 0;
        int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
        /*
@@ -841,8 +814,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
        jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
        handle = ext4_journal_start(orig_inode, jblocks);
        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
+                *err = PTR_ERR(handle);
-                return ret;
+                return 0;
        }
        if (segment_eq(get_fs(), KERNEL_DS))
@@ -858,39 +831,36 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
         * Just swap data blocks between orig and donor.
         */
        if (uninit) {
-                ret = mext_replace_branches(handle, orig_inode,
+                replaced_count = mext_replace_branches(handle, orig_inode,
-                                                 donor_inode, orig_blk_offset,
+                                                donor_inode, orig_blk_offset,
-                                                 block_len_in_page);
+                                                block_len_in_page, err);
-                /* Clear the inode cache not to refer to the old data */
-                ext4_ext_invalidate_cache(orig_inode);
-                ext4_ext_invalidate_cache(donor_inode);
                goto out2;
        }
        offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
-        /* Calculate data_len */
+        /* Calculate data_size */
        if ((orig_blk_offset + block_len_in_page - 1) ==
            ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
                /* Replace the last block */
-                tmp_data_len = orig_inode->i_size & (blocksize - 1);
+                tmp_data_size = orig_inode->i_size & (blocksize - 1);
                /*
-                 * If data_len equal zero, it shows data_len is multiples of
+                 * If data_size equal zero, it shows data_size is multiples of
                 * blocksize. So we set appropriate value.
                 */
-                if (tmp_data_len == 0)
+                if (tmp_data_size == 0)
-                        tmp_data_len = blocksize;
+                        tmp_data_size = blocksize;
-                data_len = tmp_data_len +
+                data_size = tmp_data_size +
                        ((block_len_in_page - 1) << orig_inode->i_blkbits);
-        } else {
+        } else
-                data_len = block_len_in_page << orig_inode->i_blkbits;
+                data_size = block_len_in_page << orig_inode->i_blkbits;
-        }
+        replaced_size = data_size;
-        ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags,
+        *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags,
                                 &page, &fsdata);
-        if (unlikely(ret < 0))
+        if (unlikely(*err < 0))
                goto out;
        if (!PageUptodate(page)) {
@@ -911,14 +881,17 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
        /* Release old bh and drop refs */
        try_to_release_page(page, 0);
-        ret = mext_replace_branches(handle, orig_inode, donor_inode,
+        replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
-                                         orig_blk_offset, block_len_in_page);
+                                        orig_blk_offset, block_len_in_page,
-        if (ret < 0)
+                                        &err2);
-                goto out;
+        if (err2) {
+                if (replaced_count) {
-        /* Clear the inode cache not to refer to the old data */
+                        block_len_in_page = replaced_count;
-        ext4_ext_invalidate_cache(orig_inode);
+                        replaced_size =
-        ext4_ext_invalidate_cache(donor_inode);
+                                block_len_in_page << orig_inode->i_blkbits;
+                } else
+                        goto out;
+        }
        if (!page_has_buffers(page))
                create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
@@ -928,16 +901,16 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
                bh = bh->b_this_page;
        for (i = 0; i < block_len_in_page; i++) {
-                ret = ext4_get_block(orig_inode,
+                *err = ext4_get_block(orig_inode,
                                (sector_t)(orig_blk_offset + i), bh, 0);
-                if (ret < 0)
+                if (*err < 0)
                        goto out;
                if (bh->b_this_page != NULL)
                        bh = bh->b_this_page;
        }
-        ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len,
+        *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
                               page, fsdata);
        page = NULL;
@@ -951,18 +924,20 @@ out:
 out2:
        ext4_journal_stop(handle);
-        return ret < 0 ? ret : 0;
+        if (err2)
+                *err = err2;
+        return replaced_count;
 }
 /**
- * mext_check_argumants - Check whether move extent can be done
+ * mext_check_arguments - Check whether move extent can be done
 *
 * @orig_inode:         original inode
 * @donor_inode:        donor inode
 * @orig_start:         logical start offset in block for orig
 * @donor_start:        logical start offset in block for donor
 * @len:                the number of blocks to be moved
- * @moved_len:          moved block length
 *
 * Check the arguments of ext4_move_extents() whether the files can be
 * exchanged with each other.
@@ -970,18 +945,17 @@ out2:
 */
 static int
 mext_check_arguments(struct inode *orig_inode,
-                          struct inode *donor_inode, __u64 orig_start,
+                     struct inode *donor_inode, __u64 orig_start,
-                          __u64 donor_start, __u64 *len, __u64 moved_len)
+                     __u64 donor_start, __u64 *len)
 {
        ext4_lblk_t orig_blocks, donor_blocks;
        unsigned int blkbits = orig_inode->i_blkbits;
        unsigned int blocksize = 1 << blkbits;
-        /* Regular file check */
+        if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
-        if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+                ext4_debug("ext4 move extent: suid or sgid is set"
-                ext4_debug("ext4 move extent: The argument files should be "
+                           " to donor file [ino:orig %lu, donor %lu]\n",
-                        "regular file [ino:orig %lu, donor %lu]\n",
+                           orig_inode->i_ino, donor_inode->i_ino);
-                        orig_inode->i_ino, donor_inode->i_ino);
                return -EINVAL;
        }
@@ -1025,13 +999,6 @@ mext_check_arguments(struct inode *orig_inode,
                return -EINVAL;
        }
-        if (moved_len) {
-                ext4_debug("ext4 move extent: moved_len should be 0 "
-                        "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
-                        donor_inode->i_ino);
-                return -EINVAL;
-        }
        if ((orig_start > EXT_MAX_BLOCK) ||
            (donor_start > EXT_MAX_BLOCK) ||
            (*len > EXT_MAX_BLOCK) ||
@@ -1088,7 +1055,7 @@ mext_check_arguments(struct inode *orig_inode,
        }
        if (!*len) {
-                ext4_debug("ext4 move extent: len shoudld not be 0 "
+                ext4_debug("ext4 move extent: len should not be 0 "
                        "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
                        donor_inode->i_ino);
                return -EINVAL;
@@ -1232,16 +1199,24 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                return -EINVAL;
        }
-        /* protect orig and donor against a truncate */
+        /* Regular file check */
+        if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+                ext4_debug("ext4 move extent: The argument files should be "
+                        "regular file [ino:orig %lu, donor %lu]\n",
+                        orig_inode->i_ino, donor_inode->i_ino);
+                return -EINVAL;
+        }
+        /* Protect orig and donor inodes against a truncate */
        ret1 = mext_inode_double_lock(orig_inode, donor_inode);
        if (ret1 < 0)
                return ret1;
-        mext_double_down_read(orig_inode, donor_inode);
+        /* Protect extent tree against block allocations via delalloc */
+        double_down_write_data_sem(orig_inode, donor_inode);
        /* Check the filesystem environment whether move_extent can be done */
        ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
-                                        donor_start, &len, *moved_len);
+                                    donor_start, &len);
-        mext_double_up_read(orig_inode, donor_inode);
        if (ret1)
                goto out;
@@ -1355,36 +1330,39 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                seq_start = le32_to_cpu(ext_cur->ee_block);
                rest_blocks = seq_blocks;
-                /* Discard preallocations of two inodes */
+                /*
-                down_write(&EXT4_I(orig_inode)->i_data_sem);
+                 * Up semaphore to avoid following problems:
-                ext4_discard_preallocations(orig_inode);
+                 * a. transaction deadlock among ext4_journal_start,
-                up_write(&EXT4_I(orig_inode)->i_data_sem);
+                 *    ->write_begin via pagefault, and jbd2_journal_commit
+                 * b. racing with ->readpage, ->write_begin, and ext4_get_block
-                down_write(&EXT4_I(donor_inode)->i_data_sem);
+                 *    in move_extent_per_page
-                ext4_discard_preallocations(donor_inode);
+                 */
-                up_write(&EXT4_I(donor_inode)->i_data_sem);
+                double_up_write_data_sem(orig_inode, donor_inode);
                while (orig_page_offset <= seq_end_page) {
                        /* Swap original branches with new branches */
-                        ret1 = move_extent_per_page(o_filp, donor_inode,
+                        block_len_in_page = move_extent_per_page(
+                                                o_filp, donor_inode,
                                                orig_page_offset,
                                                data_offset_in_page,
-                                                block_len_in_page, uninit);
+                                                block_len_in_page, uninit,
-                        if (ret1 < 0)
+                                                &ret1);
-                                goto out;
-                        orig_page_offset++;
                        /* Count how many blocks we have exchanged */
                        *moved_len += block_len_in_page;
+                        if (ret1 < 0)
+                                break;
                        if (*moved_len > len) {
-                                ext4_error(orig_inode->i_sb, __func__,
+                                ext4_error(orig_inode->i_sb,
                                        "We replaced blocks too much! "
                                        "sum of replaced: %llu requested: %llu",
                                        *moved_len, len);
                                ret1 = -EIO;
-                                goto out;
+                                break;
                        }
+                        orig_page_offset++;
                        data_offset_in_page = 0;
                        rest_blocks -= block_len_in_page;
                        if (rest_blocks > blocks_per_page)
@@ -1393,6 +1371,10 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                                block_len_in_page = rest_blocks;
                }
+                double_down_write_data_sem(orig_inode, donor_inode);
+                if (ret1 < 0)
+                        break;
                /* Decrease buffer counter */
                if (holecheck_path)
                        ext4_ext_drop_refs(holecheck_path);
@@ -1414,6 +1396,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
        }
 out:
+        if (*moved_len) {
+                ext4_discard_preallocations(orig_inode);
+                ext4_discard_preallocations(donor_inode);
+        }
        if (orig_path) {
                ext4_ext_drop_refs(orig_path);
                kfree(orig_path);
@@ -1422,7 +1409,7 @@ out:
                ext4_ext_drop_refs(holecheck_path);
                kfree(holecheck_path);
        }
+        double_up_write_data_sem(orig_inode, donor_inode);
        ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
        if (ret1)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6d2c1b897fc7..0c070fabd108 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -383,8 +383,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
        if (root->info.hash_version != DX_HASH_TEA &&
            root->info.hash_version != DX_HASH_HALF_MD4 &&
            root->info.hash_version != DX_HASH_LEGACY) {
-                ext4_warning(dir->i_sb, __func__,
+                ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
-                             "Unrecognised inode hash code %d",
                             root->info.hash_version);
                brelse(bh);
                *err = ERR_BAD_DX_DIR;
@@ -399,8 +398,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
        hash = hinfo->hash;
        if (root->info.unused_flags & 1) {
-                ext4_warning(dir->i_sb, __func__,
+                ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
-                             "Unimplemented inode hash flags: %#06x",
                             root->info.unused_flags);
                brelse(bh);
                *err = ERR_BAD_DX_DIR;
@@ -408,8 +406,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
        }
        if ((indirect = root->info.indirect_levels) > 1) {
-                ext4_warning(dir->i_sb, __func__,
+                ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
-                             "Unimplemented inode hash depth: %#06x",
                             root->info.indirect_levels);
                brelse(bh);
                *err = ERR_BAD_DX_DIR;
@@ -421,8 +418,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
        if (dx_get_limit(entries) != dx_root_limit(dir,
                                                   root->info.info_length)) {
-                ext4_warning(dir->i_sb, __func__,
+                ext4_warning(dir->i_sb, "dx entry: limit != root limit");
-                             "dx entry: limit != root limit");
                brelse(bh);
                *err = ERR_BAD_DX_DIR;
                goto fail;
@@ -433,7 +429,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
        {
                count = dx_get_count(entries);
                if (!count || count > dx_get_limit(entries)) {
-                        ext4_warning(dir->i_sb, __func__,
+                        ext4_warning(dir->i_sb,
                                     "dx entry: no count or count > limit");
                        brelse(bh);
                        *err = ERR_BAD_DX_DIR;
@@ -478,7 +474,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
                        goto fail2;
                at = entries = ((struct dx_node *) bh->b_data)->entries;
                if (dx_get_limit(entries) != dx_node_limit (dir)) {
-                        ext4_warning(dir->i_sb, __func__,
+                        ext4_warning(dir->i_sb,
                                     "dx entry: limit != node limit");
                        brelse(bh);
                        *err = ERR_BAD_DX_DIR;
@@ -494,7 +490,7 @@ fail2:
        }
 fail:
        if (*err == ERR_BAD_DX_DIR)
-                ext4_warning(dir->i_sb, __func__,
+                ext4_warning(dir->i_sb,
                             "Corrupt dir inode %ld, running e2fsck is "
                             "recommended.", dir->i_ino);
        return NULL;
@@ -947,9 +943,8 @@ restart:
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
                        /* read error, skip block & hope for the best */
-                        ext4_error(sb, __func__, "reading directory #%lu "
+                        ext4_error(sb, "reading directory #%lu offset %lu",
-                                   "offset %lu", dir->i_ino,
+                                   dir->i_ino, (unsigned long)block);
-                                   (unsigned long)block);
                        brelse(bh);
                        goto next;
                }
@@ -1041,7 +1036,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
                retval = ext4_htree_next_block(dir, hash, frame,
                                               frames, NULL);
                if (retval < 0) {
-                        ext4_warning(sb, __func__,
+                        ext4_warning(sb,
                             "error reading index page in directory #%lu",
                             dir->i_ino);
                        *err = retval;
@@ -1071,14 +1066,13 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
                __u32 ino = le32_to_cpu(de->inode);
                brelse(bh);
                if (!ext4_valid_inum(dir->i_sb, ino)) {
-                        ext4_error(dir->i_sb, "ext4_lookup",
+                        ext4_error(dir->i_sb, "bad inode number: %u", ino);
-                                   "bad inode number: %u", ino);
                        return ERR_PTR(-EIO);
                }
                inode = ext4_iget(dir->i_sb, ino);
                if (unlikely(IS_ERR(inode))) {
                        if (PTR_ERR(inode) == -ESTALE) {
-                                ext4_error(dir->i_sb, __func__,
+                                ext4_error(dir->i_sb,
                                                "deleted inode referenced: %u",
                                                ino);
                                return ERR_PTR(-EIO);
@@ -1110,7 +1104,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
        brelse(bh);
        if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
-                ext4_error(child->d_inode->i_sb, "ext4_get_parent",
+                ext4_error(child->d_inode->i_sb,
                           "bad inode number: %u", ino);
                return ERR_PTR(-EIO);
        }
@@ -1292,9 +1286,6 @@ errout:
 * add_dirent_to_buf will attempt search the directory block for
 * space.  It will return -ENOSPC if no space is available, and -EIO
 * and -EEXIST if directory entry already exists.
- *
- * NOTE!  bh is NOT released in the case where ENOSPC is returned.  In
- * all other cases bh is released.
 */
 static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                             struct inode *inode, struct ext4_dir_entry_2 *de,
@@ -1315,14 +1306,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                top = bh->b_data + blocksize - reclen;
                while ((char *) de <= top) {
                        if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
-                                                  bh, offset)) {
+                                                  bh, offset))
-                                brelse(bh);
                                return -EIO;
-                        }
+                        if (ext4_match(namelen, name, de))
-                        if (ext4_match(namelen, name, de)) {
-                                brelse(bh);
                                return -EEXIST;
-                        }
                        nlen = EXT4_DIR_REC_LEN(de->name_len);
                        rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
                        if ((de->inode? rlen - nlen: rlen) >= reclen)
@@ -1337,7 +1324,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        err = ext4_journal_get_write_access(handle, bh);
        if (err) {
                ext4_std_error(dir->i_sb, err);
-                brelse(bh);
                return err;
        }
@@ -1377,7 +1363,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        err = ext4_handle_dirty_metadata(handle, dir, bh);
        if (err)
                ext4_std_error(dir->i_sb, err);
-        brelse(bh);
        return 0;
 }
@@ -1419,7 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        de = (struct ext4_dir_entry_2 *)((char *)fde +
                ext4_rec_len_from_disk(fde->rec_len, blocksize));
        if ((char *) de >= (((char *) root) + blocksize)) {
-                ext4_error(dir->i_sb, __func__,
+                ext4_error(dir->i_sb,
                           "invalid rec_len for '..' in inode %lu",
                           dir->i_ino);
                brelse(bh);
@@ -1471,7 +1456,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        if (!(de))
                return retval;
-        return add_dirent_to_buf(handle, dentry, inode, de, bh);
+        retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+        brelse(bh);
+        return retval;
 }
 /*
@@ -1514,8 +1501,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                if(!bh)
                        return retval;
                retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
-                if (retval != -ENOSPC)
+                if (retval != -ENOSPC) {
+                        brelse(bh);
                        return retval;
+                }
                if (blocks == 1 && !dx_fallback &&
                    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
@@ -1528,7 +1517,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
        de = (struct ext4_dir_entry_2 *) bh->b_data;
        de->inode = 0;
        de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
-        return add_dirent_to_buf(handle, dentry, inode, de, bh);
+        retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+        brelse(bh);
+        return retval;
 }
 /*
@@ -1561,10 +1552,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                goto journal_error;
        err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
-        if (err != -ENOSPC) {
+        if (err != -ENOSPC)
-                bh = NULL;
                goto cleanup;
-        }
        /* Block full, should compress but for now just split */
        dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
@@ -1580,8 +1569,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                if (levels && (dx_get_count(frames->entries) ==
                               dx_get_limit(frames->entries))) {
-                        ext4_warning(sb, __func__,
+                        ext4_warning(sb, "Directory index full!");
-                                     "Directory index full!");
                        err = -ENOSPC;
                        goto cleanup;
                }
@@ -1657,7 +1645,6 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
        if (!de)
                goto cleanup;
        err = add_dirent_to_buf(handle, dentry, inode, de, bh);
-        bh = NULL;
        goto cleanup;
 journal_error:
@@ -1772,10 +1759,12 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
        struct inode *inode;
        int err, retries = 0;
+        dquot_initialize(dir);
 retry:
        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-                                        2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+                                        EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -1806,10 +1795,12 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
        if (!new_valid_dev(rdev))
                return -EINVAL;
+        dquot_initialize(dir);
 retry:
        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-                                        2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+                                        EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -1843,10 +1834,12 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        if (EXT4_DIR_LINK_MAX(dir))
                return -EMLINK;
+        dquot_initialize(dir);
 retry:
        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-                                        2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+                                        EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -1922,11 +1915,11 @@ static int empty_dir(struct inode *inode)
        if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
            !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
                if (err)
-                        ext4_error(inode->i_sb, __func__,
+                        ext4_error(inode->i_sb,
                                   "error %d reading directory #%lu offset 0",
                                   err, inode->i_ino);
                else
-                        ext4_warning(inode->i_sb, __func__,
+                        ext4_warning(inode->i_sb,
                                     "bad directory (dir #%lu) - no data block",
                                     inode->i_ino);
                return 1;
@@ -1937,7 +1930,7 @@ static int empty_dir(struct inode *inode)
                        !le32_to_cpu(de1->inode) ||
                        strcmp(".", de->name) ||
                        strcmp("..", de1->name)) {
-                ext4_warning(inode->i_sb, "empty_dir",
+                ext4_warning(inode->i_sb,
                             "bad directory (dir #%lu) - no `.' or `..'",
                             inode->i_ino);
                brelse(bh);
@@ -1955,7 +1948,7 @@ static int empty_dir(struct inode *inode)
                                offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
                        if (!bh) {
                                if (err)
-                                        ext4_error(sb, __func__,
+                                        ext4_error(sb,
                                                   "error %d reading directory"
                                                   " #%lu offset %u",
                                                   err, inode->i_ino, offset);
@@ -2026,11 +2019,18 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
        err = ext4_reserve_inode_write(handle, inode, &iloc);
        if (err)
                goto out_unlock;
+        /*
+         * Due to previous errors inode may be already a part of on-disk
+         * orphan list. If so skip on-disk list modification.
+         */
+        if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
+                (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
+                        goto mem_insert;
        /* Insert this inode at the head of the on-disk orphan list... */
        NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
        EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
-        err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh);
+        err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
        rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
        if (!err)
                err = rc;
@@ -2043,6 +2043,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
         *
         * This is safe: on error we're going to ignore the orphan list
         * anyway on the next recovery. */
+mem_insert:
        if (!err)
                list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
@@ -2102,7 +2103,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
                if (err)
                        goto out_brelse;
                sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
-                err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh);
+                err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
        } else {
                struct ext4_iloc iloc2;
                struct inode *i_prev =
@@ -2142,7 +2143,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
        /* Initialize quotas before so that eventual writes go in
         * separate transaction */
-        vfs_dq_init(dentry->d_inode);
+        dquot_initialize(dir);
+        dquot_initialize(dentry->d_inode);
        handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -2169,7 +2172,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
        if (retval)
                goto end_rmdir;
        if (!EXT4_DIR_LINK_EMPTY(inode))
-                ext4_warning(inode->i_sb, "ext4_rmdir",
+                ext4_warning(inode->i_sb,
                             "empty directory has too many links (%d)",
                             inode->i_nlink);
        inode->i_version++;
@@ -2201,7 +2204,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
        /* Initialize quotas before so that eventual writes go
         * in separate transaction */
-        vfs_dq_init(dentry->d_inode);
+        dquot_initialize(dir);
+        dquot_initialize(dentry->d_inode);
        handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -2221,7 +2226,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
                goto end_unlink;
        if (!inode->i_nlink) {
-                ext4_warning(inode->i_sb, "ext4_unlink",
+                ext4_warning(inode->i_sb,
                             "Deleting nonexistent file (%lu), %d",
                             inode->i_ino, inode->i_nlink);
                inode->i_nlink = 1;
@@ -2256,10 +2261,12 @@ static int ext4_symlink(struct inode *dir,
        if (l > dir->i_sb->s_blocksize)
                return -ENAMETOOLONG;
+        dquot_initialize(dir);
 retry:
        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
-                                        2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+                                        EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -2314,6 +2321,8 @@ static int ext4_link(struct dentry *old_dentry,
        if (inode->i_nlink >= EXT4_LINK_MAX)
                return -EMLINK;
+        dquot_initialize(dir);
        /*
         * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
         * otherwise has the potential to corrupt the orphan inode list.
@@ -2364,12 +2373,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct ext4_dir_entry_2 *old_de, *new_de;
        int retval, force_da_alloc = 0;
+        dquot_initialize(old_dir);
+        dquot_initialize(new_dir);
        old_bh = new_bh = dir_bh = NULL;
        /* Initialize quotas before so that eventual writes go
         * in separate transaction */
        if (new_dentry->d_inode)
-                vfs_dq_init(new_dentry->d_inode);
+                dquot_initialize(new_dentry->d_inode);
        handle = ext4_journal_start(old_dir, 2 *
                                        EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
@@ -2468,7 +2480,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                }
        }
        if (retval) {
-                ext4_warning(old_dir->i_sb, "ext4_rename",
+                ext4_warning(old_dir->i_sb,
                                "Deleting old file (%lu), %d, error=%d",
                                old_dir->i_ino, old_dir->i_nlink, retval);
        }
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3cfc343c41b5..5692c48754a0 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -48,65 +48,54 @@ static int verify_group_input(struct super_block *sb,
        ext4_get_group_no_and_offset(sb, start, NULL, &offset);
        if (group != sbi->s_groups_count)
-                ext4_warning(sb, __func__,
+                ext4_warning(sb, "Cannot add at group %u (only %u groups)",
-                             "Cannot add at group %u (only %u groups)",
                             input->group, sbi->s_groups_count);
        else if (offset != 0)
-                        ext4_warning(sb, __func__, "Last group not full");
+                        ext4_warning(sb, "Last group not full");
        else if (input->reserved_blocks > input->blocks_count / 5)
-                ext4_warning(sb, __func__, "Reserved blocks too high (%u)",
+                ext4_warning(sb, "Reserved blocks too high (%u)",
                             input->reserved_blocks);
        else if (free_blocks_count < 0)
-                ext4_warning(sb, __func__, "Bad blocks count %u",
+                ext4_warning(sb, "Bad blocks count %u",
                             input->blocks_count);
        else if (!(bh = sb_bread(sb, end - 1)))
-                ext4_warning(sb, __func__,
+                ext4_warning(sb, "Cannot read last block (%llu)",
-                             "Cannot read last block (%llu)",
                             end - 1);
        else if (outside(input->block_bitmap, start, end))
-                ext4_warning(sb, __func__,
+                ext4_warning(sb, "Block bitmap not in group (block %llu)",
-                             "Block bitmap not in group (block %llu)",
                             (unsigned long long)input->block_bitmap);
        else if (outside(input->inode_bitmap, start, end))
-                ext4_warning(sb, __func__,
+                ext4_warning(sb, "Inode bitmap not in group (block %llu)",
-                             "Inode bitmap not in group (block %llu)",
                             (unsigned long long)input->inode_bitmap);
        else if (outside(input->inode_table, start, end) ||
                 outside(itend - 1, start, end))
-                ext4_warning(sb, __func__,
+                ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)",
-                             "Inode table not in group (blocks %llu-%llu)",
                             (unsigned long long)input->inode_table, itend - 1);
        else if (input->inode_bitmap == input->block_bitmap)
-                ext4_warning(sb, __func__,
+                ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)",
-                             "Block bitmap same as inode bitmap (%llu)",
                             (unsigned long long)input->block_bitmap);
        else if (inside(input->block_bitmap, input->inode_table, itend))
-                ext4_warning(sb, __func__,
+                ext4_warning(sb, "Block bitmap (%llu) in inode table "
-                             "Block bitmap (%llu) in inode table (%llu-%llu)",
+                             "(%llu-%llu)",
                             (unsigned long long)input->block_bitmap,
                             (unsigned long long)input->inode_table, itend - 1);
        else if (inside(input->inode_bitmap, input->inode_table, itend))
-                ext4_warning(sb, __func__,
+                ext4_warning(sb, "Inode bitmap (%llu) in inode table "
-                             "Inode bitmap (%llu) in inode table (%llu-%llu)",
+                             "(%llu-%llu)",
                             (unsigned long long)input->inode_bitmap,
                             (unsigned long long)input->inode_table, itend - 1);
        else if (inside(input->block_bitmap, start, metaend))
-                ext4_warning(sb, __func__,
+                ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)",
-                             "Block bitmap (%llu) in GDT table"
-                             " (%llu-%llu)",
                             (unsigned long long)input->block_bitmap,
                             start, metaend - 1);
        else if (inside(input->inode_bitmap, start, metaend))
-                ext4_warning(sb, __func__,
+                ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)",
-                             "Inode bitmap (%llu) in GDT table"
-                             " (%llu-%llu)",
                             (unsigned long long)input->inode_bitmap,
                             start, metaend - 1);
        else if (inside(input->inode_table, start, metaend) ||
                 inside(itend - 1, start, metaend))
-                ext4_warning(sb, __func__,
+                ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table "
-                             "Inode table (%llu-%llu) overlaps"
+                             "(%llu-%llu)",
-                             "GDT table (%llu-%llu)",
                             (unsigned long long)input->inode_table,
                             itend - 1, start, metaend - 1);
        else
@@ -247,7 +236,7 @@ static int setup_new_group_blocks(struct super_block *sb,
                        goto exit_bh;
                if (IS_ERR(gdb = bclean(handle, sb, block))) {
-                        err = PTR_ERR(bh);
+                        err = PTR_ERR(gdb);
                        goto exit_bh;
                }
                ext4_handle_dirty_metadata(handle, NULL, gdb);
@@ -364,8 +353,7 @@ static int verify_reserved_gdb(struct super_block *sb,
        while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) {
                if (le32_to_cpu(*p++) !=
                    grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){
-                        ext4_warning(sb, __func__,
+                        ext4_warning(sb, "reserved GDT %llu"
-                                     "reserved GDT %llu"
                                     " missing grp %d (%llu)",
                                     blk, grp,
                                     grp *
@@ -420,8 +408,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
         */
        if (EXT4_SB(sb)->s_sbh->b_blocknr !=
            le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
-                ext4_warning(sb, __func__,
+                ext4_warning(sb, "won't resize using backup superblock at %llu",
-                        "won't resize using backup superblock at %llu",
                        (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
                return -EPERM;
        }
@@ -444,8 +431,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        data = (__le32 *)dind->b_data;
        if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
-                ext4_warning(sb, __func__,
+                ext4_warning(sb, "new group %u GDT block %llu not reserved",
-                             "new group %u GDT block %llu not reserved",
                             input->group, gdblock);
                err = -EINVAL;
                goto exit_dind;
@@ -468,7 +454,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
                        GFP_NOFS);
        if (!n_group_desc) {
                err = -ENOMEM;
-                ext4_warning(sb, __func__,
+                ext4_warning(sb,
                              "not enough memory for %lu groups", gdb_num + 1);
                goto exit_inode;
        }
@@ -567,8 +553,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
        /* Get each reserved primary GDT block and verify it holds backups */
        for (res = 0; res < reserved_gdb; res++, blk++) {
                if (le32_to_cpu(*data) != blk) {
-                        ext4_warning(sb, __func__,
+                        ext4_warning(sb, "reserved block %llu"
-                                     "reserved block %llu"
                                     " not at offset %ld",
                                     blk,
                                     (long)(data - (__le32 *)dind->b_data));
@@ -713,8 +698,7 @@ static void update_backups(struct super_block *sb,
         */
 exit_err:
        if (err) {
-                ext4_warning(sb, __func__,
+                ext4_warning(sb, "can't update backup for group %u (err %d), "
-                             "can't update backup for group %u (err %d), "
                             "forcing fsck on next reboot", group, err);
                sbi->s_mount_state &= ~EXT4_VALID_FS;
                sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -753,20 +737,19 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
                                        EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
-                ext4_warning(sb, __func__,
+                ext4_warning(sb, "Can't resize non-sparse filesystem further");
-                             "Can't resize non-sparse filesystem further");
                return -EPERM;
        }
        if (ext4_blocks_count(es) + input->blocks_count <
            ext4_blocks_count(es)) {
-                ext4_warning(sb, __func__, "blocks_count overflow");
+                ext4_warning(sb, "blocks_count overflow");
                return -EINVAL;
        }
        if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
            le32_to_cpu(es->s_inodes_count)) {
-                ext4_warning(sb, __func__, "inodes_count overflow");
+                ext4_warning(sb, "inodes_count overflow");
                return -EINVAL;
        }
@@ -774,14 +757,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
                if (!EXT4_HAS_COMPAT_FEATURE(sb,
                                             EXT4_FEATURE_COMPAT_RESIZE_INODE)
                    || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
-                        ext4_warning(sb, __func__,
+                        ext4_warning(sb,
                                     "No reserved GDT blocks, can't resize");
                        return -EPERM;
                }
                inode = ext4_iget(sb, EXT4_RESIZE_INO);
                if (IS_ERR(inode)) {
-                        ext4_warning(sb, __func__,
+                        ext4_warning(sb, "Error opening resize inode");
-                                     "Error opening resize inode");
                        return PTR_ERR(inode);
                }
        }
@@ -810,8 +792,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        mutex_lock(&sbi->s_resize_lock);
        if (input->group != sbi->s_groups_count) {
-                ext4_warning(sb, __func__,
+                ext4_warning(sb, "multiple resizers run on filesystem!");
-                             "multiple resizers run on filesystem!");
                err = -EBUSY;
                goto exit_journal;
        }
@@ -997,13 +978,12 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                        " too large to resize to %llu blocks safely\n",
                        sb->s_id, n_blocks_count);
                if (sizeof(sector_t) < 8)
-                        ext4_warning(sb, __func__, "CONFIG_LBDAF not enabled");
+                        ext4_warning(sb, "CONFIG_LBDAF not enabled");
                return -EINVAL;
        }
        if (n_blocks_count < o_blocks_count) {
-                ext4_warning(sb, __func__,
+                ext4_warning(sb, "can't shrink FS - resize aborted");
-                             "can't shrink FS - resize aborted");
                return -EBUSY;
        }
@@ -1011,15 +991,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
        if (last == 0) {
-                ext4_warning(sb, __func__,
+                ext4_warning(sb, "need to use ext2online to resize further");
-                             "need to use ext2online to resize further");
                return -EPERM;
        }
        add = EXT4_BLOCKS_PER_GROUP(sb) - last;
        if (o_blocks_count + add < o_blocks_count) {
-                ext4_warning(sb, __func__, "blocks_count overflow");
+                ext4_warning(sb, "blocks_count overflow");
                return -EINVAL;
        }
@@ -1027,16 +1006,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                add = n_blocks_count - o_blocks_count;
        if (o_blocks_count + add < n_blocks_count)
-                ext4_warning(sb, __func__,
+                ext4_warning(sb, "will only finish group (%llu blocks, %u new)",
-                             "will only finish group (%llu"
-                             " blocks, %u new)",
                             o_blocks_count + add, add);
        /* See if the device is actually as big as what was requested */
        bh = sb_bread(sb, o_blocks_count + add - 1);
        if (!bh) {
-                ext4_warning(sb, __func__,
+                ext4_warning(sb, "can't read last block, resize aborted");
-                             "can't read last block, resize aborted");
                return -ENOSPC;
        }
        brelse(bh);
@@ -1047,14 +1023,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        handle = ext4_journal_start_sb(sb, 3);
        if (IS_ERR(handle)) {
                err = PTR_ERR(handle);
-                ext4_warning(sb, __func__, "error %d on journal start", err);
+                ext4_warning(sb, "error %d on journal start", err);
                goto exit_put;
        }
        mutex_lock(&EXT4_SB(sb)->s_resize_lock);
        if (o_blocks_count != ext4_blocks_count(es)) {
-                ext4_warning(sb, __func__,
+                ext4_warning(sb, "multiple resizers run on filesystem!");
-                             "multiple resizers run on filesystem!");
                mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
                ext4_journal_stop(handle);
                err = -EBUSY;
@@ -1063,8 +1038,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        if ((err = ext4_journal_get_write_access(handle,
                                                 EXT4_SB(sb)->s_sbh))) {
-                ext4_warning(sb, __func__,
+                ext4_warning(sb, "error %d on journal write access", err);
-                             "error %d on journal write access", err);
                mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
                ext4_journal_stop(handle);
                goto exit_put;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d4ca92aab514..e14d22c170d5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -68,7 +68,21 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int ext4_unfreeze(struct super_block *sb);
 static void ext4_write_super(struct super_block *sb);
 static int ext4_freeze(struct super_block *sb);
+static int ext4_get_sb(struct file_system_type *fs_type, int flags,
+                       const char *dev_name, void *data, struct vfsmount *mnt);
+#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext3_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "ext3",
+        .get_sb         = ext4_get_sb,
+        .kill_sb        = kill_block_super,
+        .fs_flags       = FS_REQUIRES_DEV,
+};
+#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
+#else
+#define IS_EXT3_SB(sb) (0)
+#endif
 ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
                               struct ext4_group_desc *bg)
@@ -302,7 +316,7 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
 * write out the superblock safely.
 *
 * We'll just use the jbd2_journal_abort() error code to record an error in
- * the journal instead.  On recovery, the journal will compain about
+ * the journal instead.  On recovery, the journal will complain about
 * that error until we've noted it down and cleared it.
 */
@@ -333,7 +347,7 @@ static void ext4_handle_error(struct super_block *sb)
                        sb->s_id);
 }
-void ext4_error(struct super_block *sb, const char *function,
+void __ext4_error(struct super_block *sb, const char *function,
                const char *fmt, ...)
 {
        va_list args;
@@ -347,6 +361,42 @@ void ext4_error(struct super_block *sb, const char *function,
        ext4_handle_error(sb);
 }
+void ext4_error_inode(const char *function, struct inode *inode,
+                      const char *fmt, ...)
+{
+        va_list args;
+        va_start(args, fmt);
+        printk(KERN_CRIT "EXT4-fs error (device %s): %s: inode #%lu: (comm %s) ",
+               inode->i_sb->s_id, function, inode->i_ino, current->comm);
+        vprintk(fmt, args);
+        printk("\n");
+        va_end(args);
+        ext4_handle_error(inode->i_sb);
+}
+void ext4_error_file(const char *function, struct file *file,
+                     const char *fmt, ...)
+{
+        va_list args;
+        struct inode *inode = file->f_dentry->d_inode;
+        char pathname[80], *path;
+        va_start(args, fmt);
+        path = d_path(&(file->f_path), pathname, sizeof(pathname));
+        if (!path)
+                path = "(unknown)";
+        printk(KERN_CRIT
+               "EXT4-fs error (device %s): %s: inode #%lu (comm %s path %s): ",
+               inode->i_sb->s_id, function, inode->i_ino, current->comm, path);
+        vprintk(fmt, args);
+        printk("\n");
+        va_end(args);
+        ext4_handle_error(inode->i_sb);
+}
 static const char *ext4_decode_error(struct super_block *sb, int errno,
                                     char nbuf[16])
 {
@@ -450,7 +500,7 @@ void ext4_msg (struct super_block * sb, const char *prefix,
        va_end(args);
 }
-void ext4_warning(struct super_block *sb, const char *function,
+void __ext4_warning(struct super_block *sb, const char *function,
                  const char *fmt, ...)
 {
        va_list args;
@@ -507,7 +557,7 @@ void ext4_update_dynamic_rev(struct super_block *sb)
        if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
                return;
-        ext4_warning(sb, __func__,
+        ext4_warning(sb,
                     "updating to rev %d because of new feature flag, "
                     "running e2fsck is recommended",
                     EXT4_DYNAMIC_REV);
@@ -603,10 +653,6 @@ static void ext4_put_super(struct super_block *sb)
        if (sb->s_dirt)
                ext4_commit_super(sb, 1);
-        ext4_release_system_zone(sb);
-        ext4_mb_release(sb);
-        ext4_ext_release(sb);
-        ext4_xattr_put_super(sb);
        if (sbi->s_journal) {
                err = jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
@@ -614,6 +660,12 @@ static void ext4_put_super(struct super_block *sb)
                        ext4_abort(sb, __func__,
                                   "Couldn't clean up the journal");
        }
+        ext4_release_system_zone(sb);
+        ext4_mb_release(sb);
+        ext4_ext_release(sb);
+        ext4_xattr_put_super(sb);
        if (!(sb->s_flags & MS_RDONLY)) {
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -700,10 +752,17 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        ei->i_reserved_data_blocks = 0;
        ei->i_reserved_meta_blocks = 0;
        ei->i_allocated_meta_blocks = 0;
+        ei->i_da_metadata_calc_len = 0;
        ei->i_delalloc_reserved_flag = 0;
        spin_lock_init(&(ei->i_block_reservation_lock));
-        INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
+#ifdef CONFIG_QUOTA
+        ei->i_reserved_quota = 0;
+#endif
+        INIT_LIST_HEAD(&ei->i_completed_io_list);
+        spin_lock_init(&ei->i_completed_io_lock);
        ei->cur_aio_dio = NULL;
+        ei->i_sync_tid = 0;
+        ei->i_datasync_tid = 0;
        return &ei->vfs_inode;
 }
@@ -753,6 +812,7 @@ static void destroy_inodecache(void)
 static void ext4_clear_inode(struct inode *inode)
 {
+        dquot_drop(inode);
        ext4_discard_preallocations(inode);
        if (EXT4_JOURNAL(inode))
                jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
@@ -765,9 +825,22 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
 #if defined(CONFIG_QUOTA)
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        if (sbi->s_jquota_fmt)
+        if (sbi->s_jquota_fmt) {
-                seq_printf(seq, ",jqfmt=%s",
+                char *fmtname = "";
-                (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold" : "vfsv0");
+                switch (sbi->s_jquota_fmt) {
+                case QFMT_VFS_OLD:
+                        fmtname = "vfsold";
+                        break;
+                case QFMT_VFS_V0:
+                        fmtname = "vfsv0";
+                        break;
+                case QFMT_VFS_V1:
+                        fmtname = "vfsv1";
+                        break;
+                }
+                seq_printf(seq, ",jqfmt=%s", fmtname);
+        }
        if (sbi->s_qf_names[USRQUOTA])
                seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
@@ -775,10 +848,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
        if (sbi->s_qf_names[GRPQUOTA])
                seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
-        if (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA)
+        if (test_opt(sb, USRQUOTA))
                seq_puts(seq, ",usrquota");
-        if (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)
+        if (test_opt(sb, GRPQUOTA))
                seq_puts(seq, ",grpquota");
 #endif
 }
@@ -899,6 +972,15 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (test_opt(sb, NO_AUTO_DA_ALLOC))
                seq_puts(seq, ",noauto_da_alloc");
+        if (test_opt(sb, DISCARD))
+                seq_puts(seq, ",discard");
+        if (test_opt(sb, NOLOAD))
+                seq_puts(seq, ",norecovery");
+        if (test_opt(sb, DIOREAD_NOLOCK))
+                seq_puts(seq, ",dioread_nolock");
        ext4_show_quota_options(seq, sb);
        return 0;
@@ -985,17 +1067,9 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
                                const char *data, size_t len, loff_t off);
 static const struct dquot_operations ext4_quota_operations = {
-        .initialize     = dquot_initialize,
+#ifdef CONFIG_QUOTA
-        .drop           = dquot_drop,
-        .alloc_space    = dquot_alloc_space,
-        .reserve_space  = dquot_reserve_space,
-        .claim_space    = dquot_claim_space,
-        .release_rsv    = dquot_release_reserved_space,
        .get_reserved_space = ext4_get_reserved_space,
-        .alloc_inode    = dquot_alloc_inode,
+#endif
-        .free_space     = dquot_free_space,
-        .free_inode     = dquot_free_inode,
-        .transfer       = dquot_transfer,
        .write_dquot    = ext4_write_dquot,
        .acquire_dquot  = ext4_acquire_dquot,
        .release_dquot  = ext4_release_dquot,
@@ -1074,12 +1148,14 @@ enum {
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
        Opt_data_err_abort, Opt_data_err_ignore,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
-        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
+        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
-        Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
+        Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
-        Opt_usrquota, Opt_grpquota, Opt_i_version,
+        Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version,
        Opt_stripe, Opt_delalloc, Opt_nodelalloc,
        Opt_block_validity, Opt_noblock_validity,
-        Opt_inode_readahead_blks, Opt_journal_ioprio
+        Opt_inode_readahead_blks, Opt_journal_ioprio,
+        Opt_dioread_nolock, Opt_dioread_lock,
+        Opt_discard, Opt_nodiscard,
 };
 static const match_table_t tokens = {
@@ -1104,6 +1180,7 @@ static const match_table_t tokens = {
        {Opt_acl, "acl"},
        {Opt_noacl, "noacl"},
        {Opt_noload, "noload"},
+        {Opt_noload, "norecovery"},
        {Opt_nobh, "nobh"},
        {Opt_bh, "bh"},
        {Opt_commit, "commit=%u"},
@@ -1125,6 +1202,7 @@ static const match_table_t tokens = {
        {Opt_grpjquota, "grpjquota=%s"},
        {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
        {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
+        {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
        {Opt_grpquota, "grpquota"},
        {Opt_noquota, "noquota"},
        {Opt_quota, "quota"},
@@ -1144,6 +1222,10 @@ static const match_table_t tokens = {
        {Opt_auto_da_alloc, "auto_da_alloc=%u"},
        {Opt_auto_da_alloc, "auto_da_alloc"},
        {Opt_noauto_da_alloc, "noauto_da_alloc"},
+        {Opt_dioread_nolock, "dioread_nolock"},
+        {Opt_dioread_lock, "dioread_lock"},
+        {Opt_discard, "discard"},
+        {Opt_nodiscard, "nodiscard"},
        {Opt_err, NULL},
 };
@@ -1171,6 +1253,66 @@ static ext4_fsblk_t get_sb_block(void **data)
 }
 #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n"
+        "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
+#ifdef CONFIG_QUOTA
+static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        char *qname;
+        if (sb_any_quota_loaded(sb) &&
+                !sbi->s_qf_names[qtype]) {
+                ext4_msg(sb, KERN_ERR,
+                        "Cannot change journaled "
+                        "quota options when quota turned on");
+                return 0;
+        }
+        qname = match_strdup(args);
+        if (!qname) {
+                ext4_msg(sb, KERN_ERR,
+                        "Not enough memory for storing quotafile name");
+                return 0;
+        }
+        if (sbi->s_qf_names[qtype] &&
+                strcmp(sbi->s_qf_names[qtype], qname)) {
+                ext4_msg(sb, KERN_ERR,
+                        "%s quota file already specified", QTYPE2NAME(qtype));
+                kfree(qname);
+                return 0;
+        }
+        sbi->s_qf_names[qtype] = qname;
+        if (strchr(sbi->s_qf_names[qtype], '/')) {
+                ext4_msg(sb, KERN_ERR,
+                        "quotafile must be on filesystem root");
+                kfree(sbi->s_qf_names[qtype]);
+                sbi->s_qf_names[qtype] = NULL;
+                return 0;
+        }
+        set_opt(sbi->s_mount_opt, QUOTA);
+        return 1;
+}
+static int clear_qf_name(struct super_block *sb, int qtype)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        if (sb_any_quota_loaded(sb) &&
+                sbi->s_qf_names[qtype]) {
+                ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
+                        " when quota turned on");
+                return 0;
+        }
+        /*
+         * The space will be released later when all options are confirmed
+         * to be correct
+         */
+        sbi->s_qf_names[qtype] = NULL;
+        return 1;
+}
+#endif
 static int parse_options(char *options, struct super_block *sb,
                         unsigned long *journal_devnum,
@@ -1183,8 +1325,7 @@ static int parse_options(char *options, struct super_block *sb,
        int data_opt = 0;
        int option;
 #ifdef CONFIG_QUOTA
-        int qtype, qfmt;
+        int qfmt;
-        char *qname;
 #endif
        if (!options)
@@ -1195,19 +1336,31 @@ static int parse_options(char *options, struct super_block *sb,
                if (!*p)
                        continue;
+                /*
+                 * Initialize args struct so we know whether arg was
+                 * found; some options take optional arguments.
+                 */
+                args[0].to = args[0].from = 0;
                token = match_token(p, tokens, args);
                switch (token) {
                case Opt_bsd_df:
+                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
                        clear_opt(sbi->s_mount_opt, MINIX_DF);
                        break;
                case Opt_minix_df:
+                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
                        set_opt(sbi->s_mount_opt, MINIX_DF);
                        break;
                case Opt_grpid:
+                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
                        set_opt(sbi->s_mount_opt, GRPID);
                        break;
                case Opt_nogrpid:
+                        ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
                        clear_opt(sbi->s_mount_opt, GRPID);
                        break;
                case Opt_resuid:
                        if (match_int(&args[0], &option))
@@ -1344,14 +1497,13 @@ static int parse_options(char *options, struct super_block *sb,
                        data_opt = EXT4_MOUNT_WRITEBACK_DATA;
                datacheck:
                        if (is_remount) {
-                                if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS)
+                                if (test_opt(sb, DATA_FLAGS) != data_opt) {
-                                                != data_opt) {
                                        ext4_msg(sb, KERN_ERR,
                                                "Cannot change data mode on remount");
                                        return 0;
                                }
                        } else {
-                                sbi->s_mount_opt &= ~EXT4_MOUNT_DATA_FLAGS;
+                                clear_opt(sbi->s_mount_opt, DATA_FLAGS);
                                sbi->s_mount_opt |= data_opt;
                        }
                        break;
@@ -1363,68 +1515,30 @@ static int parse_options(char *options, struct super_block *sb,
                        break;
 #ifdef CONFIG_QUOTA
                case Opt_usrjquota:
-                        qtype = USRQUOTA;
+                        if (!set_qf_name(sb, USRQUOTA, &args[0]))
-                        goto set_qf_name;
-                case Opt_grpjquota:
-                        qtype = GRPQUOTA;
-set_qf_name:
-                        if (sb_any_quota_loaded(sb) &&
-                            !sbi->s_qf_names[qtype]) {
-                                ext4_msg(sb, KERN_ERR,
-                                       "Cannot change journaled "
-                                       "quota options when quota turned on");
-                                return 0;
-                        }
-                        qname = match_strdup(&args[0]);
-                        if (!qname) {
-                                ext4_msg(sb, KERN_ERR,
-                                        "Not enough memory for "
-                                        "storing quotafile name");
                                return 0;
-                        }
+                        break;
-                        if (sbi->s_qf_names[qtype] &&
+                case Opt_grpjquota:
-                            strcmp(sbi->s_qf_names[qtype], qname)) {
+                        if (!set_qf_name(sb, GRPQUOTA, &args[0]))
-                                ext4_msg(sb, KERN_ERR,
-                                        "%s quota file already "
-                                        "specified", QTYPE2NAME(qtype));
-                                kfree(qname);
-                                return 0;
-                        }
-                        sbi->s_qf_names[qtype] = qname;
-                        if (strchr(sbi->s_qf_names[qtype], '/')) {
-                                ext4_msg(sb, KERN_ERR,
-                                        "quotafile must be on "
-                                        "filesystem root");
-                                kfree(sbi->s_qf_names[qtype]);
-                                sbi->s_qf_names[qtype] = NULL;
                                return 0;
-                        }
-                        set_opt(sbi->s_mount_opt, QUOTA);
                        break;
                case Opt_offusrjquota:
-                        qtype = USRQUOTA;
+                        if (!clear_qf_name(sb, USRQUOTA))
-                        goto clear_qf_name;
+                                return 0;
+                        break;
                case Opt_offgrpjquota:
-                        qtype = GRPQUOTA;
+                        if (!clear_qf_name(sb, GRPQUOTA))
-clear_qf_name:
-                        if (sb_any_quota_loaded(sb) &&
-                            sbi->s_qf_names[qtype]) {
-                                ext4_msg(sb, KERN_ERR, "Cannot change "
-                                        "journaled quota options when "
-                                        "quota turned on");
                                return 0;
-                        }
-                        /*
-                         * The space will be released later when all options
-                         * are confirmed to be correct
-                         */
-                        sbi->s_qf_names[qtype] = NULL;
                        break;
                case Opt_jqfmt_vfsold:
                        qfmt = QFMT_VFS_OLD;
                        goto set_qf_format;
                case Opt_jqfmt_vfsv0:
                        qfmt = QFMT_VFS_V0;
+                        goto set_qf_format;
+                case Opt_jqfmt_vfsv1:
+                        qfmt = QFMT_VFS_V1;
 set_qf_format:
                        if (sb_any_quota_loaded(sb) &&
                            sbi->s_jquota_fmt != qfmt) {
@@ -1467,6 +1581,7 @@ set_qf_format:
                case Opt_offgrpjquota:
                case Opt_jqfmt_vfsold:
                case Opt_jqfmt_vfsv0:
+                case Opt_jqfmt_vfsv1:
                        ext4_msg(sb, KERN_ERR,
                                "journaled quota options not supported");
                        break;
@@ -1480,10 +1595,11 @@ set_qf_format:
                        clear_opt(sbi->s_mount_opt, BARRIER);
                        break;
                case Opt_barrier:
-                        if (match_int(&args[0], &option)) {
+                        if (args[0].from) {
-                                set_opt(sbi->s_mount_opt, BARRIER);
+                                if (match_int(&args[0], &option))
-                                break;
+                                        return 0;
-                        }
+                        } else
+                                option = 1;     /* No argument, default to 1 */
                        if (option)
                                set_opt(sbi->s_mount_opt, BARRIER);
                        else
@@ -1556,15 +1672,28 @@ set_qf_format:
                        set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
                        break;
                case Opt_auto_da_alloc:
-                        if (match_int(&args[0], &option)) {
+                        if (args[0].from) {
-                                clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+                                if (match_int(&args[0], &option))
-                                break;
+                                        return 0;
-                        }
+                        } else
+                                option = 1;     /* No argument, default to 1 */
                        if (option)
                                clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
                        else
                                set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
                        break;
+                case Opt_discard:
+                        set_opt(sbi->s_mount_opt, DISCARD);
+                        break;
+                case Opt_nodiscard:
+                        clear_opt(sbi->s_mount_opt, DISCARD);
+                        break;
+                case Opt_dioread_nolock:
+                        set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        break;
+                case Opt_dioread_lock:
+                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                        break;
                default:
                        ext4_msg(sb, KERN_ERR,
                               "Unrecognized mount option \"%s\" "
@@ -1574,18 +1703,13 @@ set_qf_format:
        }
 #ifdef CONFIG_QUOTA
        if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
-                if ((sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) &&
+                if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
-                     sbi->s_qf_names[USRQUOTA])
                        clear_opt(sbi->s_mount_opt, USRQUOTA);
-                if ((sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) &&
+                if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
-                     sbi->s_qf_names[GRPQUOTA])
                        clear_opt(sbi->s_mount_opt, GRPQUOTA);
-                if ((sbi->s_qf_names[USRQUOTA] &&
+                if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
-                                (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
-                    (sbi->s_qf_names[GRPQUOTA] &&
-                                (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
                        ext4_msg(sb, KERN_ERR, "old and new quota "
                                        "format mixing");
                        return 0;
@@ -1673,14 +1797,14 @@ static int ext4_fill_flex_info(struct super_block *sb)
        size_t size;
        int i;
-        if (!sbi->s_es->s_log_groups_per_flex) {
+        sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+        groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+        if (groups_per_flex < 2) {
                sbi->s_log_groups_per_flex = 0;
                return 1;
        }
-        sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
-        groups_per_flex = 1 << sbi->s_log_groups_per_flex;
        /* We allocate both existing and potentially added groups */
        flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
                        ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
@@ -1895,7 +2019,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
                }
                list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
-                vfs_dq_init(inode);
+                dquot_initialize(inode);
                if (inode->i_nlink) {
                        ext4_msg(sb, KERN_DEBUG,
                                "%s: truncating inode %lu to %lld bytes",
@@ -2099,11 +2223,8 @@ static int parse_strtoul(const char *buf,
 {
        char *endp;
-        while (*buf && isspace(*buf))
+        *value = simple_strtoul(skip_spaces(buf), &endp, 0);
-                buf++;
+        endp = skip_spaces(endp);
-        *value = simple_strtoul(buf, &endp, 0);
-        while (*endp && isspace(*endp))
-                endp++;
        if (*endp || *value > max)
                return -EINVAL;
@@ -2134,9 +2255,9 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
        struct super_block *sb = sbi->s_buddy_cache->i_sb;
        return snprintf(buf, PAGE_SIZE, "%llu\n",
-                        sbi->s_kbytes_written + 
+                        (unsigned long long)(sbi->s_kbytes_written +
                        ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
-                          EXT4_SB(sb)->s_sectors_written_start) >> 1));
+                          EXT4_SB(sb)->s_sectors_written_start) >> 1)));
 }
 static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
@@ -2251,7 +2372,7 @@ static void ext4_sb_release(struct kobject *kobj)
 }
-static struct sysfs_ops ext4_attr_ops = {
+static const struct sysfs_ops ext4_attr_ops = {
        .show   = ext4_attr_show,
        .store  = ext4_attr_store,
 };
@@ -2391,8 +2512,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
        if (def_mount_opts & EXT4_DEFM_DEBUG)
                set_opt(sbi->s_mount_opt, DEBUG);
-        if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
+        if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
+                ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
+                        "2.6.38");
                set_opt(sbi->s_mount_opt, GRPID);
+        }
        if (def_mount_opts & EXT4_DEFM_UID16)
                set_opt(sbi->s_mount_opt, NO_UID32);
 #ifdef CONFIG_EXT4_FS_XATTR
@@ -2404,11 +2528,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                set_opt(sbi->s_mount_opt, POSIX_ACL);
 #endif
        if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
-                sbi->s_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
+                set_opt(sbi->s_mount_opt, JOURNAL_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
-                sbi->s_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
+                set_opt(sbi->s_mount_opt, ORDERED_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
-                sbi->s_mount_opt |= EXT4_MOUNT_WRITEBACK_DATA;
+                set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
        if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
                set_opt(sbi->s_mount_opt, ERRORS_PANIC);
@@ -2429,14 +2553,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         * enable delayed allocation by default
         * Use -o nodelalloc to turn it off
         */
-        set_opt(sbi->s_mount_opt, DELALLOC);
+        if (!IS_EXT3_SB(sb))
+                set_opt(sbi->s_mount_opt, DELALLOC);
        if (!parse_options((char *) data, sb, &journal_devnum,
                           &journal_ioprio, NULL, 0))
                goto failed_mount;
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-                ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+                (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
        if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
            (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
@@ -2721,31 +2846,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
            EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
                if (ext4_load_journal(sb, es, journal_devnum))
                        goto failed_mount3;
-                if (!(sb->s_flags & MS_RDONLY) &&
-                    EXT4_SB(sb)->s_journal->j_failed_commit) {
-                        ext4_msg(sb, KERN_CRIT, "error: "
-                               "ext4_fill_super: Journal transaction "
-                               "%u is corrupt",
-                               EXT4_SB(sb)->s_journal->j_failed_commit);
-                        if (test_opt(sb, ERRORS_RO)) {
-                                ext4_msg(sb, KERN_CRIT,
-                                       "Mounting filesystem read-only");
-                                sb->s_flags |= MS_RDONLY;
-                                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
-                                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-                        }
-                        if (test_opt(sb, ERRORS_PANIC)) {
-                                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
-                                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-                                ext4_commit_super(sb, 1);
-                                goto failed_mount4;
-                        }
-                }
        } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
              EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
                ext4_msg(sb, KERN_ERR, "required journal recovery "
                       "suppressed and not mounted read-only");
-                goto failed_mount4;
+                goto failed_mount_wq;
        } else {
                clear_opt(sbi->s_mount_opt, DATA_FLAGS);
                set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
@@ -2758,7 +2863,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
            !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
                                       JBD2_FEATURE_INCOMPAT_64BIT)) {
                ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
-                goto failed_mount4;
+                goto failed_mount_wq;
        }
        if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
@@ -2797,7 +2902,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
                        ext4_msg(sb, KERN_ERR, "Journal does not support "
                               "requested data journaling mode");
-                        goto failed_mount4;
+                        goto failed_mount_wq;
                }
        default:
                break;
@@ -2805,13 +2910,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 no_journal:
        if (test_opt(sb, NOBH)) {
                if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
                        ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
                                "its supported only with writeback mode");
                        clear_opt(sbi->s_mount_opt, NOBH);
                }
+                if (test_opt(sb, DIOREAD_NOLOCK)) {
+                        ext4_msg(sb, KERN_WARNING, "dioread_nolock option is "
+                                "not supported with nobh mode");
+                        goto failed_mount_wq;
+                }
        }
        EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
        if (!EXT4_SB(sb)->dio_unwritten_wq) {
@@ -2876,6 +2985,18 @@ no_journal:
                         "requested data journaling mode");
                clear_opt(sbi->s_mount_opt, DELALLOC);
        }
+        if (test_opt(sb, DIOREAD_NOLOCK)) {
+                if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+                        ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
+                                "option - requested data journaling mode");
+                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                }
+                if (sb->s_blocksize < PAGE_SIZE) {
+                        ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
+                                "option - block size is too small");
+                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+                }
+        }
        err = ext4_setup_system_zone(sb);
        if (err) {
@@ -3339,10 +3460,9 @@ static void ext4_clear_journal_err(struct super_block *sb,
                char nbuf[16];
                errstr = ext4_decode_error(sb, j_errno, nbuf);
-                ext4_warning(sb, __func__, "Filesystem error recorded "
+                ext4_warning(sb, "Filesystem error recorded "
                             "from previous mount: %s", errstr);
-                ext4_warning(sb, __func__, "Marking fs in need of "
+                ext4_warning(sb, "Marking fs in need of filesystem check.");
-                             "filesystem check.");
                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
@@ -3493,7 +3613,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                ext4_abort(sb, __func__, "Abort forced by user");
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-                ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+                (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
        es = sbi->s_es;
@@ -3668,13 +3788,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
        buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
                       percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
-        ext4_free_blocks_count_set(es, buf->f_bfree);
        buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
        if (buf->f_bfree < ext4_r_blocks_count(es))
                buf->f_bavail = 0;
        buf->f_files = le32_to_cpu(es->s_inodes_count);
        buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
-        es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
        buf->f_namelen = EXT4_NAME_LEN;
        fsid = le64_to_cpup((void *)es->s_uuid) ^
               le64_to_cpup((void *)es->s_uuid + sizeof(u64));
@@ -3689,7 +3807,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 * Process 1                         Process 2
 * ext4_create()                     quota_sync()
 *   jbd2_journal_start()                  write_dquot()
- *   vfs_dq_init()                         down(dqio_mutex)
+ *   dquot_initialize()                         down(dqio_mutex)
 *     down(dqio_mutex)                    jbd2_journal_start()
 *
 */
@@ -3898,9 +4016,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
        ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
        int err = 0;
        int offset = off & (sb->s_blocksize - 1);
-        int tocopy;
        int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
-        size_t towrite = len;
        struct buffer_head *bh;
        handle_t *handle = journal_current_handle();
@@ -3910,52 +4026,53 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
                        (unsigned long long)off, (unsigned long long)len);
                return -EIO;
        }
+        /*
+         * Since we account only one data block in transaction credits,
+         * then it is impossible to cross a block boundary.
+         */
+        if (sb->s_blocksize - offset < len) {
+                ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+                        " cancelled because not block aligned",
+                        (unsigned long long)off, (unsigned long long)len);
+                return -EIO;
+        }
        mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
-        while (towrite > 0) {
+        bh = ext4_bread(handle, inode, blk, 1, &err);
-                tocopy = sb->s_blocksize - offset < towrite ?
+        if (!bh)
-                                sb->s_blocksize - offset : towrite;
+                goto out;
-                bh = ext4_bread(handle, inode, blk, 1, &err);
+        if (journal_quota) {
-                if (!bh)
+                err = ext4_journal_get_write_access(handle, bh);
+                if (err) {
+                        brelse(bh);
                        goto out;
-                if (journal_quota) {
-                        err = ext4_journal_get_write_access(handle, bh);
-                        if (err) {
-                                brelse(bh);
-                                goto out;
-                        }
-                }
-                lock_buffer(bh);
-                memcpy(bh->b_data+offset, data, tocopy);
-                flush_dcache_page(bh->b_page);
-                unlock_buffer(bh);
-                if (journal_quota)
-                        err = ext4_handle_dirty_metadata(handle, NULL, bh);
-                else {
-                        /* Always do at least ordered writes for quotas */
-                        err = ext4_jbd2_file_inode(handle, inode);
-                        mark_buffer_dirty(bh);
                }
-                brelse(bh);
-                if (err)
-                        goto out;
-                offset = 0;
-                towrite -= tocopy;
-                data += tocopy;
-                blk++;
        }
+        lock_buffer(bh);
+        memcpy(bh->b_data+offset, data, len);
+        flush_dcache_page(bh->b_page);
+        unlock_buffer(bh);
+        if (journal_quota)
+                err = ext4_handle_dirty_metadata(handle, NULL, bh);
+        else {
+                /* Always do at least ordered writes for quotas */
+                err = ext4_jbd2_file_inode(handle, inode);
+                mark_buffer_dirty(bh);
+        }
+        brelse(bh);
 out:
-        if (len == towrite) {
+        if (err) {
                mutex_unlock(&inode->i_mutex);
                return err;
        }
-        if (inode->i_size < off+len-towrite) {
+        if (inode->i_size < off + len) {
-                i_size_write(inode, off+len-towrite);
+                i_size_write(inode, off + len);
                EXT4_I(inode)->i_disksize = inode->i_size;
        }
        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        ext4_mark_inode_dirty(handle, inode);
        mutex_unlock(&inode->i_mutex);
-        return len - towrite;
+        return len;
 }
 #endif
@@ -3966,6 +4083,52 @@ static int ext4_get_sb(struct file_system_type *fs_type, int flags,
        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext2_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "ext2",
+        .get_sb         = ext4_get_sb,
+        .kill_sb        = kill_block_super,
+        .fs_flags       = FS_REQUIRES_DEV,
+};
+static inline void register_as_ext2(void)
+{
+        int err = register_filesystem(&ext2_fs_type);
+        if (err)
+                printk(KERN_WARNING
+                       "EXT4-fs: Unable to register as ext2 (%d)\n", err);
+}
+static inline void unregister_as_ext2(void)
+{
+        unregister_filesystem(&ext2_fs_type);
+}
+MODULE_ALIAS("ext2");
+#else
+static inline void register_as_ext2(void) { }
+static inline void unregister_as_ext2(void) { }
+#endif
+#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static inline void register_as_ext3(void)
+{
+        int err = register_filesystem(&ext3_fs_type);
+        if (err)
+                printk(KERN_WARNING
+                       "EXT4-fs: Unable to register as ext3 (%d)\n", err);
+}
+static inline void unregister_as_ext3(void)
+{
+        unregister_filesystem(&ext3_fs_type);
+}
+MODULE_ALIAS("ext3");
+#else
+static inline void register_as_ext3(void) { }
+static inline void unregister_as_ext3(void) { }
+#endif
 static struct file_system_type ext4_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext4",
@@ -3995,11 +4158,15 @@ static int __init init_ext4_fs(void)
        err = init_inodecache();
        if (err)
                goto out1;
+        register_as_ext2();
+        register_as_ext3();
        err = register_filesystem(&ext4_fs_type);
        if (err)
                goto out;
        return 0;
 out:
+        unregister_as_ext2();
+        unregister_as_ext3();
        destroy_inodecache();
 out1:
        exit_ext4_xattr();
@@ -4015,6 +4182,8 @@ out4:
 static void __exit exit_ext4_fs(void)
 {
+        unregister_as_ext2();
+        unregister_as_ext3();
        unregister_filesystem(&ext4_fs_type);
        destroy_inodecache();
        exit_ext4_xattr();
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index fed5b01d7a8d..b4c5aa8489d8 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -92,7 +92,7 @@ static struct buffer_head *ext4_xattr_cache_find(struct inode *,
                                                 struct mb_cache_entry **);
 static void ext4_xattr_rehash(struct ext4_xattr_header *,
                              struct ext4_xattr_entry *);
-static int ext4_xattr_list(struct inode *inode, char *buffer,
+static int ext4_xattr_list(struct dentry *dentry, char *buffer,
                           size_t buffer_size);
 static struct mb_cache *ext4_xattr_cache;
@@ -140,7 +140,7 @@ ext4_xattr_handler(int name_index)
 ssize_t
 ext4_listxattr(struct dentry *dentry, char *buffer, size_t size)
 {
-        return ext4_xattr_list(dentry->d_inode, buffer, size);
+        return ext4_xattr_list(dentry, buffer, size);
 }
 static int
@@ -227,7 +227,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
        ea_bdebug(bh, "b_count=%d, refcount=%d",
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
        if (ext4_xattr_check_block(bh)) {
-bad_block:      ext4_error(inode->i_sb, __func__,
+bad_block:
+                ext4_error(inode->i_sb,
                           "inode %lu: bad block %llu", inode->i_ino,
                           EXT4_I(inode)->i_file_acl);
                error = -EIO;
@@ -267,7 +268,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
        void *end;
        int error;
-        if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
+        if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
                return -ENODATA;
        error = ext4_get_inode_loc(inode, &iloc);
        if (error)
@@ -325,7 +326,7 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name,
 }
 static int
-ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry,
+ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
                        char *buffer, size_t buffer_size)
 {
        size_t rest = buffer_size;
@@ -335,9 +336,10 @@ ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry,
                        ext4_xattr_handler(entry->e_name_index);
                if (handler) {
-                        size_t size = handler->list(inode, buffer, rest,
+                        size_t size = handler->list(dentry, buffer, rest,
                                                    entry->e_name,
-                                                    entry->e_name_len);
+                                                    entry->e_name_len,
+                                                    handler->flags);
                        if (buffer) {
                                if (size > rest)
                                        return -ERANGE;
@@ -350,8 +352,9 @@ ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry,
 }
 static int
-ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
+ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
+        struct inode *inode = dentry->d_inode;
        struct buffer_head *bh = NULL;
        int error;
@@ -369,14 +372,14 @@ ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
        ea_bdebug(bh, "b_count=%d, refcount=%d",
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
        if (ext4_xattr_check_block(bh)) {
-                ext4_error(inode->i_sb, __func__,
+                ext4_error(inode->i_sb,
                           "inode %lu: bad block %llu", inode->i_ino,
                           EXT4_I(inode)->i_file_acl);
                error = -EIO;
                goto cleanup;
        }
        ext4_xattr_cache_insert(bh);
-        error = ext4_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size);
+        error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
 cleanup:
        brelse(bh);
@@ -385,15 +388,16 @@ cleanup:
 }
 static int
-ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
+ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
+        struct inode *inode = dentry->d_inode;
        struct ext4_xattr_ibody_header *header;
        struct ext4_inode *raw_inode;
        struct ext4_iloc iloc;
        void *end;
        int error;
-        if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
+        if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
                return 0;
        error = ext4_get_inode_loc(inode, &iloc);
        if (error)
@@ -404,7 +408,7 @@ ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
        error = ext4_xattr_check_names(IFIRST(header), end);
        if (error)
                goto cleanup;
-        error = ext4_xattr_list_entries(inode, IFIRST(header),
+        error = ext4_xattr_list_entries(dentry, IFIRST(header),
                                        buffer, buffer_size);
 cleanup:
@@ -423,12 +427,12 @@ cleanup:
 * used / required on success.
 */
 static int
-ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
+ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
        int i_error, b_error;
-        down_read(&EXT4_I(inode)->xattr_sem);
+        down_read(&EXT4_I(dentry->d_inode)->xattr_sem);
-        i_error = ext4_xattr_ibody_list(inode, buffer, buffer_size);
+        i_error = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
        if (i_error < 0) {
                b_error = 0;
        } else {
@@ -436,11 +440,11 @@ ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
                        buffer += i_error;
                        buffer_size -= i_error;
                }
-                b_error = ext4_xattr_block_list(inode, buffer, buffer_size);
+                b_error = ext4_xattr_block_list(dentry, buffer, buffer_size);
                if (b_error < 0)
                        i_error = 0;
        }
-        up_read(&EXT4_I(inode)->xattr_sem);
+        up_read(&EXT4_I(dentry->d_inode)->xattr_sem);
        return i_error + b_error;
 }
@@ -482,15 +486,16 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
                ea_bdebug(bh, "refcount now=0; freeing");
                if (ce)
                        mb_cache_entry_free(ce);
-                ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
                get_bh(bh);
-                ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
+                ext4_free_blocks(handle, inode, bh, 0, 1,
+                                 EXT4_FREE_BLOCKS_METADATA |
+                                 EXT4_FREE_BLOCKS_FORGET);
        } else {
                le32_add_cpu(&BHDR(bh)->h_refcount, -1);
                error = ext4_handle_dirty_metadata(handle, inode, bh);
                if (IS_SYNC(inode))
                        ext4_handle_sync(handle);
-                vfs_dq_free_block(inode, 1);
+                dquot_free_block(inode, 1);
                ea_bdebug(bh, "refcount now=%d; releasing",
                          le32_to_cpu(BHDR(bh)->h_refcount));
                if (ce)
@@ -661,9 +666,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
                        atomic_read(&(bs->bh->b_count)),
                        le32_to_cpu(BHDR(bs->bh)->h_refcount));
                if (ext4_xattr_check_block(bs->bh)) {
-                        ext4_error(sb, __func__,
+                        ext4_error(sb, "inode %lu: bad block %llu",
-                                "inode %lu: bad block %llu", inode->i_ino,
+                                   inode->i_ino, EXT4_I(inode)->i_file_acl);
-                                EXT4_I(inode)->i_file_acl);
                        error = -EIO;
                        goto cleanup;
                }
@@ -783,8 +787,8 @@ inserted:
                        else {
                                /* The old block is released after updating
                                   the inode. */
-                                error = -EDQUOT;
+                                error = dquot_alloc_block(inode, 1);
-                                if (vfs_dq_alloc_block(inode, 1))
+                                if (error)
                                        goto cleanup;
                                error = ext4_journal_get_write_access(handle,
                                                                      new_bh);
@@ -832,7 +836,8 @@ inserted:
                        new_bh = sb_getblk(sb, block);
                        if (!new_bh) {
 getblk_failed:
-                                ext4_free_blocks(handle, inode, block, 1, 1);
+                                ext4_free_blocks(handle, inode, 0, block, 1,
+                                                 EXT4_FREE_BLOCKS_METADATA);
                                error = -EIO;
                                goto cleanup;
                        }
@@ -871,13 +876,12 @@ cleanup:
        return error;
 cleanup_dquot:
-        vfs_dq_free_block(inode, 1);
+        dquot_free_block(inode, 1);
        goto cleanup;
 bad_block:
-        ext4_error(inode->i_sb, __func__,
+        ext4_error(inode->i_sb, "inode %lu: bad block %llu",
-                   "inode %lu: bad block %llu", inode->i_ino,
+                   inode->i_ino, EXT4_I(inode)->i_file_acl);
-                   EXT4_I(inode)->i_file_acl);
        goto cleanup;
 #undef header
@@ -903,7 +907,7 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
        is->s.base = is->s.first = IFIRST(header);
        is->s.here = is->s.first;
        is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
-        if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
+        if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
                error = ext4_xattr_check_names(IFIRST(header), is->s.end);
                if (error)
                        return error;
@@ -935,10 +939,10 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
        header = IHDR(inode, ext4_raw_inode(&is->iloc));
        if (!IS_LAST_ENTRY(s->first)) {
                header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
-                EXT4_I(inode)->i_state |= EXT4_STATE_XATTR;
+                ext4_set_inode_state(inode, EXT4_STATE_XATTR);
        } else {
                header->h_magic = cpu_to_le32(0);
-                EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR;
+                ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
        }
        return 0;
 }
@@ -981,17 +985,21 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
        if (strlen(name) > 255)
                return -ERANGE;
        down_write(&EXT4_I(inode)->xattr_sem);
-        no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND;
+        no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
-        EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+        ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
        error = ext4_get_inode_loc(inode, &is.iloc);
        if (error)
                goto cleanup;
-        if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
+        error = ext4_journal_get_write_access(handle, is.iloc.bh);
+        if (error)
+                goto cleanup;
+        if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) {
                struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
                memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
-                EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW;
+                ext4_clear_inode_state(inode, EXT4_STATE_NEW);
        }
        error = ext4_xattr_ibody_find(inode, &i, &is);
@@ -1013,9 +1021,6 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
                if (flags & XATTR_CREATE)
                        goto cleanup;
        }
-        error = ext4_journal_get_write_access(handle, is.iloc.bh);
-        if (error)
-                goto cleanup;
        if (!value) {
                if (!is.s.not_found)
                        error = ext4_xattr_ibody_set(handle, inode, &i, &is);
@@ -1046,7 +1051,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
                ext4_xattr_update_super_block(handle, inode->i_sb);
                inode->i_ctime = ext4_current_time(inode);
                if (!value)
-                        EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
+                        ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
                error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
                /*
                 * The bh is consumed by ext4_mark_iloc_dirty, even with
@@ -1061,7 +1066,7 @@ cleanup:
        brelse(is.iloc.bh);
        brelse(bs.bh);
        if (no_expand == 0)
-                EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
+                ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
        up_write(&EXT4_I(inode)->xattr_sem);
        return error;
 }
@@ -1189,9 +1194,8 @@ retry:
                if (!bh)
                        goto cleanup;
                if (ext4_xattr_check_block(bh)) {
-                        ext4_error(inode->i_sb, __func__,
+                        ext4_error(inode->i_sb, "inode %lu: bad block %llu",
-                                "inode %lu: bad block %llu", inode->i_ino,
+                                   inode->i_ino, EXT4_I(inode)->i_file_acl);
-                                EXT4_I(inode)->i_file_acl);
                        error = -EIO;
                        goto cleanup;
                }
@@ -1296,6 +1300,8 @@ retry:
                /* Remove the chosen entry from the inode */
                error = ext4_xattr_ibody_set(handle, inode, &i, is);
+                if (error)
+                        goto cleanup;
                entry = IFIRST(header);
                if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
@@ -1326,6 +1332,8 @@ retry:
                        goto cleanup;
                kfree(b_entry_name);
                kfree(buffer);
+                b_entry_name = NULL;
+                buffer = NULL;
                brelse(is->iloc.bh);
                kfree(is);
                kfree(bs);
@@ -1364,16 +1372,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
                goto cleanup;
        bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
        if (!bh) {
-                ext4_error(inode->i_sb, __func__,
+                ext4_error(inode->i_sb, "inode %lu: block %llu read error",
-                        "inode %lu: block %llu read error", inode->i_ino,
+                           inode->i_ino, EXT4_I(inode)->i_file_acl);
-                        EXT4_I(inode)->i_file_acl);
                goto cleanup;
        }
        if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
            BHDR(bh)->h_blocks != cpu_to_le32(1)) {
-                ext4_error(inode->i_sb, __func__,
+                ext4_error(inode->i_sb, "inode %lu: bad block %llu",
-                        "inode %lu: bad block %llu", inode->i_ino,
+                           inode->i_ino, EXT4_I(inode)->i_file_acl);
-                        EXT4_I(inode)->i_file_acl);
                goto cleanup;
        }
        ext4_xattr_release_block(handle, inode, bh);
@@ -1498,7 +1504,7 @@ again:
                }
                bh = sb_bread(inode->i_sb, ce->e_block);
                if (!bh) {
-                        ext4_error(inode->i_sb, __func__,
+                        ext4_error(inode->i_sb,
                                "inode %lu: block %lu read error",
                                inode->i_ino, (unsigned long) ce->e_block);
                } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index ca5f89fc6cae..8b145e98df07 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -7,13 +7,14 @@
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/security.h>
+#include <linux/slab.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
 #include "xattr.h"
 static size_t
-ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size,
+ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
-                         const char *name, size_t name_len)
+                const char *name, size_t name_len, int type)
 {
        const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
        const size_t total_len = prefix_len + name_len + 1;
@@ -28,23 +29,23 @@ ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size,
 }
 static int
-ext4_xattr_security_get(struct inode *inode, const char *name,
+ext4_xattr_security_get(struct dentry *dentry, const char *name,
-                       void *buffer, size_t size)
+                       void *buffer, size_t size, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ext4_xattr_get(inode, EXT4_XATTR_INDEX_SECURITY, name,
+        return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY,
-                              buffer, size);
+                              name, buffer, size);
 }
 static int
-ext4_xattr_security_set(struct inode *inode, const char *name,
+ext4_xattr_security_set(struct dentry *dentry, const char *name,
-                       const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY, name,
+        return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY,
-                              value, size, flags);
+                              name, value, size, flags);
 }
 int
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index ac1a52cf2a37..15b50edc6587 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -14,8 +14,8 @@
 #include "xattr.h"
 static size_t
-ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
+ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
-                        const char *name, size_t name_len)
+                const char *name, size_t name_len, int type)
 {
        const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
@@ -32,23 +32,23 @@ ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
 }
 static int
-ext4_xattr_trusted_get(struct inode *inode, const char *name,
+ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer,
-                       void *buffer, size_t size)
+                size_t size, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED, name,
+        return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED,
-                              buffer, size);
+                              name, buffer, size);
 }
 static int
-ext4_xattr_trusted_set(struct inode *inode, const char *name,
+ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
-                       const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED, name,
+        return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED,
-                              value, size, flags);
+                              name, value, size, flags);
 }
 struct xattr_handler ext4_xattr_trusted_handler = {
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index d91aa61b42aa..c4ce05746ce1 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -13,13 +13,13 @@
 #include "xattr.h"
 static size_t
-ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
+ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
-                     const char *name, size_t name_len)
+                     const char *name, size_t name_len, int type)
 {
        const size_t prefix_len = XATTR_USER_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
-        if (!test_opt(inode->i_sb, XATTR_USER))
+        if (!test_opt(dentry->d_sb, XATTR_USER))
                return 0;
        if (list && total_len <= list_size) {
@@ -31,26 +31,27 @@ ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
 }
 static int
-ext4_xattr_user_get(struct inode *inode, const char *name,
+ext4_xattr_user_get(struct dentry *dentry, const char *name,
-                    void *buffer, size_t size)
+                    void *buffer, size_t size, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        if (!test_opt(inode->i_sb, XATTR_USER))
+        if (!test_opt(dentry->d_sb, XATTR_USER))
                return -EOPNOTSUPP;
-        return ext4_xattr_get(inode, EXT4_XATTR_INDEX_USER, name, buffer, size);
+        return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_USER,
+                              name, buffer, size);
 }
 static int
-ext4_xattr_user_set(struct inode *inode, const char *name,
+ext4_xattr_user_set(struct dentry *dentry, const char *name,
-                    const void *value, size_t size, int flags)
+                    const void *value, size_t size, int flags, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        if (!test_opt(inode->i_sb, XATTR_USER))
+        if (!test_opt(dentry->d_sb, XATTR_USER))
                return -EOPNOTSUPP;
-        return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER, name,
+        return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_USER,
-                              value, size, flags);
+                              name, value, size, flags);
 }
 struct xattr_handler ext4_xattr_user_handler = {
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 923990e4f16e..113f0a1e565d 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -9,6 +9,7 @@
 */
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include "fat.h"
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 7db0979c6b72..e6efdfa0f6db 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -44,7 +44,8 @@ struct fat_mount_options {
                 nocase:1,        /* Does this need case conversion? 0=need case conversion*/
                 usefree:1,       /* Use free_clusters for FAT32 */
                 tz_utc:1,        /* Filesystem timestamps are in UTC */
-                 rodir:1;         /* allow ATTR_RO for directory */
+                 rodir:1,         /* allow ATTR_RO for directory */
+                 discard:1;       /* Issue discard requests on deletions */
 };
 #define FAT_HASH_BITS   8
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index a81037721a6f..81184d3b75a3 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -566,16 +566,21 @@ int fat_free_clusters(struct inode *inode, int cluster)
                        goto error;
                }
-                /* 
+                if (sbi->options.discard) {
-                 * Issue discard for the sectors we no longer care about,
+                        /*
-                 * batching contiguous clusters into one request
+                         * Issue discard for the sectors we no longer
-                 */
+                         * care about, batching contiguous clusters
-                if (cluster != fatent.entry + 1) {
+                         * into one request
-                        int nr_clus = fatent.entry - first_cl + 1;
+                         */
+                        if (cluster != fatent.entry + 1) {
-                        sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl),
+                                int nr_clus = fatent.entry - first_cl + 1;
-                                         nr_clus * sbi->sec_per_clus);
-                        first_cl = cluster;
+                                sb_issue_discard(sb,
+                                        fat_clus_to_blknr(sbi, first_cl),
+                                        nr_clus * sbi->sec_per_clus);
+                                first_cl = cluster;
+                        }
                }
                ops->ent_put(&fatent, FAT_ENT_FREE);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 76b7961ab663..0ce143bd7d56 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -558,7 +558,7 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bavail = sbi->free_clusters;
        buf->f_fsid.val[0] = (u32)id;
        buf->f_fsid.val[1] = (u32)(id >> 32);
-        buf->f_namelen = sbi->options.isvfat ? 260 : 12;
+        buf->f_namelen = sbi->options.isvfat ? FAT_LFN_LEN : 12;
        return 0;
 }
@@ -577,7 +577,7 @@ static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi,
        return i_pos;
 }
-static int fat_write_inode(struct inode *inode, int wait)
+static int __fat_write_inode(struct inode *inode, int wait)
 {
        struct super_block *sb = inode->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -634,9 +634,14 @@ retry:
        return err;
 }
+static int fat_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+        return __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
 int fat_sync_inode(struct inode *inode)
 {
-        return fat_write_inode(inode, 1);
+        return __fat_write_inode(inode, 1);
 }
 EXPORT_SYMBOL_GPL(fat_sync_inode);
@@ -858,6 +863,8 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
                seq_puts(m, ",errors=panic");
        else
                seq_puts(m, ",errors=remount-ro");
+        if (opts->discard)
+                seq_puts(m, ",discard");
        return 0;
 }
@@ -871,7 +878,7 @@ enum {
        Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
        Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
        Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
-        Opt_err_panic, Opt_err_ro, Opt_err,
+        Opt_err_panic, Opt_err_ro, Opt_discard, Opt_err,
 };
 static const match_table_t fat_tokens = {
@@ -899,6 +906,7 @@ static const match_table_t fat_tokens = {
        {Opt_err_cont, "errors=continue"},
        {Opt_err_panic, "errors=panic"},
        {Opt_err_ro, "errors=remount-ro"},
+        {Opt_discard, "discard"},
        {Opt_obsolate, "conv=binary"},
        {Opt_obsolate, "conv=text"},
        {Opt_obsolate, "conv=auto"},
@@ -1136,6 +1144,9 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
                case Opt_rodir:
                        opts->rodir = 1;
                        break;
+                case Opt_discard:
+                        opts->discard = 1;
+                        break;
                /* obsolete mount options */
                case Opt_obsolate:
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 0f55f5cb732f..d3da05f26465 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -9,6 +9,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
+#include <linux/time.h>
 #include "fat.h"
 /*
@@ -157,10 +158,6 @@ extern struct timezone sys_tz;
 #define SECS_PER_MIN    60
 #define SECS_PER_HOUR   (60 * 60)
 #define SECS_PER_DAY    (SECS_PER_HOUR * 24)
-#define UNIX_SECS_1980  315532800L
-#if BITS_PER_LONG == 64
-#define UNIX_SECS_2108  4354819200L
-#endif
 /* days between 1.1.70 and 1.1.80 (2 leap days) */
 #define DAYS_DELTA      (365 * 10 + 2)
 /* 120 (2100 - 1980) isn't leap year */
@@ -213,58 +210,35 @@ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
 void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts,
                       __le16 *time, __le16 *date, u8 *time_cs)
 {
-        time_t second = ts->tv_sec;
+        struct tm tm;
-        time_t day, leap_day, month, year;
+        time_to_tm(ts->tv_sec, sbi->options.tz_utc ? 0 :
+                   -sys_tz.tz_minuteswest * 60, &tm);
-        if (!sbi->options.tz_utc)
+        /*  FAT can only support year between 1980 to 2107 */
-                second -= sys_tz.tz_minuteswest * SECS_PER_MIN;
+        if (tm.tm_year < 1980 - 1900) {
-        /* Jan 1 GMT 00:00:00 1980. But what about another time zone? */
-        if (second < UNIX_SECS_1980) {
                *time = 0;
                *date = cpu_to_le16((0 << 9) | (1 << 5) | 1);
                if (time_cs)
                        *time_cs = 0;
                return;
        }
-#if BITS_PER_LONG == 64
+        if (tm.tm_year > 2107 - 1900) {
-        if (second >= UNIX_SECS_2108) {
                *time = cpu_to_le16((23 << 11) | (59 << 5) | 29);
                *date = cpu_to_le16((127 << 9) | (12 << 5) | 31);
                if (time_cs)
                        *time_cs = 199;
                return;
        }
-#endif
-        day = second / SECS_PER_DAY - DAYS_DELTA;
+        /* from 1900 -> from 1980 */
-        year = day / 365;
+        tm.tm_year -= 80;
-        leap_day = (year + 3) / 4;
+        /* 0~11 -> 1~12 */
-        if (year > YEAR_2100)           /* 2100 isn't leap year */
+        tm.tm_mon++;
-                leap_day--;
+        /* 0~59 -> 0~29(2sec counts) */
-        if (year * 365 + leap_day > day)
+        tm.tm_sec >>= 1;
-                year--;
-        leap_day = (year + 3) / 4;
-        if (year > YEAR_2100)           /* 2100 isn't leap year */
-                leap_day--;
-        day -= year * 365 + leap_day;
-        if (IS_LEAP_YEAR(year) && day == days_in_year[3]) {
-                month = 2;
-        } else {
-                if (IS_LEAP_YEAR(year) && day > days_in_year[3])
-                        day--;
-                for (month = 1; month < 12; month++) {
-                        if (days_in_year[month + 1] > day)
-                                break;
-                }
-        }
-        day -= days_in_year[month];
-        *time = cpu_to_le16(((second / SECS_PER_HOUR) % 24) << 11
+        *time = cpu_to_le16(tm.tm_hour << 11 | tm.tm_min << 5 | tm.tm_sec);
-                            | ((second / SECS_PER_MIN) % 60) << 5
+        *date = cpu_to_le16(tm.tm_year << 9 | tm.tm_mon << 5 | tm.tm_mday);
-                            | (second % SECS_PER_MIN) >> 1);
-        *date = cpu_to_le16((year << 9) | (month << 5) | (day + 1));
        if (time_cs)
                *time_cs = (ts->tv_sec & 1) * 100 + ts->tv_nsec / 10000000;
 }
@@ -285,4 +259,3 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
        }
        return err;
 }
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index f565f24019b5..6fcc7e71fbaa 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -309,7 +309,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
 {
        struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options;
        wchar_t *ip, *ext_start, *end, *name_start;
-        unsigned char base[9], ext[4], buf[8], *p;
+        unsigned char base[9], ext[4], buf[5], *p;
        unsigned char charbuf[NLS_MAX_CHARSET_SIZE];
        int chl, chi;
        int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen;
@@ -467,7 +467,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
                        return 0;
        }
-        i = jiffies & 0xffff;
+        i = jiffies;
        sz = (jiffies >> 16) & 0x7;
        if (baselen > 2) {
                baselen = numtail2_baselen;
@@ -476,7 +476,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
        name_res[baselen + 4] = '~';
        name_res[baselen + 5] = '1' + sz;
        while (1) {
-                sprintf(buf, "%04X", i);
+                snprintf(buf, sizeof(buf), "%04X", i & 0xffff);
                memcpy(&name_res[baselen], buf, 4);
                if (vfat_find_form(dir, name_res) < 0)
                        break;
@@ -502,14 +502,14 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
                *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname);
                if (*outlen < 0)
                        return *outlen;
-                else if (*outlen > 255)
+                else if (*outlen > FAT_LFN_LEN)
                        return -ENAMETOOLONG;
                op = &outname[*outlen * sizeof(wchar_t)];
        } else {
                if (nls) {
                        for (i = 0, ip = name, op = outname, *outlen = 0;
-                             i < len && *outlen <= 255;
+                             i < len && *outlen <= FAT_LFN_LEN;
                             *outlen += 1)
                        {
                                if (escape && (*ip == ':')) {
@@ -549,7 +549,7 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
                                return -ENAMETOOLONG;
                } else {
                        for (i = 0, ip = name, op = outname, *outlen = 0;
-                             i < len && *outlen <= 255;
+                             i < len && *outlen <= FAT_LFN_LEN;
                             i++, *outlen += 1)
                        {
                                *op++ = *ip++;
@@ -701,6 +701,15 @@ static int vfat_find(struct inode *dir, struct qstr *qname,
        return fat_search_long(dir, qname->name, len, sinfo);
 }
+/*
+ * (nfsd's) anonymous disconnected dentry?
+ * NOTE: !IS_ROOT() is not anonymous (I.e. d_splice_alias() did the job).
+ */
+static int vfat_d_anon_disconn(struct dentry *dentry)
+{
+        return IS_ROOT(dentry) && (dentry->d_flags & DCACHE_DISCONNECTED);
+}
 static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
                                  struct nameidata *nd)
 {
@@ -729,11 +738,11 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
        }
        alias = d_find_alias(inode);
-        if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) {
+        if (alias && !vfat_d_anon_disconn(alias)) {
                /*
-                 * This inode has non DCACHE_DISCONNECTED dentry. This
+                 * This inode has non anonymous-DCACHE_DISCONNECTED
-                 * means, the user did ->lookup() by an another name
+                 * dentry. This means, the user did ->lookup() by an
-                 * (longname vs 8.3 alias of it) in past.
+                 * another name (longname vs 8.3 alias of it) in past.
                 *
                 * Switch to new one for reason of locality if possible.
                 */
@@ -743,7 +752,9 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
                iput(inode);
                unlock_super(sb);
                return alias;
-        }
+        } else
+                dput(alias);
 out:
        unlock_super(sb);
        dentry->d_op = sb->s_root->d_op;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 2cf93ec40a67..452d02f9075e 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -344,7 +344,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
        switch (cmd) {
        case F_DUPFD:
        case F_DUPFD_CLOEXEC:
-                if (arg >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
+                if (arg >= rlimit(RLIMIT_NOFILE))
                        break;
                err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0);
                if (err >= 0) {
@@ -618,60 +618,90 @@ static DEFINE_RWLOCK(fasync_lock);
 static struct kmem_cache *fasync_cache __read_mostly;
 /*
- * fasync_helper() is used by almost all character device drivers
+ * Remove a fasync entry. If successfully removed, return
- * to set up the fasync queue. It returns negative on error, 0 if it did
+ * positive and clear the FASYNC flag. If no entry exists,
- * no changes and positive if it added/deleted the entry.
+ * do nothing and return 0.
+ *
+ * NOTE! It is very important that the FASYNC flag always
+ * match the state "is the filp on a fasync list".
+ *
+ * We always take the 'filp->f_lock', in since fasync_lock
+ * needs to be irq-safe.
 */
-int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
+static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
 {
        struct fasync_struct *fa, **fp;
-        struct fasync_struct *new = NULL;
        int result = 0;
-        if (on) {
+        spin_lock(&filp->f_lock);
-                new = kmem_cache_alloc(fasync_cache, GFP_KERNEL);
+        write_lock_irq(&fasync_lock);
-                if (!new)
+        for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
-                        return -ENOMEM;
+                if (fa->fa_file != filp)
+                        continue;
+                *fp = fa->fa_next;
+                kmem_cache_free(fasync_cache, fa);
+                filp->f_flags &= ~FASYNC;
+                result = 1;
+                break;
        }
+        write_unlock_irq(&fasync_lock);
+        spin_unlock(&filp->f_lock);
+        return result;
+}
+/*
+ * Add a fasync entry. Return negative on error, positive if
+ * added, and zero if did nothing but change an existing one.
+ *
+ * NOTE! It is very important that the FASYNC flag always
+ * match the state "is the filp on a fasync list".
+ */
+static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
+{
+        struct fasync_struct *new, *fa, **fp;
+        int result = 0;
+        new = kmem_cache_alloc(fasync_cache, GFP_KERNEL);
+        if (!new)
+                return -ENOMEM;
-        /*
-         * We need to take f_lock first since it's not an IRQ-safe
-         * lock.
-         */
        spin_lock(&filp->f_lock);
        write_lock_irq(&fasync_lock);
        for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
-                if (fa->fa_file == filp) {
+                if (fa->fa_file != filp)
-                        if(on) {
+                        continue;
-                                fa->fa_fd = fd;
+                fa->fa_fd = fd;
-                                kmem_cache_free(fasync_cache, new);
+                kmem_cache_free(fasync_cache, new);
-                        } else {
+                goto out;
-                                *fp = fa->fa_next;
-                                kmem_cache_free(fasync_cache, fa);
-                                result = 1;
-                        }
-                        goto out;
-                }
        }
-        if (on) {
+        new->magic = FASYNC_MAGIC;
-                new->magic = FASYNC_MAGIC;
+        new->fa_file = filp;
-                new->fa_file = filp;
+        new->fa_fd = fd;
-                new->fa_fd = fd;
+        new->fa_next = *fapp;
-                new->fa_next = *fapp;
+        *fapp = new;
-                *fapp = new;
+        result = 1;
-                result = 1;
+        filp->f_flags |= FASYNC;
-        }
 out:
-        if (on)
-                filp->f_flags |= FASYNC;
-        else
-                filp->f_flags &= ~FASYNC;
        write_unlock_irq(&fasync_lock);
        spin_unlock(&filp->f_lock);
        return result;
 }
+/*
+ * fasync_helper() is used by almost all character device drivers
+ * to set up the fasync queue, and for regular files by the file
+ * lease code. It returns negative on error, 0 if it did no changes
+ * and positive if it added/deleted the entry.
+ */
+int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
+{
+        if (!on)
+                return fasync_remove_entry(filp, fapp);
+        return fasync_add_entry(fd, filp, fapp);
+}
 EXPORT_SYMBOL(fasync_helper);
 void __kill_fasync(struct fasync_struct *fa, int sig, int band)
diff --git a/fs/fifo.c b/fs/fifo.c
index f8f97b8b6d44..5d6606ffc2d2 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -10,7 +10,6 @@
 */
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/pipe_fs_i.h>
diff --git a/fs/file.c b/fs/file.c
index 87e129030ab1..34bb7f71d994 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -257,7 +257,7 @@ int expand_files(struct files_struct *files, int nr)
         * N.B. For clone tasks sharing a files structure, this test
         * will limit the total number of files that can be opened.
         */
-        if (nr >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
+        if (nr >= rlimit(RLIMIT_NOFILE))
                return -EMFILE;
        /* Do we need to expand? */
@@ -478,7 +478,7 @@ repeat:
        error = fd;
 #if 1
        /* Sanity check */
-        if (rcu_dereference(fdt->fd[fd]) != NULL) {
+        if (rcu_dereference_raw(fdt->fd[fd]) != NULL) {
                printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
                rcu_assign_pointer(fdt->fd[fd], NULL);
        }
diff --git a/fs/file_table.c b/fs/file_table.c
index 8eb44042e009..32d12b78bac8 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -13,7 +13,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/security.h>
-#include <linux/ima.h>
 #include <linux/eventpoll.h>
 #include <linux/rcupdate.h>
 #include <linux/mount.h>
@@ -22,9 +21,12 @@
 #include <linux/fsnotify.h>
 #include <linux/sysctl.h>
 #include <linux/percpu_counter.h>
+#include <linux/ima.h>
 #include <asm/atomic.h>
+#include "internal.h"
 /* sysctl tunables... */
 struct files_stat_struct files_stat = {
        .max_files = NR_FILE
@@ -148,8 +150,6 @@ fail:
        return NULL;
 }
-EXPORT_SYMBOL(get_empty_filp);
 /**
 * alloc_file - allocate and initialize a 'struct file'
 * @mnt: the vfsmount on which the file will reside
@@ -165,8 +165,8 @@ EXPORT_SYMBOL(get_empty_filp);
 * If all the callers of init_file() are eliminated, its
 * code should be moved into this function.
 */
-struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry,
+struct file *alloc_file(struct path *path, fmode_t mode,
-                fmode_t mode, const struct file_operations *fop)
+                const struct file_operations *fop)
 {
        struct file *file;
@@ -174,35 +174,8 @@ struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry,
        if (!file)
                return NULL;
-        init_file(file, mnt, dentry, mode, fop);
+        file->f_path = *path;
-        return file;
+        file->f_mapping = path->dentry->d_inode->i_mapping;
-}
-EXPORT_SYMBOL(alloc_file);
-/**
- * init_file - initialize a 'struct file'
- * @file: the already allocated 'struct file' to initialized
- * @mnt: the vfsmount on which the file resides
- * @dentry: the dentry representing this file
- * @mode: the mode the file is opened with
- * @fop: the 'struct file_operations' for this file
- *
- * Use this instead of setting the members directly.  Doing so
- * avoids making mistakes like forgetting the mntget() or
- * forgetting to take a write on the mnt.
- *
- * Note: This is a crappy interface.  It is here to make
- * merging with the existing users of get_empty_filp()
- * who have complex failure logic easier.  All users
- * of this should be moving to alloc_file().
- */
-int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
-           fmode_t mode, const struct file_operations *fop)
-{
-        int error = 0;
-        file->f_path.dentry = dentry;
-        file->f_path.mnt = mntget(mnt);
-        file->f_mapping = dentry->d_inode->i_mapping;
        file->f_mode = mode;
        file->f_op = fop;
@@ -212,14 +185,14 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
         * visible.  We do this for consistency, and so
         * that we can do debugging checks at __fput()
         */
-        if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
+        if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) {
                file_take_write(file);
-                error = mnt_clone_write(mnt);
+                WARN_ON(mnt_clone_write(path->mnt));
-                WARN_ON(error);
        }
-        return error;
+        ima_counts_get(file);
+        return file;
 }
-EXPORT_SYMBOL(init_file);
+EXPORT_SYMBOL(alloc_file);
 void fput(struct file *file)
 {
@@ -420,7 +393,9 @@ retry:
                        continue;
                if (!(f->f_mode & FMODE_WRITE))
                        continue;
+                spin_lock(&f->f_lock);
                f->f_mode &= ~FMODE_WRITE;
+                spin_unlock(&f->f_lock);
                if (file_check_writeable(f) != 0)
                        continue;
                file_release_write(f);
diff --git a/fs/filesystems.c b/fs/filesystems.c
index a24c58e181db..68ba492d8eef 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -10,10 +10,10 @@
 #include <linux/fs.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <linux/slab.h>
 #include <linux/kmod.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 /*
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index ed8f0b0dd880..1429f3ae1e86 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -33,7 +33,6 @@
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include "vxfs_extern.h"
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9d5360c4c2af..781a322ccb45 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -16,6 +16,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
@@ -242,6 +243,7 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
 /**
 * bdi_start_writeback - start writeback
 * @bdi: the backing device to write from
+ * @sb: write inodes from this super_block
 * @nr_pages: the number of pages to write
 *
 * Description:
@@ -380,10 +382,10 @@ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
        move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
 }
-static int write_inode(struct inode *inode, int sync)
+static int write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
-                return inode->i_sb->s_op->write_inode(inode, sync);
+                return inode->i_sb->s_op->write_inode(inode, wbc);
        return 0;
 }
@@ -420,7 +422,6 @@ static int
 writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 {
        struct address_space *mapping = inode->i_mapping;
-        int wait = wbc->sync_mode == WB_SYNC_ALL;
        unsigned dirty;
        int ret;
@@ -438,7 +439,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
                 * We'll have another go at writing back this inode when we
                 * completed a full scan of b_io.
                 */
-                if (!wait) {
+                if (wbc->sync_mode != WB_SYNC_ALL) {
                        requeue_io(inode);
                        return 0;
                }
@@ -460,15 +461,20 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
        ret = do_writepages(mapping, wbc);
-        /* Don't write the inode if only I_DIRTY_PAGES was set */
+        /*
-        if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+         * Make sure to wait on the data before writing out the metadata.
-                int err = write_inode(inode, wait);
+         * This is important for filesystems that modify metadata on data
+         * I/O completion.
+         */
+        if (wbc->sync_mode == WB_SYNC_ALL) {
+                int err = filemap_fdatawait(mapping);
                if (ret == 0)
                        ret = err;
        }
-        if (wait) {
+        /* Don't write the inode if only I_DIRTY_PAGES was set */
-                int err = filemap_fdatawait(mapping);
+        if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+                int err = write_inode(inode, wbc);
                if (ret == 0)
                        ret = err;
        }
@@ -614,7 +620,6 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
                                struct writeback_control *wbc)
 {
        struct super_block *sb = wbc->sb, *pin_sb = NULL;
-        const int is_blkdev_sb = sb_is_blkdev_sb(sb);
        const unsigned long start = jiffies;    /* livelock avoidance */
        spin_lock(&inode_lock);
@@ -635,36 +640,11 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
                        continue;
                }
-                if (!bdi_cap_writeback_dirty(wb->bdi)) {
-                        redirty_tail(inode);
-                        if (is_blkdev_sb) {
-                                /*
-                                 * Dirty memory-backed blockdev: the ramdisk
-                                 * driver does this.  Skip just this inode
-                                 */
-                                continue;
-                        }
-                        /*
-                         * Dirty memory-backed inode against a filesystem other
-                         * than the kernel-internal bdev filesystem.  Skip the
-                         * entire superblock.
-                         */
-                        break;
-                }
                if (inode->i_state & (I_NEW | I_WILL_FREE)) {
                        requeue_io(inode);
                        continue;
                }
-                if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
-                        wbc->encountered_congestion = 1;
-                        if (!is_blkdev_sb)
-                                break;          /* Skip a congested fs */
-                        requeue_io(inode);
-                        continue;               /* Skip a congested blockdev */
-                }
                /*
                 * Was this inode dirtied after sync_sb_inodes was called?
                 * This keeps sync from extra jobs and livelock.
@@ -756,6 +736,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                .sync_mode              = args->sync_mode,
                .older_than_this        = NULL,
                .for_kupdate            = args->for_kupdate,
+                .for_background         = args->for_background,
                .range_cyclic           = args->range_cyclic,
        };
        unsigned long oldest_jif;
@@ -787,7 +768,6 @@ static long wb_writeback(struct bdi_writeback *wb,
                        break;
                wbc.more_io = 0;
-                wbc.encountered_congestion = 0;
                wbc.nr_to_write = MAX_WRITEBACK_PAGES;
                wbc.pages_skipped = 0;
                writeback_inodes_wb(wb, &wbc);
@@ -1213,6 +1193,23 @@ void writeback_inodes_sb(struct super_block *sb)
 EXPORT_SYMBOL(writeback_inodes_sb);
 /**
+ * writeback_inodes_sb_if_idle  -       start writeback if none underway
+ * @sb: the superblock
+ *
+ * Invoke writeback_inodes_sb if no writeback is currently underway.
+ * Returns 1 if writeback was started, 0 if not.
+ */
+int writeback_inodes_sb_if_idle(struct super_block *sb)
+{
+        if (!writeback_in_progress(sb->s_bdi)) {
+                writeback_inodes_sb(sb);
+                return 1;
+        } else
+                return 0;
+}
+EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
+/**
 * sync_inodes_sb       -       sync sb inode pages
 * @sb: the superblock
 *
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index 864dac20a242..cc94bb9563f2 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -1,7 +1,6 @@
 config FSCACHE
        tristate "General filesystem local caching manager"
-        depends on EXPERIMENTAL
        select SLOW_WORK
        help
          This option enables a generic filesystem caching manager that can be
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index e590242fa41a..1e1f286dd70e 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -12,6 +12,7 @@
 #define FSCACHE_DEBUG_LEVEL COOKIE
 #include <linux/module.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include <linux/key.h>
 #include <keys/user-type.h>
 #include "internal.h"
@@ -91,7 +92,7 @@ EXPORT_SYMBOL(fscache_object_destroy);
 */
 static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
 {
-        struct fscache_object *pobj, *obj, *minobj = NULL;
+        struct fscache_object *pobj, *obj = NULL, *minobj = NULL;
        struct rb_node *p;
        unsigned long pos;
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index e513ac599c8e..0b589a9b4ffc 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -53,7 +53,7 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
 static void fscache_object_slow_work_put_ref(struct slow_work *);
 static int  fscache_object_slow_work_get_ref(struct slow_work *);
 static void fscache_object_slow_work_execute(struct slow_work *);
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
 #endif
 static void fscache_initialise_object(struct fscache_object *);
@@ -69,7 +69,7 @@ const struct slow_work_ops fscache_object_slow_work_ops = {
        .get_ref        = fscache_object_slow_work_get_ref,
        .put_ref        = fscache_object_slow_work_put_ref,
        .execute        = fscache_object_slow_work_execute,
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
        .desc           = fscache_object_slow_work_desc,
 #endif
 };
@@ -364,7 +364,7 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
 /*
 * describe an object for slow-work debugging
 */
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 static void fscache_object_slow_work_desc(struct slow_work *work,
                                          struct seq_file *m)
 {
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 313e79a14266..f17cecafae44 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -14,6 +14,7 @@
 #define FSCACHE_DEBUG_LEVEL OPERATION
 #include <linux/module.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include "internal.h"
 atomic_t fscache_op_debug_id;
@@ -500,7 +501,7 @@ static void fscache_op_execute(struct slow_work *work)
 /*
 * describe an operation for slow-work debugging
 */
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
 {
        struct fscache_operation *op =
@@ -517,7 +518,7 @@ const struct slow_work_ops fscache_op_slow_work_ops = {
        .get_ref        = fscache_op_get_ref,
        .put_ref        = fscache_op_put_ref,
        .execute        = fscache_op_execute,
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
        .desc           = fscache_op_desc,
 #endif
 };
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index c598ea4c4e7d..47aefd376e54 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -14,6 +14,7 @@
 #include <linux/fscache-cache.h>
 #include <linux/buffer_head.h>
 #include <linux/pagevec.h>
+#include <linux/slab.h>
 #include "internal.h"
 /*
@@ -881,6 +882,7 @@ submit_failed:
        goto nobufs;
 nobufs_unlock_obj:
+        spin_unlock(&cookie->stores_lock);
        spin_unlock(&object->lock);
 nobufs:
        spin_unlock(&cookie->lock);
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index de792dcf3274..e1f8171278bd 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -44,6 +44,7 @@
 #include <linux/magic.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/stat.h>
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 51d9e33d634f..eb7e9423691f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -865,13 +865,10 @@ static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
        down_read(&fc->killsb);
        err = -ENOENT;
-        if (!fc->sb)
+        if (fc->sb) {
-                goto err_unlock;
+                err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
+                                               outarg.off, outarg.len);
-        err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
+        }
-                                       outarg.off, outarg.len);
-err_unlock:
        up_read(&fc->killsb);
        return err;
@@ -884,10 +881,15 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
                                   struct fuse_copy_state *cs)
 {
        struct fuse_notify_inval_entry_out outarg;
-        int err = -EINVAL;
+        int err = -ENOMEM;
-        char buf[FUSE_NAME_MAX+1];
+        char *buf;
        struct qstr name;
+        buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
+        if (!buf)
+                goto err;
+        err = -EINVAL;
        if (size < sizeof(outarg))
                goto err;
@@ -910,16 +912,14 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
        down_read(&fc->killsb);
        err = -ENOENT;
-        if (!fc->sb)
+        if (fc->sb)
-                goto err_unlock;
+                err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
-        err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
-err_unlock:
        up_read(&fc->killsb);
+        kfree(buf);
        return err;
 err:
+        kfree(buf);
        fuse_copy_finish(cs);
        return err;
 }
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c18913a777ae..a9f5e137f1d3 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -828,6 +828,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
                if (!page)
                        break;
+                if (mapping_writably_mapped(mapping))
+                        flush_dcache_page(page);
                pagefault_disable();
                tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
                pagefault_enable();
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 1a822ce2b24b..ec14d19ce501 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -850,7 +850,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
        req->in.args[0].size = sizeof(*arg);
        req->in.args[0].value = arg;
        req->out.numargs = 1;
-        /* Variable length arguement used for backward compatibility
+        /* Variable length argument used for backward compatibility
           with interface version < 7.5.  Rest of init_out is zeroed
           by do_get_request(), so a short reply is not a problem */
        req->out.argvar = 1;
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index e0b53aa7bbec..fe5df5457656 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -1,62 +1,59 @@
 /*
- * fs/generic_acl.c
- *
 * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
 *
 * This file is released under the GPL.
+ *
+ * Generic ACL support for in-memory filesystems.
 */
 #include <linux/sched.h>
+#include <linux/gfp.h>
 #include <linux/fs.h>
 #include <linux/generic_acl.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
-/**
- * generic_acl_list  -  Generic xattr_handler->list() operation
+static size_t
- * @ops:        Filesystem specific getacl and setacl callbacks
+generic_acl_list(struct dentry *dentry, char *list, size_t list_size,
- */
+                const char *name, size_t name_len, int type)
-size_t
-generic_acl_list(struct inode *inode, struct generic_acl_operations *ops,
-                 int type, char *list, size_t list_size)
 {
        struct posix_acl *acl;
-        const char *name;
+        const char *xname;
        size_t size;
-        acl = ops->getacl(inode, type);
+        acl = get_cached_acl(dentry->d_inode, type);
        if (!acl)
                return 0;
        posix_acl_release(acl);
-        switch(type) {
+        switch (type) {
-                case ACL_TYPE_ACCESS:
+        case ACL_TYPE_ACCESS:
-                        name = POSIX_ACL_XATTR_ACCESS;
+                xname = POSIX_ACL_XATTR_ACCESS;
-                        break;
+                break;
+        case ACL_TYPE_DEFAULT:
-                case ACL_TYPE_DEFAULT:
+                xname = POSIX_ACL_XATTR_DEFAULT;
-                        name = POSIX_ACL_XATTR_DEFAULT;
+                break;
-                        break;
+        default:
+                return 0;
-                default:
-                        return 0;
        }
-        size = strlen(name) + 1;
+        size = strlen(xname) + 1;
        if (list && size <= list_size)
-                memcpy(list, name, size);
+                memcpy(list, xname, size);
        return size;
 }
-/**
+static int
- * generic_acl_get  -  Generic xattr_handler->get() operation
+generic_acl_get(struct dentry *dentry, const char *name, void *buffer,
- * @ops:        Filesystem specific getacl and setacl callbacks
+                     size_t size, int type)
- */
-int
-generic_acl_get(struct inode *inode, struct generic_acl_operations *ops,
-                int type, void *buffer, size_t size)
 {
        struct posix_acl *acl;
        int error;
-        acl = ops->getacl(inode, type);
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        acl = get_cached_acl(dentry->d_inode, type);
        if (!acl)
                return -ENODATA;
        error = posix_acl_to_xattr(acl, buffer, size);
@@ -65,17 +62,16 @@ generic_acl_get(struct inode *inode, struct generic_acl_operations *ops,
        return error;
 }
-/**
+static int
- * generic_acl_set  -  Generic xattr_handler->set() operation
+generic_acl_set(struct dentry *dentry, const char *name, const void *value,
- * @ops:        Filesystem specific getacl and setacl callbacks
+                     size_t size, int flags, int type)
- */
-int
-generic_acl_set(struct inode *inode, struct generic_acl_operations *ops,
-                int type, const void *value, size_t size)
 {
+        struct inode *inode = dentry->d_inode;
        struct posix_acl *acl = NULL;
        int error;
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
        if (S_ISLNK(inode->i_mode))
                return -EOPNOTSUPP;
        if (!is_owner_or_cap(inode))
@@ -91,28 +87,27 @@ generic_acl_set(struct inode *inode, struct generic_acl_operations *ops,
                error = posix_acl_valid(acl);
                if (error)
                        goto failed;
-                switch(type) {
+                switch (type) {
-                        case ACL_TYPE_ACCESS:
+                case ACL_TYPE_ACCESS:
-                                mode = inode->i_mode;
+                        mode = inode->i_mode;
-                                error = posix_acl_equiv_mode(acl, &mode);
+                        error = posix_acl_equiv_mode(acl, &mode);
-                                if (error < 0)
+                        if (error < 0)
-                                        goto failed;
+                                goto failed;
-                                inode->i_mode = mode;
+                        inode->i_mode = mode;
-                                if (error == 0) {
+                        if (error == 0) {
-                                        posix_acl_release(acl);
+                                posix_acl_release(acl);
-                                        acl = NULL;
+                                acl = NULL;
-                                }
+                        }
-                                break;
+                        break;
+                case ACL_TYPE_DEFAULT:
-                        case ACL_TYPE_DEFAULT:
+                        if (!S_ISDIR(inode->i_mode)) {
-                                if (!S_ISDIR(inode->i_mode)) {
+                                error = -EINVAL;
-                                        error = -EINVAL;
+                                goto failed;
-                                        goto failed;
+                        }
-                                }
+                        break;
-                                break;
                }
        }
-        ops->setacl(inode, type, acl);
+        set_cached_acl(inode, type, acl);
        error = 0;
 failed:
        posix_acl_release(acl);
@@ -121,14 +116,12 @@ failed:
 /**
 * generic_acl_init  -  Take care of acl inheritance at @inode create time
- * @ops:        Filesystem specific getacl and setacl callbacks
 *
 * Files created inside a directory with a default ACL inherit the
 * directory's default ACL.
 */
 int
-generic_acl_init(struct inode *inode, struct inode *dir,
+generic_acl_init(struct inode *inode, struct inode *dir)
-                 struct generic_acl_operations *ops)
 {
        struct posix_acl *acl = NULL;
        mode_t mode = inode->i_mode;
@@ -136,7 +129,7 @@ generic_acl_init(struct inode *inode, struct inode *dir,
        inode->i_mode = mode & ~current_umask();
        if (!S_ISLNK(inode->i_mode))
-                acl = ops->getacl(dir, ACL_TYPE_DEFAULT);
+                acl = get_cached_acl(dir, ACL_TYPE_DEFAULT);
        if (acl) {
                struct posix_acl *clone;
@@ -145,7 +138,7 @@ generic_acl_init(struct inode *inode, struct inode *dir,
                        error = -ENOMEM;
                        if (!clone)
                                goto cleanup;
-                        ops->setacl(inode, ACL_TYPE_DEFAULT, clone);
+                        set_cached_acl(inode, ACL_TYPE_DEFAULT, clone);
                        posix_acl_release(clone);
                }
                clone = posix_acl_clone(acl, GFP_KERNEL);
@@ -156,7 +149,7 @@ generic_acl_init(struct inode *inode, struct inode *dir,
                if (error >= 0) {
                        inode->i_mode = mode;
                        if (error > 0)
-                                ops->setacl(inode, ACL_TYPE_ACCESS, clone);
+                                set_cached_acl(inode, ACL_TYPE_ACCESS, clone);
                }
                posix_acl_release(clone);
        }
@@ -169,20 +162,19 @@ cleanup:
 /**
 * generic_acl_chmod  -  change the access acl of @inode upon chmod()
- * @ops:        FIlesystem specific getacl and setacl callbacks
 *
 * A chmod also changes the permissions of the owner, group/mask, and
 * other ACL entries.
 */
 int
-generic_acl_chmod(struct inode *inode, struct generic_acl_operations *ops)
+generic_acl_chmod(struct inode *inode)
 {
        struct posix_acl *acl, *clone;
        int error = 0;
        if (S_ISLNK(inode->i_mode))
                return -EOPNOTSUPP;
-        acl = ops->getacl(inode, ACL_TYPE_ACCESS);
+        acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
        if (acl) {
                clone = posix_acl_clone(acl, GFP_KERNEL);
                posix_acl_release(acl);
@@ -190,8 +182,37 @@ generic_acl_chmod(struct inode *inode, struct generic_acl_operations *ops)
                        return -ENOMEM;
                error = posix_acl_chmod_masq(clone, inode->i_mode);
                if (!error)
-                        ops->setacl(inode, ACL_TYPE_ACCESS, clone);
+                        set_cached_acl(inode, ACL_TYPE_ACCESS, clone);
                posix_acl_release(clone);
        }
        return error;
 }
+int
+generic_check_acl(struct inode *inode, int mask)
+{
+        struct posix_acl *acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
+        if (acl) {
+                int error = posix_acl_permission(inode, acl, mask);
+                posix_acl_release(acl);
+                return error;
+        }
+        return -EAGAIN;
+}
+struct xattr_handler generic_acl_access_handler = {
+        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .flags  = ACL_TYPE_ACCESS,
+        .list   = generic_acl_list,
+        .get    = generic_acl_get,
+        .set    = generic_acl_set,
+};
+struct xattr_handler generic_acl_default_handler = {
+        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .flags  = ACL_TYPE_DEFAULT,
+        .list   = generic_acl_list,
+        .get    = generic_acl_get,
+        .set    = generic_acl_set,
+};
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 5971359d2090..a47b43107112 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -8,6 +8,7 @@ config GFS2_FS
        select FS_POSIX_ACL
        select CRC32
        select SLOW_WORK
+        select QUOTACTL
        help
          A cluster filesystem.
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3fc4e3ac7d84..87ee309d4c24 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -12,6 +12,7 @@
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
+#include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/gfs2_ondisk.h>
@@ -26,108 +27,44 @@
 #include "trans.h"
 #include "util.h"
-#define ACL_ACCESS 1
+static const char *gfs2_acl_name(int type)
-#define ACL_DEFAULT 0
-int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
-                          struct gfs2_ea_request *er, int *remove, mode_t *mode)
 {
-        struct posix_acl *acl;
+        switch (type) {
-        int error;
+        case ACL_TYPE_ACCESS:
+                return GFS2_POSIX_ACL_ACCESS;
-        error = gfs2_acl_validate_remove(ip, access);
+        case ACL_TYPE_DEFAULT:
-        if (error)
+                return GFS2_POSIX_ACL_DEFAULT;
-                return error;
-        if (!er->er_data)
-                return -EINVAL;
-        acl = posix_acl_from_xattr(er->er_data, er->er_data_len);
-        if (IS_ERR(acl))
-                return PTR_ERR(acl);
-        if (!acl) {
-                *remove = 1;
-                return 0;
-        }
-        error = posix_acl_valid(acl);
-        if (error)
-                goto out;
-        if (access) {
-                error = posix_acl_equiv_mode(acl, mode);
-                if (!error)
-                        *remove = 1;
-                else if (error > 0)
-                        error = 0;
        }
+        return NULL;
-out:
-        posix_acl_release(acl);
-        return error;
-}
-int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
-{
-        if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl)
-                return -EOPNOTSUPP;
-        if (!is_owner_or_cap(&ip->i_inode))
-                return -EPERM;
-        if (S_ISLNK(ip->i_inode.i_mode))
-                return -EOPNOTSUPP;
-        if (!access && !S_ISDIR(ip->i_inode.i_mode))
-                return -EACCES;
-        return 0;
 }
-static int acl_get(struct gfs2_inode *ip, const char *name,
+static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type)
-                   struct posix_acl **acl, struct gfs2_ea_location *el,
-                   char **datap, unsigned int *lenp)
 {
+        struct posix_acl *acl;
+        const char *name;
        char *data;
-        unsigned int len;
+        int len;
-        int error;
-        el->el_bh = NULL;
        if (!ip->i_eattr)
-                return 0;
+                return NULL;
-        error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, el);
-        if (error)
-                return error;
-        if (!el->el_ea)
-                return 0;
-        if (!GFS2_EA_DATA_LEN(el->el_ea))
-                goto out;
-        len = GFS2_EA_DATA_LEN(el->el_ea);
+        acl = get_cached_acl(&ip->i_inode, type);
-        data = kmalloc(len, GFP_NOFS);
+        if (acl != ACL_NOT_CACHED)
-        error = -ENOMEM;
+                return acl;
-        if (!data)
-                goto out;
-        error = gfs2_ea_get_copy(ip, el, data, len);
+        name = gfs2_acl_name(type);
-        if (error < 0)
+        if (name == NULL)
-                goto out_kfree;
+                return ERR_PTR(-EINVAL);
-        error = 0;
-        if (acl) {
+        len = gfs2_xattr_acl_get(ip, name, &data);
-                *acl = posix_acl_from_xattr(data, len);
+        if (len < 0)
-                if (IS_ERR(*acl))
+                return ERR_PTR(len);
-                        error = PTR_ERR(*acl);
+        if (len == 0)
-        }
+                return NULL;
-out_kfree:
+        acl = posix_acl_from_xattr(data, len);
-        if (error || !datap) {
+        kfree(data);
-                kfree(data);
+        return acl;
-        } else {
-                *datap = data;
-                *lenp = len;
-        }
-out:
-        return error;
 }
 /**
@@ -140,14 +77,12 @@ out:
 int gfs2_check_acl(struct inode *inode, int mask)
 {
-        struct gfs2_ea_location el;
+        struct posix_acl *acl;
-        struct posix_acl *acl = NULL;
        int error;
-        error = acl_get(GFS2_I(inode), GFS2_POSIX_ACL_ACCESS, &acl, &el, NULL, NULL);
+        acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS);
-        brelse(el.el_bh);
+        if (IS_ERR(acl))
-        if (error)
+                return PTR_ERR(acl);
-                return error;
        if (acl) {
                error = posix_acl_permission(inode, acl, mask);
@@ -158,57 +93,75 @@ int gfs2_check_acl(struct inode *inode, int mask)
        return -EAGAIN;
 }
-static int munge_mode(struct gfs2_inode *ip, mode_t mode)
+static int gfs2_set_mode(struct inode *inode, mode_t mode)
 {
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        int error = 0;
-        struct buffer_head *dibh;
-        int error;
-        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+        if (mode != inode->i_mode) {
-        if (error)
+                struct iattr iattr;
-                return error;
-        error = gfs2_meta_inode_buffer(ip, &dibh);
+                iattr.ia_valid = ATTR_MODE;
-        if (!error) {
+                iattr.ia_mode = mode;
-                gfs2_assert_withdraw(sdp,
-                                (ip->i_inode.i_mode & S_IFMT) == (mode & S_IFMT));
+                error = gfs2_setattr_simple(GFS2_I(inode), &iattr);
-                ip->i_inode.i_mode = mode;
-                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-                gfs2_dinode_out(ip, dibh->b_data);
-                brelse(dibh);
        }
-        gfs2_trans_end(sdp);
+        return error;
+}
+static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl)
+{
+        int error;
+        int len;
+        char *data;
+        const char *name = gfs2_acl_name(type);
-        return 0;
+        BUG_ON(name == NULL);
+        len = posix_acl_to_xattr(acl, NULL, 0);
+        if (len == 0)
+                return 0;
+        data = kmalloc(len, GFP_NOFS);
+        if (data == NULL)
+                return -ENOMEM;
+        error = posix_acl_to_xattr(acl, data, len);
+        if (error < 0)
+                goto out;
+        error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS);
+        if (!error)
+                set_cached_acl(inode, type, acl);
+out:
+        kfree(data);
+        return error;
 }
-int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
+int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode)
 {
-        struct gfs2_ea_location el;
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
-        struct posix_acl *acl = NULL, *clone;
+        struct posix_acl *acl, *clone;
-        mode_t mode = ip->i_inode.i_mode;
+        mode_t mode = inode->i_mode;
-        char *data = NULL;
+        int error = 0;
-        unsigned int len;
-        int error;
        if (!sdp->sd_args.ar_posix_acl)
                return 0;
-        if (S_ISLNK(ip->i_inode.i_mode))
+        if (S_ISLNK(inode->i_mode))
                return 0;
-        error = acl_get(dip, GFS2_POSIX_ACL_DEFAULT, &acl, &el, &data, &len);
+        acl = gfs2_acl_get(dip, ACL_TYPE_DEFAULT);
-        brelse(el.el_bh);
+        if (IS_ERR(acl))
-        if (error)
+                return PTR_ERR(acl);
-                return error;
        if (!acl) {
                mode &= ~current_umask();
-                if (mode != ip->i_inode.i_mode)
+                if (mode != inode->i_mode)
-                        error = munge_mode(ip, mode);
+                        error = gfs2_set_mode(inode, mode);
                return error;
        }
+        if (S_ISDIR(inode->i_mode)) {
+                error = gfs2_acl_set(inode, ACL_TYPE_DEFAULT, acl);
+                if (error)
+                        goto out;
+        }
        clone = posix_acl_clone(acl, GFP_NOFS);
        error = -ENOMEM;
        if (!clone)
@@ -216,43 +169,32 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
        posix_acl_release(acl);
        acl = clone;
-        if (S_ISDIR(ip->i_inode.i_mode)) {
-                error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
-                                       GFS2_POSIX_ACL_DEFAULT, data, len, 0);
-                if (error)
-                        goto out;
-        }
        error = posix_acl_create_masq(acl, &mode);
        if (error < 0)
                goto out;
        if (error == 0)
                goto munge;
-        posix_acl_to_xattr(acl, data, len);
+        error = gfs2_acl_set(inode, ACL_TYPE_ACCESS, acl);
-        error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
-                               GFS2_POSIX_ACL_ACCESS, data, len, 0);
        if (error)
                goto out;
 munge:
-        error = munge_mode(ip, mode);
+        error = gfs2_set_mode(inode, mode);
 out:
        posix_acl_release(acl);
-        kfree(data);
        return error;
 }
 int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
 {
-        struct posix_acl *acl = NULL, *clone;
+        struct posix_acl *acl, *clone;
-        struct gfs2_ea_location el;
        char *data;
        unsigned int len;
        int error;
-        error = acl_get(ip, GFS2_POSIX_ACL_ACCESS, &acl, &el, &data, &len);
+        acl = gfs2_acl_get(ip, ACL_TYPE_ACCESS);
-        if (error)
+        if (IS_ERR(acl))
-                goto out_brelse;
+                return PTR_ERR(acl);
        if (!acl)
                return gfs2_setattr_simple(ip, attr);
@@ -265,15 +207,138 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
        error = posix_acl_chmod_masq(acl, attr->ia_mode);
        if (!error) {
+                len = posix_acl_to_xattr(acl, NULL, 0);
+                data = kmalloc(len, GFP_NOFS);
+                error = -ENOMEM;
+                if (data == NULL)
+                        goto out;
                posix_acl_to_xattr(acl, data, len);
-                error = gfs2_ea_acl_chmod(ip, &el, attr, data);
+                error = gfs2_xattr_acl_chmod(ip, attr, data);
+                kfree(data);
+                set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl);
        }
 out:
        posix_acl_release(acl);
-        kfree(data);
-out_brelse:
-        brelse(el.el_bh);
        return error;
 }
+static int gfs2_acl_type(const char *name)
+{
+        if (strcmp(name, GFS2_POSIX_ACL_ACCESS) == 0)
+                return ACL_TYPE_ACCESS;
+        if (strcmp(name, GFS2_POSIX_ACL_DEFAULT) == 0)
+                return ACL_TYPE_DEFAULT;
+        return -EINVAL;
+}
+static int gfs2_xattr_system_get(struct dentry *dentry, const char *name,
+                                 void *buffer, size_t size, int xtype)
+{
+        struct inode *inode = dentry->d_inode;
+        struct posix_acl *acl;
+        int type;
+        int error;
+        type = gfs2_acl_type(name);
+        if (type < 0)
+                return type;
+        acl = gfs2_acl_get(GFS2_I(inode), type);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl == NULL)
+                return -ENODATA;
+        error = posix_acl_to_xattr(acl, buffer, size);
+        posix_acl_release(acl);
+        return error;
+}
+static int gfs2_xattr_system_set(struct dentry *dentry, const char *name,
+                                 const void *value, size_t size, int flags,
+                                 int xtype)
+{
+        struct inode *inode = dentry->d_inode;
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct posix_acl *acl = NULL;
+        int error = 0, type;
+        if (!sdp->sd_args.ar_posix_acl)
+                return -EOPNOTSUPP;
+        type = gfs2_acl_type(name);
+        if (type < 0)
+                return type;
+        if (flags & XATTR_CREATE)
+                return -EINVAL;
+        if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+                return value ? -EACCES : 0;
+        if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
+                return -EPERM;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        if (!value)
+                goto set_acl;
+        acl = posix_acl_from_xattr(value, size);
+        if (!acl) {
+                /*
+                 * acl_set_file(3) may request that we set default ACLs with
+                 * zero length -- defend (gracefully) against that here.
+                 */
+                goto out;
+        }
+        if (IS_ERR(acl)) {
+                error = PTR_ERR(acl);
+                goto out;
+        }
+        error = posix_acl_valid(acl);
+        if (error)
+                goto out_release;
+        error = -EINVAL;
+        if (acl->a_count > GFS2_ACL_MAX_ENTRIES)
+                goto out_release;
+        if (type == ACL_TYPE_ACCESS) {
+                mode_t mode = inode->i_mode;
+                error = posix_acl_equiv_mode(acl, &mode);
+                if (error <= 0) {
+                        posix_acl_release(acl);
+                        acl = NULL;
+                        if (error < 0)
+                                return error;
+                }
+                error = gfs2_set_mode(inode, mode);
+                if (error)
+                        goto out_release;
+        }
+set_acl:
+        error = __gfs2_xattr_set(inode, name, value, size, 0, GFS2_EATYPE_SYS);
+        if (!error) {
+                if (acl)
+                        set_cached_acl(inode, type, acl);
+                else
+                        forget_cached_acl(inode, type);
+        }
+out_release:
+        posix_acl_release(acl);
+out:
+        return error;
+}
+struct xattr_handler gfs2_xattr_system_handler = {
+        .prefix = XATTR_SYSTEM_PREFIX,
+        .flags  = GFS2_EATYPE_SYS,
+        .get    = gfs2_xattr_system_get,
+        .set    = gfs2_xattr_system_set,
+};
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 6751930bfb64..9306a2e6620c 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -13,26 +13,12 @@
 #include "incore.h"
 #define GFS2_POSIX_ACL_ACCESS           "posix_acl_access"
-#define GFS2_POSIX_ACL_ACCESS_LEN       16
 #define GFS2_POSIX_ACL_DEFAULT          "posix_acl_default"
-#define GFS2_POSIX_ACL_DEFAULT_LEN      17
+#define GFS2_ACL_MAX_ENTRIES            25
-#define GFS2_ACL_IS_ACCESS(name, len) \
+extern int gfs2_check_acl(struct inode *inode, int mask);
-         ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \
+extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
-         !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len)))
+extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
+extern struct xattr_handler gfs2_xattr_system_handler;
-#define GFS2_ACL_IS_DEFAULT(name, len) \
-         ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \
-         !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len)))
-struct gfs2_ea_request;
-int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
-                          struct gfs2_ea_request *er,
-                          int *remove, mode_t *mode);
-int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access);
-int gfs2_check_acl(struct inode *inode, int mask);
-int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip);
-int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
 #endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 694b5d48f036..0c1d0b82dcf1 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -269,7 +269,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
        pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
        unsigned offset = i_size & (PAGE_CACHE_SIZE-1);
        unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
        int i;
        int ret;
@@ -313,11 +312,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
                if (ret || (--(wbc->nr_to_write) <= 0))
                        ret = 1;
-                if (wbc->nonblocking && bdi_write_congested(bdi)) {
-                        wbc->encountered_congestion = 1;
-                        ret = 1;
-                }
        }
        gfs2_trans_end(sdp);
        return ret;
@@ -338,7 +332,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
 static int gfs2_write_cache_jdata(struct address_space *mapping,
                                  struct writeback_control *wbc)
 {
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
        int ret = 0;
        int done = 0;
        struct pagevec pvec;
@@ -348,11 +341,6 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
        int scanned = 0;
        int range_whole = 0;
-        if (wbc->nonblocking && bdi_write_congested(bdi)) {
-                wbc->encountered_congestion = 1;
-                return 0;
-        }
        pagevec_init(&pvec, 0);
        if (wbc->range_cyclic) {
                index = mapping->writeback_index; /* Start from prev offset */
@@ -819,8 +807,10 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
                mark_inode_dirty(inode);
        }
-        if (inode == sdp->sd_rindex)
+        if (inode == sdp->sd_rindex) {
                adjust_fs_space(inode);
+                ip->i_gh.gh_flags |= GL_NOCACHE;
+        }
        brelse(dibh);
        gfs2_trans_end(sdp);
@@ -889,8 +879,10 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
                mark_inode_dirty(inode);
        }
-        if (inode == sdp->sd_rindex)
+        if (inode == sdp->sd_rindex) {
                adjust_fs_space(inode);
+                ip->i_gh.gh_flags |= GL_NOCACHE;
+        }
        brelse(dibh);
        gfs2_trans_end(sdp);
@@ -1069,8 +1061,8 @@ out:
 int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
 {
-        struct inode *aspace = page->mapping->host;
+        struct address_space *mapping = page->mapping;
-        struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info;
+        struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
        struct buffer_head *bh, *head;
        struct gfs2_bufdata *bd;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6d47379e794b..5e411d5f4697 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -7,7 +7,6 @@
 * of the GNU General Public License version 2.
 */
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
@@ -541,7 +540,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
                                *ptr++ = cpu_to_be64(bn++);
                        break;
                }
-        } while (state != ALLOC_DATA);
+        } while ((state != ALLOC_DATA) || !dblock);
        ip->i_height = height;
        gfs2_add_inode_blocks(&ip->i_inode, alloced);
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 91beddadd388..bb7907bde3d8 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -7,7 +7,6 @@
 * of the GNU General Public License version 2.
 */
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 297d7e5cebad..25fddc100f18 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -525,38 +525,6 @@ consist_inode:
        return ERR_PTR(-EIO);
 }
-/**
- * dirent_first - Return the first dirent
- * @dip: the directory
- * @bh: The buffer
- * @dent: Pointer to list of dirents
- *
- * return first dirent whether bh points to leaf or stuffed dinode
- *
- * Returns: IS_LEAF, IS_DINODE, or -errno
- */
-static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh,
-                        struct gfs2_dirent **dent)
-{
-        struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data;
-        if (be32_to_cpu(h->mh_type) == GFS2_METATYPE_LF) {
-                if (gfs2_meta_check(GFS2_SB(&dip->i_inode), bh))
-                        return -EIO;
-                *dent = (struct gfs2_dirent *)(bh->b_data +
-                                               sizeof(struct gfs2_leaf));
-                return IS_LEAF;
-        } else {
-                if (gfs2_metatype_check(GFS2_SB(&dip->i_inode), bh, GFS2_METATYPE_DI))
-                        return -EIO;
-                *dent = (struct gfs2_dirent *)(bh->b_data +
-                                               sizeof(struct gfs2_dinode));
-                return IS_DINODE;
-        }
-}
 static int dirent_check_reclen(struct gfs2_inode *dip,
                               const struct gfs2_dirent *d, const void *end_p)
 {
@@ -1006,7 +974,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
        divider = (start + half_len) << (32 - dip->i_depth);
        /*  Copy the entries  */
-        dirent_first(dip, obh, &dent);
+        dent = (struct gfs2_dirent *)(obh->b_data + sizeof(struct gfs2_leaf));
        do {
                next = dent;
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index d15876e9aa26..c22c21174833 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -7,7 +7,6 @@
 * of the GNU General Public License version 2.
 */
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4eb308aa3234..e6dd2aec6f82 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -569,6 +569,40 @@ static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
        return ret;
 }
+/**
+ * gfs2_file_aio_write - Perform a write to a file
+ * @iocb: The io context
+ * @iov: The data to write
+ * @nr_segs: Number of @iov segments
+ * @pos: The file position
+ *
+ * We have to do a lock/unlock here to refresh the inode size for
+ * O_APPEND writes, otherwise we can land up writing at the wrong
+ * offset. There is still a race, but provided the app is using its
+ * own file locking, this will make O_APPEND work as expected.
+ *
+ */
+static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+                                   unsigned long nr_segs, loff_t pos)
+{
+        struct file *file = iocb->ki_filp;
+        if (file->f_flags & O_APPEND) {
+                struct dentry *dentry = file->f_dentry;
+                struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+                struct gfs2_holder gh;
+                int ret;
+                ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+                if (ret)
+                        return ret;
+                gfs2_glock_dq_uninit(&gh);
+        }
+        return generic_file_aio_write(iocb, iov, nr_segs, pos);
+}
 #ifdef CONFIG_GFS2_FS_LOCKING_DLM
 /**
@@ -606,7 +640,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
        if (!(fl->fl_flags & FL_POSIX))
                return -ENOLCK;
-        if (__mandatory_lock(&ip->i_inode))
+        if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK)
                return -ENOLCK;
        if (cmd == F_CANCELLK) {
@@ -711,7 +745,7 @@ const struct file_operations gfs2_file_fops = {
        .read           = do_sync_read,
        .aio_read       = generic_file_aio_read,
        .write          = do_sync_write,
-        .aio_write      = generic_file_aio_write,
+        .aio_write      = gfs2_file_aio_write,
        .unlocked_ioctl = gfs2_ioctl,
        .mmap           = gfs2_mmap,
        .open           = gfs2_open,
@@ -741,7 +775,7 @@ const struct file_operations gfs2_file_fops_nolock = {
        .read           = do_sync_read,
        .aio_read       = generic_file_aio_read,
        .write          = do_sync_write,
-        .aio_write      = generic_file_aio_write,
+        .aio_write      = gfs2_file_aio_write,
        .unlocked_ioctl = gfs2_ioctl,
        .mmap           = gfs2_mmap,
        .open           = gfs2_open,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 8b674b1f3a55..454d4b4eb36b 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -19,7 +19,6 @@
 #include <linux/list.h>
 #include <linux/wait.h>
 #include <linux/module.h>
-#include <linux/rwsem.h>
 #include <asm/uaccess.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
@@ -60,7 +59,6 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
 #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
 static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
-static DECLARE_RWSEM(gfs2_umount_flush_sem);
 static struct dentry *gfs2_root;
 static struct workqueue_struct *glock_workqueue;
 struct workqueue_struct *gfs2_delete_workqueue;
@@ -154,12 +152,14 @@ static unsigned int gl_hash(const struct gfs2_sbd *sdp,
 static void glock_free(struct gfs2_glock *gl)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
-        struct inode *aspace = gl->gl_aspace;
+        struct address_space *mapping = gfs2_glock2aspace(gl);
+        struct kmem_cache *cachep = gfs2_glock_cachep;
-        if (aspace)
+        GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
-                gfs2_aspace_put(aspace);
        trace_gfs2_glock_put(gl);
-        sdp->sd_lockstruct.ls_ops->lm_put_lock(gfs2_glock_cachep, gl);
+        if (mapping)
+                cachep = gfs2_glock_aspace_cachep;
+        sdp->sd_lockstruct.ls_ops->lm_put_lock(cachep, gl);
 }
 /**
@@ -241,15 +241,14 @@ int gfs2_glock_put(struct gfs2_glock *gl)
        int rv = 0;
        write_lock(gl_lock_addr(gl->gl_hash));
-        if (atomic_dec_and_test(&gl->gl_ref)) {
+        if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) {
                hlist_del(&gl->gl_list);
-                write_unlock(gl_lock_addr(gl->gl_hash));
-                spin_lock(&lru_lock);
                if (!list_empty(&gl->gl_lru)) {
                        list_del_init(&gl->gl_lru);
                        atomic_dec(&lru_count);
                }
                spin_unlock(&lru_lock);
+                write_unlock(gl_lock_addr(gl->gl_hash));
                GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
                glock_free(gl);
                rv = 1;
@@ -513,7 +512,6 @@ retry:
                        GLOCK_BUG_ON(gl, 1);
                }
                spin_unlock(&gl->gl_spin);
-                gfs2_glock_put(gl);
                return;
        }
@@ -524,8 +522,6 @@ retry:
                if (glops->go_xmote_bh) {
                        spin_unlock(&gl->gl_spin);
                        rv = glops->go_xmote_bh(gl, gh);
-                        if (rv == -EAGAIN)
-                                return;
                        spin_lock(&gl->gl_spin);
                        if (rv) {
                                do_error(gl, rv);
@@ -540,7 +536,6 @@ out:
        clear_bit(GLF_LOCK, &gl->gl_flags);
 out_locked:
        spin_unlock(&gl->gl_spin);
-        gfs2_glock_put(gl);
 }
 static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
@@ -600,7 +595,6 @@ __acquires(&gl->gl_spin)
        if (!(ret & LM_OUT_ASYNC)) {
                finish_xmote(gl, ret);
-                gfs2_glock_hold(gl);
                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                        gfs2_glock_put(gl);
        } else {
@@ -672,12 +666,17 @@ out:
        return;
 out_sched:
+        clear_bit(GLF_LOCK, &gl->gl_flags);
+        smp_mb__after_clear_bit();
        gfs2_glock_hold(gl);
        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                gfs2_glock_put_nolock(gl);
+        return;
 out_unlock:
        clear_bit(GLF_LOCK, &gl->gl_flags);
-        goto out;
+        smp_mb__after_clear_bit();
+        return;
 }
 static void delete_work_func(struct work_struct *work)
@@ -707,10 +706,12 @@ static void glock_work_func(struct work_struct *work)
 {
        unsigned long delay = 0;
        struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
+        int drop_ref = 0;
-        if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+        if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) {
                finish_xmote(gl, gl->gl_reply);
-        down_read(&gfs2_umount_flush_sem);
+                drop_ref = 1;
+        }
        spin_lock(&gl->gl_spin);
        if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
            gl->gl_state != LM_ST_UNLOCKED &&
@@ -723,10 +724,11 @@ static void glock_work_func(struct work_struct *work)
        }
        run_queue(gl, 0);
        spin_unlock(&gl->gl_spin);
-        up_read(&gfs2_umount_flush_sem);
        if (!delay ||
            queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
                gfs2_glock_put(gl);
+        if (drop_ref)
+                gfs2_glock_put(gl);
 }
 /**
@@ -746,10 +748,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
                   const struct gfs2_glock_operations *glops, int create,
                   struct gfs2_glock **glp)
 {
+        struct super_block *s = sdp->sd_vfs;
        struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type };
        struct gfs2_glock *gl, *tmp;
        unsigned int hash = gl_hash(sdp, &name);
-        int error;
+        struct address_space *mapping;
        read_lock(gl_lock_addr(hash));
        gl = search_bucket(hash, sdp, &name);
@@ -761,10 +764,14 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        if (!create)
                return -ENOENT;
-        gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
+        if (glops->go_flags & GLOF_ASPACE)
+                gl = kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_KERNEL);
+        else
+                gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
        if (!gl)
                return -ENOMEM;
+        atomic_inc(&sdp->sd_glock_disposal);
        gl->gl_flags = 0;
        gl->gl_name = name;
        atomic_set(&gl->gl_ref, 1);
@@ -779,18 +786,18 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        gl->gl_tchange = jiffies;
        gl->gl_object = NULL;
        gl->gl_sbd = sdp;
-        gl->gl_aspace = NULL;
        INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
        INIT_WORK(&gl->gl_delete, delete_work_func);
-        /* If this glock protects actual on-disk data or metadata blocks,
+        mapping = gfs2_glock2aspace(gl);
-           create a VFS inode to manage the pages/buffers holding them. */
+        if (mapping) {
-        if (glops == &gfs2_inode_glops || glops == &gfs2_rgrp_glops) {
+                mapping->a_ops = &gfs2_meta_aops;
-                gl->gl_aspace = gfs2_aspace_get(sdp);
+                mapping->host = s->s_bdev->bd_inode;
-                if (!gl->gl_aspace) {
+                mapping->flags = 0;
-                        error = -ENOMEM;
+                mapping_set_gfp_mask(mapping, GFP_NOFS);
-                        goto fail;
+                mapping->assoc_mapping = NULL;
-                }
+                mapping->backing_dev_info = s->s_bdi;
+                mapping->writeback_index = 0;
        }
        write_lock(gl_lock_addr(hash));
@@ -807,10 +814,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        *glp = gl;
        return 0;
-fail:
-        kmem_cache_free(gfs2_glock_cachep, gl);
-        return error;
 }
 /**
@@ -1361,10 +1364,6 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
                list_del_init(&gl->gl_lru);
                atomic_dec(&lru_count);
-                /* Check if glock is about to be freed */
-                if (atomic_read(&gl->gl_ref) == 0)
-                        continue;
                /* Test for being demotable */
                if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
                        gfs2_glock_hold(gl);
@@ -1375,10 +1374,11 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
                                handle_callback(gl, LM_ST_UNLOCKED, 0);
                                nr--;
                        }
+                        clear_bit(GLF_LOCK, &gl->gl_flags);
+                        smp_mb__after_clear_bit();
                        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                                gfs2_glock_put_nolock(gl);
                        spin_unlock(&gl->gl_spin);
-                        clear_bit(GLF_LOCK, &gl->gl_flags);
                        spin_lock(&lru_lock);
                        continue;
                }
@@ -1508,35 +1508,13 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
 {
-        unsigned long t;
        unsigned int x;
-        int cont;
-        t = jiffies;
+        for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
+                examine_bucket(clear_glock, sdp, x);
-        for (;;) {
+        flush_workqueue(glock_workqueue);
-                cont = 0;
+        wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
-                for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
+        gfs2_dump_lockstate(sdp);
-                        if (examine_bucket(clear_glock, sdp, x))
-                                cont = 1;
-                }
-                if (!cont)
-                        break;
-                if (time_after_eq(jiffies,
-                                  t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
-                        fs_warn(sdp, "Unmount seems to be stalled. "
-                                     "Dumping lock state...\n");
-                        gfs2_dump_lockstate(sdp);
-                        t = jiffies;
-                }
-                down_write(&gfs2_umount_flush_sem);
-                invalidate_inodes(sdp->sd_vfs);
-                up_write(&gfs2_umount_flush_sem);
-                msleep(10);
-        }
 }
 void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
@@ -1680,7 +1658,7 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
        dtime *= 1000000/HZ; /* demote time in uSec */
        if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
                dtime = 0;
-        gfs2_print_dbg(seq, "G:  s:%s n:%u/%llu f:%s t:%s d:%s/%llu a:%d r:%d\n",
+        gfs2_print_dbg(seq, "G:  s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d r:%d\n",
                  state2str(gl->gl_state),
                  gl->gl_name.ln_type,
                  (unsigned long long)gl->gl_name.ln_number,
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index c609894ec0d0..2bda1911b156 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -123,7 +123,7 @@ struct lm_lockops {
        int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
        void (*lm_unmount) (struct gfs2_sbd *sdp);
        void (*lm_withdraw) (struct gfs2_sbd *sdp);
-        void (*lm_put_lock) (struct kmem_cache *cachep, void *gl);
+        void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
        unsigned int (*lm_lock) (struct gfs2_glock *gl,
                                 unsigned int req_state, unsigned int flags);
        void (*lm_cancel) (struct gfs2_glock *gl);
@@ -180,13 +180,11 @@ static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
        return gl->gl_state == LM_ST_SHARED;
 }
-static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
+static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
 {
-        int ret;
+        if (gl->gl_ops->go_flags & GLOF_ASPACE)
-        spin_lock(&gl->gl_spin);
+                return (struct address_space *)(gl + 1);
-        ret = test_bit(GLF_DEMOTE, &gl->gl_flags);
+        return NULL;
-        spin_unlock(&gl->gl_spin);
-        return ret;
 }
 int gfs2_glock_get(struct gfs2_sbd *sdp,
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 6985eef06c39..49f97d3bb690 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -7,12 +7,12 @@
 * of the GNU General Public License version 2.
 */
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/bio.h>
+#include <linux/posix_acl.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -86,7 +86,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
 static void rgrp_go_sync(struct gfs2_glock *gl)
 {
-        struct address_space *metamapping = gl->gl_aspace->i_mapping;
+        struct address_space *metamapping = gfs2_glock2aspace(gl);
        int error;
        if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
@@ -112,7 +112,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
 static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
 {
-        struct address_space *mapping = gl->gl_aspace->i_mapping;
+        struct address_space *mapping = gfs2_glock2aspace(gl);
        BUG_ON(!(flags & DIO_METADATA));
        gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
@@ -133,7 +133,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
 static void inode_go_sync(struct gfs2_glock *gl)
 {
        struct gfs2_inode *ip = gl->gl_object;
-        struct address_space *metamapping = gl->gl_aspace->i_mapping;
+        struct address_space *metamapping = gfs2_glock2aspace(gl);
        int error;
        if (ip && !S_ISREG(ip->i_inode.i_mode))
@@ -182,10 +182,12 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
        gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
        if (flags & DIO_METADATA) {
-                struct address_space *mapping = gl->gl_aspace->i_mapping;
+                struct address_space *mapping = gfs2_glock2aspace(gl);
                truncate_inode_pages(mapping, 0);
-                if (ip)
+                if (ip) {
                        set_bit(GIF_INVALID, &ip->i_flags);
+                        forget_all_cached_acls(&ip->i_inode);
+                }
        }
        if (ip == GFS2_I(gl->gl_sbd->sd_rindex))
@@ -279,7 +281,8 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 static int rgrp_go_demote_ok(const struct gfs2_glock *gl)
 {
-        return !gl->gl_aspace->i_mapping->nrpages;
+        const struct address_space *mapping = (const struct address_space *)(gl + 1);
+        return !mapping->nrpages;
 }
 /**
@@ -384,8 +387,7 @@ static void iopen_go_callback(struct gfs2_glock *gl)
        struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
        if (gl->gl_demote_state == LM_ST_UNLOCKED &&
-            gl->gl_state == LM_ST_SHARED &&
+            gl->gl_state == LM_ST_SHARED && ip) {
-            ip && test_bit(GIF_USER, &ip->i_flags)) {
                gfs2_glock_hold(gl);
                if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
                        gfs2_glock_put_nolock(gl);
@@ -404,6 +406,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
        .go_dump = inode_go_dump,
        .go_type = LM_TYPE_INODE,
        .go_min_hold_time = HZ / 5,
+        .go_flags = GLOF_ASPACE,
 };
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
@@ -415,6 +418,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
        .go_dump = gfs2_rgrp_dump,
        .go_type = LM_TYPE_RGRP,
        .go_min_hold_time = HZ / 5,
+        .go_flags = GLOF_ASPACE,
 };
 const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 6edb423f90b3..3aac46f6853e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -162,6 +162,8 @@ struct gfs2_glock_operations {
        void (*go_callback) (struct gfs2_glock *gl);
        const int go_type;
        const unsigned long go_min_hold_time;
+        const unsigned long go_flags;
+#define GLOF_ASPACE 1
 };
 enum {
@@ -225,7 +227,6 @@ struct gfs2_glock {
        struct gfs2_sbd *gl_sbd;
-        struct inode *gl_aspace;
        struct list_head gl_ail_list;
        atomic_t gl_ail_count;
        struct delayed_work gl_work;
@@ -258,7 +259,6 @@ enum {
        GIF_INVALID             = 0,
        GIF_QD_LOCKED           = 1,
        GIF_SW_PAGED            = 3,
-        GIF_USER                = 4, /* user inode, not metadata addr space */
 };
@@ -429,7 +429,11 @@ struct gfs2_args {
        unsigned int ar_meta:1;                 /* mount metafs */
        unsigned int ar_discard:1;              /* discard requests */
        unsigned int ar_errors:2;               /* errors=withdraw | panic */
+        unsigned int ar_nobarrier:1;            /* do not send barriers */
        int ar_commit;                          /* Commit interval */
+        int ar_statfs_quantum;                  /* The fast statfs interval */
+        int ar_quota_quantum;                   /* The quota interval */
+        int ar_statfs_percent;                  /* The % change to force sync */
 };
 struct gfs2_tune {
@@ -447,7 +451,6 @@ struct gfs2_tune {
        unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
        unsigned int gt_new_files_jdata;
        unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
-        unsigned int gt_stall_secs; /* Detects trouble! */
        unsigned int gt_complain_secs;
        unsigned int gt_statfs_quantum;
        unsigned int gt_statfs_slow;
@@ -540,6 +543,8 @@ struct gfs2_sbd {
        struct gfs2_holder sd_live_gh;
        struct gfs2_glock *sd_rename_gl;
        struct gfs2_glock *sd_trans_gl;
+        wait_queue_head_t sd_glock_wait;
+        atomic_t sd_glock_disposal;
        /* Inode Stuff */
@@ -558,6 +563,7 @@ struct gfs2_sbd {
        spinlock_t sd_statfs_spin;
        struct gfs2_statfs_change_host sd_statfs_master;
        struct gfs2_statfs_change_host sd_statfs_local;
+        int sd_statfs_force_sync;
        /* Resource group stuff */
@@ -610,7 +616,7 @@ struct gfs2_sbd {
        unsigned int sd_log_blks_reserved;
        unsigned int sd_log_commited_buf;
        unsigned int sd_log_commited_databuf;
-        unsigned int sd_log_commited_revoke;
+        int sd_log_commited_revoke;
        unsigned int sd_log_num_buf;
        unsigned int sd_log_num_revoke;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index fb15d3b1f409..b1bf2694fb2b 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -45,7 +45,7 @@ static int iget_test(struct inode *inode, void *opaque)
        struct gfs2_inode *ip = GFS2_I(inode);
        u64 *no_addr = opaque;
-        if (ip->i_no_addr == *no_addr && test_bit(GIF_USER, &ip->i_flags))
+        if (ip->i_no_addr == *no_addr)
                return 1;
        return 0;
@@ -58,7 +58,6 @@ static int iget_set(struct inode *inode, void *opaque)
        inode->i_ino = (unsigned long)*no_addr;
        ip->i_no_addr = *no_addr;
-        set_bit(GIF_USER, &ip->i_flags);
        return 0;
 }
@@ -84,7 +83,7 @@ static int iget_skip_test(struct inode *inode, void *opaque)
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_skip_data *data = opaque;
-        if (ip->i_no_addr == data->no_addr && test_bit(GIF_USER, &ip->i_flags)){
+        if (ip->i_no_addr == data->no_addr) {
                if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){
                        data->skipped = 1;
                        return 0;
@@ -103,7 +102,6 @@ static int iget_skip_set(struct inode *inode, void *opaque)
                return 1;
        inode->i_ino = (unsigned long)(data->no_addr);
        ip->i_no_addr = data->no_addr;
-        set_bit(GIF_USER, &ip->i_flags);
        return 0;
 }
@@ -125,7 +123,7 @@ static struct inode *gfs2_iget_skip(struct super_block *sb,
 * directory entry when gfs2_inode_lookup() is invoked. Part of the code
 * segment inside gfs2_inode_lookup code needs to get moved around.
 *
- * Clean up I_LOCK and I_NEW as well.
+ * Clears I_NEW as well.
 **/
 void gfs2_set_iop(struct inode *inode)
@@ -801,7 +799,8 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
                return err;
        }
-        err = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SECURITY, name, value, len, 0);
+        err = __gfs2_xattr_set(&ip->i_inode, name, value, len, 0,
+                               GFS2_EATYPE_SECURITY);
        kfree(value);
        kfree(name);
@@ -871,7 +870,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
        if (error)
                goto fail_gunlock2;
-        error = gfs2_acl_create(dip, GFS2_I(inode));
+        error = gfs2_acl_create(dip, inode);
        if (error)
                goto fail_gunlock2;
@@ -947,9 +946,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
        str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
        str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI);
-        str->di_header.__pad0 = 0;
        str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI);
-        str->di_header.__pad1 = 0;
        str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
        str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
        str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 46df988323bc..0e0470ed34c2 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -9,6 +9,7 @@
 #include <linux/fs.h>
 #include <linux/dlm.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/gfs2_ondisk.h>
@@ -21,6 +22,7 @@ static void gdlm_ast(void *arg)
 {
        struct gfs2_glock *gl = arg;
        unsigned ret = gl->gl_state;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
        BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
@@ -29,7 +31,12 @@ static void gdlm_ast(void *arg)
        switch (gl->gl_lksb.sb_status) {
        case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
-                kmem_cache_free(gfs2_glock_cachep, gl);
+                if (gl->gl_ops->go_flags & GLOF_ASPACE)
+                        kmem_cache_free(gfs2_glock_aspace_cachep, gl);
+                else
+                        kmem_cache_free(gfs2_glock_cachep, gl);
+                if (atomic_dec_and_test(&sdp->sd_glock_disposal))
+                        wake_up(&sdp->sd_glock_wait);
                return;
        case -DLM_ECANCEL: /* Cancel while getting lock */
                ret |= LM_OUT_CANCELED;
@@ -164,14 +171,16 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl,
        return LM_OUT_ASYNC;
 }
-static void gdlm_put_lock(struct kmem_cache *cachep, void *ptr)
+static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
 {
-        struct gfs2_glock *gl = ptr;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
-        struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
        int error;
        if (gl->gl_lksb.sb_lkid == 0) {
                kmem_cache_free(cachep, gl);
+                if (atomic_dec_and_test(&sdp->sd_glock_disposal))
+                        wake_up(&sdp->sd_glock_wait);
                return;
        }
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 13c6237c5f67..e5bf4b59d46e 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -417,7 +417,7 @@ static unsigned int calc_reserved(struct gfs2_sbd *sdp)
        databufhdrs_needed = (sdp->sd_log_commited_databuf +
                              (dbuf_limit - 1)) / dbuf_limit;
-        if (sdp->sd_log_commited_revoke)
+        if (sdp->sd_log_commited_revoke > 0)
                revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
                                          sizeof(u64));
@@ -596,7 +596,9 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
        memset(lh, 0, sizeof(struct gfs2_log_header));
        lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
        lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+        lh->lh_header.__pad0 = cpu_to_be64(0);
        lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+        lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
        lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++);
        lh->lh_flags = cpu_to_be32(flags);
        lh->lh_tail = cpu_to_be32(tail);
@@ -788,7 +790,6 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
        gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) ||
                             (((int)sdp->sd_log_commited_databuf) >= 0));
        sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
-        gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
        reserved = calc_reserved(sdp);
        gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved);
        unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 9969ff062c5b..adc260fbea90 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -132,6 +132,7 @@ static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type)
 static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 {
        struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
+        struct gfs2_meta_header *mh;
        struct gfs2_trans *tr;
        lock_buffer(bd->bd_bh);
@@ -148,6 +149,9 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
        set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
        gfs2_meta_check(sdp, bd->bd_bh);
        gfs2_pin(sdp, bd->bd_bh);
+        mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
+        mh->__pad0 = cpu_to_be64(0);
+        mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
        sdp->sd_log_num_buf++;
        list_add(&le->le_list, &sdp->sd_log_le_buf);
        tr->tr_num_buf_new++;
@@ -524,9 +528,9 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
                gfs2_pin(sdp, bd->bd_bh);
                tr->tr_num_databuf_new++;
                sdp->sd_log_num_databuf++;
-                list_add(&le->le_list, &sdp->sd_log_le_databuf);
+                list_add_tail(&le->le_list, &sdp->sd_log_le_databuf);
        } else {
-                list_add(&le->le_list, &sdp->sd_log_le_ordered);
+                list_add_tail(&le->le_list, &sdp->sd_log_le_ordered);
        }
 out:
        gfs2_log_unlock(sdp);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 5b31f7741a8f..a88fadc704bb 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -52,6 +52,22 @@ static void gfs2_init_glock_once(void *foo)
        atomic_set(&gl->gl_ail_count, 0);
 }
+static void gfs2_init_gl_aspace_once(void *foo)
+{
+        struct gfs2_glock *gl = foo;
+        struct address_space *mapping = (struct address_space *)(gl + 1);
+        gfs2_init_glock_once(gl);
+        memset(mapping, 0, sizeof(*mapping));
+        INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
+        spin_lock_init(&mapping->tree_lock);
+        spin_lock_init(&mapping->i_mmap_lock);
+        INIT_LIST_HEAD(&mapping->private_list);
+        spin_lock_init(&mapping->private_lock);
+        INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
+        INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+}
 /**
 * init_gfs2_fs - Register GFS2 as a filesystem
 *
@@ -78,6 +94,14 @@ static int __init init_gfs2_fs(void)
        if (!gfs2_glock_cachep)
                goto fail;
+        gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock (aspace)",
+                                        sizeof(struct gfs2_glock) +
+                                        sizeof(struct address_space),
+                                        0, 0, gfs2_init_gl_aspace_once);
+        if (!gfs2_glock_aspace_cachep)
+                goto fail;
        gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
                                              sizeof(struct gfs2_inode),
                                              0,  SLAB_RECLAIM_ACCOUNT|
@@ -144,6 +168,9 @@ fail:
        if (gfs2_inode_cachep)
                kmem_cache_destroy(gfs2_inode_cachep);
+        if (gfs2_glock_aspace_cachep)
+                kmem_cache_destroy(gfs2_glock_aspace_cachep);
        if (gfs2_glock_cachep)
                kmem_cache_destroy(gfs2_glock_cachep);
@@ -169,6 +196,7 @@ static void __exit exit_gfs2_fs(void)
        kmem_cache_destroy(gfs2_rgrpd_cachep);
        kmem_cache_destroy(gfs2_bufdata_cachep);
        kmem_cache_destroy(gfs2_inode_cachep);
+        kmem_cache_destroy(gfs2_glock_aspace_cachep);
        kmem_cache_destroy(gfs2_glock_cachep);
        gfs2_sys_uninit();
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index cb8d7a93d5ec..0bb12c80937a 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -93,49 +93,13 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
        return err;
 }
-static const struct address_space_operations aspace_aops = {
+const struct address_space_operations gfs2_meta_aops = {
        .writepage = gfs2_aspace_writepage,
        .releasepage = gfs2_releasepage,
        .sync_page = block_sync_page,
 };
 /**
- * gfs2_aspace_get - Create and initialize a struct inode structure
- * @sdp: the filesystem the aspace is in
- *
- * Right now a struct inode is just a struct inode.  Maybe Linux
- * will supply a more lightweight address space construct (that works)
- * in the future.
- *
- * Make sure pages/buffers in this aspace aren't in high memory.
- *
- * Returns: the aspace
- */
-struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp)
-{
-        struct inode *aspace;
-        struct gfs2_inode *ip;
-        aspace = new_inode(sdp->sd_vfs);
-        if (aspace) {
-                mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS);
-                aspace->i_mapping->a_ops = &aspace_aops;
-                aspace->i_size = ~0ULL;
-                ip = GFS2_I(aspace);
-                clear_bit(GIF_USER, &ip->i_flags);
-                insert_inode_hash(aspace);
-        }
-        return aspace;
-}
-void gfs2_aspace_put(struct inode *aspace)
-{
-        remove_inode_hash(aspace);
-        iput(aspace);
-}
-/**
 * gfs2_meta_sync - Sync all buffers associated with a glock
 * @gl: The glock
 *
@@ -143,7 +107,7 @@ void gfs2_aspace_put(struct inode *aspace)
 void gfs2_meta_sync(struct gfs2_glock *gl)
 {
-        struct address_space *mapping = gl->gl_aspace->i_mapping;
+        struct address_space *mapping = gfs2_glock2aspace(gl);
        int error;
        filemap_fdatawrite(mapping);
@@ -164,7 +128,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
 struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
 {
-        struct address_space *mapping = gl->gl_aspace->i_mapping;
+        struct address_space *mapping = gfs2_glock2aspace(gl);
        struct gfs2_sbd *sdp = gl->gl_sbd;
        struct page *page;
        struct buffer_head *bh;
@@ -344,8 +308,10 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
 void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta)
 {
-        struct gfs2_sbd *sdp = GFS2_SB(bh->b_page->mapping->host);
+        struct address_space *mapping = bh->b_page->mapping;
+        struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
        struct gfs2_bufdata *bd = bh->b_private;
        if (test_clear_buffer_pinned(bh)) {
                list_del_init(&bd->bd_le.le_list);
                if (meta) {
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index de270c2f9b63..6a1d9ba16411 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -37,8 +37,16 @@ static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
               0, from_head - to_head);
 }
-struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp);
+extern const struct address_space_operations gfs2_meta_aops;
-void gfs2_aspace_put(struct inode *aspace);
+static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
+{
+        struct inode *inode = mapping->host;
+        if (mapping->a_ops == &gfs2_meta_aops)
+                return (((struct gfs2_glock *)mapping) - 1)->gl_sbd;
+        else
+                return inode->i_sb->s_fs_info;
+}
 void gfs2_meta_sync(struct gfs2_glock *gl);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 52fb6c048981..c1309ed1c496 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -18,6 +18,7 @@
 #include <linux/mount.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/slow-work.h>
+#include <linux/quotaops.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -62,13 +63,9 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
        gt->gt_quota_warn_period = 10;
        gt->gt_quota_scale_num = 1;
        gt->gt_quota_scale_den = 1;
-        gt->gt_quota_quantum = 60;
        gt->gt_new_files_jdata = 0;
        gt->gt_max_readahead = 1 << 18;
-        gt->gt_stall_secs = 600;
        gt->gt_complain_secs = 10;
-        gt->gt_statfs_quantum = 30;
-        gt->gt_statfs_slow = 0;
 }
 static struct gfs2_sbd *init_sbd(struct super_block *sb)
@@ -84,6 +81,8 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        gfs2_tune_init(&sdp->sd_tune);
+        init_waitqueue_head(&sdp->sd_glock_wait);
+        atomic_set(&sdp->sd_glock_disposal, 0);
        spin_lock_init(&sdp->sd_statfs_spin);
        spin_lock_init(&sdp->sd_rindex_spin);
@@ -725,7 +724,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
                goto fail;
        }
-        error = -EINVAL;
+        error = -EUSERS;
        if (!gfs2_jindex_size(sdp)) {
                fs_err(sdp, "no journals!\n");
                goto fail_jindex;
@@ -985,16 +984,24 @@ static const match_table_t nolock_tokens = {
        { Opt_err, NULL },
 };
+static void nolock_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        kmem_cache_free(cachep, gl);
+        if (atomic_dec_and_test(&sdp->sd_glock_disposal))
+                wake_up(&sdp->sd_glock_wait);
+}
 static const struct lm_lockops nolock_ops = {
        .lm_proto_name = "lock_nolock",
-        .lm_put_lock = kmem_cache_free,
+        .lm_put_lock = nolock_put_lock,
        .lm_tokens = &nolock_tokens,
 };
 /**
 * gfs2_lm_mount - mount a locking protocol
 * @sdp: the filesystem
- * @args: mount arguements
+ * @args: mount arguments
 * @silent: if 1, don't complain if the FS isn't a GFS2 fs
 *
 * Returns: errno
@@ -1114,7 +1121,7 @@ void gfs2_online_uevent(struct gfs2_sbd *sdp)
 * Returns: errno
 */
-static int fill_super(struct super_block *sb, void *data, int silent)
+static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent)
 {
        struct gfs2_sbd *sdp;
        struct gfs2_holder mount_gh;
@@ -1125,17 +1132,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
                printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n");
                return -ENOMEM;
        }
+        sdp->sd_args = *args;
-        sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
-        sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
-        sdp->sd_args.ar_commit = 60;
-        sdp->sd_args.ar_errors = GFS2_ERRORS_DEFAULT;
-        error = gfs2_mount_args(sdp, &sdp->sd_args, data);
-        if (error) {
-                printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
-                goto fail;
-        }
        if (sdp->sd_args.ar_spectator) {
                sb->s_flags |= MS_RDONLY;
@@ -1143,11 +1140,15 @@ static int fill_super(struct super_block *sb, void *data, int silent)
        }
        if (sdp->sd_args.ar_posix_acl)
                sb->s_flags |= MS_POSIXACL;
+        if (sdp->sd_args.ar_nobarrier)
+                set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
        sb->s_magic = GFS2_MAGIC;
        sb->s_op = &gfs2_super_ops;
        sb->s_export_op = &gfs2_export_ops;
        sb->s_xattr = gfs2_xattr_handlers;
+        sb->s_qcop = &gfs2_quotactl_ops;
+        sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
        sb->s_time_gran = 1;
        sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -1160,6 +1161,15 @@ static int fill_super(struct super_block *sb, void *data, int silent)
        sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
        sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit;
+        sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum;
+        if (sdp->sd_args.ar_statfs_quantum) {
+                sdp->sd_tune.gt_statfs_slow = 0;
+                sdp->sd_tune.gt_statfs_quantum = sdp->sd_args.ar_statfs_quantum;
+        }
+        else {
+                sdp->sd_tune.gt_statfs_slow = 1;
+                sdp->sd_tune.gt_statfs_quantum = 30;
+        }
        error = init_names(sdp, silent);
        if (error)
@@ -1230,10 +1240,9 @@ fail_sb:
 fail_locking:
        init_locking(sdp, &mount_gh, UNDO);
 fail_lm:
+        invalidate_inodes(sb);
        gfs2_gl_hash_clear(sdp);
        gfs2_lm_unmount(sdp);
-        while (invalidate_inodes(sb))
-                yield();
 fail_sys:
        gfs2_sys_fs_del(sdp);
 fail:
@@ -1243,18 +1252,127 @@ fail:
        return error;
 }
-static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
+static int set_gfs2_super(struct super_block *s, void *data)
-                       const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
+        s->s_bdev = data;
+        s->s_dev = s->s_bdev->bd_dev;
+        /*
+         * We set the bdi here to the queue backing, file systems can
+         * overwrite this in ->fill_super()
+         */
+        s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
+        return 0;
 }
-static int test_meta_super(struct super_block *s, void *ptr)
+static int test_gfs2_super(struct super_block *s, void *ptr)
 {
        struct block_device *bdev = ptr;
        return (bdev == s->s_bdev);
 }
+/**
+ * gfs2_get_sb - Get the GFS2 superblock
+ * @fs_type: The GFS2 filesystem type
+ * @flags: Mount flags
+ * @dev_name: The name of the device
+ * @data: The mount arguments
+ * @mnt: The vfsmnt for this mount
+ *
+ * Q. Why not use get_sb_bdev() ?
+ * A. We need to select one of two root directories to mount, independent
+ *    of whether this is the initial, or subsequent, mount of this sb
+ *
+ * Returns: 0 or -ve on error
+ */
+static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
+                       const char *dev_name, void *data, struct vfsmount *mnt)
+{
+        struct block_device *bdev;
+        struct super_block *s;
+        fmode_t mode = FMODE_READ;
+        int error;
+        struct gfs2_args args;
+        struct gfs2_sbd *sdp;
+        if (!(flags & MS_RDONLY))
+                mode |= FMODE_WRITE;
+        bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+        if (IS_ERR(bdev))
+                return PTR_ERR(bdev);
+        /*
+         * once the super is inserted into the list by sget, s_umount
+         * will protect the lockfs code from trying to start a snapshot
+         * while we are mounting
+         */
+        mutex_lock(&bdev->bd_fsfreeze_mutex);
+        if (bdev->bd_fsfreeze_count > 0) {
+                mutex_unlock(&bdev->bd_fsfreeze_mutex);
+                error = -EBUSY;
+                goto error_bdev;
+        }
+        s = sget(fs_type, test_gfs2_super, set_gfs2_super, bdev);
+        mutex_unlock(&bdev->bd_fsfreeze_mutex);
+        error = PTR_ERR(s);
+        if (IS_ERR(s))
+                goto error_bdev;
+        memset(&args, 0, sizeof(args));
+        args.ar_quota = GFS2_QUOTA_DEFAULT;
+        args.ar_data = GFS2_DATA_DEFAULT;
+        args.ar_commit = 60;
+        args.ar_statfs_quantum = 30;
+        args.ar_quota_quantum = 60;
+        args.ar_errors = GFS2_ERRORS_DEFAULT;
+        error = gfs2_mount_args(&args, data);
+        if (error) {
+                printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
+                if (s->s_root)
+                        goto error_super;
+                deactivate_locked_super(s);
+                return error;
+        }
+        if (s->s_root) {
+                error = -EBUSY;
+                if ((flags ^ s->s_flags) & MS_RDONLY)
+                        goto error_super;
+                close_bdev_exclusive(bdev, mode);
+        } else {
+                char b[BDEVNAME_SIZE];
+                s->s_flags = flags;
+                s->s_mode = mode;
+                strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+                sb_set_blocksize(s, block_size(bdev));
+                error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
+                if (error) {
+                        deactivate_locked_super(s);
+                        return error;
+                }
+                s->s_flags |= MS_ACTIVE;
+                bdev->bd_super = s;
+        }
+        sdp = s->s_fs_info;
+        mnt->mnt_sb = s;
+        if (args.ar_meta)
+                mnt->mnt_root = dget(sdp->sd_master_dir);
+        else
+                mnt->mnt_root = dget(sdp->sd_root_dir);
+        return 0;
+error_super:
+        deactivate_locked_super(s);
+error_bdev:
+        close_bdev_exclusive(bdev, mode);
+        return error;
+}
 static int set_meta_super(struct super_block *s, void *ptr)
 {
        return -EINVAL;
@@ -1274,13 +1392,17 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
                       dev_name, error);
                return error;
        }
-        s = sget(&gfs2_fs_type, test_meta_super, set_meta_super,
+        s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super,
                 path.dentry->d_inode->i_sb->s_bdev);
        path_put(&path);
        if (IS_ERR(s)) {
                printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
                return PTR_ERR(s);
        }
+        if ((flags ^ s->s_flags) & MS_RDONLY) {
+                deactivate_locked_super(s);
+                return -EBUSY;
+        }
        sdp = s->s_fs_info;
        mnt->mnt_sb = s;
        mnt->mnt_root = dget(sdp->sd_master_dir);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 247436c10deb..4e64352d49de 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -748,7 +748,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        struct gfs2_rgrpd *nrgd;
        unsigned int num_gh;
        int dir_rename = 0;
-        int alloc_required;
+        int alloc_required = 0;
        unsigned int x;
        int error;
@@ -867,7 +867,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                        goto out_gunlock;
        }
-        alloc_required = error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
+        if (nip == NULL)
+                alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
+        error = alloc_required;
        if (error < 0)
                goto out_gunlock;
        error = 0;
@@ -974,121 +976,62 @@ out:
 }
 /**
- * gfs2_readlinki - return the contents of a symlink
+ * gfs2_follow_link - Follow a symbolic link
- * @ip: the symlink's inode
+ * @dentry: The dentry of the link
- * @buf: a pointer to the buffer to be filled
+ * @nd: Data that we pass to vfs_follow_link()
- * @len: a pointer to the length of @buf
 *
- * If @buf is too small, a piece of memory is kmalloc()ed and needs
+ * This can handle symlinks of any size.
- * to be freed by the caller.
 *
- * Returns: errno
+ * Returns: 0 on success or error code
 */
-static int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
+static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
+        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
        struct gfs2_holder i_gh;
        struct buffer_head *dibh;
        unsigned int x;
+        char *buf;
        int error;
        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
        error = gfs2_glock_nq(&i_gh);
        if (error) {
                gfs2_holder_uninit(&i_gh);
-                return error;
+                nd_set_link(nd, ERR_PTR(error));
+                return NULL;
        }
        if (!ip->i_disksize) {
                gfs2_consist_inode(ip);
-                error = -EIO;
+                buf = ERR_PTR(-EIO);
                goto out;
        }
        error = gfs2_meta_inode_buffer(ip, &dibh);
-        if (error)
+        if (error) {
+                buf = ERR_PTR(error);
                goto out;
-        x = ip->i_disksize + 1;
-        if (x > *len) {
-                *buf = kmalloc(x, GFP_NOFS);
-                if (!*buf) {
-                        error = -ENOMEM;
-                        goto out_brelse;
-                }
        }
-        memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
+        x = ip->i_disksize + 1;
-        *len = x;
+        buf = kmalloc(x, GFP_NOFS);
+        if (!buf)
-out_brelse:
+                buf = ERR_PTR(-ENOMEM);
+        else
+                memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
        brelse(dibh);
 out:
        gfs2_glock_dq_uninit(&i_gh);
-        return error;
+        nd_set_link(nd, buf);
-}
+        return NULL;
-/**
- * gfs2_readlink - Read the value of a symlink
- * @dentry: the symlink
- * @buf: the buffer to read the symlink data into
- * @size: the size of the buffer
- *
- * Returns: errno
- */
-static int gfs2_readlink(struct dentry *dentry, char __user *user_buf,
-                         int user_size)
-{
-        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
-        char array[GFS2_FAST_NAME_SIZE], *buf = array;
-        unsigned int len = GFS2_FAST_NAME_SIZE;
-        int error;
-        error = gfs2_readlinki(ip, &buf, &len);
-        if (error)
-                return error;
-        if (user_size > len - 1)
-                user_size = len - 1;
-        if (copy_to_user(user_buf, buf, user_size))
-                error = -EFAULT;
-        else
-                error = user_size;
-        if (buf != array)
-                kfree(buf);
-        return error;
 }
-/**
+static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
- * gfs2_follow_link - Follow a symbolic link
- * @dentry: The dentry of the link
- * @nd: Data that we pass to vfs_follow_link()
- *
- * This can handle symlinks of any size. It is optimised for symlinks
- * under GFS2_FAST_NAME_SIZE.
- *
- * Returns: 0 on success or error code
- */
-static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+        char *s = nd_get_link(nd);
-        char array[GFS2_FAST_NAME_SIZE], *buf = array;
+        if (!IS_ERR(s))
-        unsigned int len = GFS2_FAST_NAME_SIZE;
+                kfree(s);
-        int error;
-        error = gfs2_readlinki(ip, &buf, &len);
-        if (!error) {
-                error = vfs_follow_link(nd, buf);
-                if (buf != array)
-                        kfree(buf);
-        }
-        return ERR_PTR(error);
 }
 /**
@@ -1423,8 +1366,9 @@ const struct inode_operations gfs2_dir_iops = {
 };
 const struct inode_operations gfs2_symlink_iops = {
-        .readlink = gfs2_readlink,
+        .readlink = generic_readlink,
        .follow_link = gfs2_follow_link,
+        .put_link = gfs2_put_link,
        .permission = gfs2_permission,
        .setattr = gfs2_setattr,
        .getattr = gfs2_getattr,
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 2e9b9326bfc9..6dbcbad6ab17 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -15,7 +15,7 @@
 * fuzziness in the current usage value of IDs that are being used on different
 * nodes in the cluster simultaneously.  So, it is possible for a user on
 * multiple nodes to overrun their quota, but that overrun is controlable.
- * Since quota tags are part of transactions, there is no need to a quota check
+ * Since quota tags are part of transactions, there is no need for a quota check
 * program to be run on node crashes or anything like that.
 *
 * There are couple of knobs that let the administrator manage the quota
@@ -47,6 +47,8 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/quota.h>
+#include <linux/dqblk_xfs.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -65,13 +67,6 @@
 #define QUOTA_USER 1
 #define QUOTA_GROUP 0
-struct gfs2_quota_host {
-        u64 qu_limit;
-        u64 qu_warn;
-        s64 qu_value;
-        u32 qu_ll_next;
-};
 struct gfs2_quota_change_host {
        u64 qc_change;
        u32 qc_flags; /* GFS2_QCF_... */
@@ -164,7 +159,7 @@ fail:
        return error;
 }
-static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
+static int qd_get(struct gfs2_sbd *sdp, int user, u32 id,
                  struct gfs2_quota_data **qdp)
 {
        struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
@@ -202,7 +197,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
                spin_unlock(&qd_lru_lock);
-                if (qd || !create) {
+                if (qd) {
                        if (new_qd) {
                                gfs2_glock_put(new_qd->qd_gl);
                                kmem_cache_free(gfs2_quotad_cachep, new_qd);
@@ -461,12 +456,12 @@ static void qd_unlock(struct gfs2_quota_data *qd)
        qd_put(qd);
 }
-static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
+static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id,
                    struct gfs2_quota_data **qdp)
 {
        int error;
-        error = qd_get(sdp, user, id, create, qdp);
+        error = qd_get(sdp, user, id, qdp);
        if (error)
                return error;
@@ -508,20 +503,20 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
        if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
                return 0;
-        error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, CREATE, qd);
+        error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, qd);
        if (error)
                goto out;
        al->al_qd_num++;
        qd++;
-        error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, CREATE, qd);
+        error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, qd);
        if (error)
                goto out;
        al->al_qd_num++;
        qd++;
        if (uid != NO_QUOTA_CHANGE && uid != ip->i_inode.i_uid) {
-                error = qdsb_get(sdp, QUOTA_USER, uid, CREATE, qd);
+                error = qdsb_get(sdp, QUOTA_USER, uid, qd);
                if (error)
                        goto out;
                al->al_qd_num++;
@@ -529,7 +524,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
        }
        if (gid != NO_QUOTA_CHANGE && gid != ip->i_inode.i_gid) {
-                error = qdsb_get(sdp, QUOTA_GROUP, gid, CREATE, qd);
+                error = qdsb_get(sdp, QUOTA_GROUP, gid, qd);
                if (error)
                        goto out;
                al->al_qd_num++;
@@ -617,48 +612,36 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
        mutex_unlock(&sdp->sd_quota_mutex);
 }
-static void gfs2_quota_in(struct gfs2_quota_host *qu, const void *buf)
-{
-        const struct gfs2_quota *str = buf;
-        qu->qu_limit = be64_to_cpu(str->qu_limit);
-        qu->qu_warn = be64_to_cpu(str->qu_warn);
-        qu->qu_value = be64_to_cpu(str->qu_value);
-        qu->qu_ll_next = be32_to_cpu(str->qu_ll_next);
-}
-static void gfs2_quota_out(const struct gfs2_quota_host *qu, void *buf)
-{
-        struct gfs2_quota *str = buf;
-        str->qu_limit = cpu_to_be64(qu->qu_limit);
-        str->qu_warn = cpu_to_be64(qu->qu_warn);
-        str->qu_value = cpu_to_be64(qu->qu_value);
-        str->qu_ll_next = cpu_to_be32(qu->qu_ll_next);
-        memset(&str->qu_reserved, 0, sizeof(str->qu_reserved));
-}
 /**
- * gfs2_adjust_quota
+ * gfs2_adjust_quota - adjust record of current block usage
+ * @ip: The quota inode
+ * @loc: Offset of the entry in the quota file
+ * @change: The amount of usage change to record
+ * @qd: The quota data
+ * @fdq: The updated limits to record
 *
 * This function was mostly borrowed from gfs2_block_truncate_page which was
 * in turn mostly borrowed from ext3
+ *
+ * Returns: 0 or -ve on error
 */
 static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
-                             s64 change, struct gfs2_quota_data *qd)
+                             s64 change, struct gfs2_quota_data *qd,
+                             struct fs_disk_quota *fdq)
 {
        struct inode *inode = &ip->i_inode;
        struct address_space *mapping = inode->i_mapping;
        unsigned long index = loc >> PAGE_CACHE_SHIFT;
        unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
        unsigned blocksize, iblock, pos;
-        struct buffer_head *bh;
+        struct buffer_head *bh, *dibh;
        struct page *page;
        void *kaddr;
-        char *ptr;
+        struct gfs2_quota *qp;
-        struct gfs2_quota_host qp;
        s64 value;
        int err = -EIO;
+        u64 size;
        if (gfs2_is_stuffed(ip))
                gfs2_unstuff_dinode(ip, NULL);
@@ -700,18 +683,38 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        gfs2_trans_add_bh(ip->i_gl, bh, 0);
        kaddr = kmap_atomic(page, KM_USER0);
-        ptr = kaddr + offset;
+        qp = kaddr + offset;
-        gfs2_quota_in(&qp, ptr);
+        value = (s64)be64_to_cpu(qp->qu_value) + change;
-        qp.qu_value += change;
+        qp->qu_value = cpu_to_be64(value);
-        value = qp.qu_value;
+        qd->qd_qb.qb_value = qp->qu_value;
-        gfs2_quota_out(&qp, ptr);
+        if (fdq) {
+                if (fdq->d_fieldmask & FS_DQ_BSOFT) {
+                        qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
+                        qd->qd_qb.qb_warn = qp->qu_warn;
+                }
+                if (fdq->d_fieldmask & FS_DQ_BHARD) {
+                        qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
+                        qd->qd_qb.qb_limit = qp->qu_limit;
+                }
+        }
        flush_dcache_page(page);
        kunmap_atomic(kaddr, KM_USER0);
-        err = 0;
-        qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC);
+        err = gfs2_meta_inode_buffer(ip, &dibh);
-        qd->qd_qb.qb_value = cpu_to_be64(value);
+        if (err)
-        ((struct gfs2_quota_lvb*)(qd->qd_gl->gl_lvb))->qb_magic = cpu_to_be32(GFS2_MAGIC);
+                goto unlock;
-        ((struct gfs2_quota_lvb*)(qd->qd_gl->gl_lvb))->qb_value = cpu_to_be64(value);
+        size = loc + sizeof(struct gfs2_quota);
+        if (size > inode->i_size) {
+                ip->i_disksize = size;
+                i_size_write(inode, size);
+        }
+        inode->i_mtime = inode->i_atime = CURRENT_TIME;
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_dinode_out(ip, dibh->b_data);
+        brelse(dibh);
+        mark_inode_dirty(inode);
 unlock:
        unlock_page(page);
        page_cache_release(page);
@@ -739,9 +742,9 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
                return -ENOMEM;
        sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
+        mutex_lock_nested(&ip->i_inode.i_mutex, I_MUTEX_QUOTA);
        for (qx = 0; qx < num_qd; qx++) {
-                error = gfs2_glock_nq_init(qda[qx]->qd_gl,
+                error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE,
-                                           LM_ST_EXCLUSIVE,
                                           GL_NOCACHE, &ghs[qx]);
                if (error)
                        goto out;
@@ -795,9 +798,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
        for (x = 0; x < num_qd; x++) {
                qd = qda[x];
                offset = qd2offset(qd);
-                error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync,
+                error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, qd, NULL);
-                                          (struct gfs2_quota_data *)
-                                          qd);
                if (error)
                        goto out_end_trans;
@@ -817,21 +818,44 @@ out_gunlock:
 out:
        while (qx--)
                gfs2_glock_dq_uninit(&ghs[qx]);
+        mutex_unlock(&ip->i_inode.i_mutex);
        kfree(ghs);
        gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
        return error;
 }
+static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd)
+{
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+        struct gfs2_quota q;
+        struct gfs2_quota_lvb *qlvb;
+        loff_t pos;
+        int error;
+        memset(&q, 0, sizeof(struct gfs2_quota));
+        pos = qd2offset(qd);
+        error = gfs2_internal_read(ip, NULL, (char *)&q, &pos, sizeof(q));
+        if (error < 0)
+                return error;
+        qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+        qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
+        qlvb->__pad = 0;
+        qlvb->qb_limit = q.qu_limit;
+        qlvb->qb_warn = q.qu_warn;
+        qlvb->qb_value = q.qu_value;
+        qd->qd_qb = *qlvb;
+        return 0;
+}
 static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
                    struct gfs2_holder *q_gh)
 {
        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
        struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
        struct gfs2_holder i_gh;
-        struct gfs2_quota_host q;
-        char buf[sizeof(struct gfs2_quota)];
        int error;
-        struct gfs2_quota_lvb *qlvb;
 restart:
        error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
@@ -841,11 +865,9 @@ restart:
        qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
        if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) {
-                loff_t pos;
                gfs2_glock_dq_uninit(q_gh);
-                error = gfs2_glock_nq_init(qd->qd_gl,
+                error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE,
-                                           LM_ST_EXCLUSIVE, GL_NOCACHE,
+                                           GL_NOCACHE, q_gh);
-                                           q_gh);
                if (error)
                        return error;
@@ -853,29 +875,14 @@ restart:
                if (error)
                        goto fail;
-                memset(buf, 0, sizeof(struct gfs2_quota));
+                error = update_qd(sdp, qd);
-                pos = qd2offset(qd);
+                if (error)
-                error = gfs2_internal_read(ip, NULL, buf, &pos,
-                                           sizeof(struct gfs2_quota));
-                if (error < 0)
                        goto fail_gunlock;
                gfs2_glock_dq_uninit(&i_gh);
+                gfs2_glock_dq_uninit(q_gh);
-                gfs2_quota_in(&q, buf);
+                force_refresh = 0;
-                qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+                goto restart;
-                qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
-                qlvb->__pad = 0;
-                qlvb->qb_limit = cpu_to_be64(q.qu_limit);
-                qlvb->qb_warn = cpu_to_be64(q.qu_warn);
-                qlvb->qb_value = cpu_to_be64(q.qu_value);
-                qd->qd_qb = *qlvb;
-                if (gfs2_glock_is_blocking(qd->qd_gl)) {
-                        gfs2_glock_dq_uninit(q_gh);
-                        force_refresh = 0;
-                        goto restart;
-                }
        }
        return 0;
@@ -995,7 +1002,7 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
 {
        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
-        printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\r\n",
+        printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\n",
               sdp->sd_fsname, type,
               (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
               qd->qd_id);
@@ -1032,6 +1039,10 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
                if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
                        print_message(qd, "exceeded");
+                        quota_send_warning(test_bit(QDF_USER, &qd->qd_flags) ?
+                                           USRQUOTA : GRPQUOTA, qd->qd_id,
+                                           sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN);
                        error = -EDQUOT;
                        break;
                } else if (be64_to_cpu(qd->qd_qb.qb_warn) &&
@@ -1039,6 +1050,9 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
                           time_after_eq(jiffies, qd->qd_last_warn +
                                         gfs2_tune_get(sdp,
                                                gt_quota_warn_period) * HZ)) {
+                        quota_send_warning(test_bit(QDF_USER, &qd->qd_flags) ?
+                                           USRQUOTA : GRPQUOTA, qd->qd_id,
+                                           sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN);
                        error = print_message(qd, "warning");
                        qd->qd_last_warn = jiffies;
                }
@@ -1069,8 +1083,9 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
        }
 }
-int gfs2_quota_sync(struct gfs2_sbd *sdp)
+int gfs2_quota_sync(struct super_block *sb, int type, int wait)
 {
+        struct gfs2_sbd *sdp = sb->s_fs_info;
        struct gfs2_quota_data **qda;
        unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync);
        unsigned int num_qd;
@@ -1112,13 +1127,18 @@ int gfs2_quota_sync(struct gfs2_sbd *sdp)
        return error;
 }
+static int gfs2_quota_sync_timeo(struct super_block *sb, int type)
+{
+        return gfs2_quota_sync(sb, type, 0);
+}
 int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
 {
        struct gfs2_quota_data *qd;
        struct gfs2_holder q_gh;
        int error;
-        error = qd_get(sdp, user, id, CREATE, &qd);
+        error = qd_get(sdp, user, id, &qd);
        if (error)
                return error;
@@ -1127,7 +1147,6 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
                gfs2_glock_dq_uninit(&q_gh);
        qd_put(qd);
        return error;
 }
@@ -1298,12 +1317,12 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
 }
 static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg,
-                               int (*fxn)(struct gfs2_sbd *sdp),
+                               int (*fxn)(struct super_block *sb, int type),
                               unsigned long t, unsigned long *timeo,
                               unsigned int *new_timeo)
 {
        if (t >= *timeo) {
-                int error = fxn(sdp);
+                int error = fxn(sdp->sd_vfs, 0);
                quotad_error(sdp, msg, error);
                *timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ;
        } else {
@@ -1330,6 +1349,14 @@ static void quotad_check_trunc_list(struct gfs2_sbd *sdp)
        }
 }
+void gfs2_wake_up_statfs(struct gfs2_sbd *sdp) {
+        if (!sdp->sd_statfs_force_sync) {
+                sdp->sd_statfs_force_sync = 1;
+                wake_up(&sdp->sd_quota_wait);
+        }
+}
 /**
 * gfs2_quotad - Write cached quota changes into the quota file
 * @sdp: Pointer to GFS2 superblock
@@ -1349,11 +1376,18 @@ int gfs2_quotad(void *data)
        while (!kthread_should_stop()) {
                /* Update the master statfs file */
-                quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t,
+                if (sdp->sd_statfs_force_sync) {
-                                   &statfs_timeo, &tune->gt_statfs_quantum);
+                        int error = gfs2_statfs_sync(sdp->sd_vfs, 0);
+                        quotad_error(sdp, "statfs", error);
+                        statfs_timeo = gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
+                }
+                else
+                        quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t,
+                                           &statfs_timeo,
+                                           &tune->gt_statfs_quantum);
                /* Update quota file */
-                quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
+                quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t,
                                   &quotad_timeo, &tune->gt_quota_quantum);
                /* Check for & recover partially truncated inodes */
@@ -1367,7 +1401,7 @@ int gfs2_quotad(void *data)
                spin_lock(&sdp->sd_trunc_lock);
                empty = list_empty(&sdp->sd_trunc_list);
                spin_unlock(&sdp->sd_trunc_lock);
-                if (empty)
+                if (empty && !sdp->sd_statfs_force_sync)
                        t -= schedule_timeout(t);
                else
                        t = 0;
@@ -1377,3 +1411,181 @@ int gfs2_quotad(void *data)
        return 0;
 }
+static int gfs2_quota_get_xstate(struct super_block *sb,
+                                 struct fs_quota_stat *fqs)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        memset(fqs, 0, sizeof(struct fs_quota_stat));
+        fqs->qs_version = FS_QSTAT_VERSION;
+        if (sdp->sd_args.ar_quota == GFS2_QUOTA_ON)
+                fqs->qs_flags = (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
+        else if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT)
+                fqs->qs_flags = (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
+        if (sdp->sd_quota_inode) {
+                fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr;
+                fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks;
+        }
+        fqs->qs_uquota.qfs_nextents = 1; /* unsupported */
+        fqs->qs_gquota = fqs->qs_uquota; /* its the same inode in both cases */
+        fqs->qs_incoredqs = atomic_read(&qd_lru_count);
+        return 0;
+}
+static int gfs2_xquota_get(struct super_block *sb, int type, qid_t id,
+                           struct fs_disk_quota *fdq)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        struct gfs2_quota_lvb *qlvb;
+        struct gfs2_quota_data *qd;
+        struct gfs2_holder q_gh;
+        int error;
+        memset(fdq, 0, sizeof(struct fs_disk_quota));
+        if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+                return -ESRCH; /* Crazy XFS error code */
+        if (type == USRQUOTA)
+                type = QUOTA_USER;
+        else if (type == GRPQUOTA)
+                type = QUOTA_GROUP;
+        else
+                return -EINVAL;
+        error = qd_get(sdp, type, id, &qd);
+        if (error)
+                return error;
+        error = do_glock(qd, FORCE, &q_gh);
+        if (error)
+                goto out;
+        qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+        fdq->d_version = FS_DQUOT_VERSION;
+        fdq->d_flags = (type == QUOTA_USER) ? XFS_USER_QUOTA : XFS_GROUP_QUOTA;
+        fdq->d_id = id;
+        fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit);
+        fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn);
+        fdq->d_bcount = be64_to_cpu(qlvb->qb_value);
+        gfs2_glock_dq_uninit(&q_gh);
+out:
+        qd_put(qd);
+        return error;
+}
+/* GFS2 only supports a subset of the XFS fields */
+#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
+static int gfs2_xquota_set(struct super_block *sb, int type, qid_t id,
+                           struct fs_disk_quota *fdq)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+        struct gfs2_quota_data *qd;
+        struct gfs2_holder q_gh, i_gh;
+        unsigned int data_blocks, ind_blocks;
+        unsigned int blocks = 0;
+        int alloc_required;
+        struct gfs2_alloc *al;
+        loff_t offset;
+        int error;
+        if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+                return -ESRCH; /* Crazy XFS error code */
+        switch(type) {
+        case USRQUOTA:
+                type = QUOTA_USER;
+                if (fdq->d_flags != XFS_USER_QUOTA)
+                        return -EINVAL;
+                break;
+        case GRPQUOTA:
+                type = QUOTA_GROUP;
+                if (fdq->d_flags != XFS_GROUP_QUOTA)
+                        return -EINVAL;
+                break;
+        default:
+                return -EINVAL;
+        }
+        if (fdq->d_fieldmask & ~GFS2_FIELDMASK)
+                return -EINVAL;
+        if (fdq->d_id != id)
+                return -EINVAL;
+        error = qd_get(sdp, type, id, &qd);
+        if (error)
+                return error;
+        mutex_lock(&ip->i_inode.i_mutex);
+        error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh);
+        if (error)
+                goto out_put;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+        if (error)
+                goto out_q;
+        /* Check for existing entry, if none then alloc new blocks */
+        error = update_qd(sdp, qd);
+        if (error)
+                goto out_i;
+        /* If nothing has changed, this is a no-op */
+        if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
+            (fdq->d_blk_softlimit == be64_to_cpu(qd->qd_qb.qb_warn)))
+                fdq->d_fieldmask ^= FS_DQ_BSOFT;
+        if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
+            (fdq->d_blk_hardlimit == be64_to_cpu(qd->qd_qb.qb_limit)))
+                fdq->d_fieldmask ^= FS_DQ_BHARD;
+        if (fdq->d_fieldmask == 0)
+                goto out_i;
+        offset = qd2offset(qd);
+        error = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota),
+                                          &alloc_required);
+        if (error)
+                goto out_i;
+        if (alloc_required) {
+                al = gfs2_alloc_get(ip);
+                if (al == NULL)
+                        goto out_i;
+                gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
+                                       &data_blocks, &ind_blocks);
+                blocks = al->al_requested = 1 + data_blocks + ind_blocks;
+                error = gfs2_inplace_reserve(ip);
+                if (error)
+                        goto out_alloc;
+        }
+        error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
+        if (error)
+                goto out_release;
+        /* Apply changes */
+        error = gfs2_adjust_quota(ip, offset, 0, qd, fdq);
+        gfs2_trans_end(sdp);
+out_release:
+        if (alloc_required) {
+                gfs2_inplace_release(ip);
+out_alloc:
+                gfs2_alloc_put(ip);
+        }
+out_i:
+        gfs2_glock_dq_uninit(&i_gh);
+out_q:
+        gfs2_glock_dq_uninit(&q_gh);
+out_put:
+        mutex_unlock(&ip->i_inode.i_mutex);
+        qd_put(qd);
+        return error;
+}
+const struct quotactl_ops gfs2_quotactl_ops = {
+        .quota_sync     = gfs2_quota_sync,
+        .get_xstate     = gfs2_quota_get_xstate,
+        .get_xquota     = gfs2_xquota_get,
+        .set_xquota     = gfs2_xquota_set,
+};
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 0fa5fa63d0e8..195f60c8bd14 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -25,13 +25,15 @@ extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
 extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
                              u32 uid, u32 gid);
-extern int gfs2_quota_sync(struct gfs2_sbd *sdp);
+extern int gfs2_quota_sync(struct super_block *sb, int type, int wait);
 extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
 extern int gfs2_quota_init(struct gfs2_sbd *sdp);
 extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
 extern int gfs2_quotad(void *data);
+extern void gfs2_wake_up_statfs(struct gfs2_sbd *sdp);
 static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
@@ -50,5 +52,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 }
 extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask);
+extern const struct quotactl_ops gfs2_quotactl_ops;
 #endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 09fa31965576..4b9bece3d437 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -410,7 +410,9 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
        memset(lh, 0, sizeof(struct gfs2_log_header));
        lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
        lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+        lh->lh_header.__pad0 = cpu_to_be64(0);
        lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+        lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
        lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1);
        lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT);
        lh->lh_blkno = cpu_to_be32(lblock);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8f1cfb02a6cb..503b842f3ba2 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -591,11 +591,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
        u64 rgrp_count = ip->i_disksize;
        int error;
-        if (do_div(rgrp_count, sizeof(struct gfs2_rindex))) {
+        do_div(rgrp_count, sizeof(struct gfs2_rindex));
-                gfs2_consist_inode(ip);
-                return -EIO;
-        }
        clear_rgrpdi(sdp);
        file_ra_state_init(&ra_state, inode->i_mapping);
@@ -915,7 +911,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
 struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
 {
        BUG_ON(ip->i_alloc != NULL);
-        ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_KERNEL);
+        ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_NOFS);
        return ip->i_alloc;
 }
@@ -1710,11 +1706,16 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
 {
        struct gfs2_rgrpd *rgd;
        struct gfs2_holder ri_gh, rgd_gh;
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
+        int ri_locked = 0;
        int error;
-        error = gfs2_rindex_hold(sdp, &ri_gh);
+        if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
-        if (error)
+                error = gfs2_rindex_hold(sdp, &ri_gh);
-                goto fail;
+                if (error)
+                        goto fail;
+                ri_locked = 1;
+        }
        error = -EINVAL;
        rgd = gfs2_blk2rgrpd(sdp, no_addr);
@@ -1730,7 +1731,8 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
        gfs2_glock_dq_uninit(&rgd_gh);
 fail_rindex:
-        gfs2_glock_dq_uninit(&ri_gh);
+        if (ri_locked)
+                gfs2_glock_dq_uninit(&ri_gh);
 fail:
        return error;
 }
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index b4106ddaaa98..f07119d89557 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -10,6 +10,8 @@
 #ifndef __RGRP_DOT_H__
 #define __RGRP_DOT_H__
+#include <linux/slab.h>
 struct gfs2_rgrpd;
 struct gfs2_sbd;
 struct gfs2_holder;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 0ec3ec672de1..50aac606b990 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -21,6 +21,8 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/time.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -70,6 +72,11 @@ enum {
        Opt_commit,
        Opt_err_withdraw,
        Opt_err_panic,
+        Opt_statfs_quantum,
+        Opt_statfs_percent,
+        Opt_quota_quantum,
+        Opt_barrier,
+        Opt_nobarrier,
        Opt_error,
 };
@@ -101,18 +108,23 @@ static const match_table_t tokens = {
        {Opt_commit, "commit=%d"},
        {Opt_err_withdraw, "errors=withdraw"},
        {Opt_err_panic, "errors=panic"},
+        {Opt_statfs_quantum, "statfs_quantum=%d"},
+        {Opt_statfs_percent, "statfs_percent=%d"},
+        {Opt_quota_quantum, "quota_quantum=%d"},
+        {Opt_barrier, "barrier"},
+        {Opt_nobarrier, "nobarrier"},
        {Opt_error, NULL}
 };
 /**
 * gfs2_mount_args - Parse mount options
- * @sdp:
+ * @args: The structure into which the parsed options will be written
- * @data:
+ * @options: The options to parse
 *
 * Return: errno
 */
-int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
+int gfs2_mount_args(struct gfs2_args *args, char *options)
 {
        char *o;
        int token;
@@ -157,7 +169,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
                        break;
                case Opt_debug:
                        if (args->ar_errors == GFS2_ERRORS_PANIC) {
-                                fs_info(sdp, "-o debug and -o errors=panic "
+                                printk(KERN_WARNING "GFS2: -o debug and -o errors=panic "
                                       "are mutually exclusive.\n");
                                return -EINVAL;
                        }
@@ -210,7 +222,29 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
                case Opt_commit:
                        rv = match_int(&tmp[0], &args->ar_commit);
                        if (rv || args->ar_commit <= 0) {
-                                fs_info(sdp, "commit mount option requires a positive numeric argument\n");
+                                printk(KERN_WARNING "GFS2: commit mount option requires a positive numeric argument\n");
+                                return rv ? rv : -EINVAL;
+                        }
+                        break;
+                case Opt_statfs_quantum:
+                        rv = match_int(&tmp[0], &args->ar_statfs_quantum);
+                        if (rv || args->ar_statfs_quantum < 0) {
+                                printk(KERN_WARNING "GFS2: statfs_quantum mount option requires a non-negative numeric argument\n");
+                                return rv ? rv : -EINVAL;
+                        }
+                        break;
+                case Opt_quota_quantum:
+                        rv = match_int(&tmp[0], &args->ar_quota_quantum);
+                        if (rv || args->ar_quota_quantum <= 0) {
+                                printk(KERN_WARNING "GFS2: quota_quantum mount option requires a positive numeric argument\n");
+                                return rv ? rv : -EINVAL;
+                        }
+                        break;
+                case Opt_statfs_percent:
+                        rv = match_int(&tmp[0], &args->ar_statfs_percent);
+                        if (rv || args->ar_statfs_percent < 0 ||
+                            args->ar_statfs_percent > 100) {
+                                printk(KERN_WARNING "statfs_percent mount option requires a numeric argument between 0 and 100\n");
                                return rv ? rv : -EINVAL;
                        }
                        break;
@@ -219,15 +253,21 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
                        break;
                case Opt_err_panic:
                        if (args->ar_debug) {
-                                fs_info(sdp, "-o debug and -o errors=panic "
+                                printk(KERN_WARNING "GFS2: -o debug and -o errors=panic "
                                        "are mutually exclusive.\n");
                                return -EINVAL;
                        }
                        args->ar_errors = GFS2_ERRORS_PANIC;
                        break;
+                case Opt_barrier:
+                        args->ar_nobarrier = 0;
+                        break;
+                case Opt_nobarrier:
+                        args->ar_nobarrier = 1;
+                        break;
                case Opt_error:
                default:
-                        fs_info(sdp, "invalid mount option: %s\n", o);
+                        printk(KERN_WARNING "GFS2: invalid mount option: %s\n", o);
                        return -EINVAL;
                }
        }
@@ -442,7 +482,10 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
 {
        struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
        struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
        struct buffer_head *l_bh;
+        s64 x, y;
+        int need_sync = 0;
        int error;
        error = gfs2_meta_inode_buffer(l_ip, &l_bh);
@@ -456,9 +499,17 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
        l_sc->sc_free += free;
        l_sc->sc_dinodes += dinodes;
        gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode));
+        if (sdp->sd_args.ar_statfs_percent) {
+                x = 100 * l_sc->sc_free;
+                y = m_sc->sc_free * sdp->sd_args.ar_statfs_percent;
+                if (x >= y || x <= -y)
+                        need_sync = 1;
+        }
        spin_unlock(&sdp->sd_statfs_spin);
        brelse(l_bh);
+        if (need_sync)
+                gfs2_wake_up_statfs(sdp);
 }
 void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
@@ -484,8 +535,9 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
        gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
 }
-int gfs2_statfs_sync(struct gfs2_sbd *sdp)
+int gfs2_statfs_sync(struct super_block *sb, int type)
 {
+        struct gfs2_sbd *sdp = sb->s_fs_info;
        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
        struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
@@ -521,6 +573,7 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp)
                goto out_bh2;
        update_statfs(sdp, m_bh, l_bh);
+        sdp->sd_statfs_force_sync = 0;
        gfs2_trans_end(sdp);
@@ -659,7 +712,7 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
 * Returns: errno
 */
-static int gfs2_write_inode(struct inode *inode, int sync)
+static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -670,8 +723,7 @@ static int gfs2_write_inode(struct inode *inode, int sync)
        int ret = 0;
        /* Check this is a "normal" inode, etc */
-        if (!test_bit(GIF_USER, &ip->i_flags) ||
+        if (current->flags & PF_MEMALLOC)
-            (current->flags & PF_MEMALLOC))
                return 0;
        ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
        if (ret)
@@ -694,7 +746,7 @@ static int gfs2_write_inode(struct inode *inode, int sync)
 do_unlock:
        gfs2_glock_dq_uninit(&gh);
 do_flush:
-        if (sync != 0)
+        if (wbc->sync_mode == WB_SYNC_ALL)
                gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
        return ret;
 }
@@ -712,8 +764,8 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
        int error;
        flush_workqueue(gfs2_delete_workqueue);
-        gfs2_quota_sync(sdp);
+        gfs2_quota_sync(sdp->sd_vfs, 0, 1);
-        gfs2_statfs_sync(sdp);
+        gfs2_statfs_sync(sdp->sd_vfs, 0);
        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
                                   &t_gh);
@@ -808,6 +860,7 @@ restart:
        gfs2_clear_rgrpd(sdp);
        gfs2_jindex_free(sdp);
        /*  Take apart glock structures and buffer lists  */
+        invalidate_inodes(sdp->sd_vfs);
        gfs2_gl_hash_clear(sdp);
        /*  Unmount the locking protocol  */
        gfs2_lm_unmount(sdp);
@@ -1061,8 +1114,13 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
        spin_lock(&gt->gt_spin);
        args.ar_commit = gt->gt_log_flush_secs;
+        args.ar_quota_quantum = gt->gt_quota_quantum;
+        if (gt->gt_statfs_slow)
+                args.ar_statfs_quantum = 0;
+        else
+                args.ar_statfs_quantum = gt->gt_statfs_quantum;
        spin_unlock(&gt->gt_spin);
-        error = gfs2_mount_args(sdp, &args, data);
+        error = gfs2_mount_args(&args, data);
        if (error)
                return error;
@@ -1097,8 +1155,21 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
                sb->s_flags |= MS_POSIXACL;
        else
                sb->s_flags &= ~MS_POSIXACL;
+        if (sdp->sd_args.ar_nobarrier)
+                set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
+        else
+                clear_bit(SDF_NOBARRIERS, &sdp->sd_flags);
        spin_lock(&gt->gt_spin);
        gt->gt_log_flush_secs = args.ar_commit;
+        gt->gt_quota_quantum = args.ar_quota_quantum;
+        if (args.ar_statfs_quantum) {
+                gt->gt_statfs_slow = 0;
+                gt->gt_statfs_quantum = args.ar_statfs_quantum;
+        }
+        else {
+                gt->gt_statfs_slow = 1;
+                gt->gt_statfs_quantum = 30;
+        }
        spin_unlock(&gt->gt_spin);
        gfs2_online_uevent(sdp);
@@ -1124,7 +1195,7 @@ static void gfs2_drop_inode(struct inode *inode)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
-        if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) {
+        if (inode->i_nlink) {
                struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
                if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
                        clear_nlink(inode);
@@ -1142,18 +1213,12 @@ static void gfs2_clear_inode(struct inode *inode)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
-        /* This tells us its a "real" inode and not one which only
+        ip->i_gl->gl_object = NULL;
-         * serves to contain an address space (see rgrp.c, meta_io.c)
+        gfs2_glock_put(ip->i_gl);
-         * which therefore doesn't have its own glocks.
+        ip->i_gl = NULL;
-         */
+        if (ip->i_iopen_gh.gh_gl) {
-        if (test_bit(GIF_USER, &ip->i_flags)) {
+                ip->i_iopen_gh.gh_gl->gl_object = NULL;
-                ip->i_gl->gl_object = NULL;
+                gfs2_glock_dq_uninit(&ip->i_iopen_gh);
-                gfs2_glock_put(ip->i_gl);
-                ip->i_gl = NULL;
-                if (ip->i_iopen_gh.gh_gl) {
-                        ip->i_iopen_gh.gh_gl->gl_object = NULL;
-                        gfs2_glock_dq_uninit(&ip->i_iopen_gh);
-                }
        }
 }
@@ -1179,7 +1244,7 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 {
        struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
        struct gfs2_args *args = &sdp->sd_args;
-        int lfsecs;
+        int val;
        if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
                seq_printf(s, ",meta");
@@ -1240,9 +1305,17 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        }
        if (args->ar_discard)
                seq_printf(s, ",discard");
-        lfsecs = sdp->sd_tune.gt_log_flush_secs;
+        val = sdp->sd_tune.gt_log_flush_secs;
-        if (lfsecs != 60)
+        if (val != 60)
-                seq_printf(s, ",commit=%d", lfsecs);
+                seq_printf(s, ",commit=%d", val);
+        val = sdp->sd_tune.gt_statfs_quantum;
+        if (val != 30)
+                seq_printf(s, ",statfs_quantum=%d", val);
+        val = sdp->sd_tune.gt_quota_quantum;
+        if (val != 60)
+                seq_printf(s, ",quota_quantum=%d", val);
+        if (args->ar_statfs_percent)
+                seq_printf(s, ",statfs_percent=%d", args->ar_statfs_percent);
        if (args->ar_errors != GFS2_ERRORS_DEFAULT) {
                const char *state;
@@ -1259,6 +1332,9 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
                }
                seq_printf(s, ",errors=%s", state);
        }
+        if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
+                seq_printf(s, ",nobarrier");
        return 0;
 }
@@ -1277,9 +1353,6 @@ static void gfs2_delete_inode(struct inode *inode)
        struct gfs2_holder gh;
        int error;
-        if (!test_bit(GIF_USER, &ip->i_flags))
-                goto out;
        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
        if (unlikely(error)) {
                gfs2_glock_dq_uninit(&ip->i_iopen_gh);
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 235db3682885..3df60f2d84e3 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -27,7 +27,7 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
 extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
-extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data);
+extern int gfs2_mount_args(struct gfs2_args *args, char *data);
 extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
 extern int gfs2_jdesc_check(struct gfs2_jdesc *jd);
@@ -44,7 +44,7 @@ extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
                                  const void *buf);
 extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
                          struct buffer_head *l_bh);
-extern int gfs2_statfs_sync(struct gfs2_sbd *sdp);
+extern int gfs2_statfs_sync(struct super_block *sb, int type);
 extern int gfs2_freeze_fs(struct gfs2_sbd *sdp);
 extern void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 446329728d52..54fd98425991 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -8,7 +8,6 @@
 */
 #include <linux/sched.h>
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
@@ -49,7 +48,7 @@ static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
        return a->store ? a->store(sdp, buf, len) : len;
 }
-static struct sysfs_ops gfs2_attr_ops = {
+static const struct sysfs_ops gfs2_attr_ops = {
        .show  = gfs2_attr_show,
        .store = gfs2_attr_store,
 };
@@ -85,11 +84,7 @@ static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf)
        buf[0] = '\0';
        if (!gfs2_uuid_valid(uuid))
                return 0;
-        return snprintf(buf, PAGE_SIZE, "%02X%02X%02X%02X-%02X%02X-"
+        return snprintf(buf, PAGE_SIZE, "%pUB\n", uuid);
-                        "%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X\n",
-                        uuid[0], uuid[1], uuid[2], uuid[3], uuid[4], uuid[5],
-                        uuid[6], uuid[7], uuid[8], uuid[9], uuid[10], uuid[11],
-                        uuid[12], uuid[13], uuid[14], uuid[15]);
 }
 static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
@@ -158,7 +153,7 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
        if (simple_strtol(buf, NULL, 0) != 1)
                return -EINVAL;
-        gfs2_statfs_sync(sdp);
+        gfs2_statfs_sync(sdp->sd_vfs, 0);
        return len;
 }
@@ -171,13 +166,14 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
        if (simple_strtol(buf, NULL, 0) != 1)
                return -EINVAL;
-        gfs2_quota_sync(sdp);
+        gfs2_quota_sync(sdp->sd_vfs, 0, 1);
        return len;
 }
 static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
                                        size_t len)
 {
+        int error;
        u32 id;
        if (!capable(CAP_SYS_ADMIN))
@@ -185,13 +181,14 @@ static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
        id = simple_strtoul(buf, NULL, 0);
-        gfs2_quota_refresh(sdp, 1, id);
+        error = gfs2_quota_refresh(sdp, 1, id);
-        return len;
+        return error ? error : len;
 }
 static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
                                         size_t len)
 {
+        int error;
        u32 id;
        if (!capable(CAP_SYS_ADMIN))
@@ -199,8 +196,8 @@ static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
        id = simple_strtoul(buf, NULL, 0);
-        gfs2_quota_refresh(sdp, 0, id);
+        error = gfs2_quota_refresh(sdp, 0, id);
-        return len;
+        return error ? error : len;
 }
 static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
@@ -480,7 +477,6 @@ TUNE_ATTR(complain_secs, 0);
 TUNE_ATTR(statfs_slow, 0);
 TUNE_ATTR(new_files_jdata, 0);
 TUNE_ATTR(quota_simul_sync, 1);
-TUNE_ATTR(stall_secs, 1);
 TUNE_ATTR(statfs_quantum, 1);
 TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
@@ -493,7 +489,6 @@ static struct attribute *tune_attrs[] = {
        &tune_attr_complain_secs.attr,
        &tune_attr_statfs_slow.attr,
        &tune_attr_quota_simul_sync.attr,
-        &tune_attr_stall_secs.attr,
        &tune_attr_statfs_quantum.attr,
        &tune_attr_quota_scale.attr,
        &tune_attr_new_files_jdata.attr,
@@ -573,18 +568,12 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
        add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
        if (!sdp->sd_args.ar_spectator)
                add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid);
-        if (gfs2_uuid_valid(uuid)) {
+        if (gfs2_uuid_valid(uuid))
-                add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-"
+                add_uevent_var(env, "UUID=%pUB", uuid);
-                               "%02X%02X-%02X%02X%02X%02X%02X%02X",
-                               uuid[0], uuid[1], uuid[2], uuid[3], uuid[4],
-                               uuid[5], uuid[6], uuid[7], uuid[8], uuid[9],
-                               uuid[10], uuid[11], uuid[12], uuid[13],
-                               uuid[14], uuid[15]);
-        }
        return 0;
 }
-static struct kset_uevent_ops gfs2_uevent_ops = {
+static const struct kset_uevent_ops gfs2_uevent_ops = {
        .uevent = gfs2_uevent,
 };
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index f6a7efa34eb9..53511291fe36 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -7,7 +7,6 @@
 * of the GNU General Public License version 2.
 */
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
@@ -21,6 +20,7 @@
 #include "util.h"
 struct kmem_cache *gfs2_glock_cachep __read_mostly;
+struct kmem_cache *gfs2_glock_aspace_cachep __read_mostly;
 struct kmem_cache *gfs2_inode_cachep __read_mostly;
 struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
 struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 33e96b0ce9ab..b432e04600de 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -145,6 +145,7 @@ gfs2_io_error_bh_i((sdp), (bh), __func__, __FILE__, __LINE__);
 extern struct kmem_cache *gfs2_glock_cachep;
+extern struct kmem_cache *gfs2_glock_aspace_cachep;
 extern struct kmem_cache *gfs2_inode_cachep;
 extern struct kmem_cache *gfs2_bufdata_cachep;
 extern struct kmem_cache *gfs2_rgrpd_cachep;
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 8a0f8ef6ee27..c2ebdf2c01d4 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -186,8 +186,8 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
        return 0;
 }
-int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
+static int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
-                 struct gfs2_ea_location *el)
+                        struct gfs2_ea_location *el)
 {
        struct ea_find ef;
        int error;
@@ -516,8 +516,8 @@ out:
        return error;
 }
-int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+static int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-                     char *data, size_t size)
+                            char *data, size_t size)
 {
        int ret;
        size_t len = GFS2_EA_DATA_LEN(el->el_ea);
@@ -534,21 +534,50 @@ int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
        return len;
 }
+int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **ppdata)
+{
+        struct gfs2_ea_location el;
+        int error;
+        int len;
+        char *data;
+        error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, &el);
+        if (error)
+                return error;
+        if (!el.el_ea)
+                goto out;
+        if (!GFS2_EA_DATA_LEN(el.el_ea))
+                goto out;
+        len = GFS2_EA_DATA_LEN(el.el_ea);
+        data = kmalloc(len, GFP_NOFS);
+        error = -ENOMEM;
+        if (data == NULL)
+                goto out;
+        error = gfs2_ea_get_copy(ip, &el, data, len);
+        if (error == 0)
+                error = len;
+        *ppdata = data;
+out:
+        brelse(el.el_bh);
+        return error;
+}
 /**
 * gfs2_xattr_get - Get a GFS2 extended attribute
 * @inode: The inode
- * @type: The type of extended attribute
 * @name: The name of the extended attribute
 * @buffer: The buffer to write the result into
 * @size: The size of the buffer
+ * @type: The type of extended attribute
 *
 * Returns: actual size of data on success, -errno on error
 */
+static int gfs2_xattr_get(struct dentry *dentry, const char *name,
-int gfs2_xattr_get(struct inode *inode, int type, const char *name,
+                void *buffer, size_t size, int type)
-                   void *buffer, size_t size)
 {
-        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
        struct gfs2_ea_location el;
        int error;
@@ -1089,7 +1118,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
 /**
 * gfs2_xattr_remove - Remove a GFS2 extended attribute
- * @inode: The inode
+ * @ip: The inode
 * @type: The type of the extended attribute
 * @name: The name of the extended attribute
 *
@@ -1100,9 +1129,8 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
 * Returns: 0, or errno on failure
 */
-static int gfs2_xattr_remove(struct inode *inode, int type, const char *name)
+static int gfs2_xattr_remove(struct gfs2_inode *ip, int type, const char *name)
 {
-        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_ea_location el;
        int error;
@@ -1126,24 +1154,24 @@ static int gfs2_xattr_remove(struct inode *inode, int type, const char *name)
 }
 /**
- * gfs2_xattr_set - Set (or remove) a GFS2 extended attribute
+ * __gfs2_xattr_set - Set (or remove) a GFS2 extended attribute
- * @inode: The inode
+ * @ip: The inode
- * @type: The type of the extended attribute
 * @name: The name of the extended attribute
 * @value: The value of the extended attribute (NULL for remove)
 * @size: The size of the @value argument
 * @flags: Create or Replace
+ * @type: The type of the extended attribute
 *
 * See gfs2_xattr_remove() for details of the removal of xattrs.
 *
 * Returns: 0 or errno on failure
 */
-int gfs2_xattr_set(struct inode *inode, int type, const char *name,
+int __gfs2_xattr_set(struct inode *inode, const char *name,
-                   const void *value, size_t size, int flags)
+                   const void *value, size_t size, int flags, int type)
 {
-        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct gfs2_ea_location el;
        unsigned int namel = strlen(name);
        int error;
@@ -1154,7 +1182,7 @@ int gfs2_xattr_set(struct inode *inode, int type, const char *name,
                return -ERANGE;
        if (value == NULL)
-                return gfs2_xattr_remove(inode, type, name);
+                return gfs2_xattr_remove(ip, type, name);
        if (ea_check_size(sdp, namel, size))
                return -ERANGE;
@@ -1194,6 +1222,13 @@ int gfs2_xattr_set(struct inode *inode, int type, const char *name,
        return error;
 }
+static int gfs2_xattr_set(struct dentry *dentry, const char *name,
+                const void *value, size_t size, int flags, int type)
+{
+        return __gfs2_xattr_set(dentry->d_inode, name, value,
+                                size, flags, type);
+}
 static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
                                  struct gfs2_ea_header *ea, char *data)
 {
@@ -1259,23 +1294,29 @@ fail:
        return error;
 }
-int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
-                      struct iattr *attr, char *data)
 {
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_ea_location el;
        struct buffer_head *dibh;
        int error;
-        if (GFS2_EA_IS_STUFFED(el->el_ea)) {
+        error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
-                error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
+        if (error)
-                if (error)
+                return error;
-                        return error;
-                gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+        if (GFS2_EA_IS_STUFFED(el.el_ea)) {
-                memcpy(GFS2_EA2DATA(el->el_ea), data,
+                error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0);
-                       GFS2_EA_DATA_LEN(el->el_ea));
+                if (error == 0) {
-        } else
+                        gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1);
-                error = ea_acl_chmod_unstuffed(ip, el->el_ea, data);
+                        memcpy(GFS2_EA2DATA(el.el_ea), data,
+                               GFS2_EA_DATA_LEN(el.el_ea));
+                }
+        } else {
+                error = ea_acl_chmod_unstuffed(ip, el.el_ea, data);
+        }
+        brelse(el.el_bh);
        if (error)
                return error;
@@ -1288,8 +1329,7 @@ int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
                brelse(dibh);
        }
-        gfs2_trans_end(GFS2_SB(&ip->i_inode));
+        gfs2_trans_end(sdp);
        return error;
 }
@@ -1495,58 +1535,18 @@ out_alloc:
        return error;
 }
-static int gfs2_xattr_user_get(struct inode *inode, const char *name,
-                               void *buffer, size_t size)
-{
-        return gfs2_xattr_get(inode, GFS2_EATYPE_USR, name, buffer, size);
-}
-static int gfs2_xattr_user_set(struct inode *inode, const char *name,
-                               const void *value, size_t size, int flags)
-{
-        return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags);
-}
-static int gfs2_xattr_system_get(struct inode *inode, const char *name,
-                                 void *buffer, size_t size)
-{
-        return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size);
-}
-static int gfs2_xattr_system_set(struct inode *inode, const char *name,
-                                 const void *value, size_t size, int flags)
-{
-        return gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, flags);
-}
-static int gfs2_xattr_security_get(struct inode *inode, const char *name,
-                                   void *buffer, size_t size)
-{
-        return gfs2_xattr_get(inode, GFS2_EATYPE_SECURITY, name, buffer, size);
-}
-static int gfs2_xattr_security_set(struct inode *inode, const char *name,
-                                   const void *value, size_t size, int flags)
-{
-        return gfs2_xattr_set(inode, GFS2_EATYPE_SECURITY, name, value, size, flags);
-}
 static struct xattr_handler gfs2_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
-        .get    = gfs2_xattr_user_get,
+        .flags  = GFS2_EATYPE_USR,
-        .set    = gfs2_xattr_user_set,
+        .get    = gfs2_xattr_get,
+        .set    = gfs2_xattr_set,
 };
 static struct xattr_handler gfs2_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
-        .get    = gfs2_xattr_security_get,
+        .flags  = GFS2_EATYPE_SECURITY,
-        .set    = gfs2_xattr_security_set,
+        .get    = gfs2_xattr_get,
-};
+        .set    = gfs2_xattr_set,
-static struct xattr_handler gfs2_xattr_system_handler = {
-        .prefix = XATTR_SYSTEM_PREFIX,
-        .get    = gfs2_xattr_system_get,
-        .set    = gfs2_xattr_system_set,
 };
 struct xattr_handler *gfs2_xattr_handlers[] = {
diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h
index cbdfd7743733..d392f8358f2f 100644
--- a/fs/gfs2/xattr.h
+++ b/fs/gfs2/xattr.h
@@ -53,20 +53,15 @@ struct gfs2_ea_location {
        struct gfs2_ea_header *el_prev;
 };
-extern int gfs2_xattr_get(struct inode *inode, int type, const char *name,
+extern int __gfs2_xattr_set(struct inode *inode, const char *name,
-                          void *buffer, size_t size);
+                            const void *value, size_t size,
-extern int gfs2_xattr_set(struct inode *inode, int type, const char *name,
+                            int flags, int type);
-                          const void *value, size_t size, int flags);
 extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
 extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
 /* Exported to acl.c */
-extern int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
+extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data);
-                        struct gfs2_ea_location *el);
+extern int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data);
-extern int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-                            char *data, size_t size);
-extern int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-                             struct iattr *attr, char *data);
 #endif /* __EATTR_DOT_H__ */
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 0d200068d0af..cdb41a1f6a64 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -9,6 +9,7 @@
 */
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 #include "btree.h"
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 052f214ea6f0..38a0a9917d7f 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -9,6 +9,7 @@
 */
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/log2.h>
 #include "btree.h"
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index 6d98f116ca03..424b0337f524 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -289,6 +289,10 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name,
        err = hfs_brec_find(&src_fd);
        if (err)
                goto out;
+        if (src_fd.entrylength > sizeof(entry) || src_fd.entrylength < 0) {
+                err = -EIO;
+                goto out;
+        }
        hfs_bnode_read(src_fd.bnode, &entry, src_fd.entryoffset,
                            src_fd.entrylength);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 7c69b98a2e45..2b3b8611b41b 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -79,6 +79,11 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                filp->f_pos++;
                /* fall through */
        case 1:
+                if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
+                        err = -EIO;
+                        goto out;
+                }
                hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
                if (entry.type != HFS_CDR_THD) {
                        printk(KERN_ERR "hfs: bad catalog folder thread\n");
@@ -109,6 +114,12 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        err = -EIO;
                        goto out;
                }
+                if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
+                        err = -EIO;
+                        goto out;
+                }
                hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
                type = entry.type;
                len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName);
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 052387e11671..fe35e3b626c4 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -188,7 +188,7 @@ extern const struct address_space_operations hfs_btree_aops;
 extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int);
 extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *);
-extern int hfs_write_inode(struct inode *, int);
+extern int hfs_write_inode(struct inode *, struct writeback_control *);
 extern int hfs_inode_setattr(struct dentry *, struct iattr *);
 extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
                        __be32 log_size, __be32 phys_size, u32 clump_size);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index a1cbff2b4d99..14f5cb1b9fdc 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -381,7 +381,7 @@ void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext,
                                         HFS_SB(inode->i_sb)->alloc_blksz);
 }
-int hfs_write_inode(struct inode *inode, int unused)
+int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        struct inode *main_inode = inode;
        struct hfs_find_data fd;
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 8bbe03c3f6d5..86428f5ac991 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -11,6 +11,7 @@
 #include <linux/cdrom.h>
 #include <linux/genhd.h>
 #include <linux/nls.h>
+#include <linux/slab.h>
 #include "hfs_fs.h"
 #include "btree.h"
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index f7fcbe49da72..0a81eb7111f3 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -19,6 +19,7 @@
 #include <linux/nls.h>
 #include <linux/parser.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/vfs.h>
@@ -409,8 +410,13 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
        /* try to get the root inode */
        hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
        res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd);
-        if (!res)
+        if (!res) {
+                if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) {
+                        res =  -EIO;
+                        goto bail;
+                }
                hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength);
+        }
        if (res) {
                hfs_find_exit(&fd);
                goto bail_no_root;
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 3fcbb0e1f6fc..572628b4b07d 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -15,6 +15,7 @@
 #include <linux/nls.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include "hfsplus_fs.h"
 enum {
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 43022f3d5148..74b473a8ef92 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -87,7 +87,8 @@ bad_inode:
        return ERR_PTR(err);
 }
-static int hfsplus_write_inode(struct inode *inode, int unused)
+static int hfsplus_write_inode(struct inode *inode,
+                struct writeback_control *wbc)
 {
        struct hfsplus_vh *vhdr;
        int ret = 0;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 032604e5ef2c..3a029d8f4cf1 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -11,6 +11,7 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/statfs.h>
+#include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include "hostfs.h"
diff --git a/fs/hpfs/anode.c b/fs/hpfs/anode.c
index 1aa88c4e0964..6a2f04bf3df0 100644
--- a/fs/hpfs/anode.c
+++ b/fs/hpfs/anode.c
@@ -353,7 +353,7 @@ int hpfs_ea_read(struct super_block *s, secno a, int ano, unsigned pos,
 }
 int hpfs_ea_write(struct super_block *s, secno a, int ano, unsigned pos,
-             unsigned len, char *buf)
+             unsigned len, const char *buf)
 {
        struct buffer_head *bh;
        char *data;
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index b6fca543544c..eac5f96323e3 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -6,6 +6,7 @@
 *  general buffer i/o
 */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include "hpfs_fn.h"
 void hpfs_lock_creation(struct super_block *s)
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 940d6d150bee..67d9d36b3d5f 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -20,8 +20,8 @@ static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
        if (l == 1) if (qstr->name[0]=='.') goto x;
        if (l == 2) if (qstr->name[0]=='.' || qstr->name[1]=='.') goto x;
-        hpfs_adjust_length((char *)qstr->name, &l);
+        hpfs_adjust_length(qstr->name, &l);
-        /*if (hpfs_chk_name((char *)qstr->name,&l))*/
+        /*if (hpfs_chk_name(qstr->name,&l))*/
                /*return -ENAMETOOLONG;*/
                /*return -ENOENT;*/
        x:
@@ -38,14 +38,16 @@ static int hpfs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qst
 {
        unsigned al=a->len;
        unsigned bl=b->len;
-        hpfs_adjust_length((char *)a->name, &al);
+        hpfs_adjust_length(a->name, &al);
-        /*hpfs_adjust_length((char *)b->name, &bl);*/
+        /*hpfs_adjust_length(b->name, &bl);*/
        /* 'a' is the qstr of an already existing dentry, so the name
         * must be valid. 'b' must be validated first.
         */
-        if (hpfs_chk_name((char *)b->name, &bl)) return 1;
+        if (hpfs_chk_name(b->name, &bl))
-        if (hpfs_compare_names(dentry->d_sb, (char *)a->name, al, (char *)b->name, bl, 0)) return 1;
+                return 1;
+        if (hpfs_compare_names(dentry->d_sb, a->name, al, b->name, bl, 0))
+                return 1;
        return 0;
 }
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 8865c94f55f6..2338130cceba 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -7,6 +7,7 @@
 */
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 #include "hpfs_fn.h"
 static int hpfs_dir_release(struct inode *inode, struct file *filp)
@@ -59,7 +60,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct hpfs_dirent *de;
        int lc;
        long old_pos;
-        char *tempname;
+        unsigned char *tempname;
        int c1, c2 = 0;
        int ret = 0;
@@ -158,11 +159,11 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3);
                if (filldir(dirent, tempname, de->namelen, old_pos, de->fnode, DT_UNKNOWN) < 0) {
                        filp->f_pos = old_pos;
-                        if (tempname != (char *)de->name) kfree(tempname);
+                        if (tempname != de->name) kfree(tempname);
                        hpfs_brelse4(&qbh);
                        goto out;
                }
-                if (tempname != (char *)de->name) kfree(tempname);
+                if (tempname != de->name) kfree(tempname);
                hpfs_brelse4(&qbh);
        }
 out:
@@ -187,7 +188,7 @@ out:
 struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
-        const char *name = dentry->d_name.name;
+        const unsigned char *name = dentry->d_name.name;
        unsigned len = dentry->d_name.len;
        struct quad_buffer_head qbh;
        struct hpfs_dirent *de;
@@ -197,7 +198,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
        struct hpfs_inode_info *hpfs_result;
        lock_kernel();
-        if ((err = hpfs_chk_name((char *)name, &len))) {
+        if ((err = hpfs_chk_name(name, &len))) {
                if (err == -ENAMETOOLONG) {
                        unlock_kernel();
                        return ERR_PTR(-ENAMETOOLONG);
@@ -209,7 +210,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
         * '.' and '..' will never be passed here.
         */
-        de = map_dirent(dir, hpfs_i(dir)->i_dno, (char *) name, len, NULL, &qbh);
+        de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, NULL, &qbh);
        /*
         * This is not really a bailout, just means file not found.
@@ -250,7 +251,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
        hpfs_result = hpfs_i(result);
        if (!de->directory) hpfs_result->i_parent_dir = dir->i_ino;
-        hpfs_decide_conv(result, (char *)name, len);
+        hpfs_decide_conv(result, name, len);
        if (de->has_acl || de->has_xtd_perm) if (!(dir->i_sb->s_flags & MS_RDONLY)) {
                hpfs_error(result->i_sb, "ACLs or XPERM found. This is probably HPFS386. This driver doesn't support it now. Send me some info on these structures");
diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c
index fe83c2b7d2d8..9b2ffadfc8c4 100644
--- a/fs/hpfs/dnode.c
+++ b/fs/hpfs/dnode.c
@@ -158,7 +158,8 @@ static void set_last_pointer(struct super_block *s, struct dnode *d, dnode_secno
 /* Add an entry to dnode and don't care if it grows over 2048 bytes */
-struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d, unsigned char *name,
+struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d,
+                                const unsigned char *name,
                                unsigned namelen, secno down_ptr)
 {
        struct hpfs_dirent *de;
@@ -223,7 +224,7 @@ static void fix_up_ptrs(struct super_block *s, struct dnode *d)
 /* Add an entry to dnode and do dnode splitting if required */
 static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
-                             unsigned char *name, unsigned namelen,
+                             const unsigned char *name, unsigned namelen,
                             struct hpfs_dirent *new_de, dnode_secno down_ptr)
 {
        struct quad_buffer_head qbh, qbh1, qbh2;
@@ -231,7 +232,7 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
        dnode_secno adno, rdno;
        struct hpfs_dirent *de;
        struct hpfs_dirent nde;
-        char *nname;
+        unsigned char *nname;
        int h;
        int pos;
        struct buffer_head *bh;
@@ -305,7 +306,9 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
                pos++;
        }
        copy_de(new_de = &nde, de);
-        memcpy(name = nname, de->name, namelen = de->namelen);
+        memcpy(nname, de->name, de->namelen);
+        name = nname;
+        namelen = de->namelen;
        for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | pos, 4);
        down_ptr = adno;
        set_last_pointer(i->i_sb, ad, de->down ? de_down_pointer(de) : 0);
@@ -368,7 +371,8 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
 * I hope, now it's finally bug-free.
 */
-int hpfs_add_dirent(struct inode *i, unsigned char *name, unsigned namelen,
+int hpfs_add_dirent(struct inode *i,
+                    const unsigned char *name, unsigned namelen,
                    struct hpfs_dirent *new_de, int cdepth)
 {
        struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
@@ -897,7 +901,8 @@ struct hpfs_dirent *map_pos_dirent(struct inode *inode, loff_t *posp,
 /* Find a dirent in tree */
-struct hpfs_dirent *map_dirent(struct inode *inode, dnode_secno dno, char *name, unsigned len,
+struct hpfs_dirent *map_dirent(struct inode *inode, dnode_secno dno,
+                               const unsigned char *name, unsigned len,
                               dnode_secno *dd, struct quad_buffer_head *qbh)
 {
        struct dnode *dnode;
@@ -988,8 +993,8 @@ void hpfs_remove_dtree(struct super_block *s, dnode_secno dno)
 struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno,
                                     struct fnode *f, struct quad_buffer_head *qbh)
 {
-        char *name1;
+        unsigned char *name1;
-        char *name2;
+        unsigned char *name2;
        int name1len, name2len;
        struct dnode *d;
        dnode_secno dno, downd;
diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c
index 547a8384571f..45e53d972b42 100644
--- a/fs/hpfs/ea.c
+++ b/fs/hpfs/ea.c
@@ -62,8 +62,8 @@ static char *get_indirect_ea(struct super_block *s, int ano, secno a, int size)
        return ret;
 }
-static void set_indirect_ea(struct super_block *s, int ano, secno a, char *data,
+static void set_indirect_ea(struct super_block *s, int ano, secno a,
-                            int size)
+                            const char *data, int size)
 {
        hpfs_ea_write(s, a, ano, 0, size, data);
 }
@@ -186,7 +186,8 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si
 * This driver can't change sizes of eas ('cause I just don't need it).
 */
-void hpfs_set_ea(struct inode *inode, struct fnode *fnode, char *key, char *data, int size)
+void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
+                 const char *data, int size)
 {
        fnode_secno fno = inode->i_ino;
        struct super_block *s = inode->i_sb;
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 701ca54c0867..97bf738cd5d6 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -215,7 +215,7 @@ secno hpfs_bplus_lookup(struct super_block *, struct inode *, struct bplus_heade
 secno hpfs_add_sector_to_btree(struct super_block *, secno, int, unsigned);
 void hpfs_remove_btree(struct super_block *, struct bplus_header *);
 int hpfs_ea_read(struct super_block *, secno, int, unsigned, unsigned, char *);
-int hpfs_ea_write(struct super_block *, secno, int, unsigned, unsigned, char *);
+int hpfs_ea_write(struct super_block *, secno, int, unsigned, unsigned, const char *);
 void hpfs_ea_remove(struct super_block *, secno, int, unsigned);
 void hpfs_truncate_btree(struct super_block *, secno, int, unsigned);
 void hpfs_remove_fnode(struct super_block *, fnode_secno fno);
@@ -244,13 +244,17 @@ extern const struct file_operations hpfs_dir_ops;
 void hpfs_add_pos(struct inode *, loff_t *);
 void hpfs_del_pos(struct inode *, loff_t *);
-struct hpfs_dirent *hpfs_add_de(struct super_block *, struct dnode *, unsigned char *, unsigned, secno);
+struct hpfs_dirent *hpfs_add_de(struct super_block *, struct dnode *,
-int hpfs_add_dirent(struct inode *, unsigned char *, unsigned, struct hpfs_dirent *, int);
+                                const unsigned char *, unsigned, secno);
+int hpfs_add_dirent(struct inode *, const unsigned char *, unsigned,
+                    struct hpfs_dirent *, int);
 int hpfs_remove_dirent(struct inode *, dnode_secno, struct hpfs_dirent *, struct quad_buffer_head *, int);
 void hpfs_count_dnodes(struct super_block *, dnode_secno, int *, int *, int *);
 dnode_secno hpfs_de_as_down_as_possible(struct super_block *, dnode_secno dno);
 struct hpfs_dirent *map_pos_dirent(struct inode *, loff_t *, struct quad_buffer_head *);
-struct hpfs_dirent *map_dirent(struct inode *, dnode_secno, char *, unsigned, dnode_secno *, struct quad_buffer_head *);
+struct hpfs_dirent *map_dirent(struct inode *, dnode_secno,
+                               const unsigned char *, unsigned, dnode_secno *,
+                               struct quad_buffer_head *);
 void hpfs_remove_dtree(struct super_block *, dnode_secno);
 struct hpfs_dirent *map_fnode_dirent(struct super_block *, fnode_secno, struct fnode *, struct quad_buffer_head *);
@@ -259,7 +263,8 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *, fnode_secno, struct f
 void hpfs_ea_ext_remove(struct super_block *, secno, int, unsigned);
 int hpfs_read_ea(struct super_block *, struct fnode *, char *, char *, int);
 char *hpfs_get_ea(struct super_block *, struct fnode *, char *, int *);
-void hpfs_set_ea(struct inode *, struct fnode *, char *, char *, int);
+void hpfs_set_ea(struct inode *, struct fnode *, const char *,
+                 const char *, int);
 /* file.c */
@@ -282,7 +287,7 @@ void hpfs_delete_inode(struct inode *);
 unsigned *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *);
 unsigned *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *);
-char *hpfs_load_code_page(struct super_block *, secno);
+unsigned char *hpfs_load_code_page(struct super_block *, secno);
 secno *hpfs_load_bitmap_directory(struct super_block *, secno bmp);
 struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **);
 struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **);
@@ -292,12 +297,13 @@ dnode_secno hpfs_fnode_dno(struct super_block *s, ino_t ino);
 /* name.c */
 unsigned char hpfs_upcase(unsigned char *, unsigned char);
-int hpfs_chk_name(unsigned char *, unsigned *);
+int hpfs_chk_name(const unsigned char *, unsigned *);
-char *hpfs_translate_name(struct super_block *, unsigned char *, unsigned, int, int);
+unsigned char *hpfs_translate_name(struct super_block *, unsigned char *, unsigned, int, int);
-int hpfs_compare_names(struct super_block *, unsigned char *, unsigned, unsigned char *, unsigned, int);
+int hpfs_compare_names(struct super_block *, const unsigned char *, unsigned,
-int hpfs_is_name_long(unsigned char *, unsigned);
+                       const unsigned char *, unsigned, int);
-void hpfs_adjust_length(unsigned char *, unsigned *);
+int hpfs_is_name_long(const unsigned char *, unsigned);
-void hpfs_decide_conv(struct inode *, unsigned char *, unsigned);
+void hpfs_adjust_length(const unsigned char *, unsigned *);
+void hpfs_decide_conv(struct inode *, const unsigned char *, unsigned);
 /* namei.c */
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index fe703ae46bc7..1042a9bc97f3 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -7,6 +7,7 @@
 */
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 #include "hpfs_fn.h"
 void hpfs_init_inode(struct inode *i)
@@ -46,7 +47,7 @@ void hpfs_read_inode(struct inode *i)
        struct fnode *fnode;
        struct super_block *sb = i->i_sb;
        struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
-        unsigned char *ea;
+        void *ea;
        int ea_size;
        if (!(fnode = hpfs_map_fnode(sb, i->i_ino, &bh))) {
@@ -112,7 +113,7 @@ void hpfs_read_inode(struct inode *i)
                }
        }
        if (fnode->dirflag) {
-                unsigned n_dnodes, n_subdirs;
+                int n_dnodes, n_subdirs;
                i->i_mode |= S_IFDIR;
                i->i_op = &hpfs_dir_iops;
                i->i_fop = &hpfs_dir_ops;
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index c4724589b2eb..840d033ecee8 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -35,7 +35,7 @@ unsigned int *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
 * lowercasing table
 */
-char *hpfs_load_code_page(struct super_block *s, secno cps)
+unsigned char *hpfs_load_code_page(struct super_block *s, secno cps)
 {
        struct buffer_head *bh;
        secno cpds;
@@ -71,7 +71,7 @@ char *hpfs_load_code_page(struct super_block *s, secno cps)
                brelse(bh);
                return NULL;
        }
-        ptr = (char *)cpd + cpd->offs[cpi] + 6;
+        ptr = (unsigned char *)cpd + cpd->offs[cpi] + 6;
        if (!(cp_table = kmalloc(256, GFP_KERNEL))) {
                printk("HPFS: out of memory for code page table\n");
                brelse(bh);
@@ -217,7 +217,7 @@ struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno,
        if ((dnode = hpfs_map_4sectors(s, secno, qbh, DNODE_RD_AHEAD)))
                if (hpfs_sb(s)->sb_chk) {
                        unsigned p, pp = 0;
-                        unsigned char *d = (char *)dnode;
+                        unsigned char *d = (unsigned char *)dnode;
                        int b = 0;
                        if (dnode->magic != DNODE_MAGIC) {
                                hpfs_error(s, "bad magic on dnode %08x", secno);
diff --git a/fs/hpfs/name.c b/fs/hpfs/name.c
index 1f4a964384eb..f24736d7a439 100644
--- a/fs/hpfs/name.c
+++ b/fs/hpfs/name.c
@@ -8,16 +8,16 @@
 #include "hpfs_fn.h"
-static char *text_postfix[]={
+static const char *text_postfix[]={
 ".ASM", ".BAS", ".BAT", ".C", ".CC", ".CFG", ".CMD", ".CON", ".CPP", ".DEF",
 ".DOC", ".DPR", ".ERX", ".H", ".HPP", ".HTM", ".HTML", ".JAVA", ".LOG", ".PAS",
 ".RC", ".TEX", ".TXT", ".Y", ""};
-static char *text_prefix[]={
+static const char *text_prefix[]={
 "AUTOEXEC.", "CHANGES", "COPYING", "CONFIG.", "CREDITS", "FAQ", "FILE_ID.DIZ",
 "MAKEFILE", "READ.ME", "README", "TERMCAP", ""};
-void hpfs_decide_conv(struct inode *inode, unsigned char *name, unsigned len)
+void hpfs_decide_conv(struct inode *inode, const unsigned char *name, unsigned len)
 {
        struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
        int i;
@@ -71,7 +71,7 @@ static inline unsigned char locase(unsigned char *dir, unsigned char a)
        return dir[a];
 }
-int hpfs_chk_name(unsigned char *name, unsigned *len)
+int hpfs_chk_name(const unsigned char *name, unsigned *len)
 {
        int i;
        if (*len > 254) return -ENAMETOOLONG;
@@ -83,10 +83,10 @@ int hpfs_chk_name(unsigned char *name, unsigned *len)
        return 0;
 }
-char *hpfs_translate_name(struct super_block *s, unsigned char *from,
+unsigned char *hpfs_translate_name(struct super_block *s, unsigned char *from,
                          unsigned len, int lc, int lng)
 {
-        char *to;
+        unsigned char *to;
        int i;
        if (hpfs_sb(s)->sb_chk >= 2) if (hpfs_is_name_long(from, len) != lng) {
                printk("HPFS: Long name flag mismatch - name ");
@@ -103,8 +103,9 @@ char *hpfs_translate_name(struct super_block *s, unsigned char *from,
        return to;
 }
-int hpfs_compare_names(struct super_block *s, unsigned char *n1, unsigned l1,
+int hpfs_compare_names(struct super_block *s,
-                       unsigned char *n2, unsigned l2, int last)
+                       const unsigned char *n1, unsigned l1,
+                       const unsigned char *n2, unsigned l2, int last)
 {
        unsigned l = l1 < l2 ? l1 : l2;
        unsigned i;
@@ -120,7 +121,7 @@ int hpfs_compare_names(struct super_block *s, unsigned char *n1, unsigned l1,
        return 0;
 }
-int hpfs_is_name_long(unsigned char *name, unsigned len)
+int hpfs_is_name_long(const unsigned char *name, unsigned len)
 {
        int i,j;
        for (i = 0; i < len && name[i] != '.'; i++)
@@ -134,7 +135,7 @@ int hpfs_is_name_long(unsigned char *name, unsigned len)
 /* OS/2 clears dots and spaces at the end of file name, so we have to */
-void hpfs_adjust_length(unsigned char *name, unsigned *len)
+void hpfs_adjust_length(const unsigned char *name, unsigned *len)
 {
        if (!*len) return;
        if (*len == 1 && name[0] == '.') return;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 82b9c4ba9ed0..11c2b4080f65 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -11,7 +11,7 @@
 static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
-        const char *name = dentry->d_name.name;
+        const unsigned char *name = dentry->d_name.name;
        unsigned len = dentry->d_name.len;
        struct quad_buffer_head qbh0;
        struct buffer_head *bh;
@@ -24,7 +24,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        int r;
        struct hpfs_dirent dee;
        int err;
-        if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err;
+        if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
        lock_kernel();
        err = -ENOSPC;
        fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
@@ -62,7 +62,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                result->i_mode &= ~0222;
        mutex_lock(&hpfs_i(dir)->i_mutex);
-        r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0);
+        r = hpfs_add_dirent(dir, name, len, &dee, 0);
        if (r == 1)
                goto bail3;
        if (r == -1) {
@@ -121,7 +121,7 @@ bail:
 static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
 {
-        const char *name = dentry->d_name.name;
+        const unsigned char *name = dentry->d_name.name;
        unsigned len = dentry->d_name.len;
        struct inode *result = NULL;
        struct buffer_head *bh;
@@ -130,7 +130,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
        int r;
        struct hpfs_dirent dee;
        int err;
-        if ((err = hpfs_chk_name((char *)name, &len)))
+        if ((err = hpfs_chk_name(name, &len)))
                return err==-ENOENT ? -EINVAL : err;
        lock_kernel();
        err = -ENOSPC;
@@ -155,7 +155,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
        result->i_op = &hpfs_file_iops;
        result->i_fop = &hpfs_file_ops;
        result->i_nlink = 1;
-        hpfs_decide_conv(result, (char *)name, len);
+        hpfs_decide_conv(result, name, len);
        hpfs_i(result)->i_parent_dir = dir->i_ino;
        result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, dee.creation_date);
        result->i_ctime.tv_nsec = 0;
@@ -170,7 +170,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
        hpfs_i(result)->mmu_private = 0;
        mutex_lock(&hpfs_i(dir)->i_mutex);
-        r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0);
+        r = hpfs_add_dirent(dir, name, len, &dee, 0);
        if (r == 1)
                goto bail2;
        if (r == -1) {
@@ -211,7 +211,7 @@ bail:
 static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 {
-        const char *name = dentry->d_name.name;
+        const unsigned char *name = dentry->d_name.name;
        unsigned len = dentry->d_name.len;
        struct buffer_head *bh;
        struct fnode *fnode;
@@ -220,7 +220,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
        struct hpfs_dirent dee;
        struct inode *result = NULL;
        int err;
-        if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err;
+        if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
        if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM;
        if (!new_valid_dev(rdev))
                return -EINVAL;
@@ -256,7 +256,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
        init_special_inode(result, mode, rdev);
        mutex_lock(&hpfs_i(dir)->i_mutex);
-        r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0);
+        r = hpfs_add_dirent(dir, name, len, &dee, 0);
        if (r == 1)
                goto bail2;
        if (r == -1) {
@@ -289,7 +289,7 @@ bail:
 static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *symlink)
 {
-        const char *name = dentry->d_name.name;
+        const unsigned char *name = dentry->d_name.name;
        unsigned len = dentry->d_name.len;
        struct buffer_head *bh;
        struct fnode *fnode;
@@ -298,7 +298,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
        struct hpfs_dirent dee;
        struct inode *result;
        int err;
-        if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err;
+        if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
        lock_kernel();
        if (hpfs_sb(dir->i_sb)->sb_eas < 2) {
                unlock_kernel();
@@ -335,7 +335,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
        result->i_data.a_ops = &hpfs_symlink_aops;
        mutex_lock(&hpfs_i(dir)->i_mutex);
-        r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0);
+        r = hpfs_add_dirent(dir, name, len, &dee, 0);
        if (r == 1)
                goto bail2;
        if (r == -1) {
@@ -345,7 +345,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
        fnode->len = len;
        memcpy(fnode->name, name, len > 15 ? 15 : len);
        fnode->up = dir->i_ino;
-        hpfs_set_ea(result, fnode, "SYMLINK", (char *)symlink, strlen(symlink));
+        hpfs_set_ea(result, fnode, "SYMLINK", symlink, strlen(symlink));
        mark_buffer_dirty(bh);
        brelse(bh);
@@ -369,7 +369,7 @@ bail:
 static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
 {
-        const char *name = dentry->d_name.name;
+        const unsigned char *name = dentry->d_name.name;
        unsigned len = dentry->d_name.len;
        struct quad_buffer_head qbh;
        struct hpfs_dirent *de;
@@ -381,12 +381,12 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
        int err;
        lock_kernel();
-        hpfs_adjust_length((char *)name, &len);
+        hpfs_adjust_length(name, &len);
 again:
        mutex_lock(&hpfs_i(inode)->i_parent_mutex);
        mutex_lock(&hpfs_i(dir)->i_mutex);
        err = -ENOENT;
-        de = map_dirent(dir, hpfs_i(dir)->i_dno, (char *)name, len, &dno, &qbh);
+        de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh);
        if (!de)
                goto out;
@@ -413,22 +413,25 @@ again:
                mutex_unlock(&hpfs_i(dir)->i_mutex);
                mutex_unlock(&hpfs_i(inode)->i_parent_mutex);
-                d_drop(dentry);
+                dentry_unhash(dentry);
-                spin_lock(&dentry->d_lock);
+                if (!d_unhashed(dentry)) {
-                if (atomic_read(&dentry->d_count) > 1 ||
+                        dput(dentry);
-                    generic_permission(inode, MAY_WRITE, NULL) ||
+                        unlock_kernel();
+                        return -ENOSPC;
+                }
+                if (generic_permission(inode, MAY_WRITE, NULL) ||
                    !S_ISREG(inode->i_mode) ||
                    get_write_access(inode)) {
-                        spin_unlock(&dentry->d_lock);
                        d_rehash(dentry);
+                        dput(dentry);
                } else {
                        struct iattr newattrs;
-                        spin_unlock(&dentry->d_lock);
                        /*printk("HPFS: truncating file before delete.\n");*/
                        newattrs.ia_size = 0;
                        newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
                        err = notify_change(dentry, &newattrs);
                        put_write_access(inode);
+                        dput(dentry);
                        if (!err)
                                goto again;
                }
@@ -451,7 +454,7 @@ out:
 static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
-        const char *name = dentry->d_name.name;
+        const unsigned char *name = dentry->d_name.name;
        unsigned len = dentry->d_name.len;
        struct quad_buffer_head qbh;
        struct hpfs_dirent *de;
@@ -462,12 +465,12 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
        int err;
        int r;
-        hpfs_adjust_length((char *)name, &len);
+        hpfs_adjust_length(name, &len);
        lock_kernel();
        mutex_lock(&hpfs_i(inode)->i_parent_mutex);
        mutex_lock(&hpfs_i(dir)->i_mutex);
        err = -ENOENT;
-        de = map_dirent(dir, hpfs_i(dir)->i_dno, (char *)name, len, &dno, &qbh);
+        de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh);
        if (!de)
                goto out;
@@ -546,10 +549,10 @@ const struct address_space_operations hpfs_symlink_aops = {
 static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                struct inode *new_dir, struct dentry *new_dentry)
 {
-        char *old_name = (char *)old_dentry->d_name.name;
+        const unsigned char *old_name = old_dentry->d_name.name;
-        int old_len = old_dentry->d_name.len;
+        unsigned old_len = old_dentry->d_name.len;
-        char *new_name = (char *)new_dentry->d_name.name;
+        const unsigned char *new_name = new_dentry->d_name.name;
-        int new_len = new_dentry->d_name.len;
+        unsigned new_len = new_dentry->d_name.len;
        struct inode *i = old_dentry->d_inode;
        struct inode *new_inode = new_dentry->d_inode;
        struct quad_buffer_head qbh, qbh1;
@@ -560,9 +563,9 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct buffer_head *bh;
        struct fnode *fnode;
        int err;
-        if ((err = hpfs_chk_name((char *)new_name, &new_len))) return err;
+        if ((err = hpfs_chk_name(new_name, &new_len))) return err;
        err = 0;
-        hpfs_adjust_length((char *)old_name, &old_len);
+        hpfs_adjust_length(old_name, &old_len);
        lock_kernel();
        /* order doesn't matter, due to VFS exclusion */
@@ -579,7 +582,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                goto end1;
        }
-        if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, (char *)old_name, old_len, &dno, &qbh))) {
+        if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) {
                hpfs_error(i->i_sb, "lookup succeeded but map dirent failed");
                err = -ENOENT;
                goto end1;
@@ -590,7 +593,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (new_inode) {
                int r;
                if ((r = hpfs_remove_dirent(old_dir, dno, dep, &qbh, 1)) != 2) {
-                        if ((nde = map_dirent(new_dir, hpfs_i(new_dir)->i_dno, (char *)new_name, new_len, NULL, &qbh1))) {
+                        if ((nde = map_dirent(new_dir, hpfs_i(new_dir)->i_dno, new_name, new_len, NULL, &qbh1))) {
                                clear_nlink(new_inode);
                                copy_de(nde, &de);
                                memcpy(nde->name, new_name, new_len);
@@ -618,7 +621,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        }
        
        if (new_dir == old_dir)
-                if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, (char *)old_name, old_len, &dno, &qbh))) {
+                if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) {
                        hpfs_unlock_creation(i->i_sb);
                        hpfs_error(i->i_sb, "lookup succeeded but map dirent failed at #2");
                        err = -ENOENT;
@@ -648,7 +651,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                brelse(bh);
        }
        hpfs_i(i)->i_conv = hpfs_sb(i->i_sb)->sb_conv;
-        hpfs_decide_conv(i, (char *)new_name, new_len);
+        hpfs_decide_conv(i, new_name, new_len);
 end1:
        if (old_dir != new_dir)
                mutex_unlock(&hpfs_i(new_dir)->i_mutex);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index f2feaa06bf26..aa53842c599c 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -14,6 +14,8 @@
 #include <linux/magic.h>
 #include <linux/sched.h>
 #include <linux/smp_lock.h>
+#include <linux/bitmap.h>
+#include <linux/slab.h>
 /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
@@ -115,15 +117,13 @@ static void hpfs_put_super(struct super_block *s)
 unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
 {
        struct quad_buffer_head qbh;
-        unsigned *bits;
+        unsigned long *bits;
-        unsigned i, count;
+        unsigned count;
-        if (!(bits = hpfs_map_4sectors(s, secno, &qbh, 4))) return 0;
-        count = 0;
+        bits = hpfs_map_4sectors(s, secno, &qbh, 4);
-        for (i = 0; i < 2048 / sizeof(unsigned); i++) {
+        if (!bits)
-                unsigned b; 
+                return 0;
-                if (!bits[i]) continue;
+        count = bitmap_weight(bits, 2048 * BITS_PER_BYTE);
-                for (b = bits[i]; b; b>>=1) count += b & 1;
-        }
        hpfs_brelse4(&qbh);
        return count;
 }
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index a5089a6dd67a..2e4dfa8593da 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -646,22 +646,27 @@ static const struct super_operations hppfs_sbops = {
 static int hppfs_readlink(struct dentry *dentry, char __user *buffer,
                          int buflen)
 {
-        struct dentry *proc_dentry;
+        struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
-        proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
        return proc_dentry->d_inode->i_op->readlink(proc_dentry, buffer,
                                                    buflen);
 }
 static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-        struct dentry *proc_dentry;
+        struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
-        proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
        return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd);
 }
+static void hppfs_put_link(struct dentry *dentry, struct nameidata *nd,
+                           void *cookie)
+{
+        struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
+        if (proc_dentry->d_inode->i_op->put_link)
+                proc_dentry->d_inode->i_op->put_link(proc_dentry, nd, cookie);
+}
 static const struct inode_operations hppfs_dir_iops = {
        .lookup         = hppfs_lookup,
 };
@@ -669,6 +674,7 @@ static const struct inode_operations hppfs_dir_iops = {
 static const struct inode_operations hppfs_link_iops = {
        .readlink       = hppfs_readlink,
        .follow_link    = hppfs_follow_link,
+        .put_link       = hppfs_put_link,
 };
 static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
@@ -712,7 +718,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
        struct vfsmount *proc_mnt;
        int err = -ENOENT;
-        proc_mnt = do_kern_mount("proc", 0, "proc", NULL);
+        proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt);
        if (IS_ERR(proc_mnt))
                goto out;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 87a1258953b8..a0bbd3d1b41a 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -30,7 +30,6 @@
 #include <linux/dnotify.h>
 #include <linux/statfs.h>
 #include <linux/security.h>
-#include <linux/ima.h>
 #include <linux/magic.h>
 #include <asm/uaccess.h>
@@ -922,7 +921,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
        int error = -ENOMEM;
        struct file *file;
        struct inode *inode;
-        struct dentry *dentry, *root;
+        struct path path;
+        struct dentry *root;
        struct qstr quick_string;
        *user = NULL;
@@ -944,10 +944,11 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
        quick_string.name = name;
        quick_string.len = strlen(quick_string.name);
        quick_string.hash = 0;
-        dentry = d_alloc(root, &quick_string);
+        path.dentry = d_alloc(root, &quick_string);
-        if (!dentry)
+        if (!path.dentry)
                goto out_shm_unlock;
+        path.mnt = mntget(hugetlbfs_vfsmount);
        error = -ENOSPC;
        inode = hugetlbfs_get_inode(root->d_sb, current_fsuid(),
                                current_fsgid(), S_IFREG | S_IRWXUGO, 0);
@@ -960,24 +961,22 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
                        acctflag))
                goto out_inode;
-        d_instantiate(dentry, inode);
+        d_instantiate(path.dentry, inode);
        inode->i_size = size;
        inode->i_nlink = 0;
        error = -ENFILE;
-        file = alloc_file(hugetlbfs_vfsmount, dentry,
+        file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
-                        FMODE_WRITE | FMODE_READ,
                        &hugetlbfs_file_operations);
        if (!file)
                goto out_dentry; /* inode is already attached */
-        ima_counts_get(file);
        return file;
 out_inode:
        iput(inode);
 out_dentry:
-        dput(dentry);
+        path_put(&path);
 out_shm_unlock:
        if (*user) {
                user_shm_unlock(size, *user);
diff --git a/fs/inode.c b/fs/inode.c
index 4d8e3be55976..407bf392e20a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -8,7 +8,6 @@
 #include <linux/mm.h>
 #include <linux/dcache.h>
 #include <linux/init.h>
-#include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/writeback.h>
 #include <linux/module.h>
@@ -18,7 +17,6 @@
 #include <linux/hash.h>
 #include <linux/swap.h>
 #include <linux/security.h>
-#include <linux/ima.h>
 #include <linux/pagemap.h>
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
@@ -114,7 +112,7 @@ static void wake_up_inode(struct inode *inode)
         * Prevent speculative execution through spin_unlock(&inode_lock);
         */
        smp_mb();
-        wake_up_bit(&inode->i_state, __I_LOCK);
+        wake_up_bit(&inode->i_state, __I_NEW);
 }
 /**
@@ -157,11 +155,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
        if (security_inode_alloc(inode))
                goto out;
-        /* allocate and initialize an i_integrity */
-        if (ima_inode_alloc(inode))
-                goto out_free_security;
        spin_lock_init(&inode->i_lock);
        lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
@@ -201,9 +194,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 #endif
        return 0;
-out_free_security:
-        security_inode_free(inode);
 out:
        return -ENOMEM;
 }
@@ -235,7 +225,6 @@ static struct inode *alloc_inode(struct super_block *sb)
 void __destroy_inode(struct inode *inode)
 {
        BUG_ON(inode_has_buffers(inode));
-        ima_inode_free(inode);
        security_inode_free(inode);
        fsnotify_inode_delete(inode);
 #ifdef CONFIG_FS_POSIX_ACL
@@ -324,7 +313,6 @@ void clear_inode(struct inode *inode)
        BUG_ON(!(inode->i_state & I_FREEING));
        BUG_ON(inode->i_state & I_CLEAR);
        inode_sync_wait(inode);
-        vfs_dq_drop(inode);
        if (inode->i_sb->s_op->clear_inode)
                inode->i_sb->s_op->clear_inode(inode);
        if (S_ISBLK(inode->i_mode) && inode->i_bdev)
@@ -700,17 +688,17 @@ void unlock_new_inode(struct inode *inode)
        }
 #endif
        /*
-         * This is special!  We do not need the spinlock when clearing I_LOCK,
+         * This is special!  We do not need the spinlock when clearing I_NEW,
         * because we're guaranteed that nobody else tries to do anything about
         * the state of the inode when it is locked, as we just created it (so
-         * there can be no old holders that haven't tested I_LOCK).
+         * there can be no old holders that haven't tested I_NEW).
         * However we must emit the memory barrier so that other CPUs reliably
-         * see the clearing of I_LOCK after the other inode initialisation has
+         * see the clearing of I_NEW after the other inode initialisation has
         * completed.
         */
        smp_mb();
-        WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW));
+        WARN_ON(!(inode->i_state & I_NEW));
-        inode->i_state &= ~(I_LOCK|I_NEW);
+        inode->i_state &= ~I_NEW;
        wake_up_inode(inode);
 }
 EXPORT_SYMBOL(unlock_new_inode);
@@ -741,7 +729,7 @@ static struct inode *get_new_inode(struct super_block *sb,
                                goto set_failed;
                        __inode_add_to_lists(sb, head, inode);
-                        inode->i_state = I_LOCK|I_NEW;
+                        inode->i_state = I_NEW;
                        spin_unlock(&inode_lock);
                        /* Return the locked inode with I_NEW set, the
@@ -788,7 +776,7 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
                if (!old) {
                        inode->i_ino = ino;
                        __inode_add_to_lists(sb, head, inode);
-                        inode->i_state = I_LOCK|I_NEW;
+                        inode->i_state = I_NEW;
                        spin_unlock(&inode_lock);
                        /* Return the locked inode with I_NEW set, the
@@ -1093,7 +1081,7 @@ int insert_inode_locked(struct inode *inode)
        ino_t ino = inode->i_ino;
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
-        inode->i_state |= I_LOCK|I_NEW;
+        inode->i_state |= I_NEW;
        while (1) {
                struct hlist_node *node;
                struct inode *old = NULL;
@@ -1130,7 +1118,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
        struct super_block *sb = inode->i_sb;
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
-        inode->i_state |= I_LOCK|I_NEW;
+        inode->i_state |= I_NEW;
        while (1) {
                struct hlist_node *node;
@@ -1221,8 +1209,6 @@ void generic_delete_inode(struct inode *inode)
        if (op->delete_inode) {
                void (*delete)(struct inode *) = op->delete_inode;
-                if (!is_bad_inode(inode))
-                        vfs_dq_init(inode);
                /* Filesystems implementing their own
                 * s_op->delete_inode are required to call
                 * truncate_inode_pages and clear_inode()
@@ -1520,7 +1506,7 @@ EXPORT_SYMBOL(inode_wait);
 * until the deletion _might_ have completed.  Callers are responsible
 * to recheck inode state.
 *
- * It doesn't matter if I_LOCK is not set initially, a call to
+ * It doesn't matter if I_NEW is not set initially, a call to
 * wake_up_inode() after removing from the hash list will DTRT.
 *
 * This is called with inode_lock held.
@@ -1528,8 +1514,8 @@ EXPORT_SYMBOL(inode_wait);
 static void __wait_on_freeing_inode(struct inode *inode)
 {
        wait_queue_head_t *wq;
-        DEFINE_WAIT_BIT(wait, &inode->i_state, __I_LOCK);
+        DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
-        wq = bit_waitqueue(&inode->i_state, __I_LOCK);
+        wq = bit_waitqueue(&inode->i_state, __I_NEW);
        prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
        spin_unlock(&inode_lock);
        schedule();
diff --git a/fs/internal.h b/fs/internal.h
index 515175b8b72e..8a03a5447bdf 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -70,6 +70,8 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
 extern void __init mnt_init(void);
+extern spinlock_t vfsmount_lock;
 /*
 * fs_struct.c
 */
@@ -79,8 +81,16 @@ extern void chroot_fs_refs(struct path *, struct path *);
 * file_table.c
 */
 extern void mark_files_ro(struct super_block *);
+extern struct file *get_empty_filp(void);
 /*
 * super.c
 */
 extern int do_remount_sb(struct super_block *, int, void *, int);
+/*
+ * open.c
+ */
+struct nameidata;
+extern struct file *nameidata_to_filp(struct nameidata *);
+extern void release_open_intent(struct nameidata *);
diff --git a/fs/ioprio.c b/fs/ioprio.c
index c7c0b28d7d21..748cfb92dcc6 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -19,6 +19,7 @@
 * See also Documentation/block/ioprio.txt
 *
 */
+#include <linux/gfp.h>
 #include <linux/kernel.h>
 #include <linux/ioprio.h>
 #include <linux/blkdev.h>
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index defb932eee9a..0b3fa7974fa8 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -36,286 +36,323 @@ static void *zisofs_zlib_workspace;
 static DEFINE_MUTEX(zisofs_zlib_lock);
 /*
- * When decompressing, we typically obtain more than one page
+ * Read data of @inode from @block_start to @block_end and uncompress
- * per reference.  We inject the additional pages into the page
+ * to one zisofs block. Store the data in the @pages array with @pcount
- * cache as a form of readahead.
+ * entries. Start storing at offset @poffset of the first page.
 */
-static int zisofs_readpage(struct file *file, struct page *page)
+static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
+                                      loff_t block_end, int pcount,
+                                      struct page **pages, unsigned poffset,
+                                      int *errp)
 {
-        struct inode *inode = file->f_path.dentry->d_inode;
-        struct address_space *mapping = inode->i_mapping;
-        unsigned int maxpage, xpage, fpage, blockindex;
-        unsigned long offset;
-        unsigned long blockptr, blockendptr, cstart, cend, csize;
-        struct buffer_head *bh, *ptrbh[2];
-        unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
-        unsigned int bufshift = ISOFS_BUFFER_BITS(inode);
-        unsigned long bufmask  = bufsize - 1;
-        int err = -EIO;
-        int i;
-        unsigned int header_size = ISOFS_I(inode)->i_format_parm[0];
        unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1];
-        /* unsigned long zisofs_block_size = 1UL << zisofs_block_shift; */
+        unsigned int bufsize = ISOFS_BUFFER_SIZE(inode);
-        unsigned int zisofs_block_page_shift = zisofs_block_shift-PAGE_CACHE_SHIFT;
+        unsigned int bufshift = ISOFS_BUFFER_BITS(inode);
-        unsigned long zisofs_block_pages = 1UL << zisofs_block_page_shift;
+        unsigned int bufmask = bufsize - 1;
-        unsigned long zisofs_block_page_mask = zisofs_block_pages-1;
+        int i, block_size = block_end - block_start;
-        struct page *pages[zisofs_block_pages];
+        z_stream stream = { .total_out = 0,
-        unsigned long index = page->index;
+                            .avail_in = 0,
-        int indexblocks;
+                            .avail_out = 0, };
+        int zerr;
-        /* We have already been given one page, this is the one
+        int needblocks = (block_size + (block_start & bufmask) + bufmask)
-           we must do. */
+                                >> bufshift;
-        xpage = index & zisofs_block_page_mask;
+        int haveblocks;
-        pages[xpage] = page;
+        blkcnt_t blocknum;
- 
+        struct buffer_head *bhs[needblocks + 1];
-        /* The remaining pages need to be allocated and inserted */
+        int curbh, curpage;
-        offset = index & ~zisofs_block_page_mask;
-        blockindex = offset >> zisofs_block_page_shift;
+        if (block_size > deflateBound(1UL << zisofs_block_shift)) {
-        maxpage = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+                *errp = -EIO;
-        /*
-         * If this page is wholly outside i_size we just return zero;
-         * do_generic_file_read() will handle this for us
-         */
-        if (page->index >= maxpage) {
-                SetPageUptodate(page);
-                unlock_page(page);
                return 0;
        }
+        /* Empty block? */
-        maxpage = min(zisofs_block_pages, maxpage-offset);
+        if (block_size == 0) {
+                for ( i = 0 ; i < pcount ; i++ ) {
-        for ( i = 0 ; i < maxpage ; i++, offset++ ) {
+                        if (!pages[i])
-                if ( i != xpage ) {
+                                continue;
-                        pages[i] = grab_cache_page_nowait(mapping, offset);
+                        memset(page_address(pages[i]), 0, PAGE_CACHE_SIZE);
-                }
+                        flush_dcache_page(pages[i]);
-                page = pages[i];
+                        SetPageUptodate(pages[i]);
-                if ( page ) {
-                        ClearPageError(page);
-                        kmap(page);
                }
+                return ((loff_t)pcount) << PAGE_CACHE_SHIFT;
        }
-        /* This is the last page filled, plus one; used in case of abort. */
+        /* Because zlib is not thread-safe, do all the I/O at the top. */
-        fpage = 0;
+        blocknum = block_start >> bufshift;
+        memset(bhs, 0, (needblocks + 1) * sizeof(struct buffer_head *));
+        haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks);
+        ll_rw_block(READ, haveblocks, bhs);
-        /* Find the pointer to this specific chunk */
+        curbh = 0;
-        /* Note: we're not using isonum_731() here because the data is known aligned */
+        curpage = 0;
-        /* Note: header_size is in 32-bit words (4 bytes) */
+        /*
-        blockptr = (header_size + blockindex) << 2;
+         * First block is special since it may be fractional.  We also wait for
-        blockendptr = blockptr + 4;
+         * it before grabbing the zlib mutex; odds are that the subsequent
+         * blocks are going to come in in short order so we don't hold the zlib
+         * mutex longer than necessary.
+         */
-        indexblocks = ((blockptr^blockendptr) >> bufshift) ? 2 : 1;
+        if (!bhs[0])
-        ptrbh[0] = ptrbh[1] = NULL;
+                goto b_eio;
-        if ( isofs_get_blocks(inode, blockptr >> bufshift, ptrbh, indexblocks) != indexblocks ) {
+        wait_on_buffer(bhs[0]);
-                if ( ptrbh[0] ) brelse(ptrbh[0]);
+        if (!buffer_uptodate(bhs[0])) {
-                printk(KERN_DEBUG "zisofs: Null buffer on reading block table, inode = %lu, block = %lu\n",
+                *errp = -EIO;
-                       inode->i_ino, blockptr >> bufshift);
+                goto b_eio;
-                goto eio;
-        }
-        ll_rw_block(READ, indexblocks, ptrbh);
-        bh = ptrbh[0];
-        if ( !bh || (wait_on_buffer(bh), !buffer_uptodate(bh)) ) {
-                printk(KERN_DEBUG "zisofs: Failed to read block table, inode = %lu, block = %lu\n",
-                       inode->i_ino, blockptr >> bufshift);
-                if ( ptrbh[1] )
-                        brelse(ptrbh[1]);
-                goto eio;
-        }
-        cstart = le32_to_cpu(*(__le32 *)(bh->b_data + (blockptr & bufmask)));
-        if ( indexblocks == 2 ) {
-                /* We just crossed a block boundary.  Switch to the next block */
-                brelse(bh);
-                bh = ptrbh[1];
-                if ( !bh || (wait_on_buffer(bh), !buffer_uptodate(bh)) ) {
-                        printk(KERN_DEBUG "zisofs: Failed to read block table, inode = %lu, block = %lu\n",
-                               inode->i_ino, blockendptr >> bufshift);
-                        goto eio;
-                }
        }
-        cend = le32_to_cpu(*(__le32 *)(bh->b_data + (blockendptr & bufmask)));
-        brelse(bh);
-        if (cstart > cend)
+        stream.workspace = zisofs_zlib_workspace;
-                goto eio;
+        mutex_lock(&zisofs_zlib_lock);
                
-        csize = cend-cstart;
+        zerr = zlib_inflateInit(&stream);
+        if (zerr != Z_OK) {
-        if (csize > deflateBound(1UL << zisofs_block_shift))
+                if (zerr == Z_MEM_ERROR)
-                goto eio;
+                        *errp = -ENOMEM;
+                else
-        /* Now page[] contains an array of pages, any of which can be NULL,
+                        *errp = -EIO;
-           and the locks on which we hold.  We should now read the data and
+                printk(KERN_DEBUG "zisofs: zisofs_inflateInit returned %d\n",
-           release the pages.  If the pages are NULL the decompressed data
+                               zerr);
-           for that particular page should be discarded. */
+                goto z_eio;
-        
+        }
-        if ( csize == 0 ) {
-                /* This data block is empty. */
+        while (curpage < pcount && curbh < haveblocks &&
+               zerr != Z_STREAM_END) {
-                for ( fpage = 0 ; fpage < maxpage ; fpage++ ) {
+                if (!stream.avail_out) {
-                        if ( (page = pages[fpage]) != NULL ) {
+                        if (pages[curpage]) {
-                                memset(page_address(page), 0, PAGE_CACHE_SIZE);
+                                stream.next_out = page_address(pages[curpage])
-                                
+                                                + poffset;
-                                flush_dcache_page(page);
+                                stream.avail_out = PAGE_CACHE_SIZE - poffset;
-                                SetPageUptodate(page);
+                                poffset = 0;
-                                kunmap(page);
+                        } else {
-                                unlock_page(page);
+                                stream.next_out = (void *)&zisofs_sink_page;
-                                if ( fpage == xpage )
+                                stream.avail_out = PAGE_CACHE_SIZE;
-                                        err = 0; /* The critical page */
-                                else
-                                        page_cache_release(page);
                        }
                }
-        } else {
+                if (!stream.avail_in) {
-                /* This data block is compressed. */
+                        wait_on_buffer(bhs[curbh]);
-                z_stream stream;
+                        if (!buffer_uptodate(bhs[curbh])) {
-                int bail = 0, left_out = -1;
+                                *errp = -EIO;
-                int zerr;
+                                break;
-                int needblocks = (csize + (cstart & bufmask) + bufmask) >> bufshift;
+                        }
-                int haveblocks;
+                        stream.next_in  = bhs[curbh]->b_data +
-                struct buffer_head *bhs[needblocks+1];
+                                                (block_start & bufmask);
-                struct buffer_head **bhptr;
+                        stream.avail_in = min_t(unsigned, bufsize -
+                                                (block_start & bufmask),
-                /* Because zlib is not thread-safe, do all the I/O at the top. */
+                                                block_size);
+                        block_size -= stream.avail_in;
-                blockptr = cstart >> bufshift;
+                        block_start = 0;
-                memset(bhs, 0, (needblocks+1)*sizeof(struct buffer_head *));
-                haveblocks = isofs_get_blocks(inode, blockptr, bhs, needblocks);
-                ll_rw_block(READ, haveblocks, bhs);
-                bhptr = &bhs[0];
-                bh = *bhptr++;
-                /* First block is special since it may be fractional.
-                   We also wait for it before grabbing the zlib
-                   mutex; odds are that the subsequent blocks are
-                   going to come in in short order so we don't hold
-                   the zlib mutex longer than necessary. */
-                if ( !bh || (wait_on_buffer(bh), !buffer_uptodate(bh)) ) {
-                        printk(KERN_DEBUG "zisofs: Hit null buffer, fpage = %d, xpage = %d, csize = %ld\n",
-                               fpage, xpage, csize);
-                        goto b_eio;
-                }
-                stream.next_in  = bh->b_data + (cstart & bufmask);
-                stream.avail_in = min(bufsize-(cstart & bufmask), csize);
-                csize -= stream.avail_in;
-                stream.workspace = zisofs_zlib_workspace;
-                mutex_lock(&zisofs_zlib_lock);
-                
-                zerr = zlib_inflateInit(&stream);
-                if ( zerr != Z_OK ) {
-                        if ( err && zerr == Z_MEM_ERROR )
-                                err = -ENOMEM;
-                        printk(KERN_DEBUG "zisofs: zisofs_inflateInit returned %d\n",
-                               zerr);
-                        goto z_eio;
                }
-                while ( !bail && fpage < maxpage ) {
+                while (stream.avail_out && stream.avail_in) {
-                        page = pages[fpage];
+                        zerr = zlib_inflate(&stream, Z_SYNC_FLUSH);
-                        if ( page )
+                        if (zerr == Z_BUF_ERROR && stream.avail_in == 0)
-                                stream.next_out = page_address(page);
+                                break;
-                        else
+                        if (zerr == Z_STREAM_END)
-                                stream.next_out = (void *)&zisofs_sink_page;
+                                break;
-                        stream.avail_out = PAGE_CACHE_SIZE;
+                        if (zerr != Z_OK) {
+                                /* EOF, error, or trying to read beyond end of input */
-                        while ( stream.avail_out ) {
+                                if (zerr == Z_MEM_ERROR)
-                                int ao, ai;
+                                        *errp = -ENOMEM;
-                                if ( stream.avail_in == 0 && left_out ) {
+                                else {
-                                        if ( !csize ) {
+                                        printk(KERN_DEBUG
-                                                printk(KERN_WARNING "zisofs: ZF read beyond end of input\n");
+                                               "zisofs: zisofs_inflate returned"
-                                                bail = 1;
+                                               " %d, inode = %lu,"
-                                                break;
+                                               " page idx = %d, bh idx = %d,"
-                                        } else {
+                                               " avail_in = %d,"
-                                                bh = *bhptr++;
+                                               " avail_out = %d\n",
-                                                if ( !bh ||
+                                               zerr, inode->i_ino, curpage,
-                                                     (wait_on_buffer(bh), !buffer_uptodate(bh)) ) {
+                                               curbh, stream.avail_in,
-                                                        /* Reached an EIO */
+                                               stream.avail_out);
-                                                        printk(KERN_DEBUG "zisofs: Hit null buffer, fpage = %d, xpage = %d, csize = %ld\n",
+                                        *errp = -EIO;
-                                                               fpage, xpage, csize);
-                                                               
-                                                        bail = 1;
-                                                        break;
-                                                }
-                                                stream.next_in = bh->b_data;
-                                                stream.avail_in = min(csize,bufsize);
-                                                csize -= stream.avail_in;
-                                        }
-                                }
-                                ao = stream.avail_out;  ai = stream.avail_in;
-                                zerr = zlib_inflate(&stream, Z_SYNC_FLUSH);
-                                left_out = stream.avail_out;
-                                if ( zerr == Z_BUF_ERROR && stream.avail_in == 0 )
-                                        continue;
-                                if ( zerr != Z_OK ) {
-                                        /* EOF, error, or trying to read beyond end of input */
-                                        if ( err && zerr == Z_MEM_ERROR )
-                                                err = -ENOMEM;
-                                        if ( zerr != Z_STREAM_END )
-                                                printk(KERN_DEBUG "zisofs: zisofs_inflate returned %d, inode = %lu, index = %lu, fpage = %d, xpage = %d, avail_in = %d, avail_out = %d, ai = %d, ao = %d\n",
-                                                       zerr, inode->i_ino, index,
-                                                       fpage, xpage,
-                                                       stream.avail_in, stream.avail_out,
-                                                       ai, ao);
-                                        bail = 1;
-                                        break;
                                }
+                                goto inflate_out;
                        }
+                }
-                        if ( stream.avail_out && zerr == Z_STREAM_END ) {
+                if (!stream.avail_out) {
-                                /* Fractional page written before EOF.  This may
+                        /* This page completed */
-                                   be the last page in the file. */
+                        if (pages[curpage]) {
-                                memset(stream.next_out, 0, stream.avail_out);
+                                flush_dcache_page(pages[curpage]);
-                                stream.avail_out = 0;
+                                SetPageUptodate(pages[curpage]);
                        }
+                        curpage++;
+                }
+                if (!stream.avail_in)
+                        curbh++;
+        }
+inflate_out:
+        zlib_inflateEnd(&stream);
-                        if ( !stream.avail_out ) {
+z_eio:
-                                /* This page completed */
+        mutex_unlock(&zisofs_zlib_lock);
-                                if ( page ) {
-                                        flush_dcache_page(page);
+b_eio:
-                                        SetPageUptodate(page);
+        for (i = 0; i < haveblocks; i++)
-                                        kunmap(page);
+                brelse(bhs[i]);
-                                        unlock_page(page);
+        return stream.total_out;
-                                        if ( fpage == xpage )
+}
-                                                err = 0; /* The critical page */
-                                        else
+/*
-                                                page_cache_release(page);
+ * Uncompress data so that pages[full_page] is fully uptodate and possibly
-                                }
+ * fills in other pages if we have data for them.
-                                fpage++;
+ */
-                        }
+static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount,
+                             struct page **pages)
+{
+        loff_t start_off, end_off;
+        loff_t block_start, block_end;
+        unsigned int header_size = ISOFS_I(inode)->i_format_parm[0];
+        unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1];
+        unsigned int blockptr;
+        loff_t poffset = 0;
+        blkcnt_t cstart_block, cend_block;
+        struct buffer_head *bh;
+        unsigned int blkbits = ISOFS_BUFFER_BITS(inode);
+        unsigned int blksize = 1 << blkbits;
+        int err;
+        loff_t ret;
+        BUG_ON(!pages[full_page]);
+        /*
+         * We want to read at least 'full_page' page. Because we have to
+         * uncompress the whole compression block anyway, fill the surrounding
+         * pages with the data we have anyway...
+         */
+        start_off = page_offset(pages[full_page]);
+        end_off = min_t(loff_t, start_off + PAGE_CACHE_SIZE, inode->i_size);
+        cstart_block = start_off >> zisofs_block_shift;
+        cend_block = (end_off + (1 << zisofs_block_shift) - 1)
+                        >> zisofs_block_shift;
+        WARN_ON(start_off - (full_page << PAGE_CACHE_SHIFT) !=
+                ((cstart_block << zisofs_block_shift) & PAGE_CACHE_MASK));
+        /* Find the pointer to this specific chunk */
+        /* Note: we're not using isonum_731() here because the data is known aligned */
+        /* Note: header_size is in 32-bit words (4 bytes) */
+        blockptr = (header_size + cstart_block) << 2;
+        bh = isofs_bread(inode, blockptr >> blkbits);
+        if (!bh)
+                return -EIO;
+        block_start = le32_to_cpu(*(__le32 *)
+                                (bh->b_data + (blockptr & (blksize - 1))));
+        while (cstart_block < cend_block && pcount > 0) {
+                /* Load end of the compressed block in the file */
+                blockptr += 4;
+                /* Traversed to next block? */
+                if (!(blockptr & (blksize - 1))) {
+                        brelse(bh);
+                        bh = isofs_bread(inode, blockptr >> blkbits);
+                        if (!bh)
+                                return -EIO;
+                }
+                block_end = le32_to_cpu(*(__le32 *)
+                                (bh->b_data + (blockptr & (blksize - 1))));
+                if (block_start > block_end) {
+                        brelse(bh);
+                        return -EIO;
+                }
+                err = 0;
+                ret = zisofs_uncompress_block(inode, block_start, block_end,
+                                              pcount, pages, poffset, &err);
+                poffset += ret;
+                pages += poffset >> PAGE_CACHE_SHIFT;
+                pcount -= poffset >> PAGE_CACHE_SHIFT;
+                full_page -= poffset >> PAGE_CACHE_SHIFT;
+                poffset &= ~PAGE_CACHE_MASK;
+                if (err) {
+                        brelse(bh);
+                        /*
+                         * Did we finish reading the page we really wanted
+                         * to read?
+                         */
+                        if (full_page < 0)
+                                return 0;
+                        return err;
                }
-                zlib_inflateEnd(&stream);
-        z_eio:
+                block_start = block_end;
-                mutex_unlock(&zisofs_zlib_lock);
+                cstart_block++;
+        }
+        if (poffset && *pages) {
+                memset(page_address(*pages) + poffset, 0,
+                       PAGE_CACHE_SIZE - poffset);
+                flush_dcache_page(*pages);
+                SetPageUptodate(*pages);
+        }
+        return 0;
+}
-        b_eio:
+/*
-                for ( i = 0 ; i < haveblocks ; i++ ) {
+ * When decompressing, we typically obtain more than one page
-                        if ( bhs[i] )
+ * per reference.  We inject the additional pages into the page
-                                brelse(bhs[i]);
+ * cache as a form of readahead.
+ */
+static int zisofs_readpage(struct file *file, struct page *page)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct address_space *mapping = inode->i_mapping;
+        int err;
+        int i, pcount, full_page;
+        unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1];
+        unsigned int zisofs_pages_per_cblock =
+                PAGE_CACHE_SHIFT <= zisofs_block_shift ?
+                (1 << (zisofs_block_shift - PAGE_CACHE_SHIFT)) : 0;
+        struct page *pages[max_t(unsigned, zisofs_pages_per_cblock, 1)];
+        pgoff_t index = page->index, end_index;
+        end_index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        /*
+         * If this page is wholly outside i_size we just return zero;
+         * do_generic_file_read() will handle this for us
+         */
+        if (index >= end_index) {
+                SetPageUptodate(page);
+                unlock_page(page);
+                return 0;
+        }
+        if (PAGE_CACHE_SHIFT <= zisofs_block_shift) {
+                /* We have already been given one page, this is the one
+                   we must do. */
+                full_page = index & (zisofs_pages_per_cblock - 1);
+                pcount = min_t(int, zisofs_pages_per_cblock,
+                        end_index - (index & ~(zisofs_pages_per_cblock - 1)));
+                index -= full_page;
+        } else {
+                full_page = 0;
+                pcount = 1;
+        }
+        pages[full_page] = page;
+        for (i = 0; i < pcount; i++, index++) {
+                if (i != full_page)
+                        pages[i] = grab_cache_page_nowait(mapping, index);
+                if (pages[i]) {
+                        ClearPageError(pages[i]);
+                        kmap(pages[i]);
                }
        }
-eio:
+        err = zisofs_fill_pages(inode, full_page, pcount, pages);
        /* Release any residual pages, do not SetPageUptodate */
-        while ( fpage < maxpage ) {
+        for (i = 0; i < pcount; i++) {
-                page = pages[fpage];
+                if (pages[i]) {
-                if ( page ) {
+                        flush_dcache_page(pages[i]);
-                        flush_dcache_page(page);
+                        if (i == full_page && err)
-                        if ( fpage == xpage )
+                                SetPageError(pages[i]);
-                                SetPageError(page);
+                        kunmap(pages[i]);
-                        kunmap(page);
+                        unlock_page(pages[i]);
-                        unlock_page(page);
+                        if (i != full_page)
-                        if ( fpage != xpage )
+                                page_cache_release(pages[i]);
-                                page_cache_release(page);
                }
-                fpage++;
        }                       
        /* At this point, err contains 0 or -EIO depending on the "critical" page */
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 8ba5441063be..b9ab69b3a482 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -11,6 +11,7 @@
 *  isofs directory handling functions
 */
 #include <linux/smp_lock.h>
+#include <linux/gfp.h>
 #include "isofs.h"
 int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode)
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index e81a30593ba9..ed752cb38474 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -9,7 +9,7 @@
 *
 * The following files are helpful:
 *
- *     Documentation/filesystems/Exporting
+ *     Documentation/filesystems/nfs/Exporting
 *     fs/exportfs/expfs.c.
 */
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index eaa831311c9c..ab438beb867c 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -7,6 +7,7 @@
 */
 #include <linux/smp_lock.h>
+#include <linux/gfp.h>
 #include "isofs.h"
 /*
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index c2fb2dd0131f..96a685c550fd 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -518,8 +518,7 @@ repeat:
                        if (algo == SIG('p', 'z')) {
                                int block_shift =
                                        isonum_711(&rr->u.ZF.parms[1]);
-                                if (block_shift < PAGE_CACHE_SHIFT
+                                if (block_shift > 17) {
-                                                || block_shift > 17) {
                                        printk(KERN_WARNING "isofs: "
                                                "Can't handle ZF block "
                                                "size of 2^%d\n",
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 4bd882548c45..ecb44c94ba8d 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -17,7 +17,6 @@
 #include <linux/fs.h>
 #include <linux/jbd.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/bio.h>
@@ -862,12 +861,12 @@ restart_loop:
                /* A buffer which has been freed while still being
                 * journaled by a previous transaction may end up still
                 * being dirty here, but we want to avoid writing back
-                 * that buffer in the future now that the last use has
+                 * that buffer in the future after the "add to orphan"
-                 * been committed.  That's not only a performance gain,
+                 * operation been committed,  That's not only a performance
-                 * it also stops aliasing problems if the buffer is left
+                 * gain, it also stops aliasing problems if the buffer is
-                 * behind for writeback and gets reallocated for another
+                 * left behind for writeback and gets reallocated for another
                 * use in a different page. */
-                if (buffer_freed(bh)) {
+                if (buffer_freed(bh) && !jh->b_next_transaction) {
                        clear_buffer_freed(bh);
                        clear_buffer_jbddirty(bh);
                }
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 4160afad6d00..bd224eec9b07 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1913,7 +1913,7 @@ static void __init jbd_create_debugfs_entry(void)
 {
        jbd_debugfs_dir = debugfs_create_dir("jbd", NULL);
        if (jbd_debugfs_dir)
-                jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO,
+                jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO | S_IWUSR,
                                               jbd_debugfs_dir,
                                               &journal_enable_debug);
 }
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index cb1a49ae605e..54c9bc9e1b17 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -20,7 +20,6 @@
 #include <linux/fs.h>
 #include <linux/jbd.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
 #endif
 /*
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 006f9ad838a2..5ae71e75a491 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1398,7 +1398,7 @@ int journal_stop(handle_t *handle)
         * the case where our storage is so fast that it is more optimal to go
         * ahead and force a flush and wait for the transaction to be committed
         * than it is to wait for an arbitrary amount of time for new writers to
-         * join the transaction.  We acheive this by measuring how long it takes
+         * join the transaction.  We achieve this by measuring how long it takes
         * to commit a transaction, and compare it with how long this
         * transaction has been running, and if run time < commit time then we
         * sleep for the delta and commit.  This greatly helps super fast disks
@@ -1864,6 +1864,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
        if (!jh)
                goto zap_buffer_no_jh;
+        /*
+         * We cannot remove the buffer from checkpoint lists until the
+         * transaction adding inode to orphan list (let's call it T)
+         * is committed.  Otherwise if the transaction changing the
+         * buffer would be cleaned from the journal before T is
+         * committed, a crash will cause that the correct contents of
+         * the buffer will be lost.  On the other hand we have to
+         * clear the buffer dirty bit at latest at the moment when the
+         * transaction marking the buffer as freed in the filesystem
+         * structures is committed because from that moment on the
+         * buffer can be reallocated and used by a different page.
+         * Since the block hasn't been freed yet but the inode has
+         * already been added to orphan list, it is safe for us to add
+         * the buffer to BJ_Forget list of the newest transaction.
+         */
        transaction = jh->b_transaction;
        if (transaction == NULL) {
                /* First case: not on any transaction.  If it
@@ -1929,16 +1944,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
                        goto zap_buffer;
                }
                /*
-                 * If it is committing, we simply cannot touch it.  We
+                 * The buffer is committing, we simply cannot touch
-                 * can remove it's next_transaction pointer from the
+                 * it. So we just set j_next_transaction to the
-                 * running transaction if that is set, but nothing
+                 * running transaction (if there is one) and mark
-                 * else. */
+                 * buffer as freed so that commit code knows it should
+                 * clear dirty bits when it is done with the buffer.
+                 */
                set_buffer_freed(bh);
-                if (jh->b_next_transaction) {
+                if (journal->j_running_transaction && buffer_jbddirty(bh))
-                        J_ASSERT(jh->b_next_transaction ==
+                        jh->b_next_transaction = journal->j_running_transaction;
-                                        journal->j_running_transaction);
-                        jh->b_next_transaction = NULL;
-                }
                journal_put_journal_head(jh);
                spin_unlock(&journal->j_list_lock);
                jbd_unlock_bh_state(bh);
@@ -2120,7 +2134,7 @@ void journal_file_buffer(struct journal_head *jh,
 */
 void __journal_refile_buffer(struct journal_head *jh)
 {
-        int was_dirty;
+        int was_dirty, jlist;
        struct buffer_head *bh = jh2bh(jh);
        J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
@@ -2142,8 +2156,13 @@ void __journal_refile_buffer(struct journal_head *jh)
        __journal_temp_unlink_buffer(jh);
        jh->b_transaction = jh->b_next_transaction;
        jh->b_next_transaction = NULL;
-        __journal_file_buffer(jh, jh->b_transaction,
+        if (buffer_freed(bh))
-                                jh->b_modified ? BJ_Metadata : BJ_Reserved);
+                jlist = BJ_Forget;
+        else if (jh->b_modified)
+                jlist = BJ_Metadata;
+        else
+                jlist = BJ_Reserved;
+        __journal_file_buffer(jh, jh->b_transaction, jlist);
        J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
        if (was_dirty)
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index ca0f5eb62b20..30beb11ef928 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -22,6 +22,7 @@
 #include <linux/jbd2.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
+#include <linux/blkdev.h>
 #include <trace/events/jbd2.h>
 /*
@@ -506,6 +507,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
        if (blocknr < journal->j_tail)
                freed = freed + journal->j_last - journal->j_first;
+        trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed);
        jbd_debug(1,
                  "Cleaning journal tail from %d to %d (offset %lu), "
                  "freeing %lu\n",
@@ -515,6 +517,20 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
        journal->j_tail_sequence = first_tid;
        journal->j_tail = blocknr;
        spin_unlock(&journal->j_state_lock);
+        /*
+         * If there is an external journal, we need to make sure that
+         * any data blocks that were recently written out --- perhaps
+         * by jbd2_log_do_checkpoint() --- are flushed out before we
+         * drop the transactions from the external journal.  It's
+         * unlikely this will be necessary, especially with a
+         * appropriately sized journal, but we need this to guarantee
+         * correctness.  Fortunately jbd2_cleanup_journal_tail()
+         * doesn't get called all that often.
+         */
+        if ((journal->j_fs_dev != journal->j_dev) &&
+            (journal->j_flags & JBD2_BARRIER))
+                blkdev_issue_flush(journal->j_fs_dev, NULL);
        if (!(journal->j_flags & JBD2_ABORT))
                jbd2_journal_update_superblock(journal, 1);
        return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index d4cfd6d2779e..671da7fb7ffd 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -259,6 +259,7 @@ static int journal_submit_data_buffers(journal_t *journal,
                        ret = err;
                spin_lock(&journal->j_list_lock);
                J_ASSERT(jinode->i_transaction == commit_transaction);
+                commit_transaction->t_flushed_data_blocks = 1;
                jinode->i_flags &= ~JI_COMMIT_RUNNING;
                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
        }
@@ -286,7 +287,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
                if (err) {
                        /*
                         * Because AS_EIO is cleared by
-                         * wait_on_page_writeback_range(), set it again so
+                         * filemap_fdatawait_range(), set it again so
                         * that user process can get -EIO from fsync().
                         */
                        set_bit(AS_EIO,
@@ -636,6 +637,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                JBUFFER_TRACE(jh, "ph3: write metadata");
                flags = jbd2_journal_write_metadata_buffer(commit_transaction,
                                                      jh, &new_jh, blocknr);
+                if (flags < 0) {
+                        jbd2_journal_abort(journal, flags);
+                        continue;
+                }
                set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
                wbuf[bufs++] = jh2bh(new_jh);
@@ -704,8 +709,17 @@ start_journal_io:
                }
        }
-        /* Done it all: now write the commit record asynchronously. */
+        /* 
+         * If the journal is not located on the file system device,
+         * then we must flush the file system device before we issue
+         * the commit record
+         */
+        if (commit_transaction->t_flushed_data_blocks &&
+            (journal->j_fs_dev != journal->j_dev) &&
+            (journal->j_flags & JBD2_BARRIER))
+                blkdev_issue_flush(journal->j_fs_dev, NULL);
+        /* Done it all: now write the commit record asynchronously. */
        if (JBD2_HAS_INCOMPAT_FEATURE(journal,
                                      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
                err = journal_submit_commit_record(journal, commit_transaction,
@@ -716,13 +730,6 @@ start_journal_io:
                        blkdev_issue_flush(journal->j_dev, NULL);
        }
-        /*
-         * This is the right place to wait for data buffers both for ASYNC
-         * and !ASYNC commit. If commit is ASYNC, we need to wait only after
-         * the commit block went to disk (which happens above). If commit is
-         * SYNC, we need to wait for data buffers before we start writing
-         * commit block, which happens below in such setting.
-         */
        err = journal_finish_inode_data_buffers(journal, commit_transaction);
        if (err) {
                printk(KERN_WARNING
@@ -876,8 +883,7 @@ restart_loop:
                spin_unlock(&journal->j_list_lock);
                bh = jh2bh(jh);
                jbd_lock_bh_state(bh);
-                J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
+                J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
-                        jh->b_transaction == journal->j_running_transaction);
                /*
                 * If there is undo-protected committed data against
@@ -923,12 +929,12 @@ restart_loop:
                /* A buffer which has been freed while still being
                 * journaled by a previous transaction may end up still
                 * being dirty here, but we want to avoid writing back
-                 * that buffer in the future now that the last use has
+                 * that buffer in the future after the "add to orphan"
-                 * been committed.  That's not only a performance gain,
+                 * operation been committed,  That's not only a performance
-                 * it also stops aliasing problems if the buffer is left
+                 * gain, it also stops aliasing problems if the buffer is
-                 * behind for writeback and gets reallocated for another
+                 * left behind for writeback and gets reallocated for another
                 * use in a different page. */
-                if (buffer_freed(bh)) {
+                if (buffer_freed(bh) && !jh->b_next_transaction) {
                        clear_buffer_freed(bh);
                        clear_buffer_jbddirty(bh);
                }
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index fed85388ee86..c03d4dce4d76 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -39,6 +39,8 @@
 #include <linux/seq_file.h>
 #include <linux/math64.h>
 #include <linux/hash.h>
+#include <linux/log2.h>
+#include <linux/vmalloc.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
@@ -78,6 +80,7 @@ EXPORT_SYMBOL(jbd2_journal_errno);
 EXPORT_SYMBOL(jbd2_journal_ack_err);
 EXPORT_SYMBOL(jbd2_journal_clear_err);
 EXPORT_SYMBOL(jbd2_log_wait_commit);
+EXPORT_SYMBOL(jbd2_log_start_commit);
 EXPORT_SYMBOL(jbd2_journal_start_commit);
 EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
 EXPORT_SYMBOL(jbd2_journal_wipe);
@@ -92,6 +95,7 @@ EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
+static int jbd2_journal_create_slab(size_t slab_size);
 /*
 * Helper function used to manage commit timeouts
@@ -358,6 +362,10 @@ repeat:
                jbd_unlock_bh_state(bh_in);
                tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
+                if (!tmp) {
+                        jbd2_journal_put_journal_head(new_jh);
+                        return -ENOMEM;
+                }
                jbd_lock_bh_state(bh_in);
                if (jh_in->b_frozen_data) {
                        jbd2_free(tmp, bh_in->b_size);
@@ -809,7 +817,7 @@ static journal_t * journal_init_common (void)
        journal_t *journal;
        int err;
-        journal = kzalloc(sizeof(*journal), GFP_KERNEL|__GFP_NOFAIL);
+        journal = kzalloc(sizeof(*journal), GFP_KERNEL);
        if (!journal)
                goto fail;
@@ -1243,11 +1251,25 @@ int jbd2_journal_load(journal_t *journal)
                }
        }
+        /*
+         * Create a slab for this blocksize
+         */
+        err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
+        if (err)
+                return err;
        /* Let the recovery code check whether it needs to recover any
         * data from the journal. */
        if (jbd2_journal_recover(journal))
                goto recovery_error;
+        if (journal->j_failed_commit) {
+                printk(KERN_ERR "JBD2: journal transaction %u on %s "
+                       "is corrupt.\n", journal->j_failed_commit,
+                       journal->j_devname);
+                return -EIO;
+        }
        /* OK, we've finished with the dynamic journal bits:
         * reinitialise the dynamic contents of the superblock in memory
         * and reset them on disk. */
@@ -1795,6 +1817,127 @@ size_t journal_tag_bytes(journal_t *journal)
 }
 /*
+ * JBD memory management
+ *
+ * These functions are used to allocate block-sized chunks of memory
+ * used for making copies of buffer_head data.  Very often it will be
+ * page-sized chunks of data, but sometimes it will be in
+ * sub-page-size chunks.  (For example, 16k pages on Power systems
+ * with a 4k block file system.)  For blocks smaller than a page, we
+ * use a SLAB allocator.  There are slab caches for each block size,
+ * which are allocated at mount time, if necessary, and we only free
+ * (all of) the slab caches when/if the jbd2 module is unloaded.  For
+ * this reason we don't need to a mutex to protect access to
+ * jbd2_slab[] allocating or releasing memory; only in
+ * jbd2_journal_create_slab().
+ */
+#define JBD2_MAX_SLABS 8
+static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
+static DECLARE_MUTEX(jbd2_slab_create_sem);
+static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
+        "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
+        "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
+};
+static void jbd2_journal_destroy_slabs(void)
+{
+        int i;
+        for (i = 0; i < JBD2_MAX_SLABS; i++) {
+                if (jbd2_slab[i])
+                        kmem_cache_destroy(jbd2_slab[i]);
+                jbd2_slab[i] = NULL;
+        }
+}
+static int jbd2_journal_create_slab(size_t size)
+{
+        int i = order_base_2(size) - 10;
+        size_t slab_size;
+        if (size == PAGE_SIZE)
+                return 0;
+        if (i >= JBD2_MAX_SLABS)
+                return -EINVAL;
+        if (unlikely(i < 0))
+                i = 0;
+        down(&jbd2_slab_create_sem);
+        if (jbd2_slab[i]) {
+                up(&jbd2_slab_create_sem);
+                return 0;       /* Already created */
+        }
+        slab_size = 1 << (i+10);
+        jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
+                                         slab_size, 0, NULL);
+        up(&jbd2_slab_create_sem);
+        if (!jbd2_slab[i]) {
+                printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
+                return -ENOMEM;
+        }
+        return 0;
+}
+static struct kmem_cache *get_slab(size_t size)
+{
+        int i = order_base_2(size) - 10;
+        BUG_ON(i >= JBD2_MAX_SLABS);
+        if (unlikely(i < 0))
+                i = 0;
+        BUG_ON(jbd2_slab[i] == 0);
+        return jbd2_slab[i];
+}
+void *jbd2_alloc(size_t size, gfp_t flags)
+{
+        void *ptr;
+        BUG_ON(size & (size-1)); /* Must be a power of 2 */
+        flags |= __GFP_REPEAT;
+        if (size == PAGE_SIZE)
+                ptr = (void *)__get_free_pages(flags, 0);
+        else if (size > PAGE_SIZE) {
+                int order = get_order(size);
+                if (order < 3)
+                        ptr = (void *)__get_free_pages(flags, order);
+                else
+                        ptr = vmalloc(size);
+        } else
+                ptr = kmem_cache_alloc(get_slab(size), flags);
+        /* Check alignment; SLUB has gotten this wrong in the past,
+         * and this can lead to user data corruption! */
+        BUG_ON(((unsigned long) ptr) & (size-1));
+        return ptr;
+}
+void jbd2_free(void *ptr, size_t size)
+{
+        if (size == PAGE_SIZE) {
+                free_pages((unsigned long)ptr, 0);
+                return;
+        }
+        if (size > PAGE_SIZE) {
+                int order = get_order(size);
+                if (order < 3)
+                        free_pages((unsigned long)ptr, order);
+                else
+                        vfree(ptr);
+                return;
+        }
+        kmem_cache_free(get_slab(size), ptr);
+};
+/*
 * Journal_head storage management
 */
 static struct kmem_cache *jbd2_journal_head_cache;
@@ -2103,7 +2246,8 @@ static void __init jbd2_create_debugfs_entry(void)
 {
        jbd2_debugfs_dir = debugfs_create_dir("jbd2", NULL);
        if (jbd2_debugfs_dir)
-                jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME, S_IRUGO,
+                jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME,
+                                               S_IRUGO | S_IWUSR,
                                               jbd2_debugfs_dir,
                                               &jbd2_journal_enable_debug);
 }
@@ -2191,6 +2335,7 @@ static void jbd2_journal_destroy_caches(void)
        jbd2_journal_destroy_revoke_caches();
        jbd2_journal_destroy_jbd2_journal_head_cache();
        jbd2_journal_destroy_handle_cache();
+        jbd2_journal_destroy_slabs();
 }
 static int __init journal_init(void)
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 73063285b13f..049281b7cb89 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -20,7 +20,6 @@
 #include <linux/fs.h>
 #include <linux/jbd2.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
 #include <linux/crc32.h>
 #endif
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index a0512700542f..bfc70f57900f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1727,6 +1727,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
        if (!jh)
                goto zap_buffer_no_jh;
+        /*
+         * We cannot remove the buffer from checkpoint lists until the
+         * transaction adding inode to orphan list (let's call it T)
+         * is committed.  Otherwise if the transaction changing the
+         * buffer would be cleaned from the journal before T is
+         * committed, a crash will cause that the correct contents of
+         * the buffer will be lost.  On the other hand we have to
+         * clear the buffer dirty bit at latest at the moment when the
+         * transaction marking the buffer as freed in the filesystem
+         * structures is committed because from that moment on the
+         * buffer can be reallocated and used by a different page.
+         * Since the block hasn't been freed yet but the inode has
+         * already been added to orphan list, it is safe for us to add
+         * the buffer to BJ_Forget list of the newest transaction.
+         */
        transaction = jh->b_transaction;
        if (transaction == NULL) {
                /* First case: not on any transaction.  If it
@@ -1783,16 +1798,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
        } else if (transaction == journal->j_committing_transaction) {
                JBUFFER_TRACE(jh, "on committing transaction");
                /*
-                 * If it is committing, we simply cannot touch it.  We
+                 * The buffer is committing, we simply cannot touch
-                 * can remove it's next_transaction pointer from the
+                 * it. So we just set j_next_transaction to the
-                 * running transaction if that is set, but nothing
+                 * running transaction (if there is one) and mark
-                 * else. */
+                 * buffer as freed so that commit code knows it should
+                 * clear dirty bits when it is done with the buffer.
+                 */
                set_buffer_freed(bh);
-                if (jh->b_next_transaction) {
+                if (journal->j_running_transaction && buffer_jbddirty(bh))
-                        J_ASSERT(jh->b_next_transaction ==
+                        jh->b_next_transaction = journal->j_running_transaction;
-                                        journal->j_running_transaction);
-                        jh->b_next_transaction = NULL;
-                }
                jbd2_journal_put_journal_head(jh);
                spin_unlock(&journal->j_list_lock);
                jbd_unlock_bh_state(bh);
@@ -1969,7 +1983,7 @@ void jbd2_journal_file_buffer(struct journal_head *jh,
 */
 void __jbd2_journal_refile_buffer(struct journal_head *jh)
 {
-        int was_dirty;
+        int was_dirty, jlist;
        struct buffer_head *bh = jh2bh(jh);
        J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
@@ -1991,8 +2005,13 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
        __jbd2_journal_temp_unlink_buffer(jh);
        jh->b_transaction = jh->b_next_transaction;
        jh->b_next_transaction = NULL;
-        __jbd2_journal_file_buffer(jh, jh->b_transaction,
+        if (buffer_freed(bh))
-                                jh->b_modified ? BJ_Metadata : BJ_Reserved);
+                jlist = BJ_Forget;
+        else if (jh->b_modified)
+                jlist = BJ_Metadata;
+        else
+                jlist = BJ_Reserved;
+        __jbd2_journal_file_buffer(jh, jh->b_transaction, jlist);
        J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
        if (was_dirty)
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 7edb62e97419..7cdc3196476a 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -350,8 +350,8 @@ int jffs2_acl_chmod(struct inode *inode)
        return rc;
 }
-static size_t jffs2_acl_access_listxattr(struct inode *inode, char *list, size_t list_size,
+static size_t jffs2_acl_access_listxattr(struct dentry *dentry, char *list,
-                                         const char *name, size_t name_len)
+                size_t list_size, const char *name, size_t name_len, int type)
 {
        const int retlen = sizeof(POSIX_ACL_XATTR_ACCESS);
@@ -360,8 +360,8 @@ static size_t jffs2_acl_access_listxattr(struct inode *inode, char *list, size_t
        return retlen;
 }
-static size_t jffs2_acl_default_listxattr(struct inode *inode, char *list, size_t list_size,
+static size_t jffs2_acl_default_listxattr(struct dentry *dentry, char *list,
-                                          const char *name, size_t name_len)
+                size_t list_size, const char *name, size_t name_len, int type)
 {
        const int retlen = sizeof(POSIX_ACL_XATTR_DEFAULT);
@@ -370,12 +370,16 @@ static size_t jffs2_acl_default_listxattr(struct inode *inode, char *list, size_
        return retlen;
 }
-static int jffs2_acl_getxattr(struct inode *inode, int type, void *buffer, size_t size)
+static int jffs2_acl_getxattr(struct dentry *dentry, const char *name,
+                void *buffer, size_t size, int type)
 {
        struct posix_acl *acl;
        int rc;
-        acl = jffs2_get_acl(inode, type);
+        if (name[0] != '\0')
+                return -EINVAL;
+        acl = jffs2_get_acl(dentry->d_inode, type);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (!acl)
@@ -386,26 +390,15 @@ static int jffs2_acl_getxattr(struct inode *inode, int type, void *buffer, size_
        return rc;
 }
-static int jffs2_acl_access_getxattr(struct inode *inode, const char *name, void *buffer, size_t size)
+static int jffs2_acl_setxattr(struct dentry *dentry, const char *name,
-{
+                const void *value, size_t size, int flags, int type)
-        if (name[0] != '\0')
-                return -EINVAL;
-        return jffs2_acl_getxattr(inode, ACL_TYPE_ACCESS, buffer, size);
-}
-static int jffs2_acl_default_getxattr(struct inode *inode, const char *name, void *buffer, size_t size)
-{
-        if (name[0] != '\0')
-                return -EINVAL;
-        return jffs2_acl_getxattr(inode, ACL_TYPE_DEFAULT, buffer, size);
-}
-static int jffs2_acl_setxattr(struct inode *inode, int type, const void *value, size_t size)
 {
        struct posix_acl *acl;
        int rc;
-        if (!is_owner_or_cap(inode))
+        if (name[0] != '\0')
+                return -EINVAL;
+        if (!is_owner_or_cap(dentry->d_inode))
                return -EPERM;
        if (value) {
@@ -420,38 +413,24 @@ static int jffs2_acl_setxattr(struct inode *inode, int type, const void *value,
        } else {
                acl = NULL;
        }
-        rc = jffs2_set_acl(inode, type, acl);
+        rc = jffs2_set_acl(dentry->d_inode, type, acl);
 out:
        posix_acl_release(acl);
        return rc;
 }
-static int jffs2_acl_access_setxattr(struct inode *inode, const char *name,
-                                     const void *buffer, size_t size, int flags)
-{
-        if (name[0] != '\0')
-                return -EINVAL;
-        return jffs2_acl_setxattr(inode, ACL_TYPE_ACCESS, buffer, size);
-}
-static int jffs2_acl_default_setxattr(struct inode *inode, const char *name,
-                                      const void *buffer, size_t size, int flags)
-{
-        if (name[0] != '\0')
-                return -EINVAL;
-        return jffs2_acl_setxattr(inode, ACL_TYPE_DEFAULT, buffer, size);
-}
 struct xattr_handler jffs2_acl_access_xattr_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .flags  = ACL_TYPE_DEFAULT,
        .list   = jffs2_acl_access_listxattr,
-        .get    = jffs2_acl_access_getxattr,
+        .get    = jffs2_acl_getxattr,
-        .set    = jffs2_acl_access_setxattr,
+        .set    = jffs2_acl_setxattr,
 };
 struct xattr_handler jffs2_acl_default_xattr_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .flags  = ACL_TYPE_DEFAULT,
        .list   = jffs2_acl_default_listxattr,
-        .get    = jffs2_acl_default_getxattr,
+        .get    = jffs2_acl_getxattr,
-        .set    = jffs2_acl_default_setxattr,
+        .set    = jffs2_acl_setxattr,
 };
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index f25e70c1b51c..f0294410868d 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -177,7 +177,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
                spin_unlock(&jffs2_compressor_list_lock);
                break;
        default:
-                printk(KERN_ERR "JFFS2: unknow compression mode.\n");
+                printk(KERN_ERR "JFFS2: unknown compression mode.\n");
        }
 out:
        if (ret == JFFS2_COMPR_NONE) {
diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index 90cb60d09787..cd02acafde8a 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -11,7 +11,6 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/init.h>
 #include <linux/lzo.h>
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index cfd301a5edfc..b46661a42758 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -14,7 +14,6 @@
 #endif
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/zlib.h>
 #include <linux/zutil.h>
 #include "nodelist.h"
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index 5544d31c066b..ec3538413926 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -15,6 +15,7 @@
 #include <linux/crc32.h>
 #include <linux/jffs2.h>
 #include <linux/mtd/mtd.h>
+#include <linux/slab.h>
 #include "nodelist.h"
 #include "debug.h"
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index b7b74e299142..e7291c161a19 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -10,7 +10,6 @@
 */
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/time.h>
 #include <linux/pagemap.h>
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 090c556ffed2..3b6f2fa12cff 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -700,7 +700,8 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_
        struct jffs2_raw_inode ri;
        struct jffs2_node_frag *last_frag;
        union jffs2_device_node dev;
-        char *mdata = NULL, mdatalen = 0;
+        char *mdata = NULL;
+        int mdatalen = 0;
        uint32_t alloclen, ilen;
        int ret;
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 87c6f555e1a0..af02bd138469 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -15,7 +15,6 @@
 #include <linux/mtd/mtd.h>
 #include <linux/rbtree.h>
 #include <linux/crc32.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include "nodelist.h"
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 21a052915aa9..191359dde4e1 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -10,7 +10,6 @@
 */
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/mtd/mtd.h>
 #include <linux/compiler.h>
 #include <linux/sched.h> /* For cond_resched() */
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 1a80301004b8..d32ee9412cb9 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -567,7 +567,7 @@ static void jffs2_free_tmp_dnode_info_list(struct rb_root *list)
                        else BUG();
                }
        }
-        list->rb_node = NULL;
+        *list = RB_ROOT;
 }
 static void jffs2_free_full_dirent_list(struct jffs2_full_dirent *fd)
@@ -931,7 +931,7 @@ static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_re
 * Helper function for jffs2_get_inode_nodes().
 * The function detects whether more data should be read and reads it if yes.
 *
- * Returns: 0 on succes;
+ * Returns: 0 on success;
 *          negative error code on failure.
 */
 static int read_more(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
@@ -1284,7 +1284,7 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
                                f->target = NULL;
                                mutex_unlock(&f->sem);
                                jffs2_do_clear_inode(c, f);
-                                return -ret;
+                                return ret;
                        }
                        f->target[je32_to_cpu(latest_node->csize)] = '\0';
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index 02c39c64ecb3..eaccee058583 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -44,26 +44,28 @@ int jffs2_init_security(struct inode *inode, struct inode *dir)
 }
 /* ---- XATTR Handler for "security.*" ----------------- */
-static int jffs2_security_getxattr(struct inode *inode, const char *name,
+static int jffs2_security_getxattr(struct dentry *dentry, const char *name,
-                                   void *buffer, size_t size)
+                                   void *buffer, size_t size, int type)
 {
        if (!strcmp(name, ""))
                return -EINVAL;
-        return do_jffs2_getxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size);
+        return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_SECURITY,
+                                 name, buffer, size);
 }
-static int jffs2_security_setxattr(struct inode *inode, const char *name, const void *buffer,
+static int jffs2_security_setxattr(struct dentry *dentry, const char *name,
-                                   size_t size, int flags)
+                const void *buffer, size_t size, int flags, int type)
 {
        if (!strcmp(name, ""))
                return -EINVAL;
-        return do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size, flags);
+        return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_SECURITY,
+                                 name, buffer, size, flags);
 }
-static size_t jffs2_security_listxattr(struct inode *inode, char *list, size_t list_size,
+static size_t jffs2_security_listxattr(struct dentry *dentry, char *list,
-                                       const char *name, size_t name_len)
+                size_t list_size, const char *name, size_t name_len, int type)
 {
        size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1;
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 6caf1e1ee26d..800171dca53b 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -23,7 +23,7 @@
 int jffs2_sum_init(struct jffs2_sb_info *c)
 {
-        uint32_t sum_size = max_t(uint32_t, c->sector_size, MAX_SUMMARY_SIZE);
+        uint32_t sum_size = min_t(uint32_t, c->sector_size, MAX_SUMMARY_SIZE);
        c->summary = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL);
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 4ec11e8bda8c..b955626071c2 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -10,7 +10,6 @@
 */
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include "nodelist.h"
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index ca29440e9435..c819eb0e982d 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/crc32.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/mtd/mtd.h>
 #include "nodelist.h"
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 082e844ab2db..9e75c62c85d6 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -31,7 +31,7 @@
 *   is used to release xattr name/value pair and detach from c->xattrindex.
 * reclaim_xattr_datum(c)
 *   is used to reclaim xattr name/value pairs on the xattr name/value pair cache when
- *   memory usage by cache is over c->xdatum_mem_threshold. Currentry, this threshold 
+ *   memory usage by cache is over c->xdatum_mem_threshold. Currently, this threshold 
 *   is hard coded as 32KiB.
 * do_verify_xattr_datum(c, xd)
 *   is used to load the xdatum informations without name/value pair from the medium.
@@ -990,9 +990,11 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
                if (!xhandle)
                        continue;
                if (buffer) {
-                        rc = xhandle->list(inode, buffer+len, size-len, xd->xname, xd->name_len);
+                        rc = xhandle->list(dentry, buffer+len, size-len,
+                                           xd->xname, xd->name_len, xd->flags);
                } else {
-                        rc = xhandle->list(inode, NULL, 0, xd->xname, xd->name_len);
+                        rc = xhandle->list(dentry, NULL, 0, xd->xname,
+                                           xd->name_len, xd->flags);
                }
                if (rc < 0)
                        goto out;
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index 8ec5765ef348..3e5a5e356e05 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -16,24 +16,26 @@
 #include <linux/mtd/mtd.h>
 #include "nodelist.h"
-static int jffs2_trusted_getxattr(struct inode *inode, const char *name,
+static int jffs2_trusted_getxattr(struct dentry *dentry, const char *name,
-                                  void *buffer, size_t size)
+                void *buffer, size_t size, int type)
 {
        if (!strcmp(name, ""))
                return -EINVAL;
-        return do_jffs2_getxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size);
+        return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_TRUSTED,
+                                 name, buffer, size);
 }
-static int jffs2_trusted_setxattr(struct inode *inode, const char *name, const void *buffer,
+static int jffs2_trusted_setxattr(struct dentry *dentry, const char *name,
-                                  size_t size, int flags)
+                const void *buffer, size_t size, int flags, int type)
 {
        if (!strcmp(name, ""))
                return -EINVAL;
-        return do_jffs2_setxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size, flags);
+        return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_TRUSTED,
+                                 name, buffer, size, flags);
 }
-static size_t jffs2_trusted_listxattr(struct inode *inode, char *list, size_t list_size,
+static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list,
-                                      const char *name, size_t name_len)
+                size_t list_size, const char *name, size_t name_len, int type)
 {
        size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1;
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index 8bbeab90ada1..8544af67dffe 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -16,24 +16,26 @@
 #include <linux/mtd/mtd.h>
 #include "nodelist.h"
-static int jffs2_user_getxattr(struct inode *inode, const char *name,
+static int jffs2_user_getxattr(struct dentry *dentry, const char *name,
-                               void *buffer, size_t size)
+                               void *buffer, size_t size, int type)
 {
        if (!strcmp(name, ""))
                return -EINVAL;
-        return do_jffs2_getxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size);
+        return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_USER,
+                                 name, buffer, size);
 }
-static int jffs2_user_setxattr(struct inode *inode, const char *name, const void *buffer,
+static int jffs2_user_setxattr(struct dentry *dentry, const char *name,
-                               size_t size, int flags)
+                const void *buffer, size_t size, int flags, int type)
 {
        if (!strcmp(name, ""))
                return -EINVAL;
-        return do_jffs2_setxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size, flags);
+        return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_USER,
+                                 name, buffer, size, flags);
 }
-static size_t jffs2_user_listxattr(struct inode *inode, char *list, size_t list_size,
+static size_t jffs2_user_listxattr(struct dentry *dentry, char *list,
-                                   const char *name, size_t name_len)
+                size_t list_size, const char *name, size_t name_len, int type)
 {
        size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1;
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index d66477c34306..1057a4998e4e 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -19,8 +19,8 @@
 */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/fs.h>
-#include <linux/quotaops.h>
 #include <linux/posix_acl_xattr.h>
 #include "jfs_incore.h"
 #include "jfs_txnmgr.h"
@@ -174,7 +174,7 @@ cleanup:
        return rc;
 }
-static int jfs_acl_chmod(struct inode *inode)
+int jfs_acl_chmod(struct inode *inode)
 {
        struct posix_acl *acl, *clone;
        int rc;
@@ -205,26 +205,3 @@ static int jfs_acl_chmod(struct inode *inode)
        posix_acl_release(clone);
        return rc;
 }
-int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
-{
-        struct inode *inode = dentry->d_inode;
-        int rc;
-        rc = inode_change_ok(inode, iattr);
-        if (rc)
-                return rc;
-        if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
-            (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
-                if (vfs_dq_transfer(inode, iattr))
-                        return -EDQUOT;
-        }
-        rc = inode_setattr(inode, iattr);
-        if (!rc && (iattr->ia_valid & ATTR_MODE))
-                rc = jfs_acl_chmod(inode);
-        return rc;
-}
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 2b70fa78e4a7..14ba982b3f24 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -18,6 +18,7 @@
 */
 #include <linux/fs.h>
+#include <linux/quotaops.h>
 #include "jfs_incore.h"
 #include "jfs_inode.h"
 #include "jfs_dmap.h"
@@ -47,7 +48,7 @@ static int jfs_open(struct inode *inode, struct file *file)
 {
        int rc;
-        if ((rc = generic_file_open(inode, file)))
+        if ((rc = dquot_file_open(inode, file)))
                return rc;
        /*
@@ -88,14 +89,40 @@ static int jfs_release(struct inode *inode, struct file *file)
        return 0;
 }
+int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+        struct inode *inode = dentry->d_inode;
+        int rc;
+        rc = inode_change_ok(inode, iattr);
+        if (rc)
+                return rc;
+        if (iattr->ia_valid & ATTR_SIZE)
+                dquot_initialize(inode);
+        if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
+            (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
+                rc = dquot_transfer(inode, iattr);
+                if (rc)
+                        return rc;
+        }
+        rc = inode_setattr(inode, iattr);
+        if (!rc && (iattr->ia_valid & ATTR_MODE))
+                rc = jfs_acl_chmod(inode);
+        return rc;
+}
 const struct inode_operations jfs_file_inode_operations = {
        .truncate       = jfs_truncate,
        .setxattr       = jfs_setxattr,
        .getxattr       = jfs_getxattr,
        .listxattr      = jfs_listxattr,
        .removexattr    = jfs_removexattr,
-#ifdef CONFIG_JFS_POSIX_ACL
        .setattr        = jfs_setattr,
+#ifdef CONFIG_JFS_POSIX_ACL
        .check_acl      = jfs_check_acl,
 #endif
 };
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index b2ae190a77ba..9dd126276c9f 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -22,6 +22,7 @@
 #include <linux/buffer_head.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
+#include <linux/writeback.h>
 #include "jfs_incore.h"
 #include "jfs_inode.h"
 #include "jfs_filsys.h"
@@ -120,8 +121,10 @@ int jfs_commit_inode(struct inode *inode, int wait)
        return rc;
 }
-int jfs_write_inode(struct inode *inode, int wait)
+int jfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
+        int wait = wbc->sync_mode == WB_SYNC_ALL;
        if (test_cflag(COMMIT_Nolink, inode))
                return 0;
        /*
@@ -146,6 +149,9 @@ void jfs_delete_inode(struct inode *inode)
 {
        jfs_info("In jfs_delete_inode, inode = 0x%p", inode);
+        if (!is_bad_inode(inode))
+                dquot_initialize(inode);
        if (!is_bad_inode(inode) &&
            (JFS_IP(inode)->fileset == FILESYSTEM_I)) {
                truncate_inode_pages(&inode->i_data, 0);
@@ -158,9 +164,9 @@ void jfs_delete_inode(struct inode *inode)
                /*
                 * Free the inode from the quota allocation.
                 */
-                vfs_dq_init(inode);
+                dquot_initialize(inode);
-                vfs_dq_free_inode(inode);
+                dquot_free_inode(inode);
-                vfs_dq_drop(inode);
+                dquot_drop(inode);
        }
        clear_inode(inode);
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index b07bd417ef85..54e07559878d 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -22,7 +22,7 @@
 int jfs_check_acl(struct inode *, int);
 int jfs_init_acl(tid_t, struct inode *, struct inode *);
-int jfs_setattr(struct dentry *, struct iattr *);
+int jfs_acl_chmod(struct inode *inode);
 #else
@@ -32,5 +32,10 @@ static inline int jfs_init_acl(tid_t tid, struct inode *inode,
        return 0;
 }
+static inline int jfs_acl_chmod(struct inode *inode)
+{
+        return 0;
+}
 #endif
 #endif          /* _H_JFS_ACL */
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 2bc7d8aa5740..6c4dfcbf3f55 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -17,6 +17,7 @@
 */
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include "jfs_incore.h"
 #include "jfs_superblock.h"
 #include "jfs_dmap.h"
@@ -755,7 +756,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
         * allocation group.
         */
        if ((blkno & (bmp->db_agsize - 1)) == 0)
-                /* check if the AG is currenly being written to.
+                /* check if the AG is currently being written to.
                 * if so, call dbNextAG() to find a non-busy
                 * AG with sufficient free space.
                 */
@@ -3337,7 +3338,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
        for (i = 0, n = 0; i < agno; n++) {
                bmp->db_agfree[n] = 0;  /* init collection point */
-                /* coalesce cotiguous k AGs; */
+                /* coalesce contiguous k AGs; */
                for (j = 0; j < k && i < agno; j++, i++) {
                        /* merge AGi to AGn */
                        bmp->db_agfree[n] += bmp->db_agfree[i];
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 925871e9887b..9197a1b0d02d 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -102,6 +102,7 @@
 #include <linux/fs.h>
 #include <linux/quotaops.h>
+#include <linux/slab.h>
 #include "jfs_incore.h"
 #include "jfs_superblock.h"
 #include "jfs_filsys.h"
@@ -381,10 +382,10 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
                 * It's time to move the inline table to an external
                 * page and begin to build the xtree
                 */
-                if (vfs_dq_alloc_block(ip, sbi->nbperpage))
+                if (dquot_alloc_block(ip, sbi->nbperpage))
                        goto clean_up;
                if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) {
-                        vfs_dq_free_block(ip, sbi->nbperpage);
+                        dquot_free_block(ip, sbi->nbperpage);
                        goto clean_up;
                }
@@ -408,7 +409,7 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
                        memcpy(&jfs_ip->i_dirtable, temp_table,
                               sizeof (temp_table));
                        dbFree(ip, xaddr, sbi->nbperpage);
-                        vfs_dq_free_block(ip, sbi->nbperpage);
+                        dquot_free_block(ip, sbi->nbperpage);
                        goto clean_up;
                }
                ip->i_size = PSIZE;
@@ -1027,10 +1028,9 @@ static int dtSplitUp(tid_t tid,
                        n = xlen;
                /* Allocate blocks to quota. */
-                if (vfs_dq_alloc_block(ip, n)) {
+                rc = dquot_alloc_block(ip, n);
-                        rc = -EDQUOT;
+                if (rc)
                        goto extendOut;
-                }
                quota_allocation += n;
                if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen,
@@ -1308,7 +1308,7 @@ static int dtSplitUp(tid_t tid,
        /* Rollback quota allocation */
        if (rc && quota_allocation)
-                vfs_dq_free_block(ip, quota_allocation);
+                dquot_free_block(ip, quota_allocation);
      dtSplitUp_Exit:
@@ -1369,9 +1369,10 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
                return -EIO;
        /* Allocate blocks to quota. */
-        if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
+        rc = dquot_alloc_block(ip, lengthPXD(pxd));
+        if (rc) {
                release_metapage(rmp);
-                return -EDQUOT;
+                return rc;
        }
        jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp);
@@ -1892,6 +1893,7 @@ static int dtSplitRoot(tid_t tid,
        struct dt_lock *dtlck;
        struct tlock *tlck;
        struct lv *lv;
+        int rc;
        /* get split root page */
        smp = split->mp;
@@ -1916,9 +1918,10 @@ static int dtSplitRoot(tid_t tid,
        rp = rmp->data;
        /* Allocate blocks to quota. */
-        if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
+        rc = dquot_alloc_block(ip, lengthPXD(pxd));
+        if (rc) {
                release_metapage(rmp);
-                return -EDQUOT;
+                return rc;
        }
        BT_MARK_DIRTY(rmp, ip);
@@ -2287,7 +2290,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
        xlen = lengthPXD(&fp->header.self);
        /* Free quota allocation. */
-        vfs_dq_free_block(ip, xlen);
+        dquot_free_block(ip, xlen);
        /* free/invalidate its buffer page */
        discard_metapage(fmp);
@@ -2363,7 +2366,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
                                xlen = lengthPXD(&p->header.self);
                                /* Free quota allocation */
-                                vfs_dq_free_block(ip, xlen);
+                                dquot_free_block(ip, xlen);
                                /* free/invalidate its buffer page */
                                discard_metapage(mp);
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 41d6045dbeb0..5d3bbd10f8db 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -141,10 +141,11 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
        }
        /* Allocate blocks to quota. */
-        if (vfs_dq_alloc_block(ip, nxlen)) {
+        rc = dquot_alloc_block(ip, nxlen);
+        if (rc) {
                dbFree(ip, nxaddr, (s64) nxlen);
                mutex_unlock(&JFS_IP(ip)->commit_mutex);
-                return -EDQUOT;
+                return rc;
        }
        /* determine the value of the extent flag */
@@ -164,7 +165,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
         */
        if (rc) {
                dbFree(ip, nxaddr, nxlen);
-                vfs_dq_free_block(ip, nxlen);
+                dquot_free_block(ip, nxlen);
                mutex_unlock(&JFS_IP(ip)->commit_mutex);
                return (rc);
        }
@@ -256,10 +257,11 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
                goto exit;
        /* Allocat blocks to quota. */
-        if (vfs_dq_alloc_block(ip, nxlen)) {
+        rc = dquot_alloc_block(ip, nxlen);
+        if (rc) {
                dbFree(ip, nxaddr, (s64) nxlen);
                mutex_unlock(&JFS_IP(ip)->commit_mutex);
-                return -EDQUOT;
+                return rc;
        }
        delta = nxlen - xlen;
@@ -297,7 +299,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
                /* extend the extent */
                if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) {
                        dbFree(ip, xaddr + xlen, delta);
-                        vfs_dq_free_block(ip, nxlen);
+                        dquot_free_block(ip, nxlen);
                        goto exit;
                }
        } else {
@@ -308,7 +310,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
                 */
                if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) {
                        dbFree(ip, nxaddr, nxlen);
-                        vfs_dq_free_block(ip, nxlen);
+                        dquot_free_block(ip, nxlen);
                        goto exit;
                }
        }
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 0fc30407f039..f8332dc8eeb2 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -45,6 +45,7 @@
 #include <linux/buffer_head.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
+#include <linux/slab.h>
 #include "jfs_incore.h"
 #include "jfs_inode.h"
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index dc0e02159ac9..829921b67765 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -116,10 +116,10 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
        /*
         * Allocate inode to quota.
         */
-        if (vfs_dq_alloc_inode(inode)) {
+        dquot_initialize(inode);
-                rc = -EDQUOT;
+        rc = dquot_alloc_inode(inode);
+        if (rc)
                goto fail_drop;
-        }
        inode->i_mode = mode;
        /* inherit flags from parent */
@@ -162,7 +162,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
        return inode;
 fail_drop:
-        vfs_dq_drop(inode);
+        dquot_drop(inode);
        inode->i_flags |= S_NOQUOTA;
 fail_unlock:
        inode->i_nlink = 0;
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 1eff7db34d63..79e2c79661df 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -26,7 +26,7 @@ extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
 extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
 extern struct inode *jfs_iget(struct super_block *, unsigned long);
 extern int jfs_commit_inode(struct inode *, int);
-extern int jfs_write_inode(struct inode*, int);
+extern int jfs_write_inode(struct inode *, struct writeback_control *);
 extern void jfs_delete_inode(struct inode *);
 extern void jfs_dirty_inode(struct inode *);
 extern void jfs_truncate(struct inode *);
@@ -40,6 +40,7 @@ extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid,
        int fh_len, int fh_type);
 extern void jfs_set_inode_flags(struct inode *);
 extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
+extern int jfs_setattr(struct dentry *, struct iattr *);
 extern const struct address_space_operations jfs_aops;
 extern const struct inode_operations jfs_dir_inode_operations;
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 335c4de6552d..c51af2a14516 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -70,6 +70,7 @@
 #include <linux/delay.h>
 #include <linux/mutex.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
 #include "jfs_metapage.h"
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 07b6c5dfb4b6..48b44bd8267b 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -21,6 +21,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/bio.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/buffer_head.h>
 #include <linux/mempool.h>
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index f26e4d03ada5..d945ea76b445 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1292,7 +1292,7 @@ int txCommit(tid_t tid,		/* transaction identifier */
                 */
                /*
                 * I believe this code is no longer needed.  Splitting I_LOCK
-                 * into two bits, I_LOCK and I_SYNC should prevent this
+                 * into two bits, I_NEW and I_SYNC should prevent this
                 * deadlock as well.  But since I don't have a JFS testload
                 * to verify this, only a trivial s/I_LOCK/I_SYNC/ was done.
                 * Joern
diff --git a/fs/jfs/jfs_unicode.h b/fs/jfs/jfs_unicode.h
index 3fbb3a225590..8f0f02cb6ca6 100644
--- a/fs/jfs/jfs_unicode.h
+++ b/fs/jfs/jfs_unicode.h
@@ -19,6 +19,7 @@
 #ifndef _H_JFS_UNICODE
 #define _H_JFS_UNICODE
+#include <linux/slab.h>
 #include <asm/byteorder.h>
 #include "jfs_types.h"
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index d654a6458648..6c50871e6220 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -585,10 +585,10 @@ int xtInsert(tid_t tid,		/* transaction id */
                        hint = addressXAD(xad) + lengthXAD(xad) - 1;
                } else
                        hint = 0;
-                if ((rc = vfs_dq_alloc_block(ip, xlen)))
+                if ((rc = dquot_alloc_block(ip, xlen)))
                        goto out;
                if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) {
-                        vfs_dq_free_block(ip, xlen);
+                        dquot_free_block(ip, xlen);
                        goto out;
                }
        }
@@ -617,7 +617,7 @@ int xtInsert(tid_t tid,		/* transaction id */
                        /* undo data extent allocation */
                        if (*xaddrp == 0) {
                                dbFree(ip, xaddr, (s64) xlen);
-                                vfs_dq_free_block(ip, xlen);
+                                dquot_free_block(ip, xlen);
                        }
                        return rc;
                }
@@ -985,10 +985,9 @@ xtSplitPage(tid_t tid, struct inode *ip,
        rbn = addressPXD(pxd);
        /* Allocate blocks to quota. */
-        if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
+        rc = dquot_alloc_block(ip, lengthPXD(pxd));
-                rc = -EDQUOT;
+        if (rc)
                goto clean_up;
-        }
        quota_allocation += lengthPXD(pxd);
@@ -1195,7 +1194,7 @@ xtSplitPage(tid_t tid, struct inode *ip,
        /* Rollback quota allocation. */
        if (quota_allocation)
-                vfs_dq_free_block(ip, quota_allocation);
+                dquot_free_block(ip, quota_allocation);
        return (rc);
 }
@@ -1235,6 +1234,7 @@ xtSplitRoot(tid_t tid,
        struct pxdlist *pxdlist;
        struct tlock *tlck;
        struct xtlock *xtlck;
+        int rc;
        sp = &JFS_IP(ip)->i_xtroot;
@@ -1252,9 +1252,10 @@ xtSplitRoot(tid_t tid,
                return -EIO;
        /* Allocate blocks to quota. */
-        if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
+        rc = dquot_alloc_block(ip, lengthPXD(pxd));
+        if (rc) {
                release_metapage(rmp);
-                return -EDQUOT;
+                return rc;
        }
        jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp);
@@ -3680,7 +3681,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
                ip->i_size = newsize;
        /* update quota allocation to reflect freed blocks */
-        vfs_dq_free_block(ip, nfreed);
+        dquot_free_block(ip, nfreed);
        /*
         * free tlock of invalidated pages
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index c79a4270f083..4a3e9f39c21d 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -85,6 +85,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
        jfs_info("jfs_create: dip:0x%p name:%s", dip, dentry->d_name.name);
+        dquot_initialize(dip);
        /*
         * search parent directory for entry/freespace
         * (dtSearch() returns parent directory page pinned)
@@ -215,6 +217,8 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
        jfs_info("jfs_mkdir: dip:0x%p name:%s", dip, dentry->d_name.name);
+        dquot_initialize(dip);
        /* link count overflow on parent directory ? */
        if (dip->i_nlink == JFS_LINK_MAX) {
                rc = -EMLINK;
@@ -356,7 +360,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
        jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
        /* Init inode for quota operations. */
-        vfs_dq_init(ip);
+        dquot_initialize(dip);
+        dquot_initialize(ip);
        /* directory must be empty to be removed */
        if (!dtEmpty(ip)) {
@@ -483,7 +488,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
        jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name);
        /* Init inode for quota operations. */
-        vfs_dq_init(ip);
+        dquot_initialize(dip);
+        dquot_initialize(ip);
        if ((rc = get_UCSname(&dname, dentry)))
                goto out;
@@ -805,6 +811,8 @@ static int jfs_link(struct dentry *old_dentry,
        if (ip->i_nlink == 0)
                return -ENOENT;
+        dquot_initialize(dir);
        tid = txBegin(ip->i_sb, 0);
        mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT);
@@ -896,6 +904,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
        jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name);
+        dquot_initialize(dip);
        ssize = strlen(name) + 1;
        /*
@@ -1087,6 +1097,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
                 new_dentry->d_name.name);
+        dquot_initialize(old_dir);
+        dquot_initialize(new_dir);
        old_ip = old_dentry->d_inode;
        new_ip = new_dentry->d_inode;
@@ -1136,7 +1149,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        } else if (new_ip) {
                IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL);
                /* Init inode for quota operations. */
-                vfs_dq_init(new_ip);
+                dquot_initialize(new_ip);
        }
        /*
@@ -1360,6 +1373,8 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
        jfs_info("jfs_mknod: %s", dentry->d_name.name);
+        dquot_initialize(dir);
        if ((rc = get_UCSname(&dname, dentry)))
                goto out;
@@ -1541,8 +1556,8 @@ const struct inode_operations jfs_dir_inode_operations = {
        .getxattr       = jfs_getxattr,
        .listxattr      = jfs_listxattr,
        .removexattr    = jfs_removexattr,
-#ifdef CONFIG_JFS_POSIX_ACL
        .setattr        = jfs_setattr,
+#ifdef CONFIG_JFS_POSIX_ACL
        .check_acl      = jfs_check_acl,
 #endif
 };
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 2234c73fc577..157382fa6256 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -30,6 +30,7 @@
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/seq_file.h>
 #include <linux/smp_lock.h>
@@ -131,6 +132,11 @@ static void jfs_destroy_inode(struct inode *inode)
        kmem_cache_free(jfs_inode_cachep, ji);
 }
+static void jfs_clear_inode(struct inode *inode)
+{
+        dquot_drop(inode);
+}
 static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb);
@@ -524,7 +530,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
         * Page cache is indexed by long.
         * I would use MAX_LFS_FILESIZE, but it's only half as big
         */
-        sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, sb->s_maxbytes);
+        sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, (u64)sb->s_maxbytes);
 #endif
        sb->s_time_gran = 1;
        return 0;
@@ -745,6 +751,7 @@ static const struct super_operations jfs_super_operations = {
        .dirty_inode    = jfs_dirty_inode,
        .write_inode    = jfs_write_inode,
        .delete_inode   = jfs_delete_inode,
+        .clear_inode    = jfs_clear_inode,
        .put_super      = jfs_put_super,
        .sync_fs        = jfs_sync_fs,
        .freeze_fs      = jfs_freeze,
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index fad364548bc9..fa96bbb26343 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/slab.h>
 #include <linux/quotaops.h>
 #include <linux/security.h>
 #include "jfs_incore.h"
@@ -260,14 +261,14 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
        nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits;
        /* Allocate new blocks to quota. */
-        if (vfs_dq_alloc_block(ip, nblocks)) {
+        rc = dquot_alloc_block(ip, nblocks);
-                return -EDQUOT;
+        if (rc)
-        }
+                return rc;
        rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno);
        if (rc) {
                /*Rollback quota allocation. */
-                vfs_dq_free_block(ip, nblocks);
+                dquot_free_block(ip, nblocks);
                return rc;
        }
@@ -332,7 +333,7 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
      failed:
        /* Rollback quota allocation. */
-        vfs_dq_free_block(ip, nblocks);
+        dquot_free_block(ip, nblocks);
        dbFree(ip, blkno, nblocks);
        return rc;
@@ -538,7 +539,8 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
        if (blocks_needed > current_blocks) {
                /* Allocate new blocks to quota. */
-                if (vfs_dq_alloc_block(inode, blocks_needed))
+                rc = dquot_alloc_block(inode, blocks_needed);
+                if (rc)
                        return -EDQUOT;
                quota_allocation = blocks_needed;
@@ -602,7 +604,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
      clean_up:
        /* Rollback quota allocation */
        if (quota_allocation)
-                vfs_dq_free_block(inode, quota_allocation);
+                dquot_free_block(inode, quota_allocation);
        return (rc);
 }
@@ -677,7 +679,7 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf,
        /* If old blocks exist, they must be removed from quota allocation. */
        if (old_blocks)
-                vfs_dq_free_block(inode, old_blocks);
+                dquot_free_block(inode, old_blocks);
        inode->i_ctime = CURRENT_TIME;
diff --git a/fs/libfs.c b/fs/libfs.c
index 219576c52d80..ea9a6cc9b35c 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -5,6 +5,7 @@
 #include <linux/module.h>
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/vfs.h>
 #include <linux/mutex.h>
@@ -338,28 +339,14 @@ int simple_readpage(struct file *file, struct page *page)
        return 0;
 }
-int simple_prepare_write(struct file *file, struct page *page,
-                        unsigned from, unsigned to)
-{
-        if (!PageUptodate(page)) {
-                if (to - from != PAGE_CACHE_SIZE)
-                        zero_user_segments(page,
-                                0, from,
-                                to, PAGE_CACHE_SIZE);
-        }
-        return 0;
-}
 int simple_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
 {
        struct page *page;
        pgoff_t index;
-        unsigned from;
        index = pos >> PAGE_CACHE_SHIFT;
-        from = pos & (PAGE_CACHE_SIZE - 1);
        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
@@ -367,43 +354,59 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
        *pagep = page;
-        return simple_prepare_write(file, page, from, from+len);
+        if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
-}
+                unsigned from = pos & (PAGE_CACHE_SIZE - 1);
-static int simple_commit_write(struct file *file, struct page *page,
+                zero_user_segments(page, 0, from, from + len, PAGE_CACHE_SIZE);
-                               unsigned from, unsigned to)
+        }
-{
-        struct inode *inode = page->mapping->host;
-        loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-        if (!PageUptodate(page))
-                SetPageUptodate(page);
-        /*
-         * No need to use i_size_read() here, the i_size
-         * cannot change under us because we hold the i_mutex.
-         */
-        if (pos > inode->i_size)
-                i_size_write(inode, pos);
-        set_page_dirty(page);
        return 0;
 }
+/**
+ * simple_write_end - .write_end helper for non-block-device FSes
+ * @available: See .write_end of address_space_operations
+ * @file:               "
+ * @mapping:            "
+ * @pos:                "
+ * @len:                "
+ * @copied:             "
+ * @page:               "
+ * @fsdata:             "
+ *
+ * simple_write_end does the minimum needed for updating a page after writing is
+ * done. It has the same API signature as the .write_end of
+ * address_space_operations vector. So it can just be set onto .write_end for
+ * FSes that don't need any other processing. i_mutex is assumed to be held.
+ * Block based filesystems should use generic_write_end().
+ * NOTE: Even though i_size might get updated by this function, mark_inode_dirty
+ * is not called, so a filesystem that actually does store data in .write_inode
+ * should extend on what's done here with a call to mark_inode_dirty() in the
+ * case that i_size has changed.
+ */
 int simple_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *page, void *fsdata)
 {
-        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+        struct inode *inode = page->mapping->host;
+        loff_t last_pos = pos + copied;
        /* zero the stale part of the page if we did a short copy */
        if (copied < len) {
-                void *kaddr = kmap_atomic(page, KM_USER0);
+                unsigned from = pos & (PAGE_CACHE_SIZE - 1);
-                memset(kaddr + from + copied, 0, len - copied);
-                flush_dcache_page(page);
+                zero_user(page, from + copied, len - copied);
-                kunmap_atomic(kaddr, KM_USER0);
        }
-        simple_commit_write(file, page, from, from+copied);
+        if (!PageUptodate(page))
+                SetPageUptodate(page);
+        /*
+         * No need to use i_size_read() here, the i_size
+         * cannot change under us because we hold the i_mutex.
+         */
+        if (last_pos > inode->i_size)
+                i_size_write(inode, last_pos);
+        set_page_dirty(page);
        unlock_page(page);
        page_cache_release(page);
@@ -848,13 +851,11 @@ EXPORT_SYMBOL(simple_write_end);
 EXPORT_SYMBOL(simple_dir_inode_operations);
 EXPORT_SYMBOL(simple_dir_operations);
 EXPORT_SYMBOL(simple_empty);
-EXPORT_SYMBOL(d_alloc_name);
 EXPORT_SYMBOL(simple_fill_super);
 EXPORT_SYMBOL(simple_getattr);
 EXPORT_SYMBOL(simple_link);
 EXPORT_SYMBOL(simple_lookup);
 EXPORT_SYMBOL(simple_pin_fs);
-EXPORT_UNUSED_SYMBOL(simple_prepare_write);
 EXPORT_SYMBOL(simple_readpage);
 EXPORT_SYMBOL(simple_release_fs);
 EXPORT_SYMBOL(simple_rename);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index fc9032dc8862..64fd427c993c 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/types.h>
+#include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/nfs_fs.h>
 #include <linux/sunrpc/clnt.h>
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index c81249fef11f..7932c399fab4 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 4600c2037b8b..bb464d12104c 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -479,8 +479,8 @@ again:	mutex_lock(&nlm_host_mutex);
                        }
                }
        }
        mutex_unlock(&nlm_host_mutex);
+        nsm_release(nsm);
 }
 /*
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index f956651d0f65..e3015464fbab 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -10,6 +10,7 @@
 #include <linux/utsname.h>
 #include <linux/kernel.h>
 #include <linux/ktime.h>
+#include <linux/slab.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/xprtsock.h>
@@ -349,9 +350,9 @@ retry:
 * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle
 * @info: pointer to NLMPROC_SM_NOTIFY arguments
 *
- * Returns a matching nsm_handle if found in the nsm cache; the returned
+ * Returns a matching nsm_handle if found in the nsm cache. The returned
- * nsm_handle's reference count is bumped and sm_monitored is cleared.
+ * nsm_handle's reference count is bumped. Otherwise returns NULL if some
- * Otherwise returns NULL if some error occurred.
+ * error occurred.
 */
 struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
 {
@@ -370,12 +371,6 @@ struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
        atomic_inc(&cached->sm_count);
        spin_unlock(&nsm_lock);
-        /*
-         * During subsequent lock activity, force a fresh
-         * notification to be set up for this host.
-         */
-        cached->sm_monitored = 0;
        dprintk("lockd: host %s (%s) rebooted, cnt %d\n",
                        cached->sm_name, cached->sm_addrbuf,
                        atomic_read(&cached->sm_count));
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 1a54ae14a192..f1bacf1a0391 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -21,7 +21,6 @@
 #include <linux/errno.h>
 #include <linux/in.h>
 #include <linux/uio.h>
-#include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/mutex.h>
@@ -243,11 +242,9 @@ static int make_socks(struct svc_serv *serv)
        if (err < 0)
                goto out_err;
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
        err = create_lockd_family(serv, PF_INET6);
        if (err < 0 && err != -EAFNOSUPPORT)
                goto out_err;
-#endif  /* CONFIG_IPV6 || CONFIG_IPV6_MODULE */
        warned = 0;
        return 0;
@@ -371,82 +368,74 @@ EXPORT_SYMBOL_GPL(lockd_down);
 static ctl_table nlm_sysctls[] = {
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nlm_grace_period",
                .data           = &nlm_grace_period,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-                .proc_handler   = &proc_doulongvec_minmax,
+                .proc_handler   = proc_doulongvec_minmax,
                .extra1         = (unsigned long *) &nlm_grace_period_min,
                .extra2         = (unsigned long *) &nlm_grace_period_max,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nlm_timeout",
                .data           = &nlm_timeout,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-                .proc_handler   = &proc_doulongvec_minmax,
+                .proc_handler   = proc_doulongvec_minmax,
                .extra1         = (unsigned long *) &nlm_timeout_min,
                .extra2         = (unsigned long *) &nlm_timeout_max,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nlm_udpport",
                .data           = &nlm_udpport,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
                .extra1         = (int *) &nlm_port_min,
                .extra2         = (int *) &nlm_port_max,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nlm_tcpport",
                .data           = &nlm_tcpport,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
                .extra1         = (int *) &nlm_port_min,
                .extra2         = (int *) &nlm_port_max,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nsm_use_hostnames",
                .data           = &nsm_use_hostnames,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nsm_local_state",
                .data           = &nsm_local_state,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
-        { .ctl_name = 0 }
+        { }
 };
 static ctl_table nlm_sysctl_dir[] = {
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nfs",
                .mode           = 0555,
                .child          = nlm_sysctls,
        },
-        { .ctl_name = 0 }
+        { }
 };
 static ctl_table nlm_sysctl_root[] = {
        {
-                .ctl_name       = CTL_FS,
                .procname       = "fs",
                .mode           = 0555,
                .child          = nlm_sysctl_dir,
        },
-        { .ctl_name = 0 }
+        { }
 };
 #endif  /* CONFIG_SYSCTL */
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index bd173a6ca3b1..031c6569a134 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -9,12 +9,7 @@
 #include <linux/types.h>
 #include <linux/time.h>
-#include <linux/slab.h>
 #include <linux/smp_lock.h>
-#include <linux/in.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/nfsd/nfsd.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index d1001790fa9a..84055d31bfc5 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -21,6 +21,7 @@
 */
 #include <linux/types.h>
+#include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index e1d28ddd2169..0f2ab741ae7c 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -9,12 +9,7 @@
 #include <linux/types.h>
 #include <linux/time.h>
-#include <linux/slab.h>
 #include <linux/smp_lock.h>
-#include <linux/in.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/nfsd/nfsd.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index ad478da7ca63..d0ef94cfb3da 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -10,6 +10,7 @@
 #include <linux/string.h>
 #include <linux/time.h>
 #include <linux/in.h>
+#include <linux/slab.h>
 #include <linux/mutex.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/clnt.h>
diff --git a/fs/locks.c b/fs/locks.c
index a8794f233bc9..ab24d49fc048 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1182,8 +1182,9 @@ int __break_lease(struct inode *inode, unsigned int mode)
        struct file_lock *fl;
        unsigned long break_time;
        int i_have_this_lease = 0;
+        int want_write = (mode & O_ACCMODE) != O_RDONLY;
-        new_fl = lease_alloc(NULL, mode & FMODE_WRITE ? F_WRLCK : F_RDLCK);
+        new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
        lock_kernel();
@@ -1197,7 +1198,7 @@ int __break_lease(struct inode *inode, unsigned int mode)
                if (fl->fl_owner == current->files)
                        i_have_this_lease = 1;
-        if (mode & FMODE_WRITE) {
+        if (want_write) {
                /* If we want write access, we have to revoke any lease. */
                future = F_UNLCK | F_INPROGRESS;
        } else if (flock->fl_type & F_INPROGRESS) {
@@ -1454,7 +1455,7 @@ EXPORT_SYMBOL(generic_setlease);
 *      leases held by processes on this node.
 *
 *      There is also no break_lease method; filesystems that
- *      handle their own leases shoud break leases themselves from the
+ *      handle their own leases should break leases themselves from the
 *      filesystem's open, create, and (on truncate) setattr methods.
 *
 *      Warning: the only current setlease methods exist only to disable
diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig
new file mode 100644
index 000000000000..daf9a9b32dd3
--- /dev/null
+++ b/fs/logfs/Kconfig
@@ -0,0 +1,17 @@
+config LOGFS
+        tristate "LogFS file system (EXPERIMENTAL)"
+        depends on (MTD || BLOCK) && EXPERIMENTAL
+        select ZLIB_INFLATE
+        select ZLIB_DEFLATE
+        select CRC32
+        select BTREE
+        help
+          Flash filesystem aimed to scale efficiently to large devices.
+          In comparison to JFFS2 it offers significantly faster mount
+          times and potentially less RAM usage, although the latter has
+          not been measured yet.
+          In its current state it is still very experimental and should
+          not be used for other than testing purposes.
+          If unsure, say N.
diff --git a/fs/logfs/Makefile b/fs/logfs/Makefile
new file mode 100644
index 000000000000..4820027787ee
--- /dev/null
+++ b/fs/logfs/Makefile
@@ -0,0 +1,13 @@
+obj-$(CONFIG_LOGFS)     += logfs.o
+logfs-y += compr.o
+logfs-y += dir.o
+logfs-y += file.o
+logfs-y += gc.o
+logfs-y += inode.o
+logfs-y += journal.o
+logfs-y += readwrite.o
+logfs-y += segment.o
+logfs-y += super.o
+logfs-$(CONFIG_BLOCK)   += dev_bdev.o
+logfs-$(CONFIG_MTD)     += dev_mtd.o
diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c
new file mode 100644
index 000000000000..44bbfd249abc
--- /dev/null
+++ b/fs/logfs/compr.c
@@ -0,0 +1,95 @@
+/*
+ * fs/logfs/compr.c     - compression routines
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/vmalloc.h>
+#include <linux/zlib.h>
+#define COMPR_LEVEL 3
+static DEFINE_MUTEX(compr_mutex);
+static struct z_stream_s stream;
+int logfs_compress(void *in, void *out, size_t inlen, size_t outlen)
+{
+        int err, ret;
+        ret = -EIO;
+        mutex_lock(&compr_mutex);
+        err = zlib_deflateInit(&stream, COMPR_LEVEL);
+        if (err != Z_OK)
+                goto error;
+        stream.next_in = in;
+        stream.avail_in = inlen;
+        stream.total_in = 0;
+        stream.next_out = out;
+        stream.avail_out = outlen;
+        stream.total_out = 0;
+        err = zlib_deflate(&stream, Z_FINISH);
+        if (err != Z_STREAM_END)
+                goto error;
+        err = zlib_deflateEnd(&stream);
+        if (err != Z_OK)
+                goto error;
+        if (stream.total_out >= stream.total_in)
+                goto error;
+        ret = stream.total_out;
+error:
+        mutex_unlock(&compr_mutex);
+        return ret;
+}
+int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen)
+{
+        int err, ret;
+        ret = -EIO;
+        mutex_lock(&compr_mutex);
+        err = zlib_inflateInit(&stream);
+        if (err != Z_OK)
+                goto error;
+        stream.next_in = in;
+        stream.avail_in = inlen;
+        stream.total_in = 0;
+        stream.next_out = out;
+        stream.avail_out = outlen;
+        stream.total_out = 0;
+        err = zlib_inflate(&stream, Z_FINISH);
+        if (err != Z_STREAM_END)
+                goto error;
+        err = zlib_inflateEnd(&stream);
+        if (err != Z_OK)
+                goto error;
+        ret = 0;
+error:
+        mutex_unlock(&compr_mutex);
+        return ret;
+}
+int __init logfs_compr_init(void)
+{
+        size_t size = max(zlib_deflate_workspacesize(),
+                        zlib_inflate_workspacesize());
+        stream.workspace = vmalloc(size);
+        if (!stream.workspace)
+                return -ENOMEM;
+        return 0;
+}
+void logfs_compr_exit(void)
+{
+        vfree(stream.workspace);
+}
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
new file mode 100644
index 000000000000..243c00071f76
--- /dev/null
+++ b/fs/logfs/dev_bdev.c
@@ -0,0 +1,333 @@
+/*
+ * fs/logfs/dev_bdev.c  - Device access methods for block devices
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/gfp.h>
+#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
+static void request_complete(struct bio *bio, int err)
+{
+        complete((struct completion *)bio->bi_private);
+}
+static int sync_request(struct page *page, struct block_device *bdev, int rw)
+{
+        struct bio bio;
+        struct bio_vec bio_vec;
+        struct completion complete;
+        bio_init(&bio);
+        bio.bi_io_vec = &bio_vec;
+        bio_vec.bv_page = page;
+        bio_vec.bv_len = PAGE_SIZE;
+        bio_vec.bv_offset = 0;
+        bio.bi_vcnt = 1;
+        bio.bi_idx = 0;
+        bio.bi_size = PAGE_SIZE;
+        bio.bi_bdev = bdev;
+        bio.bi_sector = page->index * (PAGE_SIZE >> 9);
+        init_completion(&complete);
+        bio.bi_private = &complete;
+        bio.bi_end_io = request_complete;
+        submit_bio(rw, &bio);
+        generic_unplug_device(bdev_get_queue(bdev));
+        wait_for_completion(&complete);
+        return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
+}
+static int bdev_readpage(void *_sb, struct page *page)
+{
+        struct super_block *sb = _sb;
+        struct block_device *bdev = logfs_super(sb)->s_bdev;
+        int err;
+        err = sync_request(page, bdev, READ);
+        if (err) {
+                ClearPageUptodate(page);
+                SetPageError(page);
+        } else {
+                SetPageUptodate(page);
+                ClearPageError(page);
+        }
+        unlock_page(page);
+        return err;
+}
+static DECLARE_WAIT_QUEUE_HEAD(wq);
+static void writeseg_end_io(struct bio *bio, int err)
+{
+        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct super_block *sb = bio->bi_private;
+        struct logfs_super *super = logfs_super(sb);
+        struct page *page;
+        BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
+        BUG_ON(err);
+        BUG_ON(bio->bi_vcnt == 0);
+        do {
+                page = bvec->bv_page;
+                if (--bvec >= bio->bi_io_vec)
+                        prefetchw(&bvec->bv_page->flags);
+                end_page_writeback(page);
+                page_cache_release(page);
+        } while (bvec >= bio->bi_io_vec);
+        bio_put(bio);
+        if (atomic_dec_and_test(&super->s_pending_writes))
+                wake_up(&wq);
+}
+static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
+                size_t nr_pages)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct address_space *mapping = super->s_mapping_inode->i_mapping;
+        struct bio *bio;
+        struct page *page;
+        struct request_queue *q = bdev_get_queue(sb->s_bdev);
+        unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
+        int i;
+        if (max_pages > BIO_MAX_PAGES)
+                max_pages = BIO_MAX_PAGES;
+        bio = bio_alloc(GFP_NOFS, max_pages);
+        BUG_ON(!bio);
+        for (i = 0; i < nr_pages; i++) {
+                if (i >= max_pages) {
+                        /* Block layer cannot split bios :( */
+                        bio->bi_vcnt = i;
+                        bio->bi_idx = 0;
+                        bio->bi_size = i * PAGE_SIZE;
+                        bio->bi_bdev = super->s_bdev;
+                        bio->bi_sector = ofs >> 9;
+                        bio->bi_private = sb;
+                        bio->bi_end_io = writeseg_end_io;
+                        atomic_inc(&super->s_pending_writes);
+                        submit_bio(WRITE, bio);
+                        ofs += i * PAGE_SIZE;
+                        index += i;
+                        nr_pages -= i;
+                        i = 0;
+                        bio = bio_alloc(GFP_NOFS, max_pages);
+                        BUG_ON(!bio);
+                }
+                page = find_lock_page(mapping, index + i);
+                BUG_ON(!page);
+                bio->bi_io_vec[i].bv_page = page;
+                bio->bi_io_vec[i].bv_len = PAGE_SIZE;
+                bio->bi_io_vec[i].bv_offset = 0;
+                BUG_ON(PageWriteback(page));
+                set_page_writeback(page);
+                unlock_page(page);
+        }
+        bio->bi_vcnt = nr_pages;
+        bio->bi_idx = 0;
+        bio->bi_size = nr_pages * PAGE_SIZE;
+        bio->bi_bdev = super->s_bdev;
+        bio->bi_sector = ofs >> 9;
+        bio->bi_private = sb;
+        bio->bi_end_io = writeseg_end_io;
+        atomic_inc(&super->s_pending_writes);
+        submit_bio(WRITE, bio);
+        return 0;
+}
+static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
+{
+        struct logfs_super *super = logfs_super(sb);
+        int head;
+        BUG_ON(super->s_flags & LOGFS_SB_FLAG_RO);
+        if (len == 0) {
+                /* This can happen when the object fit perfectly into a
+                 * segment, the segment gets written per sync and subsequently
+                 * closed.
+                 */
+                return;
+        }
+        head = ofs & (PAGE_SIZE - 1);
+        if (head) {
+                ofs -= head;
+                len += head;
+        }
+        len = PAGE_ALIGN(len);
+        __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
+        generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev));
+}
+static void erase_end_io(struct bio *bio, int err) 
+{ 
+        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 
+        struct super_block *sb = bio->bi_private; 
+        struct logfs_super *super = logfs_super(sb); 
+        BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ 
+        BUG_ON(err); 
+        BUG_ON(bio->bi_vcnt == 0); 
+        bio_put(bio); 
+        if (atomic_dec_and_test(&super->s_pending_writes))
+                wake_up(&wq); 
+} 
+static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
+                size_t nr_pages)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct bio *bio;
+        struct request_queue *q = bdev_get_queue(sb->s_bdev);
+        unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
+        int i;
+        if (max_pages > BIO_MAX_PAGES)
+                max_pages = BIO_MAX_PAGES;
+        bio = bio_alloc(GFP_NOFS, max_pages);
+        BUG_ON(!bio);
+        for (i = 0; i < nr_pages; i++) {
+                if (i >= max_pages) {
+                        /* Block layer cannot split bios :( */
+                        bio->bi_vcnt = i;
+                        bio->bi_idx = 0;
+                        bio->bi_size = i * PAGE_SIZE;
+                        bio->bi_bdev = super->s_bdev;
+                        bio->bi_sector = ofs >> 9;
+                        bio->bi_private = sb;
+                        bio->bi_end_io = erase_end_io;
+                        atomic_inc(&super->s_pending_writes);
+                        submit_bio(WRITE, bio);
+                        ofs += i * PAGE_SIZE;
+                        index += i;
+                        nr_pages -= i;
+                        i = 0;
+                        bio = bio_alloc(GFP_NOFS, max_pages);
+                        BUG_ON(!bio);
+                }
+                bio->bi_io_vec[i].bv_page = super->s_erase_page;
+                bio->bi_io_vec[i].bv_len = PAGE_SIZE;
+                bio->bi_io_vec[i].bv_offset = 0;
+        }
+        bio->bi_vcnt = nr_pages;
+        bio->bi_idx = 0;
+        bio->bi_size = nr_pages * PAGE_SIZE;
+        bio->bi_bdev = super->s_bdev;
+        bio->bi_sector = ofs >> 9;
+        bio->bi_private = sb;
+        bio->bi_end_io = erase_end_io;
+        atomic_inc(&super->s_pending_writes);
+        submit_bio(WRITE, bio);
+        return 0;
+}
+static int bdev_erase(struct super_block *sb, loff_t to, size_t len,
+                int ensure_write)
+{
+        struct logfs_super *super = logfs_super(sb);
+        BUG_ON(to & (PAGE_SIZE - 1));
+        BUG_ON(len & (PAGE_SIZE - 1));
+        if (super->s_flags & LOGFS_SB_FLAG_RO)
+                return -EROFS;
+        if (ensure_write) {
+                /*
+                 * Object store doesn't care whether erases happen or not.
+                 * But for the journal they are required.  Otherwise a scan
+                 * can find an old commit entry and assume it is the current
+                 * one, travelling back in time.
+                 */
+                do_erase(sb, to, to >> PAGE_SHIFT, len >> PAGE_SHIFT);
+        }
+        return 0;
+}
+static void bdev_sync(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        wait_event(wq, atomic_read(&super->s_pending_writes) == 0);
+}
+static struct page *bdev_find_first_sb(struct super_block *sb, u64 *ofs)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct address_space *mapping = super->s_mapping_inode->i_mapping;
+        filler_t *filler = bdev_readpage;
+        *ofs = 0;
+        return read_cache_page(mapping, 0, filler, sb);
+}
+static struct page *bdev_find_last_sb(struct super_block *sb, u64 *ofs)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct address_space *mapping = super->s_mapping_inode->i_mapping;
+        filler_t *filler = bdev_readpage;
+        u64 pos = (super->s_bdev->bd_inode->i_size & ~0xfffULL) - 0x1000;
+        pgoff_t index = pos >> PAGE_SHIFT;
+        *ofs = pos;
+        return read_cache_page(mapping, index, filler, sb);
+}
+static int bdev_write_sb(struct super_block *sb, struct page *page)
+{
+        struct block_device *bdev = logfs_super(sb)->s_bdev;
+        /* Nothing special to do for block devices. */
+        return sync_request(page, bdev, WRITE);
+}
+static void bdev_put_device(struct super_block *sb)
+{
+        close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE);
+}
+static const struct logfs_device_ops bd_devops = {
+        .find_first_sb  = bdev_find_first_sb,
+        .find_last_sb   = bdev_find_last_sb,
+        .write_sb       = bdev_write_sb,
+        .readpage       = bdev_readpage,
+        .writeseg       = bdev_writeseg,
+        .erase          = bdev_erase,
+        .sync           = bdev_sync,
+        .put_device     = bdev_put_device,
+};
+int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+                const char *devname, struct vfsmount *mnt)
+{
+        struct block_device *bdev;
+        bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type);
+        if (IS_ERR(bdev))
+                return PTR_ERR(bdev);
+        if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
+                int mtdnr = MINOR(bdev->bd_dev);
+                close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
+                return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
+        }
+        return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt);
+}
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
new file mode 100644
index 000000000000..cafb6ef2e05b
--- /dev/null
+++ b/fs/logfs/dev_mtd.c
@@ -0,0 +1,254 @@
+/*
+ * fs/logfs/dev_mtd.c   - Device access methods for MTD
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/completion.h>
+#include <linux/mount.h>
+#include <linux/sched.h>
+#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
+static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf)
+{
+        struct mtd_info *mtd = logfs_super(sb)->s_mtd;
+        size_t retlen;
+        int ret;
+        ret = mtd->read(mtd, ofs, len, &retlen, buf);
+        BUG_ON(ret == -EINVAL);
+        if (ret)
+                return ret;
+        /* Not sure if we should loop instead. */
+        if (retlen != len)
+                return -EIO;
+        return 0;
+}
+static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct mtd_info *mtd = super->s_mtd;
+        size_t retlen;
+        loff_t page_start, page_end;
+        int ret;
+        if (super->s_flags & LOGFS_SB_FLAG_RO)
+                return -EROFS;
+        BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs));
+        BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift);
+        BUG_ON(len > PAGE_CACHE_SIZE);
+        page_start = ofs & PAGE_CACHE_MASK;
+        page_end = PAGE_CACHE_ALIGN(ofs + len) - 1;
+        ret = mtd->write(mtd, ofs, len, &retlen, buf);
+        if (ret || (retlen != len))
+                return -EIO;
+        return 0;
+}
+/*
+ * For as long as I can remember (since about 2001) mtd->erase has been an
+ * asynchronous interface lacking the first driver to actually use the
+ * asynchronous properties.  So just to prevent the first implementor of such
+ * a thing from breaking logfs in 2350, we do the usual pointless dance to
+ * declare a completion variable and wait for completion before returning
+ * from mtd_erase().  What an excercise in futility!
+ */
+static void logfs_erase_callback(struct erase_info *ei)
+{
+        complete((struct completion *)ei->priv);
+}
+static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct address_space *mapping = super->s_mapping_inode->i_mapping;
+        struct page *page;
+        pgoff_t index = ofs >> PAGE_SHIFT;
+        for (index = ofs >> PAGE_SHIFT; index < (ofs + len) >> PAGE_SHIFT; index++) {
+                page = find_get_page(mapping, index);
+                if (!page)
+                        continue;
+                memset(page_address(page), 0xFF, PAGE_SIZE);
+                page_cache_release(page);
+        }
+        return 0;
+}
+static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
+                int ensure_write)
+{
+        struct mtd_info *mtd = logfs_super(sb)->s_mtd;
+        struct erase_info ei;
+        DECLARE_COMPLETION_ONSTACK(complete);
+        int ret;
+        BUG_ON(len % mtd->erasesize);
+        if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO)
+                return -EROFS;
+        memset(&ei, 0, sizeof(ei));
+        ei.mtd = mtd;
+        ei.addr = ofs;
+        ei.len = len;
+        ei.callback = logfs_erase_callback;
+        ei.priv = (long)&complete;
+        ret = mtd->erase(mtd, &ei);
+        if (ret)
+                return -EIO;
+        wait_for_completion(&complete);
+        if (ei.state != MTD_ERASE_DONE)
+                return -EIO;
+        return mtd_erase_mapping(sb, ofs, len);
+}
+static void mtd_sync(struct super_block *sb)
+{
+        struct mtd_info *mtd = logfs_super(sb)->s_mtd;
+        if (mtd->sync)
+                mtd->sync(mtd);
+}
+static int mtd_readpage(void *_sb, struct page *page)
+{
+        struct super_block *sb = _sb;
+        int err;
+        err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
+                        page_address(page));
+        if (err == -EUCLEAN) {
+                err = 0;
+                /* FIXME: force GC this segment */
+        }
+        if (err) {
+                ClearPageUptodate(page);
+                SetPageError(page);
+        } else {
+                SetPageUptodate(page);
+                ClearPageError(page);
+        }
+        unlock_page(page);
+        return err;
+}
+static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct address_space *mapping = super->s_mapping_inode->i_mapping;
+        filler_t *filler = mtd_readpage;
+        struct mtd_info *mtd = super->s_mtd;
+        if (!mtd->block_isbad)
+                return NULL;
+        *ofs = 0;
+        while (mtd->block_isbad(mtd, *ofs)) {
+                *ofs += mtd->erasesize;
+                if (*ofs >= mtd->size)
+                        return NULL;
+        }
+        BUG_ON(*ofs & ~PAGE_MASK);
+        return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
+}
+static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct address_space *mapping = super->s_mapping_inode->i_mapping;
+        filler_t *filler = mtd_readpage;
+        struct mtd_info *mtd = super->s_mtd;
+        if (!mtd->block_isbad)
+                return NULL;
+        *ofs = mtd->size - mtd->erasesize;
+        while (mtd->block_isbad(mtd, *ofs)) {
+                *ofs -= mtd->erasesize;
+                if (*ofs <= 0)
+                        return NULL;
+        }
+        *ofs = *ofs + mtd->erasesize - 0x1000;
+        BUG_ON(*ofs & ~PAGE_MASK);
+        return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
+}
+static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
+                size_t nr_pages)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct address_space *mapping = super->s_mapping_inode->i_mapping;
+        struct page *page;
+        int i, err;
+        for (i = 0; i < nr_pages; i++) {
+                page = find_lock_page(mapping, index + i);
+                BUG_ON(!page);
+                err = mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
+                                page_address(page));
+                unlock_page(page);
+                page_cache_release(page);
+                if (err)
+                        return err;
+        }
+        return 0;
+}
+static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
+{
+        struct logfs_super *super = logfs_super(sb);
+        int head;
+        if (super->s_flags & LOGFS_SB_FLAG_RO)
+                return;
+        if (len == 0) {
+                /* This can happen when the object fit perfectly into a
+                 * segment, the segment gets written per sync and subsequently
+                 * closed.
+                 */
+                return;
+        }
+        head = ofs & (PAGE_SIZE - 1);
+        if (head) {
+                ofs -= head;
+                len += head;
+        }
+        len = PAGE_ALIGN(len);
+        __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
+}
+static void mtd_put_device(struct super_block *sb)
+{
+        put_mtd_device(logfs_super(sb)->s_mtd);
+}
+static const struct logfs_device_ops mtd_devops = {
+        .find_first_sb  = mtd_find_first_sb,
+        .find_last_sb   = mtd_find_last_sb,
+        .readpage       = mtd_readpage,
+        .writeseg       = mtd_writeseg,
+        .erase          = mtd_erase,
+        .sync           = mtd_sync,
+        .put_device     = mtd_put_device,
+};
+int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+                int mtdnr, struct vfsmount *mnt)
+{
+        struct mtd_info *mtd;
+        const struct logfs_device_ops *devops = &mtd_devops;
+        mtd = get_mtd_device(NULL, mtdnr);
+        return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt);
+}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
new file mode 100644
index 000000000000..2396a85c0f55
--- /dev/null
+++ b/fs/logfs/dir.c
@@ -0,0 +1,827 @@
+/*
+ * fs/logfs/dir.c       - directory-related code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/slab.h>
+/*
+ * Atomic dir operations
+ *
+ * Directory operations are by default not atomic.  Dentries and Inodes are
+ * created/removed/altered in seperate operations.  Therefore we need to do
+ * a small amount of journaling.
+ *
+ * Create, link, mkdir, mknod and symlink all share the same function to do
+ * the work: __logfs_create.  This function works in two atomic steps:
+ * 1. allocate inode (remember in journal)
+ * 2. allocate dentry (clear journal)
+ *
+ * As we can only get interrupted between the two, when the inode we just
+ * created is simply stored in the anchor.  On next mount, if we were
+ * interrupted, we delete the inode.  From a users point of view the
+ * operation never happened.
+ *
+ * Unlink and rmdir also share the same function: unlink.  Again, this
+ * function works in two atomic steps
+ * 1. remove dentry (remember inode in journal)
+ * 2. unlink inode (clear journal)
+ *
+ * And again, on the next mount, if we were interrupted, we delete the inode.
+ * From a users point of view the operation succeeded.
+ *
+ * Rename is the real pain to deal with, harder than all the other methods
+ * combined.  Depending on the circumstances we can run into three cases.
+ * A "target rename" where the target dentry already existed, a "local
+ * rename" where both parent directories are identical or a "cross-directory
+ * rename" in the remaining case.
+ *
+ * Local rename is atomic, as the old dentry is simply rewritten with a new
+ * name.
+ *
+ * Cross-directory rename works in two steps, similar to __logfs_create and
+ * logfs_unlink:
+ * 1. Write new dentry (remember old dentry in journal)
+ * 2. Remove old dentry (clear journal)
+ *
+ * Here we remember a dentry instead of an inode.  On next mount, if we were
+ * interrupted, we delete the dentry.  From a users point of view, the
+ * operation succeeded.
+ *
+ * Target rename works in three atomic steps:
+ * 1. Attach old inode to new dentry (remember old dentry and new inode)
+ * 2. Remove old dentry (still remember the new inode)
+ * 3. Remove victim inode
+ *
+ * Here we remember both an inode an a dentry.  If we get interrupted
+ * between steps 1 and 2, we delete both the dentry and the inode.  If
+ * we get interrupted between steps 2 and 3, we delete just the inode.
+ * In either case, the remaining objects are deleted on next mount.  From
+ * a users point of view, the operation succeeded.
+ */
+static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,
+                loff_t pos)
+{
+        return logfs_inode_write(dir, dd, sizeof(*dd), pos, WF_LOCK, NULL);
+}
+static int write_inode(struct inode *inode)
+{
+        return __logfs_write_inode(inode, WF_LOCK);
+}
+static s64 dir_seek_data(struct inode *inode, s64 pos)
+{
+        s64 new_pos = logfs_seek_data(inode, pos);
+        return max(pos, new_pos - 1);
+}
+static int beyond_eof(struct inode *inode, loff_t bix)
+{
+        loff_t pos = bix << inode->i_sb->s_blocksize_bits;
+        return pos >= i_size_read(inode);
+}
+/*
+ * Prime value was chosen to be roughly 256 + 26.  r5 hash uses 11,
+ * so short names (len <= 9) don't even occupy the complete 32bit name
+ * space.  A prime >256 ensures short names quickly spread the 32bit
+ * name space.  Add about 26 for the estimated amount of information
+ * of each character and pick a prime nearby, preferrably a bit-sparse
+ * one.
+ */
+static u32 hash_32(const char *s, int len, u32 seed)
+{
+        u32 hash = seed;
+        int i;
+        for (i = 0; i < len; i++)
+                hash = hash * 293 + s[i];
+        return hash;
+}
+/*
+ * We have to satisfy several conflicting requirements here.  Small
+ * directories should stay fairly compact and not require too many
+ * indirect blocks.  The number of possible locations for a given hash
+ * should be small to make lookup() fast.  And we should try hard not
+ * to overflow the 32bit name space or nfs and 32bit host systems will
+ * be unhappy.
+ *
+ * So we use the following scheme.  First we reduce the hash to 0..15
+ * and try a direct block.  If that is occupied we reduce the hash to
+ * 16..255 and try an indirect block.  Same for 2x and 3x indirect
+ * blocks.  Lastly we reduce the hash to 0x800_0000 .. 0xffff_ffff,
+ * but use buckets containing eight entries instead of a single one.
+ *
+ * Using 16 entries should allow for a reasonable amount of hash
+ * collisions, so the 32bit name space can be packed fairly tight
+ * before overflowing.  Oh and currently we don't overflow but return
+ * and error.
+ *
+ * How likely are collisions?  Doing the appropriate math is beyond me
+ * and the Bronstein textbook.  But running a test program to brute
+ * force collisions for a couple of days showed that on average the
+ * first collision occurs after 598M entries, with 290M being the
+ * smallest result.  Obviously 21 entries could already cause a
+ * collision if all entries are carefully chosen.
+ */
+static pgoff_t hash_index(u32 hash, int round)
+{
+        u32 i0_blocks = I0_BLOCKS;
+        u32 i1_blocks = I1_BLOCKS;
+        u32 i2_blocks = I2_BLOCKS;
+        u32 i3_blocks = I3_BLOCKS;
+        switch (round) {
+        case 0:
+                return hash % i0_blocks;
+        case 1:
+                return i0_blocks + hash % (i1_blocks - i0_blocks);
+        case 2:
+                return i1_blocks + hash % (i2_blocks - i1_blocks);
+        case 3:
+                return i2_blocks + hash % (i3_blocks - i2_blocks);
+        case 4 ... 19:
+                return i3_blocks + 16 * (hash % (((1<<31) - i3_blocks) / 16))
+                        + round - 4;
+        }
+        BUG();
+}
+static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry)
+{
+        struct qstr *name = &dentry->d_name;
+        struct page *page;
+        struct logfs_disk_dentry *dd;
+        u32 hash = hash_32(name->name, name->len, 0);
+        pgoff_t index;
+        int round;
+        if (name->len > LOGFS_MAX_NAMELEN)
+                return ERR_PTR(-ENAMETOOLONG);
+        for (round = 0; round < 20; round++) {
+                index = hash_index(hash, round);
+                if (beyond_eof(dir, index))
+                        return NULL;
+                if (!logfs_exist_block(dir, index))
+                        continue;
+                page = read_cache_page(dir->i_mapping, index,
+                                (filler_t *)logfs_readpage, NULL);
+                if (IS_ERR(page))
+                        return page;
+                dd = kmap_atomic(page, KM_USER0);
+                BUG_ON(dd->namelen == 0);
+                if (name->len != be16_to_cpu(dd->namelen) ||
+                                memcmp(name->name, dd->name, name->len)) {
+                        kunmap_atomic(dd, KM_USER0);
+                        page_cache_release(page);
+                        continue;
+                }
+                kunmap_atomic(dd, KM_USER0);
+                return page;
+        }
+        return NULL;
+}
+static int logfs_remove_inode(struct inode *inode)
+{
+        int ret;
+        inode->i_nlink--;
+        ret = write_inode(inode);
+        LOGFS_BUG_ON(ret, inode->i_sb);
+        return ret;
+}
+static void abort_transaction(struct inode *inode, struct logfs_transaction *ta)
+{
+        if (logfs_inode(inode)->li_block)
+                logfs_inode(inode)->li_block->ta = NULL;
+        kfree(ta);
+}
+static int logfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct logfs_super *super = logfs_super(dir->i_sb);
+        struct inode *inode = dentry->d_inode;
+        struct logfs_transaction *ta;
+        struct page *page;
+        pgoff_t index;
+        int ret;
+        ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+        if (!ta)
+                return -ENOMEM;
+        ta->state = UNLINK_1;
+        ta->ino = inode->i_ino;
+        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+        page = logfs_get_dd_page(dir, dentry);
+        if (!page) {
+                kfree(ta);
+                return -ENOENT;
+        }
+        if (IS_ERR(page)) {
+                kfree(ta);
+                return PTR_ERR(page);
+        }
+        index = page->index;
+        page_cache_release(page);
+        mutex_lock(&super->s_dirop_mutex);
+        logfs_add_transaction(dir, ta);
+        ret = logfs_delete(dir, index, NULL);
+        if (!ret)
+                ret = write_inode(dir);
+        if (ret) {
+                abort_transaction(dir, ta);
+                printk(KERN_ERR"LOGFS: unable to delete inode\n");
+                goto out;
+        }
+        ta->state = UNLINK_2;
+        logfs_add_transaction(inode, ta);
+        ret = logfs_remove_inode(inode);
+out:
+        mutex_unlock(&super->s_dirop_mutex);
+        return ret;
+}
+static inline int logfs_empty_dir(struct inode *dir)
+{
+        u64 data;
+        data = logfs_seek_data(dir, 0) << dir->i_sb->s_blocksize_bits;
+        return data >= i_size_read(dir);
+}
+static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+        struct inode *inode = dentry->d_inode;
+        if (!logfs_empty_dir(inode))
+                return -ENOTEMPTY;
+        return logfs_unlink(dir, dentry);
+}
+/* FIXME: readdir currently has it's own dir_walk code.  I don't see a good
+ * way to combine the two copies */
+#define IMPLICIT_NODES 2
+static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
+{
+        struct inode *dir = file->f_dentry->d_inode;
+        loff_t pos = file->f_pos - IMPLICIT_NODES;
+        struct page *page;
+        struct logfs_disk_dentry *dd;
+        int full;
+        BUG_ON(pos < 0);
+        for (;; pos++) {
+                if (beyond_eof(dir, pos))
+                        break;
+                if (!logfs_exist_block(dir, pos)) {
+                        /* deleted dentry */
+                        pos = dir_seek_data(dir, pos);
+                        continue;
+                }
+                page = read_cache_page(dir->i_mapping, pos,
+                                (filler_t *)logfs_readpage, NULL);
+                if (IS_ERR(page))
+                        return PTR_ERR(page);
+                dd = kmap(page);
+                BUG_ON(dd->namelen == 0);
+                full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
+                                pos, be64_to_cpu(dd->ino), dd->type);
+                kunmap(page);
+                page_cache_release(page);
+                if (full)
+                        break;
+        }
+        file->f_pos = pos + IMPLICIT_NODES;
+        return 0;
+}
+static int logfs_readdir(struct file *file, void *buf, filldir_t filldir)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        ino_t pino = parent_ino(file->f_dentry);
+        int err;
+        if (file->f_pos < 0)
+                return -EINVAL;
+        if (file->f_pos == 0) {
+                if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0)
+                        return 0;
+                file->f_pos++;
+        }
+        if (file->f_pos == 1) {
+                if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0)
+                        return 0;
+                file->f_pos++;
+        }
+        err = __logfs_readdir(file, buf, filldir);
+        return err;
+}
+static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name)
+{
+        dd->namelen = cpu_to_be16(name->len);
+        memcpy(dd->name, name->name, name->len);
+}
+static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry,
+                struct nameidata *nd)
+{
+        struct page *page;
+        struct logfs_disk_dentry *dd;
+        pgoff_t index;
+        u64 ino = 0;
+        struct inode *inode;
+        page = logfs_get_dd_page(dir, dentry);
+        if (IS_ERR(page))
+                return ERR_CAST(page);
+        if (!page) {
+                d_add(dentry, NULL);
+                return NULL;
+        }
+        index = page->index;
+        dd = kmap_atomic(page, KM_USER0);
+        ino = be64_to_cpu(dd->ino);
+        kunmap_atomic(dd, KM_USER0);
+        page_cache_release(page);
+        inode = logfs_iget(dir->i_sb, ino);
+        if (IS_ERR(inode)) {
+                printk(KERN_ERR"LogFS: Cannot read inode #%llx for dentry (%lx, %lx)n",
+                                ino, dir->i_ino, index);
+                return ERR_CAST(inode);
+        }
+        return d_splice_alias(inode, dentry);
+}
+static void grow_dir(struct inode *dir, loff_t index)
+{
+        index = (index + 1) << dir->i_sb->s_blocksize_bits;
+        if (i_size_read(dir) < index)
+                i_size_write(dir, index);
+}
+static int logfs_write_dir(struct inode *dir, struct dentry *dentry,
+                struct inode *inode)
+{
+        struct page *page;
+        struct logfs_disk_dentry *dd;
+        u32 hash = hash_32(dentry->d_name.name, dentry->d_name.len, 0);
+        pgoff_t index;
+        int round, err;
+        for (round = 0; round < 20; round++) {
+                index = hash_index(hash, round);
+                if (logfs_exist_block(dir, index))
+                        continue;
+                page = find_or_create_page(dir->i_mapping, index, GFP_KERNEL);
+                if (!page)
+                        return -ENOMEM;
+                dd = kmap_atomic(page, KM_USER0);
+                memset(dd, 0, sizeof(*dd));
+                dd->ino = cpu_to_be64(inode->i_ino);
+                dd->type = logfs_type(inode);
+                logfs_set_name(dd, &dentry->d_name);
+                kunmap_atomic(dd, KM_USER0);
+                err = logfs_write_buf(dir, page, WF_LOCK);
+                unlock_page(page);
+                page_cache_release(page);
+                if (!err)
+                        grow_dir(dir, index);
+                return err;
+        }
+        /* FIXME: Is there a better return value?  In most cases neither
+         * the filesystem nor the directory are full.  But we have had
+         * too many collisions for this particular hash and no fallback.
+         */
+        return -ENOSPC;
+}
+static int __logfs_create(struct inode *dir, struct dentry *dentry,
+                struct inode *inode, const char *dest, long destlen)
+{
+        struct logfs_super *super = logfs_super(dir->i_sb);
+        struct logfs_inode *li = logfs_inode(inode);
+        struct logfs_transaction *ta;
+        int ret;
+        ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+        if (!ta)
+                return -ENOMEM;
+        ta->state = CREATE_1;
+        ta->ino = inode->i_ino;
+        mutex_lock(&super->s_dirop_mutex);
+        logfs_add_transaction(inode, ta);
+        if (dest) {
+                /* symlink */
+                ret = logfs_inode_write(inode, dest, destlen, 0, WF_LOCK, NULL);
+                if (!ret)
+                        ret = write_inode(inode);
+        } else {
+                /* creat/mkdir/mknod */
+                ret = write_inode(inode);
+        }
+        if (ret) {
+                abort_transaction(inode, ta);
+                li->li_flags |= LOGFS_IF_STILLBORN;
+                /* FIXME: truncate symlink */
+                inode->i_nlink--;
+                iput(inode);
+                goto out;
+        }
+        ta->state = CREATE_2;
+        logfs_add_transaction(dir, ta);
+        ret = logfs_write_dir(dir, dentry, inode);
+        /* sync directory */
+        if (!ret)
+                ret = write_inode(dir);
+        if (ret) {
+                logfs_del_transaction(dir, ta);
+                ta->state = CREATE_2;
+                logfs_add_transaction(inode, ta);
+                logfs_remove_inode(inode);
+                iput(inode);
+                goto out;
+        }
+        d_instantiate(dentry, inode);
+out:
+        mutex_unlock(&super->s_dirop_mutex);
+        return ret;
+}
+static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+        struct inode *inode;
+        /*
+         * FIXME: why do we have to fill in S_IFDIR, while the mode is
+         * correct for mknod, creat, etc.?  Smells like the vfs *should*
+         * do it for us but for some reason fails to do so.
+         */
+        inode = logfs_new_inode(dir, S_IFDIR | mode);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
+        inode->i_op = &logfs_dir_iops;
+        inode->i_fop = &logfs_dir_fops;
+        return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+static int logfs_create(struct inode *dir, struct dentry *dentry, int mode,
+                struct nameidata *nd)
+{
+        struct inode *inode;
+        inode = logfs_new_inode(dir, mode);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
+        inode->i_op = &logfs_reg_iops;
+        inode->i_fop = &logfs_reg_fops;
+        inode->i_mapping->a_ops = &logfs_reg_aops;
+        return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+static int logfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
+                dev_t rdev)
+{
+        struct inode *inode;
+        if (dentry->d_name.len > LOGFS_MAX_NAMELEN)
+                return -ENAMETOOLONG;
+        inode = logfs_new_inode(dir, mode);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
+        init_special_inode(inode, mode, rdev);
+        return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+static int logfs_symlink(struct inode *dir, struct dentry *dentry,
+                const char *target)
+{
+        struct inode *inode;
+        size_t destlen = strlen(target) + 1;
+        if (destlen > dir->i_sb->s_blocksize)
+                return -ENAMETOOLONG;
+        inode = logfs_new_inode(dir, S_IFLNK | 0777);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
+        inode->i_op = &logfs_symlink_iops;
+        inode->i_mapping->a_ops = &logfs_reg_aops;
+        return __logfs_create(dir, dentry, inode, target, destlen);
+}
+static int logfs_permission(struct inode *inode, int mask)
+{
+        return generic_permission(inode, mask, NULL);
+}
+static int logfs_link(struct dentry *old_dentry, struct inode *dir,
+                struct dentry *dentry)
+{
+        struct inode *inode = old_dentry->d_inode;
+        if (inode->i_nlink >= LOGFS_LINK_MAX)
+                return -EMLINK;
+        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+        atomic_inc(&inode->i_count);
+        inode->i_nlink++;
+        mark_inode_dirty_sync(inode);
+        return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+static int logfs_get_dd(struct inode *dir, struct dentry *dentry,
+                struct logfs_disk_dentry *dd, loff_t *pos)
+{
+        struct page *page;
+        void *map;
+        page = logfs_get_dd_page(dir, dentry);
+        if (IS_ERR(page))
+                return PTR_ERR(page);
+        *pos = page->index;
+        map = kmap_atomic(page, KM_USER0);
+        memcpy(dd, map, sizeof(*dd));
+        kunmap_atomic(map, KM_USER0);
+        page_cache_release(page);
+        return 0;
+}
+static int logfs_delete_dd(struct inode *dir, loff_t pos)
+{
+        /*
+         * Getting called with pos somewhere beyond eof is either a goofup
+         * within this file or means someone maliciously edited the
+         * (crc-protected) journal.
+         */
+        BUG_ON(beyond_eof(dir, pos));
+        dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+        log_dir(" Delete dentry (%lx, %llx)\n", dir->i_ino, pos);
+        return logfs_delete(dir, pos, NULL);
+}
+/*
+ * Cross-directory rename, target does not exist.  Just a little nasty.
+ * Create a new dentry in the target dir, then remove the old dentry,
+ * all the while taking care to remember our operation in the journal.
+ */
+static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
+                              struct inode *new_dir, struct dentry *new_dentry)
+{
+        struct logfs_super *super = logfs_super(old_dir->i_sb);
+        struct logfs_disk_dentry dd;
+        struct logfs_transaction *ta;
+        loff_t pos;
+        int err;
+        /* 1. locate source dd */
+        err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
+        if (err)
+                return err;
+        ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+        if (!ta)
+                return -ENOMEM;
+        ta->state = CROSS_RENAME_1;
+        ta->dir = old_dir->i_ino;
+        ta->pos = pos;
+        /* 2. write target dd */
+        mutex_lock(&super->s_dirop_mutex);
+        logfs_add_transaction(new_dir, ta);
+        err = logfs_write_dir(new_dir, new_dentry, old_dentry->d_inode);
+        if (!err)
+                err = write_inode(new_dir);
+        if (err) {
+                super->s_rename_dir = 0;
+                super->s_rename_pos = 0;
+                abort_transaction(new_dir, ta);
+                goto out;
+        }
+        /* 3. remove source dd */
+        ta->state = CROSS_RENAME_2;
+        logfs_add_transaction(old_dir, ta);
+        err = logfs_delete_dd(old_dir, pos);
+        if (!err)
+                err = write_inode(old_dir);
+        LOGFS_BUG_ON(err, old_dir->i_sb);
+out:
+        mutex_unlock(&super->s_dirop_mutex);
+        return err;
+}
+static int logfs_replace_inode(struct inode *dir, struct dentry *dentry,
+                struct logfs_disk_dentry *dd, struct inode *inode)
+{
+        loff_t pos;
+        int err;
+        err = logfs_get_dd(dir, dentry, dd, &pos);
+        if (err)
+                return err;
+        dd->ino = cpu_to_be64(inode->i_ino);
+        dd->type = logfs_type(inode);
+        err = write_dir(dir, dd, pos);
+        if (err)
+                return err;
+        log_dir("Replace dentry (%lx, %llx) %s -> %llx\n", dir->i_ino, pos,
+                        dd->name, be64_to_cpu(dd->ino));
+        return write_inode(dir);
+}
+/* Target dentry exists - the worst case.  We need to attach the source
+ * inode to the target dentry, then remove the orphaned target inode and
+ * source dentry.
+ */
+static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry,
+                               struct inode *new_dir, struct dentry *new_dentry)
+{
+        struct logfs_super *super = logfs_super(old_dir->i_sb);
+        struct inode *old_inode = old_dentry->d_inode;
+        struct inode *new_inode = new_dentry->d_inode;
+        int isdir = S_ISDIR(old_inode->i_mode);
+        struct logfs_disk_dentry dd;
+        struct logfs_transaction *ta;
+        loff_t pos;
+        int err;
+        BUG_ON(isdir != S_ISDIR(new_inode->i_mode));
+        if (isdir) {
+                if (!logfs_empty_dir(new_inode))
+                        return -ENOTEMPTY;
+        }
+        /* 1. locate source dd */
+        err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
+        if (err)
+                return err;
+        ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+        if (!ta)
+                return -ENOMEM;
+        ta->state = TARGET_RENAME_1;
+        ta->dir = old_dir->i_ino;
+        ta->pos = pos;
+        ta->ino = new_inode->i_ino;
+        /* 2. attach source inode to target dd */
+        mutex_lock(&super->s_dirop_mutex);
+        logfs_add_transaction(new_dir, ta);
+        err = logfs_replace_inode(new_dir, new_dentry, &dd, old_inode);
+        if (err) {
+                super->s_rename_dir = 0;
+                super->s_rename_pos = 0;
+                super->s_victim_ino = 0;
+                abort_transaction(new_dir, ta);
+                goto out;
+        }
+        /* 3. remove source dd */
+        ta->state = TARGET_RENAME_2;
+        logfs_add_transaction(old_dir, ta);
+        err = logfs_delete_dd(old_dir, pos);
+        if (!err)
+                err = write_inode(old_dir);
+        LOGFS_BUG_ON(err, old_dir->i_sb);
+        /* 4. remove target inode */
+        ta->state = TARGET_RENAME_3;
+        logfs_add_transaction(new_inode, ta);
+        err = logfs_remove_inode(new_inode);
+out:
+        mutex_unlock(&super->s_dirop_mutex);
+        return err;
+}
+static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+                        struct inode *new_dir, struct dentry *new_dentry)
+{
+        if (new_dentry->d_inode)
+                return logfs_rename_target(old_dir, old_dentry,
+                                           new_dir, new_dentry);
+        return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry);
+}
+/* No locking done here, as this is called before .get_sb() returns. */
+int logfs_replay_journal(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct inode *inode;
+        u64 ino, pos;
+        int err;
+        if (super->s_victim_ino) {
+                /* delete victim inode */
+                ino = super->s_victim_ino;
+                printk(KERN_INFO"LogFS: delete unmapped inode #%llx\n", ino);
+                inode = logfs_iget(sb, ino);
+                if (IS_ERR(inode))
+                        goto fail;
+                LOGFS_BUG_ON(i_size_read(inode) > 0, sb);
+                super->s_victim_ino = 0;
+                err = logfs_remove_inode(inode);
+                iput(inode);
+                if (err) {
+                        super->s_victim_ino = ino;
+                        goto fail;
+                }
+        }
+        if (super->s_rename_dir) {
+                /* delete old dd from rename */
+                ino = super->s_rename_dir;
+                pos = super->s_rename_pos;
+                printk(KERN_INFO"LogFS: delete unbacked dentry (%llx, %llx)\n",
+                                ino, pos);
+                inode = logfs_iget(sb, ino);
+                if (IS_ERR(inode))
+                        goto fail;
+                super->s_rename_dir = 0;
+                super->s_rename_pos = 0;
+                err = logfs_delete_dd(inode, pos);
+                iput(inode);
+                if (err) {
+                        super->s_rename_dir = ino;
+                        super->s_rename_pos = pos;
+                        goto fail;
+                }
+        }
+        return 0;
+fail:
+        LOGFS_BUG(sb);
+        return -EIO;
+}
+const struct inode_operations logfs_symlink_iops = {
+        .readlink       = generic_readlink,
+        .follow_link    = page_follow_link_light,
+};
+const struct inode_operations logfs_dir_iops = {
+        .create         = logfs_create,
+        .link           = logfs_link,
+        .lookup         = logfs_lookup,
+        .mkdir          = logfs_mkdir,
+        .mknod          = logfs_mknod,
+        .rename         = logfs_rename,
+        .rmdir          = logfs_rmdir,
+        .permission     = logfs_permission,
+        .symlink        = logfs_symlink,
+        .unlink         = logfs_unlink,
+};
+const struct file_operations logfs_dir_fops = {
+        .fsync          = logfs_fsync,
+        .ioctl          = logfs_ioctl,
+        .readdir        = logfs_readdir,
+        .read           = generic_read_dir,
+};
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
new file mode 100644
index 000000000000..370f367a933e
--- /dev/null
+++ b/fs/logfs/file.c
@@ -0,0 +1,263 @@
+/*
+ * fs/logfs/file.c      - prepare_write, commit_write and friends
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/sched.h>
+#include <linux/writeback.h>
+static int logfs_write_begin(struct file *file, struct address_space *mapping,
+                loff_t pos, unsigned len, unsigned flags,
+                struct page **pagep, void **fsdata)
+{
+        struct inode *inode = mapping->host;
+        struct page *page;
+        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+        page = grab_cache_page_write_begin(mapping, index, flags);
+        if (!page)
+                return -ENOMEM;
+        *pagep = page;
+        if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
+                return 0;
+        if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
+                unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+                unsigned end = start + len;
+                /* Reading beyond i_size is simple: memset to zero */
+                zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
+                return 0;
+        }
+        return logfs_readpage_nolock(page);
+}
+static int logfs_write_end(struct file *file, struct address_space *mapping,
+                loff_t pos, unsigned len, unsigned copied, struct page *page,
+                void *fsdata)
+{
+        struct inode *inode = mapping->host;
+        pgoff_t index = page->index;
+        unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+        unsigned end = start + copied;
+        int ret = 0;
+        BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize);
+        BUG_ON(page->index > I3_BLOCKS);
+        if (copied < len) {
+                /*
+                 * Short write of a non-initialized paged.  Just tell userspace
+                 * to retry the entire page.
+                 */
+                if (!PageUptodate(page)) {
+                        copied = 0;
+                        goto out;
+                }
+        }
+        if (copied == 0)
+                goto out; /* FIXME: do we need to update inode? */
+        if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) {
+                i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end);
+                mark_inode_dirty_sync(inode);
+        }
+        SetPageUptodate(page);
+        if (!PageDirty(page)) {
+                if (!get_page_reserve(inode, page))
+                        __set_page_dirty_nobuffers(page);
+                else
+                        ret = logfs_write_buf(inode, page, WF_LOCK);
+        }
+out:
+        unlock_page(page);
+        page_cache_release(page);
+        return ret ? ret : copied;
+}
+int logfs_readpage(struct file *file, struct page *page)
+{
+        int ret;
+        ret = logfs_readpage_nolock(page);
+        unlock_page(page);
+        return ret;
+}
+/* Clear the page's dirty flag in the radix tree. */
+/* TODO: mucking with PageWriteback is silly.  Add a generic function to clear
+ * the dirty bit from the radix tree for filesystems that don't have to wait
+ * for page writeback to finish (i.e. any compressing filesystem).
+ */
+static void clear_radix_tree_dirty(struct page *page)
+{
+        BUG_ON(PagePrivate(page) || page->private);
+        set_page_writeback(page);
+        end_page_writeback(page);
+}
+static int __logfs_writepage(struct page *page)
+{
+        struct inode *inode = page->mapping->host;
+        int err;
+        err = logfs_write_buf(inode, page, WF_LOCK);
+        if (err)
+                set_page_dirty(page);
+        else
+                clear_radix_tree_dirty(page);
+        unlock_page(page);
+        return err;
+}
+static int logfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+        struct inode *inode = page->mapping->host;
+        loff_t i_size = i_size_read(inode);
+        pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+        unsigned offset;
+        u64 bix;
+        level_t level;
+        log_file("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index,
+                        page);
+        logfs_unpack_index(page->index, &bix, &level);
+        /* Indirect blocks are never truncated */
+        if (level != 0)
+                return __logfs_writepage(page);
+        /*
+         * TODO: everything below is a near-verbatim copy of nobh_writepage().
+         * The relevant bits should be factored out after logfs is merged.
+         */
+        /* Is the page fully inside i_size? */
+        if (bix < end_index)
+                return __logfs_writepage(page);
+         /* Is the page fully outside i_size? (truncate in progress) */
+        offset = i_size & (PAGE_CACHE_SIZE-1);
+        if (bix > end_index || offset == 0) {
+                unlock_page(page);
+                return 0; /* don't care */
+        }
+        /*
+         * The page straddles i_size.  It must be zeroed out on each and every
+         * writepage invokation because it may be mmapped.  "A file is mapped
+         * in multiples of the page size.  For a file that is not a multiple of
+         * the  page size, the remaining memory is zeroed when mapped, and
+         * writes to that region are not written out to the file."
+         */
+        zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+        return __logfs_writepage(page);
+}
+static void logfs_invalidatepage(struct page *page, unsigned long offset)
+{
+        move_page_to_btree(page);
+        BUG_ON(PagePrivate(page) || page->private);
+}
+static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
+{
+        return 0; /* None of these are easy to release */
+}
+int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+                unsigned long arg)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        unsigned int oldflags, flags;
+        int err;
+        switch (cmd) {
+        case FS_IOC_GETFLAGS:
+                flags = li->li_flags & LOGFS_FL_USER_VISIBLE;
+                return put_user(flags, (int __user *)arg);
+        case FS_IOC_SETFLAGS:
+                if (IS_RDONLY(inode))
+                        return -EROFS;
+                if (!is_owner_or_cap(inode))
+                        return -EACCES;
+                err = get_user(flags, (int __user *)arg);
+                if (err)
+                        return err;
+                mutex_lock(&inode->i_mutex);
+                oldflags = li->li_flags;
+                flags &= LOGFS_FL_USER_MODIFIABLE;
+                flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE;
+                li->li_flags = flags;
+                mutex_unlock(&inode->i_mutex);
+                inode->i_ctime = CURRENT_TIME;
+                mark_inode_dirty_sync(inode);
+                return 0;
+        default:
+                return -ENOTTY;
+        }
+}
+int logfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+        struct super_block *sb = dentry->d_inode->i_sb;
+        struct logfs_super *super = logfs_super(sb);
+        /* FIXME: write anchor */
+        super->s_devops->sync(sb);
+        return 0;
+}
+static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+        struct inode *inode = dentry->d_inode;
+        int err = 0;
+        if (attr->ia_valid & ATTR_SIZE)
+                err = logfs_truncate(inode, attr->ia_size);
+        attr->ia_valid &= ~ATTR_SIZE;
+        if (!err)
+                err = inode_change_ok(inode, attr);
+        if (!err)
+                err = inode_setattr(inode, attr);
+        return err;
+}
+const struct inode_operations logfs_reg_iops = {
+        .setattr        = logfs_setattr,
+};
+const struct file_operations logfs_reg_fops = {
+        .aio_read       = generic_file_aio_read,
+        .aio_write      = generic_file_aio_write,
+        .fsync          = logfs_fsync,
+        .ioctl          = logfs_ioctl,
+        .llseek         = generic_file_llseek,
+        .mmap           = generic_file_readonly_mmap,
+        .open           = generic_file_open,
+        .read           = do_sync_read,
+        .write          = do_sync_write,
+};
+const struct address_space_operations logfs_reg_aops = {
+        .invalidatepage = logfs_invalidatepage,
+        .readpage       = logfs_readpage,
+        .releasepage    = logfs_releasepage,
+        .set_page_dirty = __set_page_dirty_nobuffers,
+        .writepage      = logfs_writepage,
+        .writepages     = generic_writepages,
+        .write_begin    = logfs_write_begin,
+        .write_end      = logfs_write_end,
+};
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
new file mode 100644
index 000000000000..84e36f52fe95
--- /dev/null
+++ b/fs/logfs/gc.c
@@ -0,0 +1,731 @@
+/*
+ * fs/logfs/gc.c        - garbage collection code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/sched.h>
+#include <linux/slab.h>
+/*
+ * Wear leveling needs to kick in when the difference between low erase
+ * counts and high erase counts gets too big.  A good value for "too big"
+ * may be somewhat below 10% of maximum erase count for the device.
+ * Why not 397, to pick a nice round number with no specific meaning? :)
+ *
+ * WL_RATELIMIT is the minimum time between two wear level events.  A huge
+ * number of segments may fulfil the requirements for wear leveling at the
+ * same time.  If that happens we don't want to cause a latency from hell,
+ * but just gently pick one segment every so often and minimize overhead.
+ */
+#define WL_DELTA 397
+#define WL_RATELIMIT 100
+#define MAX_OBJ_ALIASES 2600
+#define SCAN_RATIO 512  /* number of scanned segments per gc'd segment */
+#define LIST_SIZE 64    /* base size of candidate lists */
+#define SCAN_ROUNDS 128 /* maximum number of complete medium scans */
+#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */
+static int no_free_segments(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        return super->s_free_list.count;
+}
+/* journal has distance -1, top-most ifile layer distance 0 */
+static u8 root_distance(struct super_block *sb, gc_level_t __gc_level)
+{
+        struct logfs_super *super = logfs_super(sb);
+        u8 gc_level = (__force u8)__gc_level;
+        switch (gc_level) {
+        case 0: /* fall through */
+        case 1: /* fall through */
+        case 2: /* fall through */
+        case 3:
+                /* file data or indirect blocks */
+                return super->s_ifile_levels + super->s_iblock_levels - gc_level;
+        case 6: /* fall through */
+        case 7: /* fall through */
+        case 8: /* fall through */
+        case 9:
+                /* inode file data or indirect blocks */
+                return super->s_ifile_levels - (gc_level - 6);
+        default:
+                printk(KERN_ERR"LOGFS: segment of unknown level %x found\n",
+                                gc_level);
+                WARN_ON(1);
+                return super->s_ifile_levels + super->s_iblock_levels;
+        }
+}
+static int segment_is_reserved(struct super_block *sb, u32 segno)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct logfs_area *area;
+        void *reserved;
+        int i;
+        /* Some segments are reserved.  Just pretend they were all valid */
+        reserved = btree_lookup32(&super->s_reserved_segments, segno);
+        if (reserved)
+                return 1;
+        /* Currently open segments */
+        for_each_area(i) {
+                area = super->s_area[i];
+                if (area->a_is_open && area->a_segno == segno)
+                        return 1;
+        }
+        return 0;
+}
+static void logfs_mark_segment_bad(struct super_block *sb, u32 segno)
+{
+        BUG();
+}
+/*
+ * Returns the bytes consumed by valid objects in this segment.  Object headers
+ * are counted, the segment header is not.
+ */
+static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec,
+                gc_level_t *gc_level)
+{
+        struct logfs_segment_entry se;
+        u32 ec_level;
+        logfs_get_segment_entry(sb, segno, &se);
+        if (se.ec_level == cpu_to_be32(BADSEG) ||
+                        se.valid == cpu_to_be32(RESERVED))
+                return RESERVED;
+        ec_level = be32_to_cpu(se.ec_level);
+        *ec = ec_level >> 4;
+        *gc_level = GC_LEVEL(ec_level & 0xf);
+        return be32_to_cpu(se.valid);
+}
+static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
+                u64 bix, gc_level_t gc_level)
+{
+        struct inode *inode;
+        int err, cookie;
+        inode = logfs_safe_iget(sb, ino, &cookie);
+        err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0);
+        BUG_ON(err);
+        logfs_safe_iput(inode, cookie);
+}
+static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct logfs_segment_header sh;
+        struct logfs_object_header oh;
+        u64 ofs, ino, bix;
+        u32 seg_ofs, logical_segno, cleaned = 0;
+        int err, len, valid;
+        gc_level_t gc_level;
+        LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb);
+        btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS);
+        err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
+        BUG_ON(err);
+        gc_level = GC_LEVEL(sh.level);
+        logical_segno = be32_to_cpu(sh.segno);
+        if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) {
+                logfs_mark_segment_bad(sb, segno);
+                cleaned = -1;
+                goto out;
+        }
+        for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE;
+                        seg_ofs + sizeof(oh) < super->s_segsize; ) {
+                ofs = dev_ofs(sb, logical_segno, seg_ofs);
+                err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh),
+                                &oh);
+                BUG_ON(err);
+                if (!memchr_inv(&oh, 0xff, sizeof(oh)))
+                        break;
+                if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) {
+                        logfs_mark_segment_bad(sb, segno);
+                        cleaned = super->s_segsize - 1;
+                        goto out;
+                }
+                ino = be64_to_cpu(oh.ino);
+                bix = be64_to_cpu(oh.bix);
+                len = sizeof(oh) + be16_to_cpu(oh.len);
+                valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level);
+                if (valid == 1) {
+                        logfs_cleanse_block(sb, ofs, ino, bix, gc_level);
+                        cleaned += len;
+                } else if (valid == 2) {
+                        /* Will be invalid upon journal commit */
+                        cleaned += len;
+                }
+                seg_ofs += len;
+        }
+out:
+        btree_remove32(&super->s_reserved_segments, segno);
+        return cleaned;
+}
+static struct gc_candidate *add_list(struct gc_candidate *cand,
+                struct candidate_list *list)
+{
+        struct rb_node **p = &list->rb_tree.rb_node;
+        struct rb_node *parent = NULL;
+        struct gc_candidate *cur;
+        int comp;
+        cand->list = list;
+        while (*p) {
+                parent = *p;
+                cur = rb_entry(parent, struct gc_candidate, rb_node);
+                if (list->sort_by_ec)
+                        comp = cand->erase_count < cur->erase_count;
+                else
+                        comp = cand->valid < cur->valid;
+                if (comp)
+                        p = &parent->rb_left;
+                else
+                        p = &parent->rb_right;
+        }
+        rb_link_node(&cand->rb_node, parent, p);
+        rb_insert_color(&cand->rb_node, &list->rb_tree);
+        if (list->count <= list->maxcount) {
+                list->count++;
+                return NULL;
+        }
+        cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node);
+        rb_erase(&cand->rb_node, &list->rb_tree);
+        cand->list = NULL;
+        return cand;
+}
+static void remove_from_list(struct gc_candidate *cand)
+{
+        struct candidate_list *list = cand->list;
+        rb_erase(&cand->rb_node, &list->rb_tree);
+        list->count--;
+}
+static void free_candidate(struct super_block *sb, struct gc_candidate *cand)
+{
+        struct logfs_super *super = logfs_super(sb);
+        btree_remove32(&super->s_cand_tree, cand->segno);
+        kfree(cand);
+}
+u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec)
+{
+        struct gc_candidate *cand;
+        u32 segno;
+        BUG_ON(list->count == 0);
+        cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
+        remove_from_list(cand);
+        segno = cand->segno;
+        if (ec)
+                *ec = cand->erase_count;
+        free_candidate(sb, cand);
+        return segno;
+}
+/*
+ * We have several lists to manage segments with.  The reserve_list is used to
+ * deal with bad blocks.  We try to keep the best (lowest ec) segments on this
+ * list.
+ * The free_list contains free segments for normal usage.  It usually gets the
+ * second pick after the reserve_list.  But when the free_list is running short
+ * it is more important to keep the free_list full than to keep a reserve.
+ *
+ * Segments that are not free are put onto a per-level low_list.  If we have
+ * to run garbage collection, we pick a candidate from there.  All segments on
+ * those lists should have at least some free space so GC will make progress.
+ *
+ * And last we have the ec_list, which is used to pick segments for wear
+ * leveling.
+ *
+ * If all appropriate lists are full, we simply free the candidate and forget
+ * about that segment for a while.  We have better candidates for each purpose.
+ */
+static void __add_candidate(struct super_block *sb, struct gc_candidate *cand)
+{
+        struct logfs_super *super = logfs_super(sb);
+        u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE;
+        if (cand->valid == 0) {
+                /* 100% free segments */
+                log_gc_noisy("add reserve segment %x (ec %x) at %llx\n",
+                                cand->segno, cand->erase_count,
+                                dev_ofs(sb, cand->segno, 0));
+                cand = add_list(cand, &super->s_reserve_list);
+                if (cand) {
+                        log_gc_noisy("add free segment %x (ec %x) at %llx\n",
+                                        cand->segno, cand->erase_count,
+                                        dev_ofs(sb, cand->segno, 0));
+                        cand = add_list(cand, &super->s_free_list);
+                }
+        } else {
+                /* good candidates for Garbage Collection */
+                if (cand->valid < full)
+                        cand = add_list(cand, &super->s_low_list[cand->dist]);
+                /* good candidates for wear leveling,
+                 * segments that were recently written get ignored */
+                if (cand)
+                        cand = add_list(cand, &super->s_ec_list);
+        }
+        if (cand)
+                free_candidate(sb, cand);
+}
+static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec,
+                u8 dist)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct gc_candidate *cand;
+        cand = kmalloc(sizeof(*cand), GFP_NOFS);
+        if (!cand)
+                return -ENOMEM;
+        cand->segno = segno;
+        cand->valid = valid;
+        cand->erase_count = ec;
+        cand->dist = dist;
+        btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS);
+        __add_candidate(sb, cand);
+        return 0;
+}
+static void remove_segment_from_lists(struct super_block *sb, u32 segno)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct gc_candidate *cand;
+        cand = btree_lookup32(&super->s_cand_tree, segno);
+        if (cand) {
+                remove_from_list(cand);
+                free_candidate(sb, cand);
+        }
+}
+static void scan_segment(struct super_block *sb, u32 segno)
+{
+        u32 valid, ec = 0;
+        gc_level_t gc_level = 0;
+        u8 dist;
+        if (segment_is_reserved(sb, segno))
+                return;
+        remove_segment_from_lists(sb, segno);
+        valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
+        if (valid == RESERVED)
+                return;
+        dist = root_distance(sb, gc_level);
+        add_candidate(sb, segno, valid, ec, dist);
+}
+static struct gc_candidate *first_in_list(struct candidate_list *list)
+{
+        if (list->count == 0)
+                return NULL;
+        return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
+}
+/*
+ * Find the best segment for garbage collection.  Main criterion is
+ * the segment requiring the least effort to clean.  Secondary
+ * criterion is to GC on the lowest level available.
+ *
+ * So we search the least effort segment on the lowest level first,
+ * then move up and pick another segment iff is requires significantly
+ * less effort.  Hence the LOGFS_MAX_OBJECTSIZE in the comparison.
+ */
+static struct gc_candidate *get_candidate(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        int i, max_dist;
+        struct gc_candidate *cand = NULL, *this;
+        max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS);
+        for (i = max_dist; i >= 0; i--) {
+                this = first_in_list(&super->s_low_list[i]);
+                if (!this)
+                        continue;
+                if (!cand)
+                        cand = this;
+                if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid)
+                        cand = this;
+        }
+        return cand;
+}
+static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
+{
+        struct logfs_super *super = logfs_super(sb);
+        gc_level_t gc_level;
+        u32 cleaned, valid, segno, ec;
+        u8 dist;
+        if (!cand) {
+                log_gc("GC attempted, but no candidate found\n");
+                return 0;
+        }
+        segno = cand->segno;
+        dist = cand->dist;
+        valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
+        free_candidate(sb, cand);
+        log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n",
+                        segno, (u64)segno << super->s_segshift,
+                        dist, no_free_segments(sb), valid,
+                        super->s_free_bytes);
+        cleaned = logfs_gc_segment(sb, segno, dist);
+        log_gc("GC segment #%02x complete - now %x valid\n", segno,
+                        valid - cleaned);
+        BUG_ON(cleaned != valid);
+        return 1;
+}
+static int logfs_gc_once(struct super_block *sb)
+{
+        struct gc_candidate *cand;
+        cand = get_candidate(sb);
+        if (cand)
+                remove_from_list(cand);
+        return __logfs_gc_once(sb, cand);
+}
+/* returns 1 if a wrap occurs, 0 otherwise */
+static int logfs_scan_some(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        u32 segno;
+        int i, ret = 0;
+        segno = super->s_sweeper;
+        for (i = SCAN_RATIO; i > 0; i--) {
+                segno++;
+                if (segno >= super->s_no_segs) {
+                        segno = 0;
+                        ret = 1;
+                        /* Break out of the loop.  We want to read a single
+                         * block from the segment size on next invocation if
+                         * SCAN_RATIO is set to match block size
+                         */
+                        break;
+                }
+                scan_segment(sb, segno);
+        }
+        super->s_sweeper = segno;
+        return ret;
+}
+/*
+ * In principle, this function should loop forever, looking for GC candidates
+ * and moving data.  LogFS is designed in such a way that this loop is
+ * guaranteed to terminate.
+ *
+ * Limiting the loop to some iterations serves purely to catch cases when
+ * these guarantees have failed.  An actual endless loop is an obvious bug
+ * and should be reported as such.
+ */
+static void __logfs_gc_pass(struct super_block *sb, int target)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct logfs_block *block;
+        int round, progress, last_progress = 0;
+        if (no_free_segments(sb) >= target &&
+                        super->s_no_object_aliases < MAX_OBJ_ALIASES)
+                return;
+        log_gc("__logfs_gc_pass(%x)\n", target);
+        for (round = 0; round < SCAN_ROUNDS; ) {
+                if (no_free_segments(sb) >= target)
+                        goto write_alias;
+                /* Sync in-memory state with on-medium state in case they
+                 * diverged */
+                logfs_write_anchor(sb);
+                round += logfs_scan_some(sb);
+                if (no_free_segments(sb) >= target)
+                        goto write_alias;
+                progress = logfs_gc_once(sb);
+                if (progress)
+                        last_progress = round;
+                else if (round - last_progress > 2)
+                        break;
+                continue;
+                /*
+                 * The goto logic is nasty, I just don't know a better way to
+                 * code it.  GC is supposed to ensure two things:
+                 * 1. Enough free segments are available.
+                 * 2. The number of aliases is bounded.
+                 * When 1. is achieved, we take a look at 2. and write back
+                 * some alias-containing blocks, if necessary.  However, after
+                 * each such write we need to go back to 1., as writes can
+                 * consume free segments.
+                 */
+write_alias:
+                if (super->s_no_object_aliases < MAX_OBJ_ALIASES)
+                        return;
+                if (list_empty(&super->s_object_alias)) {
+                        /* All aliases are still in btree */
+                        return;
+                }
+                log_gc("Write back one alias\n");
+                block = list_entry(super->s_object_alias.next,
+                                struct logfs_block, alias_list);
+                block->ops->write_block(block);
+                /*
+                 * To round off the nasty goto logic, we reset round here.  It
+                 * is a safety-net for GC not making any progress and limited
+                 * to something reasonably small.  If incremented it for every
+                 * single alias, the loop could terminate rather quickly.
+                 */
+                round = 0;
+        }
+        LOGFS_BUG(sb);
+}
+static int wl_ratelimit(struct super_block *sb, u64 *next_event)
+{
+        struct logfs_super *super = logfs_super(sb);
+        if (*next_event < super->s_gec) {
+                *next_event = super->s_gec + WL_RATELIMIT;
+                return 0;
+        }
+        return 1;
+}
+static void logfs_wl_pass(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct gc_candidate *wl_cand, *free_cand;
+        if (wl_ratelimit(sb, &super->s_wl_gec_ostore))
+                return;
+        wl_cand = first_in_list(&super->s_ec_list);
+        if (!wl_cand)
+                return;
+        free_cand = first_in_list(&super->s_free_list);
+        if (!free_cand)
+                return;
+        if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) {
+                remove_from_list(wl_cand);
+                __logfs_gc_once(sb, wl_cand);
+        }
+}
+/*
+ * The journal needs wear leveling as well.  But moving the journal is an
+ * expensive operation so we try to avoid it as much as possible.  And if we
+ * have to do it, we move the whole journal, not individual segments.
+ *
+ * Ratelimiting is not strictly necessary here, it mainly serves to avoid the
+ * calculations.  First we check whether moving the journal would be a
+ * significant improvement.  That means that a) the current journal segments
+ * have more wear than the future journal segments and b) the current journal
+ * segments have more wear than normal ostore segments.
+ * Rationale for b) is that we don't have to move the journal if it is aging
+ * less than the ostore, even if the reserve segments age even less (they are
+ * excluded from wear leveling, after all).
+ * Next we check that the superblocks have less wear than the journal.  Since
+ * moving the journal requires writing the superblocks, we have to protect the
+ * superblocks even more than the journal.
+ *
+ * Also we double the acceptable wear difference, compared to ostore wear
+ * leveling.  Journal data is read and rewritten rapidly, comparatively.  So
+ * soft errors have much less time to accumulate and we allow the journal to
+ * be a bit worse than the ostore.
+ */
+static void logfs_journal_wl_pass(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct gc_candidate *cand;
+        u32 min_journal_ec = -1, max_reserve_ec = 0;
+        int i;
+        if (wl_ratelimit(sb, &super->s_wl_gec_journal))
+                return;
+        if (super->s_reserve_list.count < super->s_no_journal_segs) {
+                /* Reserve is not full enough to move complete journal */
+                return;
+        }
+        journal_for_each(i)
+                if (super->s_journal_seg[i])
+                        min_journal_ec = min(min_journal_ec,
+                                        super->s_journal_ec[i]);
+        cand = rb_entry(rb_first(&super->s_free_list.rb_tree),
+                        struct gc_candidate, rb_node);
+        max_reserve_ec = cand->erase_count;
+        for (i = 0; i < 2; i++) {
+                struct logfs_segment_entry se;
+                u32 segno = seg_no(sb, super->s_sb_ofs[i]);
+                u32 ec;
+                logfs_get_segment_entry(sb, segno, &se);
+                ec = be32_to_cpu(se.ec_level) >> 4;
+                max_reserve_ec = max(max_reserve_ec, ec);
+        }
+        if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) {
+                do_logfs_journal_wl_pass(sb);
+        }
+}
+void logfs_gc_pass(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex));
+        /* Write journal before free space is getting saturated with dirty
+         * objects.
+         */
+        if (super->s_dirty_used_bytes + super->s_dirty_free_bytes
+                        + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes)
+                logfs_write_anchor(sb);
+        __logfs_gc_pass(sb, super->s_total_levels);
+        logfs_wl_pass(sb);
+        logfs_journal_wl_pass(sb);
+}
+static int check_area(struct super_block *sb, int i)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct logfs_area *area = super->s_area[i];
+        struct logfs_object_header oh;
+        u32 segno = area->a_segno;
+        u32 ofs = area->a_used_bytes;
+        __be32 crc;
+        int err;
+        if (!area->a_is_open)
+                return 0;
+        for (ofs = area->a_used_bytes;
+             ofs <= super->s_segsize - sizeof(oh);
+             ofs += (u32)be16_to_cpu(oh.len) + sizeof(oh)) {
+                err = wbuf_read(sb, dev_ofs(sb, segno, ofs), sizeof(oh), &oh);
+                if (err)
+                        return err;
+                if (!memchr_inv(&oh, 0xff, sizeof(oh)))
+                        break;
+                crc = logfs_crc32(&oh, sizeof(oh) - 4, 4);
+                if (crc != oh.crc) {
+                        printk(KERN_INFO "interrupted header at %llx\n",
+                                        dev_ofs(sb, segno, ofs));
+                        return 0;
+                }
+        }
+        if (ofs != area->a_used_bytes) {
+                printk(KERN_INFO "%x bytes unaccounted data found at %llx\n",
+                                ofs - area->a_used_bytes,
+                                dev_ofs(sb, segno, area->a_used_bytes));
+                area->a_used_bytes = ofs;
+        }
+        return 0;
+}
+int logfs_check_areas(struct super_block *sb)
+{
+        int i, err;
+        for_each_area(i) {
+                err = check_area(sb, i);
+                if (err)
+                        return err;
+        }
+        return 0;
+}
+static void logfs_init_candlist(struct candidate_list *list, int maxcount,
+                int sort_by_ec)
+{
+        list->count = 0;
+        list->maxcount = maxcount;
+        list->sort_by_ec = sort_by_ec;
+        list->rb_tree = RB_ROOT;
+}
+int logfs_init_gc(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        int i;
+        btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool);
+        logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1);
+        logfs_init_candlist(&super->s_reserve_list,
+                        super->s_bad_seg_reserve, 1);
+        for_each_area(i)
+                logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0);
+        logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1);
+        return 0;
+}
+static void logfs_cleanup_list(struct super_block *sb,
+                struct candidate_list *list)
+{
+        struct gc_candidate *cand;
+        while (list->count) {
+                cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate,
+                                rb_node);
+                remove_from_list(cand);
+                free_candidate(sb, cand);
+        }
+        BUG_ON(list->rb_tree.rb_node);
+}
+void logfs_cleanup_gc(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        int i;
+        if (!super->s_free_list.count)
+                return;
+        /*
+         * FIXME: The btree may still contain a single empty node.  So we
+         * call the grim visitor to clean up that mess.  Btree code should
+         * do it for us, really.
+         */
+        btree_grim_visitor32(&super->s_cand_tree, 0, NULL);
+        logfs_cleanup_list(sb, &super->s_free_list);
+        logfs_cleanup_list(sb, &super->s_reserve_list);
+        for_each_area(i)
+                logfs_cleanup_list(sb, &super->s_low_list[i]);
+        logfs_cleanup_list(sb, &super->s_ec_list);
+}
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
new file mode 100644
index 000000000000..14ed27274da2
--- /dev/null
+++ b/fs/logfs/inode.c
@@ -0,0 +1,418 @@
+/*
+ * fs/logfs/inode.c     - inode handling code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+/*
+ * How soon to reuse old inode numbers?  LogFS doesn't store deleted inodes
+ * on the medium.  It therefore also lacks a method to store the previous
+ * generation number for deleted inodes.  Instead a single generation number
+ * is stored which will be used for new inodes.  Being just a 32bit counter,
+ * this can obvious wrap relatively quickly.  So we only reuse inodes if we
+ * know that a fair number of inodes can be created before we have to increment
+ * the generation again - effectively adding some bits to the counter.
+ * But being too aggressive here means we keep a very large and very sparse
+ * inode file, wasting space on indirect blocks.
+ * So what is a good value?  Beats me.  64k seems moderately bad on both
+ * fronts, so let's use that for now...
+ *
+ * NFS sucks, as everyone already knows.
+ */
+#define INOS_PER_WRAP (0x10000)
+/*
+ * Logfs' requirement to read inodes for garbage collection makes life a bit
+ * harder.  GC may have to read inodes that are in I_FREEING state, when they
+ * are being written out - and waiting for GC to make progress, naturally.
+ *
+ * So we cannot just call iget() or some variant of it, but first have to check
+ * wether the inode in question might be in I_FREEING state.  Therefore we
+ * maintain our own per-sb list of "almost deleted" inodes and check against
+ * that list first.  Normally this should be at most 1-2 entries long.
+ *
+ * Also, inodes have logfs-specific reference counting on top of what the vfs
+ * does.  When .destroy_inode is called, normally the reference count will drop
+ * to zero and the inode gets deleted.  But if GC accessed the inode, its
+ * refcount will remain nonzero and final deletion will have to wait.
+ *
+ * As a result we have two sets of functions to get/put inodes:
+ * logfs_safe_iget/logfs_safe_iput      - safe to call from GC context
+ * logfs_iget/iput                      - normal version
+ */
+static struct kmem_cache *logfs_inode_cache;
+static DEFINE_SPINLOCK(logfs_inode_lock);
+static void logfs_inode_setops(struct inode *inode)
+{
+        switch (inode->i_mode & S_IFMT) {
+        case S_IFDIR:
+                inode->i_op = &logfs_dir_iops;
+                inode->i_fop = &logfs_dir_fops;
+                inode->i_mapping->a_ops = &logfs_reg_aops;
+                break;
+        case S_IFREG:
+                inode->i_op = &logfs_reg_iops;
+                inode->i_fop = &logfs_reg_fops;
+                inode->i_mapping->a_ops = &logfs_reg_aops;
+                break;
+        case S_IFLNK:
+                inode->i_op = &logfs_symlink_iops;
+                inode->i_mapping->a_ops = &logfs_reg_aops;
+                break;
+        case S_IFSOCK:  /* fall through */
+        case S_IFBLK:   /* fall through */
+        case S_IFCHR:   /* fall through */
+        case S_IFIFO:
+                init_special_inode(inode, inode->i_mode, inode->i_rdev);
+                break;
+        default:
+                BUG();
+        }
+}
+static struct inode *__logfs_iget(struct super_block *sb, ino_t ino)
+{
+        struct inode *inode = iget_locked(sb, ino);
+        int err;
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        err = logfs_read_inode(inode);
+        if (err || inode->i_nlink == 0) {
+                /* inode->i_nlink == 0 can be true when called from
+                 * block validator */
+                /* set i_nlink to 0 to prevent caching */
+                inode->i_nlink = 0;
+                logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE;
+                iget_failed(inode);
+                if (!err)
+                        err = -ENOENT;
+                return ERR_PTR(err);
+        }
+        logfs_inode_setops(inode);
+        unlock_new_inode(inode);
+        return inode;
+}
+struct inode *logfs_iget(struct super_block *sb, ino_t ino)
+{
+        BUG_ON(ino == LOGFS_INO_MASTER);
+        BUG_ON(ino == LOGFS_INO_SEGFILE);
+        return __logfs_iget(sb, ino);
+}
+/*
+ * is_cached is set to 1 if we hand out a cached inode, 0 otherwise.
+ * this allows logfs_iput to do the right thing later
+ */
+struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct logfs_inode *li;
+        if (ino == LOGFS_INO_MASTER)
+                return super->s_master_inode;
+        if (ino == LOGFS_INO_SEGFILE)
+                return super->s_segfile_inode;
+        spin_lock(&logfs_inode_lock);
+        list_for_each_entry(li, &super->s_freeing_list, li_freeing_list)
+                if (li->vfs_inode.i_ino == ino) {
+                        li->li_refcount++;
+                        spin_unlock(&logfs_inode_lock);
+                        *is_cached = 1;
+                        return &li->vfs_inode;
+                }
+        spin_unlock(&logfs_inode_lock);
+        *is_cached = 0;
+        return __logfs_iget(sb, ino);
+}
+static void __logfs_destroy_inode(struct inode *inode)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        BUG_ON(li->li_block);
+        list_del(&li->li_freeing_list);
+        kmem_cache_free(logfs_inode_cache, li);
+}
+static void logfs_destroy_inode(struct inode *inode)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        BUG_ON(list_empty(&li->li_freeing_list));
+        spin_lock(&logfs_inode_lock);
+        li->li_refcount--;
+        if (li->li_refcount == 0)
+                __logfs_destroy_inode(inode);
+        spin_unlock(&logfs_inode_lock);
+}
+void logfs_safe_iput(struct inode *inode, int is_cached)
+{
+        if (inode->i_ino == LOGFS_INO_MASTER)
+                return;
+        if (inode->i_ino == LOGFS_INO_SEGFILE)
+                return;
+        if (is_cached) {
+                logfs_destroy_inode(inode);
+                return;
+        }
+        iput(inode);
+}
+static void logfs_init_inode(struct super_block *sb, struct inode *inode)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        int i;
+        li->li_flags    = 0;
+        li->li_height   = 0;
+        li->li_used_bytes = 0;
+        li->li_block    = NULL;
+        inode->i_uid    = 0;
+        inode->i_gid    = 0;
+        inode->i_size   = 0;
+        inode->i_blocks = 0;
+        inode->i_ctime  = CURRENT_TIME;
+        inode->i_mtime  = CURRENT_TIME;
+        inode->i_nlink  = 1;
+        INIT_LIST_HEAD(&li->li_freeing_list);
+        for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+                li->li_data[i] = 0;
+        return;
+}
+static struct inode *logfs_alloc_inode(struct super_block *sb)
+{
+        struct logfs_inode *li;
+        li = kmem_cache_alloc(logfs_inode_cache, GFP_NOFS);
+        if (!li)
+                return NULL;
+        logfs_init_inode(sb, &li->vfs_inode);
+        return &li->vfs_inode;
+}
+/*
+ * In logfs inodes are written to an inode file.  The inode file, like any
+ * other file, is managed with a inode.  The inode file's inode, aka master
+ * inode, requires special handling in several respects.  First, it cannot be
+ * written to the inode file, so it is stored in the journal instead.
+ *
+ * Secondly, this inode cannot be written back and destroyed before all other
+ * inodes have been written.  The ordering is important.  Linux' VFS is happily
+ * unaware of the ordering constraint and would ordinarily destroy the master
+ * inode at umount time while other inodes are still in use and dirty.  Not
+ * good.
+ *
+ * So logfs makes sure the master inode is not written until all other inodes
+ * have been destroyed.  Sadly, this method has another side-effect.  The VFS
+ * will notice one remaining inode and print a frightening warning message.
+ * Worse, it is impossible to judge whether such a warning was caused by the
+ * master inode or any other inodes have leaked as well.
+ *
+ * Our attempt of solving this is with logfs_new_meta_inode() below.  Its
+ * purpose is to create a new inode that will not trigger the warning if such
+ * an inode is still in use.  An ugly hack, no doubt.  Suggections for
+ * improvement are welcome.
+ */
+struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino)
+{
+        struct inode *inode;
+        inode = logfs_alloc_inode(sb);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        inode->i_mode = S_IFREG;
+        inode->i_ino = ino;
+        inode->i_sb = sb;
+        /* This is a blatant copy of alloc_inode code.  We'd need alloc_inode
+         * to be nonstatic, alas. */
+        {
+                struct address_space * const mapping = &inode->i_data;
+                mapping->a_ops = &logfs_reg_aops;
+                mapping->host = inode;
+                mapping->flags = 0;
+                mapping_set_gfp_mask(mapping, GFP_NOFS);
+                mapping->assoc_mapping = NULL;
+                mapping->backing_dev_info = &default_backing_dev_info;
+                inode->i_mapping = mapping;
+                inode->i_nlink = 1;
+        }
+        return inode;
+}
+struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino)
+{
+        struct inode *inode;
+        int err;
+        inode = logfs_new_meta_inode(sb, ino);
+        if (IS_ERR(inode))
+                return inode;
+        err = logfs_read_inode(inode);
+        if (err) {
+                destroy_meta_inode(inode);
+                return ERR_PTR(err);
+        }
+        logfs_inode_setops(inode);
+        return inode;
+}
+static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+        int ret;
+        long flags = WF_LOCK;
+        /* Can only happen if creat() failed.  Safe to skip. */
+        if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
+                return 0;
+        ret = __logfs_write_inode(inode, flags);
+        LOGFS_BUG_ON(ret, inode->i_sb);
+        return ret;
+}
+void destroy_meta_inode(struct inode *inode)
+{
+        if (inode) {
+                if (inode->i_data.nrpages)
+                        truncate_inode_pages(&inode->i_data, 0);
+                logfs_clear_inode(inode);
+                kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
+        }
+}
+/* called with inode_lock held */
+static void logfs_drop_inode(struct inode *inode)
+{
+        struct logfs_super *super = logfs_super(inode->i_sb);
+        struct logfs_inode *li = logfs_inode(inode);
+        spin_lock(&logfs_inode_lock);
+        list_move(&li->li_freeing_list, &super->s_freeing_list);
+        spin_unlock(&logfs_inode_lock);
+        generic_drop_inode(inode);
+}
+static void logfs_set_ino_generation(struct super_block *sb,
+                struct inode *inode)
+{
+        struct logfs_super *super = logfs_super(sb);
+        u64 ino;
+        mutex_lock(&super->s_journal_mutex);
+        ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino);
+        super->s_last_ino = ino;
+        super->s_inos_till_wrap--;
+        if (super->s_inos_till_wrap < 0) {
+                super->s_last_ino = LOGFS_RESERVED_INOS;
+                super->s_generation++;
+                super->s_inos_till_wrap = INOS_PER_WRAP;
+        }
+        inode->i_ino = ino;
+        inode->i_generation = super->s_generation;
+        mutex_unlock(&super->s_journal_mutex);
+}
+struct inode *logfs_new_inode(struct inode *dir, int mode)
+{
+        struct super_block *sb = dir->i_sb;
+        struct inode *inode;
+        inode = new_inode(sb);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        logfs_init_inode(sb, inode);
+        /* inherit parent flags */
+        logfs_inode(inode)->li_flags |=
+                logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED;
+        inode->i_mode = mode;
+        logfs_set_ino_generation(sb, inode);
+        inode->i_uid = current_fsuid();
+        inode->i_gid = current_fsgid();
+        if (dir->i_mode & S_ISGID) {
+                inode->i_gid = dir->i_gid;
+                if (S_ISDIR(mode))
+                        inode->i_mode |= S_ISGID;
+        }
+        logfs_inode_setops(inode);
+        insert_inode_hash(inode);
+        return inode;
+}
+static void logfs_init_once(void *_li)
+{
+        struct logfs_inode *li = _li;
+        int i;
+        li->li_flags = 0;
+        li->li_used_bytes = 0;
+        li->li_refcount = 1;
+        for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+                li->li_data[i] = 0;
+        inode_init_once(&li->vfs_inode);
+}
+static int logfs_sync_fs(struct super_block *sb, int wait)
+{
+        /* FIXME: write anchor */
+        logfs_super(sb)->s_devops->sync(sb);
+        return 0;
+}
+const struct super_operations logfs_super_operations = {
+        .alloc_inode    = logfs_alloc_inode,
+        .clear_inode    = logfs_clear_inode,
+        .delete_inode   = logfs_delete_inode,
+        .destroy_inode  = logfs_destroy_inode,
+        .drop_inode     = logfs_drop_inode,
+        .write_inode    = logfs_write_inode,
+        .statfs         = logfs_statfs,
+        .sync_fs        = logfs_sync_fs,
+};
+int logfs_init_inode_cache(void)
+{
+        logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
+                        sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT,
+                        logfs_init_once);
+        if (!logfs_inode_cache)
+                return -ENOMEM;
+        return 0;
+}
+void logfs_destroy_inode_cache(void)
+{
+        kmem_cache_destroy(logfs_inode_cache);
+}
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
new file mode 100644
index 000000000000..33bd260b8309
--- /dev/null
+++ b/fs/logfs/journal.c
@@ -0,0 +1,891 @@
+/*
+ * fs/logfs/journal.c   - journal handling code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/slab.h>
+static void logfs_calc_free(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        u64 reserve, no_segs = super->s_no_segs;
+        s64 free;
+        int i;
+        /* superblock segments */
+        no_segs -= 2;
+        super->s_no_journal_segs = 0;
+        /* journal */
+        journal_for_each(i)
+                if (super->s_journal_seg[i]) {
+                        no_segs--;
+                        super->s_no_journal_segs++;
+                }
+        /* open segments plus one extra per level for GC */
+        no_segs -= 2 * super->s_total_levels;
+        free = no_segs * (super->s_segsize - LOGFS_SEGMENT_RESERVE);
+        free -= super->s_used_bytes;
+        /* just a bit extra */
+        free -= super->s_total_levels * 4096;
+        /* Bad blocks are 'paid' for with speed reserve - the filesystem
+         * simply gets slower as bad blocks accumulate.  Until the bad blocks
+         * exceed the speed reserve - then the filesystem gets smaller.
+         */
+        reserve = super->s_bad_segments + super->s_bad_seg_reserve;
+        reserve *= super->s_segsize - LOGFS_SEGMENT_RESERVE;
+        reserve = max(reserve, super->s_speed_reserve);
+        free -= reserve;
+        if (free < 0)
+                free = 0;
+        super->s_free_bytes = free;
+}
+static void reserve_sb_and_journal(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct btree_head32 *head = &super->s_reserved_segments;
+        int i, err;
+        err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[0]), (void *)1,
+                        GFP_KERNEL);
+        BUG_ON(err);
+        err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[1]), (void *)1,
+                        GFP_KERNEL);
+        BUG_ON(err);
+        journal_for_each(i) {
+                if (!super->s_journal_seg[i])
+                        continue;
+                err = btree_insert32(head, super->s_journal_seg[i], (void *)1,
+                                GFP_KERNEL);
+                BUG_ON(err);
+        }
+}
+static void read_dynsb(struct super_block *sb,
+                struct logfs_je_dynsb *dynsb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        super->s_gec            = be64_to_cpu(dynsb->ds_gec);
+        super->s_sweeper        = be64_to_cpu(dynsb->ds_sweeper);
+        super->s_victim_ino     = be64_to_cpu(dynsb->ds_victim_ino);
+        super->s_rename_dir     = be64_to_cpu(dynsb->ds_rename_dir);
+        super->s_rename_pos     = be64_to_cpu(dynsb->ds_rename_pos);
+        super->s_used_bytes     = be64_to_cpu(dynsb->ds_used_bytes);
+        super->s_generation     = be32_to_cpu(dynsb->ds_generation);
+}
+static void read_anchor(struct super_block *sb,
+                struct logfs_je_anchor *da)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct inode *inode = super->s_master_inode;
+        struct logfs_inode *li = logfs_inode(inode);
+        int i;
+        super->s_last_ino = be64_to_cpu(da->da_last_ino);
+        li->li_flags    = 0;
+        li->li_height   = da->da_height;
+        i_size_write(inode, be64_to_cpu(da->da_size));
+        li->li_used_bytes = be64_to_cpu(da->da_used_bytes);
+        for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+                li->li_data[i] = be64_to_cpu(da->da_data[i]);
+}
+static void read_erasecount(struct super_block *sb,
+                struct logfs_je_journal_ec *ec)
+{
+        struct logfs_super *super = logfs_super(sb);
+        int i;
+        journal_for_each(i)
+                super->s_journal_ec[i] = be32_to_cpu(ec->ec[i]);
+}
+static int read_area(struct super_block *sb, struct logfs_je_area *a)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct logfs_area *area = super->s_area[a->gc_level];
+        u64 ofs;
+        u32 writemask = ~(super->s_writesize - 1);
+        if (a->gc_level >= LOGFS_NO_AREAS)
+                return -EIO;
+        if (a->vim != VIM_DEFAULT)
+                return -EIO; /* TODO: close area and continue */
+        area->a_used_bytes = be32_to_cpu(a->used_bytes);
+        area->a_written_bytes = area->a_used_bytes & writemask;
+        area->a_segno = be32_to_cpu(a->segno);
+        if (area->a_segno)
+                area->a_is_open = 1;
+        ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
+        if (super->s_writesize > 1)
+                logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
+        else
+                logfs_buf_recover(area, ofs, NULL, 0);
+        return 0;
+}
+static void *unpack(void *from, void *to)
+{
+        struct logfs_journal_header *jh = from;
+        void *data = from + sizeof(struct logfs_journal_header);
+        int err;
+        size_t inlen, outlen;
+        inlen = be16_to_cpu(jh->h_len);
+        outlen = be16_to_cpu(jh->h_datalen);
+        if (jh->h_compr == COMPR_NONE)
+                memcpy(to, data, inlen);
+        else {
+                err = logfs_uncompress(data, to, inlen, outlen);
+                BUG_ON(err);
+        }
+        return to;
+}
+static int __read_je_header(struct super_block *sb, u64 ofs,
+                struct logfs_journal_header *jh)
+{
+        struct logfs_super *super = logfs_super(sb);
+        size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
+                + MAX_JOURNAL_HEADER;
+        u16 type, len, datalen;
+        int err;
+        /* read header only */
+        err = wbuf_read(sb, ofs, sizeof(*jh), jh);
+        if (err)
+                return err;
+        type = be16_to_cpu(jh->h_type);
+        len = be16_to_cpu(jh->h_len);
+        datalen = be16_to_cpu(jh->h_datalen);
+        if (len > sb->s_blocksize)
+                return -EIO;
+        if ((type < JE_FIRST) || (type > JE_LAST))
+                return -EIO;
+        if (datalen > bufsize)
+                return -EIO;
+        return 0;
+}
+static int __read_je_payload(struct super_block *sb, u64 ofs,
+                struct logfs_journal_header *jh)
+{
+        u16 len;
+        int err;
+        len = be16_to_cpu(jh->h_len);
+        err = wbuf_read(sb, ofs + sizeof(*jh), len, jh + 1);
+        if (err)
+                return err;
+        if (jh->h_crc != logfs_crc32(jh, len + sizeof(*jh), 4)) {
+                /* Old code was confused.  It forgot about the header length
+                 * and stopped calculating the crc 16 bytes before the end
+                 * of data - ick!
+                 * FIXME: Remove this hack once the old code is fixed.
+                 */
+                if (jh->h_crc == logfs_crc32(jh, len, 4))
+                        WARN_ON_ONCE(1);
+                else
+                        return -EIO;
+        }
+        return 0;
+}
+/*
+ * jh needs to be large enough to hold the complete entry, not just the header
+ */
+static int __read_je(struct super_block *sb, u64 ofs,
+                struct logfs_journal_header *jh)
+{
+        int err;
+        err = __read_je_header(sb, ofs, jh);
+        if (err)
+                return err;
+        return __read_je_payload(sb, ofs, jh);
+}
+static int read_je(struct super_block *sb, u64 ofs)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct logfs_journal_header *jh = super->s_compressed_je;
+        void *scratch = super->s_je;
+        u16 type, datalen;
+        int err;
+        err = __read_je(sb, ofs, jh);
+        if (err)
+                return err;
+        type = be16_to_cpu(jh->h_type);
+        datalen = be16_to_cpu(jh->h_datalen);
+        switch (type) {
+        case JE_DYNSB:
+                read_dynsb(sb, unpack(jh, scratch));
+                break;
+        case JE_ANCHOR:
+                read_anchor(sb, unpack(jh, scratch));
+                break;
+        case JE_ERASECOUNT:
+                read_erasecount(sb, unpack(jh, scratch));
+                break;
+        case JE_AREA:
+                read_area(sb, unpack(jh, scratch));
+                break;
+        case JE_OBJ_ALIAS:
+                err = logfs_load_object_aliases(sb, unpack(jh, scratch),
+                                datalen);
+                break;
+        default:
+                WARN_ON_ONCE(1);
+                return -EIO;
+        }
+        return err;
+}
+static int logfs_read_segment(struct super_block *sb, u32 segno)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct logfs_journal_header *jh = super->s_compressed_je;
+        u64 ofs, seg_ofs = dev_ofs(sb, segno, 0);
+        u32 h_ofs, last_ofs = 0;
+        u16 len, datalen, last_len = 0;
+        int i, err;
+        /* search for most recent commit */
+        for (h_ofs = 0; h_ofs < super->s_segsize; h_ofs += sizeof(*jh)) {
+                ofs = seg_ofs + h_ofs;
+                err = __read_je_header(sb, ofs, jh);
+                if (err)
+                        continue;
+                if (jh->h_type != cpu_to_be16(JE_COMMIT))
+                        continue;
+                err = __read_je_payload(sb, ofs, jh);
+                if (err)
+                        continue;
+                len = be16_to_cpu(jh->h_len);
+                datalen = be16_to_cpu(jh->h_datalen);
+                if ((datalen > sizeof(super->s_je_array)) ||
+                                (datalen % sizeof(__be64)))
+                        continue;
+                last_ofs = h_ofs;
+                last_len = datalen;
+                h_ofs += ALIGN(len, sizeof(*jh)) - sizeof(*jh);
+        }
+        /* read commit */
+        if (last_ofs == 0)
+                return -ENOENT;
+        ofs = seg_ofs + last_ofs;
+        log_journal("Read commit from %llx\n", ofs);
+        err = __read_je(sb, ofs, jh);
+        BUG_ON(err); /* We should have caught it in the scan loop already */
+        if (err)
+                return err;
+        /* uncompress */
+        unpack(jh, super->s_je_array);
+        super->s_no_je = last_len / sizeof(__be64);
+        /* iterate over array */
+        for (i = 0; i < super->s_no_je; i++) {
+                err = read_je(sb, be64_to_cpu(super->s_je_array[i]));
+                if (err)
+                        return err;
+        }
+        super->s_journal_area->a_segno = segno;
+        return 0;
+}
+static u64 read_gec(struct super_block *sb, u32 segno)
+{
+        struct logfs_segment_header sh;
+        __be32 crc;
+        int err;
+        if (!segno)
+                return 0;
+        err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
+        if (err)
+                return 0;
+        crc = logfs_crc32(&sh, sizeof(sh), 4);
+        if (crc != sh.crc) {
+                WARN_ON(sh.gec != cpu_to_be64(0xffffffffffffffffull));
+                /* Most likely it was just erased */
+                return 0;
+        }
+        return be64_to_cpu(sh.gec);
+}
+static int logfs_read_journal(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        u64 gec[LOGFS_JOURNAL_SEGS], max;
+        u32 segno;
+        int i, max_i;
+        max = 0;
+        max_i = -1;
+        journal_for_each(i) {
+                segno = super->s_journal_seg[i];
+                gec[i] = read_gec(sb, super->s_journal_seg[i]);
+                if (gec[i] > max) {
+                        max = gec[i];
+                        max_i = i;
+                }
+        }
+        if (max_i == -1)
+                return -EIO;
+        /* FIXME: Try older segments in case of error */
+        return logfs_read_segment(sb, super->s_journal_seg[max_i]);
+}
+/*
+ * First search the current segment (outer loop), then pick the next segment
+ * in the array, skipping any zero entries (inner loop).
+ */
+static void journal_get_free_segment(struct logfs_area *area)
+{
+        struct logfs_super *super = logfs_super(area->a_sb);
+        int i;
+        journal_for_each(i) {
+                if (area->a_segno != super->s_journal_seg[i])
+                        continue;
+                do {
+                        i++;
+                        if (i == LOGFS_JOURNAL_SEGS)
+                                i = 0;
+                } while (!super->s_journal_seg[i]);
+                area->a_segno = super->s_journal_seg[i];
+                area->a_erase_count = ++(super->s_journal_ec[i]);
+                log_journal("Journal now at %x (ec %x)\n", area->a_segno,
+                                area->a_erase_count);
+                return;
+        }
+        BUG();
+}
+static void journal_get_erase_count(struct logfs_area *area)
+{
+        /* erase count is stored globally and incremented in
+         * journal_get_free_segment() - nothing to do here */
+}
+static int journal_erase_segment(struct logfs_area *area)
+{
+        struct super_block *sb = area->a_sb;
+        struct logfs_segment_header sh;
+        u64 ofs;
+        int err;
+        err = logfs_erase_segment(sb, area->a_segno, 1);
+        if (err)
+                return err;
+        sh.pad = 0;
+        sh.type = SEG_JOURNAL;
+        sh.level = 0;
+        sh.segno = cpu_to_be32(area->a_segno);
+        sh.ec = cpu_to_be32(area->a_erase_count);
+        sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
+        sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
+        /* This causes a bug in segment.c.  Not yet. */
+        //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0);
+        ofs = dev_ofs(sb, area->a_segno, 0);
+        area->a_used_bytes = ALIGN(sizeof(sh), 16);
+        logfs_buf_write(area, ofs, &sh, sizeof(sh));
+        return 0;
+}
+static size_t __logfs_write_header(struct logfs_super *super,
+                struct logfs_journal_header *jh, size_t len, size_t datalen,
+                u16 type, u8 compr)
+{
+        jh->h_len       = cpu_to_be16(len);
+        jh->h_type      = cpu_to_be16(type);
+        jh->h_datalen   = cpu_to_be16(datalen);
+        jh->h_compr     = compr;
+        jh->h_pad[0]    = 'H';
+        jh->h_pad[1]    = 'E';
+        jh->h_pad[2]    = 'A';
+        jh->h_pad[3]    = 'D';
+        jh->h_pad[4]    = 'R';
+        jh->h_crc       = logfs_crc32(jh, len + sizeof(*jh), 4);
+        return ALIGN(len, 16) + sizeof(*jh);
+}
+static size_t logfs_write_header(struct logfs_super *super,
+                struct logfs_journal_header *jh, size_t datalen, u16 type)
+{
+        size_t len = datalen;
+        return __logfs_write_header(super, jh, len, datalen, type, COMPR_NONE);
+}
+static inline size_t logfs_journal_erasecount_size(struct logfs_super *super)
+{
+        return LOGFS_JOURNAL_SEGS * sizeof(__be32);
+}
+static void *logfs_write_erasecount(struct super_block *sb, void *_ec,
+                u16 *type, size_t *len)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct logfs_je_journal_ec *ec = _ec;
+        int i;
+        journal_for_each(i)
+                ec->ec[i] = cpu_to_be32(super->s_journal_ec[i]);
+        *type = JE_ERASECOUNT;
+        *len = logfs_journal_erasecount_size(super);
+        return ec;
+}
+static void account_shadow(void *_shadow, unsigned long _sb, u64 ignore,
+                size_t ignore2)
+{
+        struct logfs_shadow *shadow = _shadow;
+        struct super_block *sb = (void *)_sb;
+        struct logfs_super *super = logfs_super(sb);
+        /* consume new space */
+        super->s_free_bytes       -= shadow->new_len;
+        super->s_used_bytes       += shadow->new_len;
+        super->s_dirty_used_bytes -= shadow->new_len;
+        /* free up old space */
+        super->s_free_bytes       += shadow->old_len;
+        super->s_used_bytes       -= shadow->old_len;
+        super->s_dirty_free_bytes -= shadow->old_len;
+        logfs_set_segment_used(sb, shadow->old_ofs, -shadow->old_len);
+        logfs_set_segment_used(sb, shadow->new_ofs, shadow->new_len);
+        log_journal("account_shadow(%llx, %llx, %x) %llx->%llx %x->%x\n",
+                        shadow->ino, shadow->bix, shadow->gc_level,
+                        shadow->old_ofs, shadow->new_ofs,
+                        shadow->old_len, shadow->new_len);
+        mempool_free(shadow, super->s_shadow_pool);
+}
+static void account_shadows(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct inode *inode = super->s_master_inode;
+        struct logfs_inode *li = logfs_inode(inode);
+        struct shadow_tree *tree = &super->s_shadow_tree;
+        btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow);
+        btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow);
+        if (li->li_block) {
+                /*
+                 * We never actually use the structure, when attached to the
+                 * master inode.  But it is easier to always free it here than
+                 * to have checks in several places elsewhere when allocating
+                 * it.
+                 */
+                li->li_block->ops->free_block(sb, li->li_block);
+        }
+        BUG_ON((s64)li->li_used_bytes < 0);
+}
+static void *__logfs_write_anchor(struct super_block *sb, void *_da,
+                u16 *type, size_t *len)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct logfs_je_anchor *da = _da;
+        struct inode *inode = super->s_master_inode;
+        struct logfs_inode *li = logfs_inode(inode);
+        int i;
+        da->da_height   = li->li_height;
+        da->da_last_ino = cpu_to_be64(super->s_last_ino);
+        da->da_size     = cpu_to_be64(i_size_read(inode));
+        da->da_used_bytes = cpu_to_be64(li->li_used_bytes);
+        for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+                da->da_data[i] = cpu_to_be64(li->li_data[i]);
+        *type = JE_ANCHOR;
+        *len = sizeof(*da);
+        return da;
+}
+static void *logfs_write_dynsb(struct super_block *sb, void *_dynsb,
+                u16 *type, size_t *len)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct logfs_je_dynsb *dynsb = _dynsb;
+        dynsb->ds_gec           = cpu_to_be64(super->s_gec);
+        dynsb->ds_sweeper       = cpu_to_be64(super->s_sweeper);
+        dynsb->ds_victim_ino    = cpu_to_be64(super->s_victim_ino);
+        dynsb->ds_rename_dir    = cpu_to_be64(super->s_rename_dir);
+        dynsb->ds_rename_pos    = cpu_to_be64(super->s_rename_pos);
+        dynsb->ds_used_bytes    = cpu_to_be64(super->s_used_bytes);
+        dynsb->ds_generation    = cpu_to_be32(super->s_generation);
+        *type = JE_DYNSB;
+        *len = sizeof(*dynsb);
+        return dynsb;
+}
+static void write_wbuf(struct super_block *sb, struct logfs_area *area,
+                void *wbuf)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct address_space *mapping = super->s_mapping_inode->i_mapping;
+        u64 ofs;
+        pgoff_t index;
+        int page_ofs;
+        struct page *page;
+        ofs = dev_ofs(sb, area->a_segno,
+                        area->a_used_bytes & ~(super->s_writesize - 1));
+        index = ofs >> PAGE_SHIFT;
+        page_ofs = ofs & (PAGE_SIZE - 1);
+        page = find_lock_page(mapping, index);
+        BUG_ON(!page);
+        memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize);
+        unlock_page(page);
+}
+static void *logfs_write_area(struct super_block *sb, void *_a,
+                u16 *type, size_t *len)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct logfs_area *area = super->s_area[super->s_sum_index];
+        struct logfs_je_area *a = _a;
+        a->vim = VIM_DEFAULT;
+        a->gc_level = super->s_sum_index;
+        a->used_bytes = cpu_to_be32(area->a_used_bytes);
+        a->segno = cpu_to_be32(area->a_segno);
+        if (super->s_writesize > 1)
+                write_wbuf(sb, area, a + 1);
+        *type = JE_AREA;
+        *len = sizeof(*a) + super->s_writesize;
+        return a;
+}
+static void *logfs_write_commit(struct super_block *sb, void *h,
+                u16 *type, size_t *len)
+{
+        struct logfs_super *super = logfs_super(sb);
+        *type = JE_COMMIT;
+        *len = super->s_no_je * sizeof(__be64);
+        return super->s_je_array;
+}
+static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
+                size_t len)
+{
+        struct logfs_super *super = logfs_super(sb);
+        void *header = super->s_compressed_je;
+        void *data = header + sizeof(struct logfs_journal_header);
+        ssize_t compr_len, pad_len;
+        u8 compr = COMPR_ZLIB;
+        if (len == 0)
+                return logfs_write_header(super, header, 0, type);
+        compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
+        if (compr_len < 0 || type == JE_ANCHOR) {
+                BUG_ON(len > sb->s_blocksize);
+                memcpy(data, buf, len);
+                compr_len = len;
+                compr = COMPR_NONE;
+        }
+        pad_len = ALIGN(compr_len, 16);
+        memset(data + compr_len, 0, pad_len - compr_len);
+        return __logfs_write_header(super, header, compr_len, len, type, compr);
+}
+static s64 logfs_get_free_bytes(struct logfs_area *area, size_t *bytes,
+                int must_pad)
+{
+        u32 writesize = logfs_super(area->a_sb)->s_writesize;
+        s32 ofs;
+        int ret;
+        ret = logfs_open_area(area, *bytes);
+        if (ret)
+                return -EAGAIN;
+        ofs = area->a_used_bytes;
+        area->a_used_bytes += *bytes;
+        if (must_pad) {
+                area->a_used_bytes = ALIGN(area->a_used_bytes, writesize);
+                *bytes = area->a_used_bytes - ofs;
+        }
+        return dev_ofs(area->a_sb, area->a_segno, ofs);
+}
+static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type,
+                size_t buf_len)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct logfs_area *area = super->s_journal_area;
+        struct logfs_journal_header *jh = super->s_compressed_je;
+        size_t len;
+        int must_pad = 0;
+        s64 ofs;
+        len = __logfs_write_je(sb, buf, type, buf_len);
+        if (jh->h_type == cpu_to_be16(JE_COMMIT))
+                must_pad = 1;
+        ofs = logfs_get_free_bytes(area, &len, must_pad);
+        if (ofs < 0)
+                return ofs;
+        logfs_buf_write(area, ofs, super->s_compressed_je, len);
+        super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs);
+        return 0;
+}
+static int logfs_write_je(struct super_block *sb,
+                void* (*write)(struct super_block *sb, void *scratch,
+                        u16 *type, size_t *len))
+{
+        void *buf;
+        size_t len;
+        u16 type;
+        buf = write(sb, logfs_super(sb)->s_je, &type, &len);
+        return logfs_write_je_buf(sb, buf, type, len);
+}
+int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
+                level_t level, int child_no, __be64 val)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct logfs_obj_alias *oa = super->s_je;
+        int err = 0, fill = super->s_je_fill;
+        log_aliases("logfs_write_obj_aliases #%x(%llx, %llx, %x, %x) %llx\n",
+                        fill, ino, bix, level, child_no, be64_to_cpu(val));
+        oa[fill].ino = cpu_to_be64(ino);
+        oa[fill].bix = cpu_to_be64(bix);
+        oa[fill].val = val;
+        oa[fill].level = (__force u8)level;
+        oa[fill].child_no = cpu_to_be16(child_no);
+        fill++;
+        if (fill >= sb->s_blocksize / sizeof(*oa)) {
+                err = logfs_write_je_buf(sb, oa, JE_OBJ_ALIAS, sb->s_blocksize);
+                fill = 0;
+        }
+        super->s_je_fill = fill;
+        return err;
+}
+static int logfs_write_obj_aliases(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        int err;
+        log_journal("logfs_write_obj_aliases: %d aliases to write\n",
+                        super->s_no_object_aliases);
+        super->s_je_fill = 0;
+        err = logfs_write_obj_aliases_pagecache(sb);
+        if (err)
+                return err;
+        if (super->s_je_fill)
+                err = logfs_write_je_buf(sb, super->s_je, JE_OBJ_ALIAS,
+                                super->s_je_fill
+                                * sizeof(struct logfs_obj_alias));
+        return err;
+}
+/*
+ * Write all journal entries.  The goto logic ensures that all journal entries
+ * are written whenever a new segment is used.  It is ugly and potentially a
+ * bit wasteful, but robustness is more important.  With this we can *always*
+ * erase all journal segments except the one containing the most recent commit.
+ */
+void logfs_write_anchor(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct logfs_area *area = super->s_journal_area;
+        int i, err;
+        if (!(super->s_flags & LOGFS_SB_FLAG_DIRTY))
+                return;
+        super->s_flags &= ~LOGFS_SB_FLAG_DIRTY;
+        BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
+        mutex_lock(&super->s_journal_mutex);
+        /* Do this first or suffer corruption */
+        logfs_sync_segments(sb);
+        account_shadows(sb);
+again:
+        super->s_no_je = 0;
+        for_each_area(i) {
+                if (!super->s_area[i]->a_is_open)
+                        continue;
+                super->s_sum_index = i;
+                err = logfs_write_je(sb, logfs_write_area);
+                if (err)
+                        goto again;
+        }
+        err = logfs_write_obj_aliases(sb);
+        if (err)
+                goto again;
+        err = logfs_write_je(sb, logfs_write_erasecount);
+        if (err)
+                goto again;
+        err = logfs_write_je(sb, __logfs_write_anchor);
+        if (err)
+                goto again;
+        err = logfs_write_je(sb, logfs_write_dynsb);
+        if (err)
+                goto again;
+        /*
+         * Order is imperative.  First we sync all writes, including the
+         * non-committed journal writes.  Then we write the final commit and
+         * sync the current journal segment.
+         * There is a theoretical bug here.  Syncing the journal segment will
+         * write a number of journal entries and the final commit.  All these
+         * are written in a single operation.  If the device layer writes the
+         * data back-to-front, the commit will precede the other journal
+         * entries, leaving a race window.
+         * Two fixes are possible.  Preferred is to fix the device layer to
+         * ensure writes happen front-to-back.  Alternatively we can insert
+         * another logfs_sync_area() super->s_devops->sync() combo before
+         * writing the commit.
+         */
+        /*
+         * On another subject, super->s_devops->sync is usually not necessary.
+         * Unless called from sys_sync or friends, a barrier would suffice.
+         */
+        super->s_devops->sync(sb);
+        err = logfs_write_je(sb, logfs_write_commit);
+        if (err)
+                goto again;
+        log_journal("Write commit to %llx\n",
+                        be64_to_cpu(super->s_je_array[super->s_no_je - 1]));
+        logfs_sync_area(area);
+        BUG_ON(area->a_used_bytes != area->a_written_bytes);
+        super->s_devops->sync(sb);
+        mutex_unlock(&super->s_journal_mutex);
+        return;
+}
+void do_logfs_journal_wl_pass(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct logfs_area *area = super->s_journal_area;
+        struct btree_head32 *head = &super->s_reserved_segments;
+        u32 segno, ec;
+        int i, err;
+        log_journal("Journal requires wear-leveling.\n");
+        /* Drop old segments */
+        journal_for_each(i)
+                if (super->s_journal_seg[i]) {
+                        btree_remove32(head, super->s_journal_seg[i]);
+                        logfs_set_segment_unreserved(sb,
+                                        super->s_journal_seg[i],
+                                        super->s_journal_ec[i]);
+                        super->s_journal_seg[i] = 0;
+                        super->s_journal_ec[i] = 0;
+                }
+        /* Get new segments */
+        for (i = 0; i < super->s_no_journal_segs; i++) {
+                segno = get_best_cand(sb, &super->s_reserve_list, &ec);
+                super->s_journal_seg[i] = segno;
+                super->s_journal_ec[i] = ec;
+                logfs_set_segment_reserved(sb, segno);
+                err = btree_insert32(head, segno, (void *)1, GFP_KERNEL);
+                BUG_ON(err); /* mempool should prevent this */
+                err = logfs_erase_segment(sb, segno, 1);
+                BUG_ON(err); /* FIXME: remount-ro would be nicer */
+        }
+        /* Manually move journal_area */
+        freeseg(sb, area->a_segno);
+        area->a_segno = super->s_journal_seg[0];
+        area->a_is_open = 0;
+        area->a_used_bytes = 0;
+        /* Write journal */
+        logfs_write_anchor(sb);
+        /* Write superblocks */
+        err = logfs_write_sb(sb);
+        BUG_ON(err);
+}
+static const struct logfs_area_ops journal_area_ops = {
+        .get_free_segment       = journal_get_free_segment,
+        .get_erase_count        = journal_get_erase_count,
+        .erase_segment          = journal_erase_segment,
+};
+int logfs_init_journal(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
+                + MAX_JOURNAL_HEADER;
+        int ret = -ENOMEM;
+        mutex_init(&super->s_journal_mutex);
+        btree_init_mempool32(&super->s_reserved_segments, super->s_btree_pool);
+        super->s_je = kzalloc(bufsize, GFP_KERNEL);
+        if (!super->s_je)
+                return ret;
+        super->s_compressed_je = kzalloc(bufsize, GFP_KERNEL);
+        if (!super->s_compressed_je)
+                return ret;
+        super->s_master_inode = logfs_new_meta_inode(sb, LOGFS_INO_MASTER);
+        if (IS_ERR(super->s_master_inode))
+                return PTR_ERR(super->s_master_inode);
+        ret = logfs_read_journal(sb);
+        if (ret)
+                return -EIO;
+        reserve_sb_and_journal(sb);
+        logfs_calc_free(sb);
+        super->s_journal_area->a_ops = &journal_area_ops;
+        return 0;
+}
+void logfs_cleanup_journal(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        btree_grim_visitor32(&super->s_reserved_segments, 0, NULL);
+        destroy_meta_inode(super->s_master_inode);
+        super->s_master_inode = NULL;
+        kfree(super->s_compressed_je);
+        kfree(super->s_je);
+}
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
new file mode 100644
index 000000000000..b84b0eec6024
--- /dev/null
+++ b/fs/logfs/logfs.h
@@ -0,0 +1,725 @@
+/*
+ * fs/logfs/logfs.h
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ * Private header for logfs.
+ */
+#ifndef FS_LOGFS_LOGFS_H
+#define FS_LOGFS_LOGFS_H
+#undef __CHECK_ENDIAN__
+#define __CHECK_ENDIAN__
+#include <linux/btree.h>
+#include <linux/crc32.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mempool.h>
+#include <linux/pagemap.h>
+#include <linux/mtd/mtd.h>
+#include "logfs_abi.h"
+#define LOGFS_DEBUG_SUPER       (0x0001)
+#define LOGFS_DEBUG_SEGMENT     (0x0002)
+#define LOGFS_DEBUG_JOURNAL     (0x0004)
+#define LOGFS_DEBUG_DIR         (0x0008)
+#define LOGFS_DEBUG_FILE        (0x0010)
+#define LOGFS_DEBUG_INODE       (0x0020)
+#define LOGFS_DEBUG_READWRITE   (0x0040)
+#define LOGFS_DEBUG_GC          (0x0080)
+#define LOGFS_DEBUG_GC_NOISY    (0x0100)
+#define LOGFS_DEBUG_ALIASES     (0x0200)
+#define LOGFS_DEBUG_BLOCKMOVE   (0x0400)
+#define LOGFS_DEBUG_ALL         (0xffffffff)
+#define LOGFS_DEBUG             (0x01)
+/*
+ * To enable specific log messages, simply define LOGFS_DEBUG to match any
+ * or all of the above.
+ */
+#ifndef LOGFS_DEBUG
+#define LOGFS_DEBUG             (0)
+#endif
+#define log_cond(cond, fmt, arg...) do {        \
+        if (cond)                               \
+                printk(KERN_DEBUG fmt, ##arg);  \
+} while (0)
+#define log_super(fmt, arg...) \
+        log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SUPER, fmt, ##arg)
+#define log_segment(fmt, arg...) \
+        log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SEGMENT, fmt, ##arg)
+#define log_journal(fmt, arg...) \
+        log_cond(LOGFS_DEBUG & LOGFS_DEBUG_JOURNAL, fmt, ##arg)
+#define log_dir(fmt, arg...) \
+        log_cond(LOGFS_DEBUG & LOGFS_DEBUG_DIR, fmt, ##arg)
+#define log_file(fmt, arg...) \
+        log_cond(LOGFS_DEBUG & LOGFS_DEBUG_FILE, fmt, ##arg)
+#define log_inode(fmt, arg...) \
+        log_cond(LOGFS_DEBUG & LOGFS_DEBUG_INODE, fmt, ##arg)
+#define log_readwrite(fmt, arg...) \
+        log_cond(LOGFS_DEBUG & LOGFS_DEBUG_READWRITE, fmt, ##arg)
+#define log_gc(fmt, arg...) \
+        log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC, fmt, ##arg)
+#define log_gc_noisy(fmt, arg...) \
+        log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC_NOISY, fmt, ##arg)
+#define log_aliases(fmt, arg...) \
+        log_cond(LOGFS_DEBUG & LOGFS_DEBUG_ALIASES, fmt, ##arg)
+#define log_blockmove(fmt, arg...) \
+        log_cond(LOGFS_DEBUG & LOGFS_DEBUG_BLOCKMOVE, fmt, ##arg)
+#define PG_pre_locked           PG_owner_priv_1
+#define PagePreLocked(page)     test_bit(PG_pre_locked, &(page)->flags)
+#define SetPagePreLocked(page)  set_bit(PG_pre_locked, &(page)->flags)
+#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags)
+/* FIXME: This should really be somewhere in the 64bit area. */
+#define LOGFS_LINK_MAX          (1<<30)
+/* Read-only filesystem */
+#define LOGFS_SB_FLAG_RO        0x0001
+#define LOGFS_SB_FLAG_DIRTY     0x0002
+#define LOGFS_SB_FLAG_OBJ_ALIAS 0x0004
+#define LOGFS_SB_FLAG_SHUTDOWN  0x0008
+/* Write Control Flags */
+#define WF_LOCK                 0x01 /* take write lock */
+#define WF_WRITE                0x02 /* write block */
+#define WF_DELETE               0x04 /* delete old block */
+typedef u8 __bitwise level_t;
+typedef u8 __bitwise gc_level_t;
+#define LEVEL(level) ((__force level_t)(level))
+#define GC_LEVEL(gc_level) ((__force gc_level_t)(gc_level))
+#define SUBLEVEL(level) ( (void)((level) == LEVEL(1)),  \
+                (__force level_t)((__force u8)(level) - 1) )
+/**
+ * struct logfs_area - area management information
+ *
+ * @a_sb:                       the superblock this area belongs to
+ * @a_is_open:                  1 if the area is currently open, else 0
+ * @a_segno:                    segment number of area
+ * @a_written_bytes:            number of bytes already written back
+ * @a_used_bytes:               number of used bytes
+ * @a_ops:                      area operations (either journal or ostore)
+ * @a_erase_count:              erase count
+ * @a_level:                    GC level
+ */
+struct logfs_area { /* a segment open for writing */
+        struct super_block *a_sb;
+        int     a_is_open;
+        u32     a_segno;
+        u32     a_written_bytes;
+        u32     a_used_bytes;
+        const struct logfs_area_ops *a_ops;
+        u32     a_erase_count;
+        gc_level_t a_level;
+};
+/**
+ * struct logfs_area_ops - area operations
+ *
+ * @get_free_segment:           fill area->ofs with the offset of a free segment
+ * @get_erase_count:            fill area->erase_count (needs area->ofs)
+ * @erase_segment:              erase and setup segment
+ */
+struct logfs_area_ops {
+        void    (*get_free_segment)(struct logfs_area *area);
+        void    (*get_erase_count)(struct logfs_area *area);
+        int     (*erase_segment)(struct logfs_area *area);
+};
+/**
+ * struct logfs_device_ops - device access operations
+ *
+ * @readpage:                   read one page (mm page)
+ * @writeseg:                   write one segment.  may be a partial segment
+ * @erase:                      erase one segment
+ * @read:                       read from the device
+ * @erase:                      erase part of the device
+ */
+struct logfs_device_ops {
+        struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs);
+        struct page *(*find_last_sb)(struct super_block *sb, u64 *ofs);
+        int (*write_sb)(struct super_block *sb, struct page *page);
+        int (*readpage)(void *_sb, struct page *page);
+        void (*writeseg)(struct super_block *sb, u64 ofs, size_t len);
+        int (*erase)(struct super_block *sb, loff_t ofs, size_t len,
+                        int ensure_write);
+        void (*sync)(struct super_block *sb);
+        void (*put_device)(struct super_block *sb);
+};
+/**
+ * struct candidate_list - list of similar candidates
+ */
+struct candidate_list {
+        struct rb_root rb_tree;
+        int count;
+        int maxcount;
+        int sort_by_ec;
+};
+/**
+ * struct gc_candidate - "candidate" segment to be garbage collected next
+ *
+ * @list:                       list (either free of low)
+ * @segno:                      segment number
+ * @valid:                      number of valid bytes
+ * @erase_count:                erase count of segment
+ * @dist:                       distance from tree root
+ *
+ * Candidates can be on two lists.  The free list contains electees rather
+ * than candidates - segments that no longer contain any valid data.  The
+ * low list contains candidates to be picked for GC.  It should be kept
+ * short.  It is not required to always pick a perfect candidate.  In the
+ * worst case GC will have to move more data than absolutely necessary.
+ */
+struct gc_candidate {
+        struct rb_node rb_node;
+        struct candidate_list *list;
+        u32     segno;
+        u32     valid;
+        u32     erase_count;
+        u8      dist;
+};
+/**
+ * struct logfs_journal_entry - temporary structure used during journal scan
+ *
+ * @used:
+ * @version:                    normalized version
+ * @len:                        length
+ * @offset:                     offset
+ */
+struct logfs_journal_entry {
+        int used;
+        s16 version;
+        u16 len;
+        u16 datalen;
+        u64 offset;
+};
+enum transaction_state {
+        CREATE_1 = 1,
+        CREATE_2,
+        UNLINK_1,
+        UNLINK_2,
+        CROSS_RENAME_1,
+        CROSS_RENAME_2,
+        TARGET_RENAME_1,
+        TARGET_RENAME_2,
+        TARGET_RENAME_3
+};
+/**
+ * struct logfs_transaction - essential fields to support atomic dirops
+ *
+ * @ino:                        target inode
+ * @dir:                        inode of directory containing dentry
+ * @pos:                        pos of dentry in directory
+ */
+struct logfs_transaction {
+        enum transaction_state state;
+        u64      ino;
+        u64      dir;
+        u64      pos;
+};
+/**
+ * struct logfs_shadow - old block in the shadow of a not-yet-committed new one
+ * @old_ofs:                    offset of old block on medium
+ * @new_ofs:                    offset of new block on medium
+ * @ino:                        inode number
+ * @bix:                        block index
+ * @old_len:                    size of old block, including header
+ * @new_len:                    size of new block, including header
+ * @level:                      block level
+ */
+struct logfs_shadow {
+        u64 old_ofs;
+        u64 new_ofs;
+        u64 ino;
+        u64 bix;
+        int old_len;
+        int new_len;
+        gc_level_t gc_level;
+};
+/**
+ * struct shadow_tree
+ * @new:                        shadows where old_ofs==0, indexed by new_ofs
+ * @old:                        shadows where old_ofs!=0, indexed by old_ofs
+ */
+struct shadow_tree {
+        struct btree_head64 new;
+        struct btree_head64 old;
+};
+struct object_alias_item {
+        struct list_head list;
+        __be64 val;
+        int child_no;
+};
+/**
+ * struct logfs_block - contains any block state
+ * @type:                       indirect block or inode
+ * @full:                       number of fully populated children
+ * @partial:                    number of partially populated children
+ *
+ * Most blocks are directly represented by page cache pages.  But when a block
+ * becomes dirty, is part of a transaction, contains aliases or is otherwise
+ * special, a struct logfs_block is allocated to track the additional state.
+ * Inodes are very similar to indirect blocks, so they can also get one of
+ * these structures added when appropriate.
+ */
+#define BLOCK_INDIRECT  1       /* Indirect block */
+#define BLOCK_INODE     2       /* Inode */
+struct logfs_block_ops;
+struct logfs_block {
+        struct list_head alias_list;
+        struct list_head item_list;
+        struct super_block *sb;
+        u64 ino;
+        u64 bix;
+        level_t level;
+        struct page *page;
+        struct inode *inode;
+        struct logfs_transaction *ta;
+        unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG];
+        struct logfs_block_ops *ops;
+        int full;
+        int partial;
+        int reserved_bytes;
+};
+typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix,
+                level_t level, int child_no, __be64 val);
+struct logfs_block_ops {
+        void    (*write_block)(struct logfs_block *block);
+        gc_level_t      (*block_level)(struct logfs_block *block);
+        void    (*free_block)(struct super_block *sb, struct logfs_block*block);
+        int     (*write_alias)(struct super_block *sb,
+                        struct logfs_block *block,
+                        write_alias_t *write_one_alias);
+};
+struct logfs_super {
+        struct mtd_info *s_mtd;                 /* underlying device */
+        struct block_device *s_bdev;            /* underlying device */
+        const struct logfs_device_ops *s_devops;/* device access */
+        struct inode    *s_master_inode;        /* inode file */
+        struct inode    *s_segfile_inode;       /* segment file */
+        struct inode *s_mapping_inode;          /* device mapping */
+        atomic_t s_pending_writes;              /* outstanting bios */
+        long     s_flags;
+        mempool_t *s_btree_pool;                /* for btree nodes */
+        mempool_t *s_alias_pool;                /* aliases in segment.c */
+        u64      s_feature_incompat;
+        u64      s_feature_ro_compat;
+        u64      s_feature_compat;
+        u64      s_feature_flags;
+        u64      s_sb_ofs[2];
+        struct page *s_erase_page;              /* for dev_bdev.c */
+        /* alias.c fields */
+        struct btree_head32 s_segment_alias;    /* remapped segments */
+        int      s_no_object_aliases;
+        struct list_head s_object_alias;        /* remapped objects */
+        struct btree_head128 s_object_alias_tree; /* remapped objects */
+        struct mutex s_object_alias_mutex;
+        /* dir.c fields */
+        struct mutex s_dirop_mutex;             /* for creat/unlink/rename */
+        u64      s_victim_ino;                  /* used for atomic dir-ops */
+        u64      s_rename_dir;                  /* source directory ino */
+        u64      s_rename_pos;                  /* position of source dd */
+        /* gc.c fields */
+        long     s_segsize;                     /* size of a segment */
+        int      s_segshift;                    /* log2 of segment size */
+        long     s_segmask;                     /* 1 << s_segshift - 1 */
+        long     s_no_segs;                     /* segments on device */
+        long     s_no_journal_segs;             /* segments used for journal */
+        long     s_no_blocks;                   /* blocks per segment */
+        long     s_writesize;                   /* minimum write size */
+        int      s_writeshift;                  /* log2 of write size */
+        u64      s_size;                        /* filesystem size */
+        struct logfs_area *s_area[LOGFS_NO_AREAS];      /* open segment array */
+        u64      s_gec;                         /* global erase count */
+        u64      s_wl_gec_ostore;               /* time of last wl event */
+        u64      s_wl_gec_journal;              /* time of last wl event */
+        u64      s_sweeper;                     /* current sweeper pos */
+        u8       s_ifile_levels;                /* max level of ifile */
+        u8       s_iblock_levels;               /* max level of regular files */
+        u8       s_data_levels;                 /* # of segments to leaf block*/
+        u8       s_total_levels;                /* sum of above three */
+        struct btree_head32 s_cand_tree;        /* all candidates */
+        struct candidate_list s_free_list;      /* 100% free segments */
+        struct candidate_list s_reserve_list;   /* Bad segment reserve */
+        struct candidate_list s_low_list[LOGFS_NO_AREAS];/* good candidates */
+        struct candidate_list s_ec_list;        /* wear level candidates */
+        struct btree_head32 s_reserved_segments;/* sb, journal, bad, etc. */
+        /* inode.c fields */
+        u64      s_last_ino;                    /* highest ino used */
+        long     s_inos_till_wrap;
+        u32      s_generation;                  /* i_generation for new files */
+        struct list_head s_freeing_list;        /* inodes being freed */
+        /* journal.c fields */
+        struct mutex s_journal_mutex;
+        void    *s_je;                          /* journal entry to compress */
+        void    *s_compressed_je;               /* block to write to journal */
+        u32      s_journal_seg[LOGFS_JOURNAL_SEGS]; /* journal segments */
+        u32      s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */
+        u64      s_last_version;
+        struct logfs_area *s_journal_area;      /* open journal segment */
+        __be64  s_je_array[64];
+        int     s_no_je;
+        int      s_sum_index;                   /* for the 12 summaries */
+        struct shadow_tree s_shadow_tree;
+        int      s_je_fill;                     /* index of current je */
+        /* readwrite.c fields */
+        struct mutex s_write_mutex;
+        int      s_lock_count;
+        mempool_t *s_block_pool;                /* struct logfs_block pool */
+        mempool_t *s_shadow_pool;               /* struct logfs_shadow pool */
+        /*
+         * Space accounting:
+         * - s_used_bytes specifies space used to store valid data objects.
+         * - s_dirty_used_bytes is space used to store non-committed data
+         *   objects.  Those objects have already been written themselves,
+         *   but they don't become valid until all indirect blocks up to the
+         *   journal have been written as well.
+         * - s_dirty_free_bytes is space used to store the old copy of a
+         *   replaced object, as long as the replacement is non-committed.
+         *   In other words, it is the amount of space freed when all dirty
+         *   blocks are written back.
+         * - s_free_bytes is the amount of free space available for any
+         *   purpose.
+         * - s_root_reserve is the amount of free space available only to
+         *   the root user.  Non-privileged users can no longer write once
+         *   this watermark has been reached.
+         * - s_speed_reserve is space which remains unused to speed up
+         *   garbage collection performance.
+         * - s_dirty_pages is the space reserved for currently dirty pages.
+         *   It is a pessimistic estimate, so some/most will get freed on
+         *   page writeback.
+         *
+         * s_used_bytes + s_free_bytes + s_speed_reserve = total usable size
+         */
+        u64      s_free_bytes;
+        u64      s_used_bytes;
+        u64      s_dirty_free_bytes;
+        u64      s_dirty_used_bytes;
+        u64      s_root_reserve;
+        u64      s_speed_reserve;
+        u64      s_dirty_pages;
+        /* Bad block handling:
+         * - s_bad_seg_reserve is a number of segments usually kept
+         *   free.  When encountering bad blocks, the affected segment's data
+         *   is _temporarily_ moved to a reserved segment.
+         * - s_bad_segments is the number of known bad segments.
+         */
+        u32      s_bad_seg_reserve;
+        u32      s_bad_segments;
+};
+/**
+ * struct logfs_inode - in-memory inode
+ *
+ * @vfs_inode:                  struct inode
+ * @li_data:                    data pointers
+ * @li_used_bytes:              number of used bytes
+ * @li_freeing_list:            used to track inodes currently being freed
+ * @li_flags:                   inode flags
+ * @li_refcount:                number of internal (GC-induced) references
+ */
+struct logfs_inode {
+        struct inode vfs_inode;
+        u64     li_data[LOGFS_EMBEDDED_FIELDS];
+        u64     li_used_bytes;
+        struct list_head li_freeing_list;
+        struct logfs_block *li_block;
+        u32     li_flags;
+        u8      li_height;
+        int     li_refcount;
+};
+#define journal_for_each(__i) for (__i = 0; __i < LOGFS_JOURNAL_SEGS; __i++)
+#define for_each_area(__i) for (__i = 0; __i < LOGFS_NO_AREAS; __i++)
+#define for_each_area_down(__i) for (__i = LOGFS_NO_AREAS - 1; __i >= 0; __i--)
+/* compr.c */
+int logfs_compress(void *in, void *out, size_t inlen, size_t outlen);
+int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen);
+int __init logfs_compr_init(void);
+void logfs_compr_exit(void);
+/* dev_bdev.c */
+#ifdef CONFIG_BLOCK
+int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+                const char *devname, struct vfsmount *mnt);
+#else
+static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+                const char *devname, struct vfsmount *mnt)
+{
+        return -ENODEV;
+}
+#endif
+/* dev_mtd.c */
+#ifdef CONFIG_MTD
+int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+                int mtdnr, struct vfsmount *mnt);
+#else
+static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+                int mtdnr, struct vfsmount *mnt)
+{
+        return -ENODEV;
+}
+#endif
+/* dir.c */
+extern const struct inode_operations logfs_symlink_iops;
+extern const struct inode_operations logfs_dir_iops;
+extern const struct file_operations logfs_dir_fops;
+int logfs_replay_journal(struct super_block *sb);
+/* file.c */
+extern const struct inode_operations logfs_reg_iops;
+extern const struct file_operations logfs_reg_fops;
+extern const struct address_space_operations logfs_reg_aops;
+int logfs_readpage(struct file *file, struct page *page);
+int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+                unsigned long arg);
+int logfs_fsync(struct file *file, struct dentry *dentry, int datasync);
+/* gc.c */
+u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
+void logfs_gc_pass(struct super_block *sb);
+int logfs_check_areas(struct super_block *sb);
+int logfs_init_gc(struct super_block *sb);
+void logfs_cleanup_gc(struct super_block *sb);
+/* inode.c */
+extern const struct super_operations logfs_super_operations;
+struct inode *logfs_iget(struct super_block *sb, ino_t ino);
+struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie);
+void logfs_safe_iput(struct inode *inode, int cookie);
+struct inode *logfs_new_inode(struct inode *dir, int mode);
+struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino);
+struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino);
+int logfs_init_inode_cache(void);
+void logfs_destroy_inode_cache(void);
+void destroy_meta_inode(struct inode *inode);
+void logfs_set_blocks(struct inode *inode, u64 no);
+/* these logically belong into inode.c but actually reside in readwrite.c */
+int logfs_read_inode(struct inode *inode);
+int __logfs_write_inode(struct inode *inode, long flags);
+void logfs_delete_inode(struct inode *inode);
+void logfs_clear_inode(struct inode *inode);
+/* journal.c */
+void logfs_write_anchor(struct super_block *sb);
+int logfs_init_journal(struct super_block *sb);
+void logfs_cleanup_journal(struct super_block *sb);
+int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
+                level_t level, int child_no, __be64 val);
+void do_logfs_journal_wl_pass(struct super_block *sb);
+/* readwrite.c */
+pgoff_t logfs_pack_index(u64 bix, level_t level);
+void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level);
+int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
+                loff_t bix, long flags, struct shadow_tree *shadow_tree);
+int logfs_readpage_nolock(struct page *page);
+int logfs_write_buf(struct inode *inode, struct page *page, long flags);
+int logfs_delete(struct inode *inode, pgoff_t index,
+                struct shadow_tree *shadow_tree);
+int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
+                gc_level_t gc_level, long flags);
+int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
+                gc_level_t gc_level);
+int logfs_truncate(struct inode *inode, u64 size);
+u64 logfs_seek_hole(struct inode *inode, u64 bix);
+u64 logfs_seek_data(struct inode *inode, u64 bix);
+int logfs_open_segfile(struct super_block *sb);
+int logfs_init_rw(struct super_block *sb);
+void logfs_cleanup_rw(struct super_block *sb);
+void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta);
+void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta);
+void logfs_write_block(struct logfs_block *block, long flags);
+int logfs_write_obj_aliases_pagecache(struct super_block *sb);
+void logfs_get_segment_entry(struct super_block *sb, u32 segno,
+                struct logfs_segment_entry *se);
+void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment);
+void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
+                gc_level_t gc_level);
+void logfs_set_segment_reserved(struct super_block *sb, u32 segno);
+void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec);
+struct logfs_block *__alloc_block(struct super_block *sb,
+                u64 ino, u64 bix, level_t level);
+void __free_block(struct super_block *sb, struct logfs_block *block);
+void btree_write_block(struct logfs_block *block);
+void initialize_block_counters(struct page *page, struct logfs_block *block,
+                __be64 *array, int page_is_empty);
+int logfs_exist_block(struct inode *inode, u64 bix);
+int get_page_reserve(struct inode *inode, struct page *page);
+extern struct logfs_block_ops indirect_block_ops;
+/* segment.c */
+int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase);
+int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf);
+int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix,
+                level_t level);
+int logfs_segment_write(struct inode *inode, struct page *page,
+                struct logfs_shadow *shadow);
+int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow);
+int logfs_load_object_aliases(struct super_block *sb,
+                struct logfs_obj_alias *oa, int count);
+void move_page_to_btree(struct page *page);
+int logfs_init_mapping(struct super_block *sb);
+void logfs_sync_area(struct logfs_area *area);
+void logfs_sync_segments(struct super_block *sb);
+void freeseg(struct super_block *sb, u32 segno);
+/* area handling */
+int logfs_init_areas(struct super_block *sb);
+void logfs_cleanup_areas(struct super_block *sb);
+int logfs_open_area(struct logfs_area *area, size_t bytes);
+void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
+                int use_filler);
+static inline void logfs_buf_write(struct logfs_area *area, u64 ofs,
+                void *buf, size_t len)
+{
+        __logfs_buf_write(area, ofs, buf, len, 0);
+}
+static inline void logfs_buf_recover(struct logfs_area *area, u64 ofs,
+                void *buf, size_t len)
+{
+        __logfs_buf_write(area, ofs, buf, len, 1);
+}
+/* super.c */
+struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index);
+void emergency_read_end(struct page *page);
+void logfs_crash_dump(struct super_block *sb);
+void *memchr_inv(const void *s, int c, size_t n);
+int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
+int logfs_get_sb_device(struct file_system_type *type, int flags,
+                struct mtd_info *mtd, struct block_device *bdev,
+                const struct logfs_device_ops *devops, struct vfsmount *mnt);
+int logfs_check_ds(struct logfs_disk_super *ds);
+int logfs_write_sb(struct super_block *sb);
+static inline struct logfs_super *logfs_super(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
+static inline struct logfs_inode *logfs_inode(struct inode *inode)
+{
+        return container_of(inode, struct logfs_inode, vfs_inode);
+}
+static inline void logfs_set_ro(struct super_block *sb)
+{
+        logfs_super(sb)->s_flags |= LOGFS_SB_FLAG_RO;
+}
+#define LOGFS_BUG(sb) do {                                      \
+        struct super_block *__sb = sb;                          \
+        logfs_crash_dump(__sb);                                 \
+        logfs_super(__sb)->s_flags |= LOGFS_SB_FLAG_RO;         \
+        BUG();                                                  \
+} while (0)
+#define LOGFS_BUG_ON(condition, sb) \
+        do { if (unlikely(condition)) LOGFS_BUG((sb)); } while (0)
+static inline __be32 logfs_crc32(void *data, size_t len, size_t skip)
+{
+        return cpu_to_be32(crc32(~0, data+skip, len-skip));
+}
+static inline u8 logfs_type(struct inode *inode)
+{
+        return (inode->i_mode >> 12) & 15;
+}
+static inline pgoff_t logfs_index(struct super_block *sb, u64 pos)
+{
+        return pos >> sb->s_blocksize_bits;
+}
+static inline u64 dev_ofs(struct super_block *sb, u32 segno, u32 ofs)
+{
+        return ((u64)segno << logfs_super(sb)->s_segshift) + ofs;
+}
+static inline u32 seg_no(struct super_block *sb, u64 ofs)
+{
+        return ofs >> logfs_super(sb)->s_segshift;
+}
+static inline u32 seg_ofs(struct super_block *sb, u64 ofs)
+{
+        return ofs & logfs_super(sb)->s_segmask;
+}
+static inline u64 seg_align(struct super_block *sb, u64 ofs)
+{
+        return ofs & ~logfs_super(sb)->s_segmask;
+}
+static inline struct logfs_block *logfs_block(struct page *page)
+{
+        return (void *)page->private;
+}
+static inline level_t shrink_level(gc_level_t __level)
+{
+        u8 level = (__force u8)__level;
+        if (level >= LOGFS_MAX_LEVELS)
+                level -= LOGFS_MAX_LEVELS;
+        return (__force level_t)level;
+}
+static inline gc_level_t expand_level(u64 ino, level_t __level)
+{
+        u8 level = (__force u8)__level;
+        if (ino == LOGFS_INO_MASTER) {
+                /* ifile has seperate areas */
+                level += LOGFS_MAX_LEVELS;
+        }
+        return (__force gc_level_t)level;
+}
+static inline int logfs_block_shift(struct super_block *sb, level_t level)
+{
+        level = shrink_level((__force gc_level_t)level);
+        return (__force int)level * (sb->s_blocksize_bits - 3);
+}
+static inline u64 logfs_block_mask(struct super_block *sb, level_t level)
+{
+        return ~0ull << logfs_block_shift(sb, level);
+}
+static inline struct logfs_area *get_area(struct super_block *sb,
+                gc_level_t gc_level)
+{
+        return logfs_super(sb)->s_area[(__force u8)gc_level];
+}
+#endif
diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h
new file mode 100644
index 000000000000..f674725663fe
--- /dev/null
+++ b/fs/logfs/logfs_abi.h
@@ -0,0 +1,629 @@
+/*
+ * fs/logfs/logfs_abi.h
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ * Public header for logfs.
+ */
+#ifndef FS_LOGFS_LOGFS_ABI_H
+#define FS_LOGFS_LOGFS_ABI_H
+/* For out-of-kernel compiles */
+#ifndef BUILD_BUG_ON
+#define BUILD_BUG_ON(condition) /**/
+#endif
+#define SIZE_CHECK(type, size)                                  \
+static inline void check_##type(void)                           \
+{                                                               \
+        BUILD_BUG_ON(sizeof(struct type) != (size));            \
+}
+/*
+ * Throughout the logfs code, we're constantly dealing with blocks at
+ * various positions or offsets.  To remove confusion, we stricly
+ * distinguish between a "position" - the logical position within a
+ * file and an "offset" - the physical location within the device.
+ *
+ * Any usage of the term offset for a logical location or position for
+ * a physical one is a bug and should get fixed.
+ */
+/*
+ * Block are allocated in one of several segments depending on their
+ * level.  The following levels are used:
+ *  0   - regular data block
+ *  1   - i1 indirect blocks
+ *  2   - i2 indirect blocks
+ *  3   - i3 indirect blocks
+ *  4   - i4 indirect blocks
+ *  5   - i5 indirect blocks
+ *  6   - ifile data blocks
+ *  7   - ifile i1 indirect blocks
+ *  8   - ifile i2 indirect blocks
+ *  9   - ifile i3 indirect blocks
+ * 10   - ifile i4 indirect blocks
+ * 11   - ifile i5 indirect blocks
+ * Potential levels to be used in the future:
+ * 12   - gc recycled blocks, long-lived data
+ * 13   - replacement blocks, short-lived data
+ *
+ * Levels 1-11 are necessary for robust gc operations and help seperate
+ * short-lived metadata from longer-lived file data.  In the future,
+ * file data should get seperated into several segments based on simple
+ * heuristics.  Old data recycled during gc operation is expected to be
+ * long-lived.  New data is of uncertain life expectancy.  New data
+ * used to replace older blocks in existing files is expected to be
+ * short-lived.
+ */
+/* Magic numbers.  64bit for superblock, 32bit for statfs f_type */
+#define LOGFS_MAGIC             0x7a3a8e5cb9d5bf67ull
+#define LOGFS_MAGIC_U32         0xc97e8168u
+/*
+ * Various blocksize related macros.  Blocksize is currently fixed at 4KiB.
+ * Sooner or later that should become configurable and the macros replaced
+ * by something superblock-dependent.  Pointers in indirect blocks are and
+ * will remain 64bit.
+ *
+ * LOGFS_BLOCKSIZE      - self-explaining
+ * LOGFS_BLOCK_FACTOR   - number of pointers per indirect block
+ * LOGFS_BLOCK_BITS     - log2 of LOGFS_BLOCK_FACTOR, used for shifts
+ */
+#define LOGFS_BLOCKSIZE         (4096ull)
+#define LOGFS_BLOCK_FACTOR      (LOGFS_BLOCKSIZE / sizeof(u64))
+#define LOGFS_BLOCK_BITS        (9)
+/*
+ * Number of blocks at various levels of indirection.  There are 16 direct
+ * block pointers plus a single indirect pointer.
+ */
+#define I0_BLOCKS               (16)
+#define I1_BLOCKS               LOGFS_BLOCK_FACTOR
+#define I2_BLOCKS               (LOGFS_BLOCK_FACTOR * I1_BLOCKS)
+#define I3_BLOCKS               (LOGFS_BLOCK_FACTOR * I2_BLOCKS)
+#define I4_BLOCKS               (LOGFS_BLOCK_FACTOR * I3_BLOCKS)
+#define I5_BLOCKS               (LOGFS_BLOCK_FACTOR * I4_BLOCKS)
+#define INDIRECT_INDEX          I0_BLOCKS
+#define LOGFS_EMBEDDED_FIELDS   (I0_BLOCKS + 1)
+/*
+ * Sizes at which files require another level of indirection.  Files smaller
+ * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself,
+ * similar like ext2 fast symlinks.
+ *
+ * Data at a position smaller than LOGFS_I0_SIZE is accessed through the
+ * direct pointers, else through the 1x indirect pointer and so forth.
+ */
+#define LOGFS_EMBEDDED_SIZE     (LOGFS_EMBEDDED_FIELDS * sizeof(u64))
+#define LOGFS_I0_SIZE           (I0_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I1_SIZE           (I1_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I2_SIZE           (I2_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I3_SIZE           (I3_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I4_SIZE           (I4_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I5_SIZE           (I5_BLOCKS * LOGFS_BLOCKSIZE)
+/*
+ * Each indirect block pointer must have this flag set, if all block pointers
+ * behind it are set, i.e. there is no hole hidden in the shadow of this
+ * indirect block pointer.
+ */
+#define LOGFS_FULLY_POPULATED (1ULL << 63)
+#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
+/*
+ * LogFS needs to seperate data into levels.  Each level is defined as the
+ * maximal possible distance from the master inode (inode of the inode file).
+ * Data blocks reside on level 0, 1x indirect block on level 1, etc.
+ * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
+ * This effort is necessary to guarantee garbage collection to always make
+ * progress.
+ *
+ * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks,
+ * LOGFS_MAX_LEVELS is one more for the actual data level of a file.  It is
+ * the maximal number of levels for one file.
+ * LOGFS_NO_AREAS is twice that, as the inode file and regular files are
+ * effectively stacked on top of each other.
+ */
+#define LOGFS_MAX_INDIRECT      (5)
+#define LOGFS_MAX_LEVELS        (LOGFS_MAX_INDIRECT + 1)
+#define LOGFS_NO_AREAS          (2 * LOGFS_MAX_LEVELS)
+/* Maximum size of filenames */
+#define LOGFS_MAX_NAMELEN       (255)
+/* Number of segments in the primary journal. */
+#define LOGFS_JOURNAL_SEGS      (16)
+/* Maximum number of free/erased/etc. segments in journal entries */
+#define MAX_CACHED_SEGS         (64)
+/*
+ * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store,
+ * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including
+ * its header,
+ * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for
+ * its segment header and the padded space at the end when no further objects
+ * fit.
+ */
+#define LOGFS_OBJECT_HEADERSIZE (0x1c)
+#define LOGFS_SEGMENT_HEADERSIZE (0x18)
+#define LOGFS_MAX_OBJECTSIZE    (LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE)
+#define LOGFS_SEGMENT_RESERVE   \
+        (LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1)
+/*
+ * Segment types:
+ * SEG_SUPER    - Data or indirect block
+ * SEG_JOURNAL  - Inode
+ * SEG_OSTORE   - Dentry
+ */
+enum {
+        SEG_SUPER       = 0x01,
+        SEG_JOURNAL     = 0x02,
+        SEG_OSTORE      = 0x03,
+};
+/**
+ * struct logfs_segment_header - per-segment header in the ostore
+ *
+ * @crc:                        crc32 of header (there is no data)
+ * @pad:                        unused, must be 0
+ * @type:                       segment type, see above
+ * @level:                      GC level for all objects in this segment
+ * @segno:                      segment number
+ * @ec:                         erase count for this segment
+ * @gec:                        global erase count at time of writing
+ */
+struct logfs_segment_header {
+        __be32  crc;
+        __be16  pad;
+        __u8    type;
+        __u8    level;
+        __be32  segno;
+        __be32  ec;
+        __be64  gec;
+};
+SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
+#define LOGFS_FEATURES_INCOMPAT         (0ull)
+#define LOGFS_FEATURES_RO_COMPAT        (0ull)
+#define LOGFS_FEATURES_COMPAT           (0ull)
+/**
+ * struct logfs_disk_super - on-medium superblock
+ *
+ * @ds_magic:                   magic number, must equal LOGFS_MAGIC
+ * @ds_crc:                     crc32 of structure starting with the next field
+ * @ds_ifile_levels:            maximum number of levels for ifile
+ * @ds_iblock_levels:           maximum number of levels for regular files
+ * @ds_data_levels:             number of seperate levels for data
+ * @pad0:                       reserved, must be 0
+ * @ds_feature_incompat:        incompatible filesystem features
+ * @ds_feature_ro_compat:       read-only compatible filesystem features
+ * @ds_feature_compat:          compatible filesystem features
+ * @ds_flags:                   flags
+ * @ds_segment_shift:           log2 of segment size
+ * @ds_block_shift:             log2 of block size
+ * @ds_write_shift:             log2 of write size
+ * @pad1:                       reserved, must be 0
+ * @ds_journal_seg:             segments used by primary journal
+ * @ds_root_reserve:            bytes reserved for the superuser
+ * @ds_speed_reserve:           bytes reserved to speed up GC
+ * @ds_bad_seg_reserve:         number of segments reserved to handle bad blocks
+ * @pad2:                       reserved, must be 0
+ * @pad3:                       reserved, must be 0
+ *
+ * Contains only read-only fields.  Read-write fields like the amount of used
+ * space is tracked in the dynamic superblock, which is stored in the journal.
+ */
+struct logfs_disk_super {
+        struct logfs_segment_header ds_sh;
+        __be64  ds_magic;
+        __be32  ds_crc;
+        __u8    ds_ifile_levels;
+        __u8    ds_iblock_levels;
+        __u8    ds_data_levels;
+        __u8    ds_segment_shift;
+        __u8    ds_block_shift;
+        __u8    ds_write_shift;
+        __u8    pad0[6];
+        __be64  ds_filesystem_size;
+        __be32  ds_segment_size;
+        __be32  ds_bad_seg_reserve;
+        __be64  ds_feature_incompat;
+        __be64  ds_feature_ro_compat;
+        __be64  ds_feature_compat;
+        __be64  ds_feature_flags;
+        __be64  ds_root_reserve;
+        __be64  ds_speed_reserve;
+        __be32  ds_journal_seg[LOGFS_JOURNAL_SEGS];
+        __be64  ds_super_ofs[2];
+        __be64  pad3[8];
+};
+SIZE_CHECK(logfs_disk_super, 256);
+/*
+ * Object types:
+ * OBJ_BLOCK    - Data or indirect block
+ * OBJ_INODE    - Inode
+ * OBJ_DENTRY   - Dentry
+ */
+enum {
+        OBJ_BLOCK       = 0x04,
+        OBJ_INODE       = 0x05,
+        OBJ_DENTRY      = 0x06,
+};
+/**
+ * struct logfs_object_header - per-object header in the ostore
+ *
+ * @crc:                        crc32 of header, excluding data_crc
+ * @len:                        length of data
+ * @type:                       object type, see above
+ * @compr:                      compression type
+ * @ino:                        inode number
+ * @bix:                        block index
+ * @data_crc:                   crc32 of payload
+ */
+struct logfs_object_header {
+        __be32  crc;
+        __be16  len;
+        __u8    type;
+        __u8    compr;
+        __be64  ino;
+        __be64  bix;
+        __be32  data_crc;
+} __attribute__((packed));
+SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE);
+/*
+ * Reserved inode numbers:
+ * LOGFS_INO_MASTER     - master inode (for inode file)
+ * LOGFS_INO_ROOT       - root directory
+ * LOGFS_INO_SEGFILE    - per-segment used bytes and erase count
+ */
+enum {
+        LOGFS_INO_MAPPING       = 0x00,
+        LOGFS_INO_MASTER        = 0x01,
+        LOGFS_INO_ROOT          = 0x02,
+        LOGFS_INO_SEGFILE       = 0x03,
+        LOGFS_RESERVED_INOS     = 0x10,
+};
+/*
+ * Inode flags.  High bits should never be written to the medium.  They are
+ * reserved for in-memory usage.
+ * Low bits should either remain in sync with the corresponding FS_*_FL or
+ * reuse slots that obviously don't make sense for logfs.
+ *
+ * LOGFS_IF_DIRTY       Inode must be written back
+ * LOGFS_IF_ZOMBIE      Inode has been deleted
+ * LOGFS_IF_STILLBORN   -ENOSPC happened when creating inode
+ */
+#define LOGFS_IF_COMPRESSED     0x00000004 /* == FS_COMPR_FL */
+#define LOGFS_IF_DIRTY          0x20000000
+#define LOGFS_IF_ZOMBIE         0x40000000
+#define LOGFS_IF_STILLBORN      0x80000000
+/* Flags available to chattr */
+#define LOGFS_FL_USER_VISIBLE   (LOGFS_IF_COMPRESSED)
+#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED)
+/* Flags inherited from parent directory on file/directory creation */
+#define LOGFS_FL_INHERITED      (LOGFS_IF_COMPRESSED)
+/**
+ * struct logfs_disk_inode - on-medium inode
+ *
+ * @di_mode:                    file mode
+ * @di_pad:                     reserved, must be 0
+ * @di_flags:                   inode flags, see above
+ * @di_uid:                     user id
+ * @di_gid:                     group id
+ * @di_ctime:                   change time
+ * @di_mtime:                   modify time
+ * @di_refcount:                reference count (aka nlink or link count)
+ * @di_generation:              inode generation, for nfs
+ * @di_used_bytes:              number of bytes used
+ * @di_size:                    file size
+ * @di_data:                    data pointers
+ */
+struct logfs_disk_inode {
+        __be16  di_mode;
+        __u8    di_height;
+        __u8    di_pad;
+        __be32  di_flags;
+        __be32  di_uid;
+        __be32  di_gid;
+        __be64  di_ctime;
+        __be64  di_mtime;
+        __be64  di_atime;
+        __be32  di_refcount;
+        __be32  di_generation;
+        __be64  di_used_bytes;
+        __be64  di_size;
+        __be64  di_data[LOGFS_EMBEDDED_FIELDS];
+};
+SIZE_CHECK(logfs_disk_inode, 200);
+#define INODE_POINTER_OFS \
+        (offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64))
+#define INODE_USED_OFS \
+        (offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64))
+#define INODE_SIZE_OFS \
+        (offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64))
+#define INODE_HEIGHT_OFS        (0)
+/**
+ * struct logfs_disk_dentry - on-medium dentry structure
+ *
+ * @ino:                        inode number
+ * @namelen:                    length of file name
+ * @type:                       file type, identical to bits 12..15 of mode
+ * @name:                       file name
+ */
+/* FIXME: add 6 bytes of padding to remove the __packed */
+struct logfs_disk_dentry {
+        __be64  ino;
+        __be16  namelen;
+        __u8    type;
+        __u8    name[LOGFS_MAX_NAMELEN];
+} __attribute__((packed));
+SIZE_CHECK(logfs_disk_dentry, 266);
+#define RESERVED                0xffffffff
+#define BADSEG                  0xffffffff
+/**
+ * struct logfs_segment_entry - segment file entry
+ *
+ * @ec_level:                   erase count and level
+ * @valid:                      number of valid bytes
+ *
+ * Segment file contains one entry for every segment.  ec_level contains the
+ * erasecount in the upper 28 bits and the level in the lower 4 bits.  An
+ * ec_level of BADSEG (-1) identifies bad segments.  valid contains the number
+ * of valid bytes or RESERVED (-1 again) if the segment is used for either the
+ * superblock or the journal, or when the segment is bad.
+ */
+struct logfs_segment_entry {
+        __be32  ec_level;
+        __be32  valid;
+};
+SIZE_CHECK(logfs_segment_entry, 8);
+/**
+ * struct logfs_journal_header - header for journal entries (JEs)
+ *
+ * @h_crc:                      crc32 of journal entry
+ * @h_len:                      length of compressed journal entry,
+ *                              not including header
+ * @h_datalen:                  length of uncompressed data
+ * @h_type:                     JE type
+ * @h_compr:                    compression type
+ * @h_pad:                      reserved
+ */
+struct logfs_journal_header {
+        __be32  h_crc;
+        __be16  h_len;
+        __be16  h_datalen;
+        __be16  h_type;
+        __u8    h_compr;
+        __u8    h_pad[5];
+};
+SIZE_CHECK(logfs_journal_header, 16);
+/*
+ * Life expectency of data.
+ * VIM_DEFAULT          - default vim
+ * VIM_SEGFILE          - for segment file only - very short-living
+ * VIM_GC               - GC'd data - likely long-living
+ */
+enum logfs_vim {
+        VIM_DEFAULT     = 0,
+        VIM_SEGFILE     = 1,
+};
+/**
+ * struct logfs_je_area - wbuf header
+ *
+ * @segno:                      segment number of area
+ * @used_bytes:                 number of bytes already used
+ * @gc_level:                   GC level
+ * @vim:                        life expectancy of data
+ *
+ * "Areas" are segments currently being used for writing.  There is at least
+ * one area per GC level.  Several may be used to seperate long-living from
+ * short-living data.  If an area with unknown vim is encountered, it can
+ * simply be closed.
+ * The write buffer immediately follow this header.
+ */
+struct logfs_je_area {
+        __be32  segno;
+        __be32  used_bytes;
+        __u8    gc_level;
+        __u8    vim;
+} __attribute__((packed));
+SIZE_CHECK(logfs_je_area, 10);
+#define MAX_JOURNAL_HEADER \
+        (sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area))
+/**
+ * struct logfs_je_dynsb - dynamic superblock
+ *
+ * @ds_gec:                     global erase count
+ * @ds_sweeper:                 current position of GC "sweeper"
+ * @ds_rename_dir:              source directory ino (see dir.c documentation)
+ * @ds_rename_pos:              position of source dd (see dir.c documentation)
+ * @ds_victim_ino:              victims of incomplete dir operation (see dir.c)
+ * @ds_victim_ino:              parent inode of victim (see dir.c)
+ * @ds_used_bytes:              number of used bytes
+ */
+struct logfs_je_dynsb {
+        __be64  ds_gec;
+        __be64  ds_sweeper;
+        __be64  ds_rename_dir;
+        __be64  ds_rename_pos;
+        __be64  ds_victim_ino;
+        __be64  ds_victim_parent; /* XXX */
+        __be64  ds_used_bytes;
+        __be32  ds_generation;
+        __be32  pad;
+};
+SIZE_CHECK(logfs_je_dynsb, 64);
+/**
+ * struct logfs_je_anchor - anchor of filesystem tree, aka master inode
+ *
+ * @da_size:                    size of inode file
+ * @da_last_ino:                last created inode
+ * @da_used_bytes:              number of bytes used
+ * @da_data:                    data pointers
+ */
+struct logfs_je_anchor {
+        __be64  da_size;
+        __be64  da_last_ino;
+        __be64  da_used_bytes;
+        u8      da_height;
+        u8      pad[7];
+        __be64  da_data[LOGFS_EMBEDDED_FIELDS];
+};
+SIZE_CHECK(logfs_je_anchor, 168);
+/**
+ * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal)
+ *
+ * @so_segment:                 segments used for 2nd journal
+ *
+ * Length of the array is given by h_len field in the header.
+ */
+struct logfs_je_spillout {
+        __be64  so_segment[0];
+};
+SIZE_CHECK(logfs_je_spillout, 0);
+/**
+ * struct logfs_je_journal_ec - erase counts for all journal segments
+ *
+ * @ec:                         erase count
+ *
+ * Length of the array is given by h_len field in the header.
+ */
+struct logfs_je_journal_ec {
+        __be32  ec[0];
+};
+SIZE_CHECK(logfs_je_journal_ec, 0);
+/**
+ * struct logfs_je_free_segments - list of free segmetns with erase count
+ */
+struct logfs_je_free_segments {
+        __be32  segno;
+        __be32  ec;
+};
+SIZE_CHECK(logfs_je_free_segments, 8);
+/**
+ * struct logfs_seg_alias - list of segment aliases
+ */
+struct logfs_seg_alias {
+        __be32  old_segno;
+        __be32  new_segno;
+};
+SIZE_CHECK(logfs_seg_alias, 8);
+/**
+ * struct logfs_obj_alias - list of object aliases
+ */
+struct logfs_obj_alias {
+        __be64  ino;
+        __be64  bix;
+        __be64  val;
+        u8      level;
+        u8      pad[5];
+        __be16  child_no;
+};
+SIZE_CHECK(logfs_obj_alias, 32);
+/**
+ * Compression types.
+ *
+ * COMPR_NONE   - uncompressed
+ * COMPR_ZLIB   - compressed with zlib
+ */
+enum {
+        COMPR_NONE      = 0,
+        COMPR_ZLIB      = 1,
+};
+/*
+ * Journal entries come in groups of 16.  First group contains unique
+ * entries, next groups contain one entry per level
+ *
+ * JE_FIRST     - smallest possible journal entry number
+ *
+ * JEG_BASE     - base group, containing unique entries
+ * JE_COMMIT    - commit entry, validates all previous entries
+ * JE_DYNSB     - dynamic superblock, anything that ought to be in the
+ *                superblock but cannot because it is read-write data
+ * JE_ANCHOR    - anchor aka master inode aka inode file's inode
+ * JE_ERASECOUNT  erasecounts for all journal segments
+ * JE_SPILLOUT  - unused
+ * JE_SEG_ALIAS - aliases segments
+ * JE_AREA      - area description
+ *
+ * JE_LAST      - largest possible journal entry number
+ */
+enum {
+        JE_FIRST        = 0x01,
+        JEG_BASE        = 0x00,
+        JE_COMMIT       = 0x02,
+        JE_DYNSB        = 0x03,
+        JE_ANCHOR       = 0x04,
+        JE_ERASECOUNT   = 0x05,
+        JE_SPILLOUT     = 0x06,
+        JE_OBJ_ALIAS    = 0x0d,
+        JE_AREA         = 0x0e,
+        JE_LAST         = 0x0e,
+};
+#endif
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
new file mode 100644
index 000000000000..bff40253dfb2
--- /dev/null
+++ b/fs/logfs/readwrite.c
@@ -0,0 +1,2258 @@
+/*
+ * fs/logfs/readwrite.c
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ *
+ * Actually contains five sets of very similar functions:
+ * read         read blocks from a file
+ * seek_hole    find next hole
+ * seek_data    find next data block
+ * valid        check whether a block still belongs to a file
+ * write        write blocks to a file
+ * delete       delete a block (for directories and ifile)
+ * rewrite      move existing blocks of a file to a new location (gc helper)
+ * truncate     truncate a file
+ */
+#include "logfs.h"
+#include <linux/sched.h>
+#include <linux/slab.h>
+static u64 adjust_bix(u64 bix, level_t level)
+{
+        switch (level) {
+        case 0:
+                return bix;
+        case LEVEL(1):
+                return max_t(u64, bix, I0_BLOCKS);
+        case LEVEL(2):
+                return max_t(u64, bix, I1_BLOCKS);
+        case LEVEL(3):
+                return max_t(u64, bix, I2_BLOCKS);
+        case LEVEL(4):
+                return max_t(u64, bix, I3_BLOCKS);
+        case LEVEL(5):
+                return max_t(u64, bix, I4_BLOCKS);
+        default:
+                WARN_ON(1);
+                return bix;
+        }
+}
+static inline u64 maxbix(u8 height)
+{
+        return 1ULL << (LOGFS_BLOCK_BITS * height);
+}
+/**
+ * The inode address space is cut in two halves.  Lower half belongs to data
+ * pages, upper half to indirect blocks.  If the high bit (INDIRECT_BIT) is
+ * set, the actual block index (bix) and level can be derived from the page
+ * index.
+ *
+ * The lowest three bits of the block index are set to 0 after packing and
+ * unpacking.  Since the lowest n bits (9 for 4KiB blocksize) are ignored
+ * anyway this is harmless.
+ */
+#define ARCH_SHIFT      (BITS_PER_LONG - 32)
+#define INDIRECT_BIT    (0x80000000UL << ARCH_SHIFT)
+#define LEVEL_SHIFT     (28 + ARCH_SHIFT)
+static inline pgoff_t first_indirect_block(void)
+{
+        return INDIRECT_BIT | (1ULL << LEVEL_SHIFT);
+}
+pgoff_t logfs_pack_index(u64 bix, level_t level)
+{
+        pgoff_t index;
+        BUG_ON(bix >= INDIRECT_BIT);
+        if (level == 0)
+                return bix;
+        index  = INDIRECT_BIT;
+        index |= (__force long)level << LEVEL_SHIFT;
+        index |= bix >> ((__force u8)level * LOGFS_BLOCK_BITS);
+        return index;
+}
+void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level)
+{
+        u8 __level;
+        if (!(index & INDIRECT_BIT)) {
+                *bix = index;
+                *level = 0;
+                return;
+        }
+        __level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT;
+        *level = LEVEL(__level);
+        *bix = (index << (__level * LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT;
+        *bix = adjust_bix(*bix, *level);
+        return;
+}
+#undef ARCH_SHIFT
+#undef INDIRECT_BIT
+#undef LEVEL_SHIFT
+/*
+ * Time is stored as nanoseconds since the epoch.
+ */
+static struct timespec be64_to_timespec(__be64 betime)
+{
+        return ns_to_timespec(be64_to_cpu(betime));
+}
+static __be64 timespec_to_be64(struct timespec tsp)
+{
+        return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec);
+}
+static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        int i;
+        inode->i_mode   = be16_to_cpu(di->di_mode);
+        li->li_height   = di->di_height;
+        li->li_flags    = be32_to_cpu(di->di_flags);
+        inode->i_uid    = be32_to_cpu(di->di_uid);
+        inode->i_gid    = be32_to_cpu(di->di_gid);
+        inode->i_size   = be64_to_cpu(di->di_size);
+        logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes));
+        inode->i_atime  = be64_to_timespec(di->di_atime);
+        inode->i_ctime  = be64_to_timespec(di->di_ctime);
+        inode->i_mtime  = be64_to_timespec(di->di_mtime);
+        inode->i_nlink  = be32_to_cpu(di->di_refcount);
+        inode->i_generation = be32_to_cpu(di->di_generation);
+        switch (inode->i_mode & S_IFMT) {
+        case S_IFSOCK:  /* fall through */
+        case S_IFBLK:   /* fall through */
+        case S_IFCHR:   /* fall through */
+        case S_IFIFO:
+                inode->i_rdev = be64_to_cpu(di->di_data[0]);
+                break;
+        case S_IFDIR:   /* fall through */
+        case S_IFREG:   /* fall through */
+        case S_IFLNK:
+                for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+                        li->li_data[i] = be64_to_cpu(di->di_data[i]);
+                break;
+        default:
+                BUG();
+        }
+}
+static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        int i;
+        di->di_mode     = cpu_to_be16(inode->i_mode);
+        di->di_height   = li->li_height;
+        di->di_pad      = 0;
+        di->di_flags    = cpu_to_be32(li->li_flags);
+        di->di_uid      = cpu_to_be32(inode->i_uid);
+        di->di_gid      = cpu_to_be32(inode->i_gid);
+        di->di_size     = cpu_to_be64(i_size_read(inode));
+        di->di_used_bytes = cpu_to_be64(li->li_used_bytes);
+        di->di_atime    = timespec_to_be64(inode->i_atime);
+        di->di_ctime    = timespec_to_be64(inode->i_ctime);
+        di->di_mtime    = timespec_to_be64(inode->i_mtime);
+        di->di_refcount = cpu_to_be32(inode->i_nlink);
+        di->di_generation = cpu_to_be32(inode->i_generation);
+        switch (inode->i_mode & S_IFMT) {
+        case S_IFSOCK:  /* fall through */
+        case S_IFBLK:   /* fall through */
+        case S_IFCHR:   /* fall through */
+        case S_IFIFO:
+                di->di_data[0] = cpu_to_be64(inode->i_rdev);
+                break;
+        case S_IFDIR:   /* fall through */
+        case S_IFREG:   /* fall through */
+        case S_IFLNK:
+                for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+                        di->di_data[i] = cpu_to_be64(li->li_data[i]);
+                break;
+        default:
+                BUG();
+        }
+}
+static void __logfs_set_blocks(struct inode *inode)
+{
+        struct super_block *sb = inode->i_sb;
+        struct logfs_inode *li = logfs_inode(inode);
+        inode->i_blocks = ULONG_MAX;
+        if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX)
+                inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9;
+}
+void logfs_set_blocks(struct inode *inode, u64 bytes)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        li->li_used_bytes = bytes;
+        __logfs_set_blocks(inode);
+}
+static void prelock_page(struct super_block *sb, struct page *page, int lock)
+{
+        struct logfs_super *super = logfs_super(sb);
+        BUG_ON(!PageLocked(page));
+        if (lock) {
+                BUG_ON(PagePreLocked(page));
+                SetPagePreLocked(page);
+        } else {
+                /* We are in GC path. */
+                if (PagePreLocked(page))
+                        super->s_lock_count++;
+                else
+                        SetPagePreLocked(page);
+        }
+}
+static void preunlock_page(struct super_block *sb, struct page *page, int lock)
+{
+        struct logfs_super *super = logfs_super(sb);
+        BUG_ON(!PageLocked(page));
+        if (lock)
+                ClearPagePreLocked(page);
+        else {
+                /* We are in GC path. */
+                BUG_ON(!PagePreLocked(page));
+                if (super->s_lock_count)
+                        super->s_lock_count--;
+                else
+                        ClearPagePreLocked(page);
+        }
+}
+/*
+ * Logfs is prone to an AB-BA deadlock where one task tries to acquire
+ * s_write_mutex with a locked page and GC tries to get that page while holding
+ * s_write_mutex.
+ * To solve this issue logfs will ignore the page lock iff the page in question
+ * is waiting for s_write_mutex.  We annotate this fact by setting PG_pre_locked
+ * in addition to PG_locked.
+ */
+static void logfs_get_wblocks(struct super_block *sb, struct page *page,
+                int lock)
+{
+        struct logfs_super *super = logfs_super(sb);
+        if (page)
+                prelock_page(sb, page, lock);
+        if (lock) {
+                mutex_lock(&super->s_write_mutex);
+                logfs_gc_pass(sb);
+                /* FIXME: We also have to check for shadowed space
+                 * and mempool fill grade */
+        }
+}
+static void logfs_put_wblocks(struct super_block *sb, struct page *page,
+                int lock)
+{
+        struct logfs_super *super = logfs_super(sb);
+        if (page)
+                preunlock_page(sb, page, lock);
+        /* Order matters - we must clear PG_pre_locked before releasing
+         * s_write_mutex or we could race against another task. */
+        if (lock)
+                mutex_unlock(&super->s_write_mutex);
+}
+static struct page *logfs_get_read_page(struct inode *inode, u64 bix,
+                level_t level)
+{
+        return find_or_create_page(inode->i_mapping,
+                        logfs_pack_index(bix, level), GFP_NOFS);
+}
+static void logfs_put_read_page(struct page *page)
+{
+        unlock_page(page);
+        page_cache_release(page);
+}
+static void logfs_lock_write_page(struct page *page)
+{
+        int loop = 0;
+        while (unlikely(!trylock_page(page))) {
+                if (loop++ > 0x1000) {
+                        /* Has been observed once so far... */
+                        printk(KERN_ERR "stack at %p\n", &loop);
+                        BUG();
+                }
+                if (PagePreLocked(page)) {
+                        /* Holder of page lock is waiting for us, it
+                         * is safe to use this page. */
+                        break;
+                }
+                /* Some other process has this page locked and has
+                 * nothing to do with us.  Wait for it to finish.
+                 */
+                schedule();
+        }
+        BUG_ON(!PageLocked(page));
+}
+static struct page *logfs_get_write_page(struct inode *inode, u64 bix,
+                level_t level)
+{
+        struct address_space *mapping = inode->i_mapping;
+        pgoff_t index = logfs_pack_index(bix, level);
+        struct page *page;
+        int err;
+repeat:
+        page = find_get_page(mapping, index);
+        if (!page) {
+                page = __page_cache_alloc(GFP_NOFS);
+                if (!page)
+                        return NULL;
+                err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS);
+                if (unlikely(err)) {
+                        page_cache_release(page);
+                        if (err == -EEXIST)
+                                goto repeat;
+                        return NULL;
+                }
+        } else logfs_lock_write_page(page);
+        BUG_ON(!PageLocked(page));
+        return page;
+}
+static void logfs_unlock_write_page(struct page *page)
+{
+        if (!PagePreLocked(page))
+                unlock_page(page);
+}
+static void logfs_put_write_page(struct page *page)
+{
+        logfs_unlock_write_page(page);
+        page_cache_release(page);
+}
+static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level,
+                int rw)
+{
+        if (rw == READ)
+                return logfs_get_read_page(inode, bix, level);
+        else
+                return logfs_get_write_page(inode, bix, level);
+}
+static void logfs_put_page(struct page *page, int rw)
+{
+        if (rw == READ)
+                logfs_put_read_page(page);
+        else
+                logfs_put_write_page(page);
+}
+static unsigned long __get_bits(u64 val, int skip, int no)
+{
+        u64 ret = val;
+        ret >>= skip * no;
+        ret <<= 64 - no;
+        ret >>= 64 - no;
+        return ret;
+}
+static unsigned long get_bits(u64 val, level_t skip)
+{
+        return __get_bits(val, (__force int)skip, LOGFS_BLOCK_BITS);
+}
+static inline void init_shadow_tree(struct super_block *sb,
+                struct shadow_tree *tree)
+{
+        struct logfs_super *super = logfs_super(sb);
+        btree_init_mempool64(&tree->new, super->s_btree_pool);
+        btree_init_mempool64(&tree->old, super->s_btree_pool);
+}
+static void indirect_write_block(struct logfs_block *block)
+{
+        struct page *page;
+        struct inode *inode;
+        int ret;
+        page = block->page;
+        inode = page->mapping->host;
+        logfs_lock_write_page(page);
+        ret = logfs_write_buf(inode, page, 0);
+        logfs_unlock_write_page(page);
+        /*
+         * This needs some rework.  Unless you want your filesystem to run
+         * completely synchronously (you don't), the filesystem will always
+         * report writes as 'successful' before the actual work has been
+         * done.  The actual work gets done here and this is where any errors
+         * will show up.  And there isn't much we can do about it, really.
+         *
+         * Some attempts to fix the errors (move from bad blocks, retry io,...)
+         * have already been done, so anything left should be either a broken
+         * device or a bug somewhere in logfs itself.  Being relatively new,
+         * the odds currently favor a bug, so for now the line below isn't
+         * entirely tasteles.
+         */
+        BUG_ON(ret);
+}
+static void inode_write_block(struct logfs_block *block)
+{
+        struct inode *inode;
+        int ret;
+        inode = block->inode;
+        if (inode->i_ino == LOGFS_INO_MASTER)
+                logfs_write_anchor(inode->i_sb);
+        else {
+                ret = __logfs_write_inode(inode, 0);
+                /* see indirect_write_block comment */
+                BUG_ON(ret);
+        }
+}
+static gc_level_t inode_block_level(struct logfs_block *block)
+{
+        BUG_ON(block->inode->i_ino == LOGFS_INO_MASTER);
+        return GC_LEVEL(LOGFS_MAX_LEVELS);
+}
+static gc_level_t indirect_block_level(struct logfs_block *block)
+{
+        struct page *page;
+        struct inode *inode;
+        u64 bix;
+        level_t level;
+        page = block->page;
+        inode = page->mapping->host;
+        logfs_unpack_index(page->index, &bix, &level);
+        return expand_level(inode->i_ino, level);
+}
+/*
+ * This silences a false, yet annoying gcc warning.  I hate it when my editor
+ * jumps into bitops.h each time I recompile this file.
+ * TODO: Complain to gcc folks about this and upgrade compiler.
+ */
+static unsigned long fnb(const unsigned long *addr,
+                unsigned long size, unsigned long offset)
+{
+        return find_next_bit(addr, size, offset);
+}
+static __be64 inode_val0(struct inode *inode)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        u64 val;
+        /*
+         * Explicit shifting generates good code, but must match the format
+         * of the structure.  Add some paranoia just in case.
+         */
+        BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_mode) != 0);
+        BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_height) != 2);
+        BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_flags) != 4);
+        val =   (u64)inode->i_mode << 48 |
+                (u64)li->li_height << 40 |
+                (u64)li->li_flags;
+        return cpu_to_be64(val);
+}
+static int inode_write_alias(struct super_block *sb,
+                struct logfs_block *block, write_alias_t *write_one_alias)
+{
+        struct inode *inode = block->inode;
+        struct logfs_inode *li = logfs_inode(inode);
+        unsigned long pos;
+        u64 ino , bix;
+        __be64 val;
+        level_t level;
+        int err;
+        for (pos = 0; ; pos++) {
+                pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
+                if (pos >= LOGFS_EMBEDDED_FIELDS + INODE_POINTER_OFS)
+                        return 0;
+                switch (pos) {
+                case INODE_HEIGHT_OFS:
+                        val = inode_val0(inode);
+                        break;
+                case INODE_USED_OFS:
+                        val = cpu_to_be64(li->li_used_bytes);;
+                        break;
+                case INODE_SIZE_OFS:
+                        val = cpu_to_be64(i_size_read(inode));
+                        break;
+                case INODE_POINTER_OFS ... INODE_POINTER_OFS + LOGFS_EMBEDDED_FIELDS - 1:
+                        val = cpu_to_be64(li->li_data[pos - INODE_POINTER_OFS]);
+                        break;
+                default:
+                        BUG();
+                }
+                ino = LOGFS_INO_MASTER;
+                bix = inode->i_ino;
+                level = LEVEL(0);
+                err = write_one_alias(sb, ino, bix, level, pos, val);
+                if (err)
+                        return err;
+        }
+}
+static int indirect_write_alias(struct super_block *sb,
+                struct logfs_block *block, write_alias_t *write_one_alias)
+{
+        unsigned long pos;
+        struct page *page = block->page;
+        u64 ino , bix;
+        __be64 *child, val;
+        level_t level;
+        int err;
+        for (pos = 0; ; pos++) {
+                pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
+                if (pos >= LOGFS_BLOCK_FACTOR)
+                        return 0;
+                ino = page->mapping->host->i_ino;
+                logfs_unpack_index(page->index, &bix, &level);
+                child = kmap_atomic(page, KM_USER0);
+                val = child[pos];
+                kunmap_atomic(child, KM_USER0);
+                err = write_one_alias(sb, ino, bix, level, pos, val);
+                if (err)
+                        return err;
+        }
+}
+int logfs_write_obj_aliases_pagecache(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct logfs_block *block;
+        int err;
+        list_for_each_entry(block, &super->s_object_alias, alias_list) {
+                err = block->ops->write_alias(sb, block, write_alias_journal);
+                if (err)
+                        return err;
+        }
+        return 0;
+}
+void __free_block(struct super_block *sb, struct logfs_block *block)
+{
+        BUG_ON(!list_empty(&block->item_list));
+        list_del(&block->alias_list);
+        mempool_free(block, logfs_super(sb)->s_block_pool);
+}
+static void inode_free_block(struct super_block *sb, struct logfs_block *block)
+{
+        struct inode *inode = block->inode;
+        logfs_inode(inode)->li_block = NULL;
+        __free_block(sb, block);
+}
+static void indirect_free_block(struct super_block *sb,
+                struct logfs_block *block)
+{
+        ClearPagePrivate(block->page);
+        block->page->private = 0;
+        __free_block(sb, block);
+}
+static struct logfs_block_ops inode_block_ops = {
+        .write_block = inode_write_block,
+        .block_level = inode_block_level,
+        .free_block = inode_free_block,
+        .write_alias = inode_write_alias,
+};
+struct logfs_block_ops indirect_block_ops = {
+        .write_block = indirect_write_block,
+        .block_level = indirect_block_level,
+        .free_block = indirect_free_block,
+        .write_alias = indirect_write_alias,
+};
+struct logfs_block *__alloc_block(struct super_block *sb,
+                u64 ino, u64 bix, level_t level)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct logfs_block *block;
+        block = mempool_alloc(super->s_block_pool, GFP_NOFS);
+        memset(block, 0, sizeof(*block));
+        INIT_LIST_HEAD(&block->alias_list);
+        INIT_LIST_HEAD(&block->item_list);
+        block->sb = sb;
+        block->ino = ino;
+        block->bix = bix;
+        block->level = level;
+        return block;
+}
+static void alloc_inode_block(struct inode *inode)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        struct logfs_block *block;
+        if (li->li_block)
+                return;
+        block = __alloc_block(inode->i_sb, LOGFS_INO_MASTER, inode->i_ino, 0);
+        block->inode = inode;
+        li->li_block = block;
+        block->ops = &inode_block_ops;
+}
+void initialize_block_counters(struct page *page, struct logfs_block *block,
+                __be64 *array, int page_is_empty)
+{
+        u64 ptr;
+        int i, start;
+        block->partial = 0;
+        block->full = 0;
+        start = 0;
+        if (page->index < first_indirect_block()) {
+                /* Counters are pointless on level 0 */
+                return;
+        }
+        if (page->index == first_indirect_block()) {
+                /* Skip unused pointers */
+                start = I0_BLOCKS;
+                block->full = I0_BLOCKS;
+        }
+        if (!page_is_empty) {
+                for (i = start; i < LOGFS_BLOCK_FACTOR; i++) {
+                        ptr = be64_to_cpu(array[i]);
+                        if (ptr)
+                                block->partial++;
+                        if (ptr & LOGFS_FULLY_POPULATED)
+                                block->full++;
+                }
+        }
+}
+static void alloc_data_block(struct inode *inode, struct page *page)
+{
+        struct logfs_block *block;
+        u64 bix;
+        level_t level;
+        if (PagePrivate(page))
+                return;
+        logfs_unpack_index(page->index, &bix, &level);
+        block = __alloc_block(inode->i_sb, inode->i_ino, bix, level);
+        block->page = page;
+        SetPagePrivate(page);
+        page->private = (unsigned long)block;
+        block->ops = &indirect_block_ops;
+}
+static void alloc_indirect_block(struct inode *inode, struct page *page,
+                int page_is_empty)
+{
+        struct logfs_block *block;
+        __be64 *array;
+        if (PagePrivate(page))
+                return;
+        alloc_data_block(inode, page);
+        block = logfs_block(page);
+        array = kmap_atomic(page, KM_USER0);
+        initialize_block_counters(page, block, array, page_is_empty);
+        kunmap_atomic(array, KM_USER0);
+}
+static void block_set_pointer(struct page *page, int index, u64 ptr)
+{
+        struct logfs_block *block = logfs_block(page);
+        __be64 *array;
+        u64 oldptr;
+        BUG_ON(!block);
+        array = kmap_atomic(page, KM_USER0);
+        oldptr = be64_to_cpu(array[index]);
+        array[index] = cpu_to_be64(ptr);
+        kunmap_atomic(array, KM_USER0);
+        SetPageUptodate(page);
+        block->full += !!(ptr & LOGFS_FULLY_POPULATED)
+                - !!(oldptr & LOGFS_FULLY_POPULATED);
+        block->partial += !!ptr - !!oldptr;
+}
+static u64 block_get_pointer(struct page *page, int index)
+{
+        __be64 *block;
+        u64 ptr;
+        block = kmap_atomic(page, KM_USER0);
+        ptr = be64_to_cpu(block[index]);
+        kunmap_atomic(block, KM_USER0);
+        return ptr;
+}
+static int logfs_read_empty(struct page *page)
+{
+        zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+        return 0;
+}
+static int logfs_read_direct(struct inode *inode, struct page *page)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        pgoff_t index = page->index;
+        u64 block;
+        block = li->li_data[index];
+        if (!block)
+                return logfs_read_empty(page);
+        return logfs_segment_read(inode, page, block, index, 0);
+}
+static int logfs_read_loop(struct inode *inode, struct page *page,
+                int rw_context)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        u64 bix, bofs = li->li_data[INDIRECT_INDEX];
+        level_t level, target_level;
+        int ret;
+        struct page *ipage;
+        logfs_unpack_index(page->index, &bix, &target_level);
+        if (!bofs)
+                return logfs_read_empty(page);
+        if (bix >= maxbix(li->li_height))
+                return logfs_read_empty(page);
+        for (level = LEVEL(li->li_height);
+                        (__force u8)level > (__force u8)target_level;
+                        level = SUBLEVEL(level)){
+                ipage = logfs_get_page(inode, bix, level, rw_context);
+                if (!ipage)
+                        return -ENOMEM;
+                ret = logfs_segment_read(inode, ipage, bofs, bix, level);
+                if (ret) {
+                        logfs_put_read_page(ipage);
+                        return ret;
+                }
+                bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
+                logfs_put_page(ipage, rw_context);
+                if (!bofs)
+                        return logfs_read_empty(page);
+        }
+        return logfs_segment_read(inode, page, bofs, bix, 0);
+}
+static int logfs_read_block(struct inode *inode, struct page *page,
+                int rw_context)
+{
+        pgoff_t index = page->index;
+        if (index < I0_BLOCKS)
+                return logfs_read_direct(inode, page);
+        return logfs_read_loop(inode, page, rw_context);
+}
+static int logfs_exist_loop(struct inode *inode, u64 bix)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        u64 bofs = li->li_data[INDIRECT_INDEX];
+        level_t level;
+        int ret;
+        struct page *ipage;
+        if (!bofs)
+                return 0;
+        if (bix >= maxbix(li->li_height))
+                return 0;
+        for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
+                ipage = logfs_get_read_page(inode, bix, level);
+                if (!ipage)
+                        return -ENOMEM;
+                ret = logfs_segment_read(inode, ipage, bofs, bix, level);
+                if (ret) {
+                        logfs_put_read_page(ipage);
+                        return ret;
+                }
+                bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
+                logfs_put_read_page(ipage);
+                if (!bofs)
+                        return 0;
+        }
+        return 1;
+}
+int logfs_exist_block(struct inode *inode, u64 bix)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        if (bix < I0_BLOCKS)
+                return !!li->li_data[bix];
+        return logfs_exist_loop(inode, bix);
+}
+static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        for (; bix < I0_BLOCKS; bix++)
+                if (data ^ (li->li_data[bix] == 0))
+                        return bix;
+        return I0_BLOCKS;
+}
+static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        __be64 *rblock;
+        u64 increment, bofs = li->li_data[INDIRECT_INDEX];
+        level_t level;
+        int ret, slot;
+        struct page *page;
+        BUG_ON(!bofs);
+        for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
+                increment = 1 << (LOGFS_BLOCK_BITS * ((__force u8)level-1));
+                page = logfs_get_read_page(inode, bix, level);
+                if (!page)
+                        return bix;
+                ret = logfs_segment_read(inode, page, bofs, bix, level);
+                if (ret) {
+                        logfs_put_read_page(page);
+                        return bix;
+                }
+                slot = get_bits(bix, SUBLEVEL(level));
+                rblock = kmap_atomic(page, KM_USER0);
+                while (slot < LOGFS_BLOCK_FACTOR) {
+                        if (data && (rblock[slot] != 0))
+                                break;
+                        if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED))
+                                break;
+                        slot++;
+                        bix += increment;
+                        bix &= ~(increment - 1);
+                }
+                if (slot >= LOGFS_BLOCK_FACTOR) {
+                        kunmap_atomic(rblock, KM_USER0);
+                        logfs_put_read_page(page);
+                        return bix;
+                }
+                bofs = be64_to_cpu(rblock[slot]);
+                kunmap_atomic(rblock, KM_USER0);
+                logfs_put_read_page(page);
+                if (!bofs) {
+                        BUG_ON(data);
+                        return bix;
+                }
+        }
+        return bix;
+}
+/**
+ * logfs_seek_hole - find next hole starting at a given block index
+ * @inode:              inode to search in
+ * @bix:                block index to start searching
+ *
+ * Returns next hole.  If the file doesn't contain any further holes, the
+ * block address next to eof is returned instead.
+ */
+u64 logfs_seek_hole(struct inode *inode, u64 bix)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        if (bix < I0_BLOCKS) {
+                bix = seek_holedata_direct(inode, bix, 0);
+                if (bix < I0_BLOCKS)
+                        return bix;
+        }
+        if (!li->li_data[INDIRECT_INDEX])
+                return bix;
+        else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED)
+                bix = maxbix(li->li_height);
+        else {
+                bix = seek_holedata_loop(inode, bix, 0);
+                if (bix < maxbix(li->li_height))
+                        return bix;
+                /* Should not happen anymore.  But if some port writes semi-
+                 * corrupt images (as this one used to) we might run into it.
+                 */
+                WARN_ON_ONCE(bix == maxbix(li->li_height));
+        }
+        return bix;
+}
+static u64 __logfs_seek_data(struct inode *inode, u64 bix)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        if (bix < I0_BLOCKS) {
+                bix = seek_holedata_direct(inode, bix, 1);
+                if (bix < I0_BLOCKS)
+                        return bix;
+        }
+        if (bix < maxbix(li->li_height)) {
+                if (!li->li_data[INDIRECT_INDEX])
+                        bix = maxbix(li->li_height);
+                else
+                        return seek_holedata_loop(inode, bix, 1);
+        }
+        return bix;
+}
+/**
+ * logfs_seek_data - find next data block after a given block index
+ * @inode:              inode to search in
+ * @bix:                block index to start searching
+ *
+ * Returns next data block.  If the file doesn't contain any further data
+ * blocks, the last block in the file is returned instead.
+ */
+u64 logfs_seek_data(struct inode *inode, u64 bix)
+{
+        struct super_block *sb = inode->i_sb;
+        u64 ret, end;
+        ret = __logfs_seek_data(inode, bix);
+        end = i_size_read(inode) >> sb->s_blocksize_bits;
+        if (ret >= end)
+                ret = max(bix, end);
+        return ret;
+}
+static int logfs_is_valid_direct(struct logfs_inode *li, u64 bix, u64 ofs)
+{
+        return pure_ofs(li->li_data[bix]) == ofs;
+}
+static int __logfs_is_valid_loop(struct inode *inode, u64 bix,
+                u64 ofs, u64 bofs)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        level_t level;
+        int ret;
+        struct page *page;
+        for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)){
+                page = logfs_get_write_page(inode, bix, level);
+                BUG_ON(!page);
+                ret = logfs_segment_read(inode, page, bofs, bix, level);
+                if (ret) {
+                        logfs_put_write_page(page);
+                        return 0;
+                }
+                bofs = block_get_pointer(page, get_bits(bix, SUBLEVEL(level)));
+                logfs_put_write_page(page);
+                if (!bofs)
+                        return 0;
+                if (pure_ofs(bofs) == ofs)
+                        return 1;
+        }
+        return 0;
+}
+static int logfs_is_valid_loop(struct inode *inode, u64 bix, u64 ofs)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        u64 bofs = li->li_data[INDIRECT_INDEX];
+        if (!bofs)
+                return 0;
+        if (bix >= maxbix(li->li_height))
+                return 0;
+        if (pure_ofs(bofs) == ofs)
+                return 1;
+        return __logfs_is_valid_loop(inode, bix, ofs, bofs);
+}
+static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1)
+                return 0;
+        if (bix < I0_BLOCKS)
+                return logfs_is_valid_direct(li, bix, ofs);
+        return logfs_is_valid_loop(inode, bix, ofs);
+}
+/**
+ * logfs_is_valid_block - check whether this block is still valid
+ *
+ * @sb  - superblock
+ * @ofs - block physical offset
+ * @ino - block inode number
+ * @bix - block index
+ * @level - block level
+ *
+ * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will
+ * become invalid once the journal is written.
+ */
+int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
+                gc_level_t gc_level)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct inode *inode;
+        int ret, cookie;
+        /* Umount closes a segment with free blocks remaining.  Those
+         * blocks are by definition invalid. */
+        if (ino == -1)
+                return 0;
+        LOGFS_BUG_ON((u64)(u_long)ino != ino, sb);
+        inode = logfs_safe_iget(sb, ino, &cookie);
+        if (IS_ERR(inode))
+                goto invalid;
+        ret = __logfs_is_valid_block(inode, bix, ofs);
+        logfs_safe_iput(inode, cookie);
+        if (ret)
+                return ret;
+invalid:
+        /* Block is nominally invalid, but may still sit in the shadow tree,
+         * waiting for a journal commit.
+         */
+        if (btree_lookup64(&super->s_shadow_tree.old, ofs))
+                return 2;
+        return 0;
+}
+int logfs_readpage_nolock(struct page *page)
+{
+        struct inode *inode = page->mapping->host;
+        int ret = -EIO;
+        ret = logfs_read_block(inode, page, READ);
+        if (ret) {
+                ClearPageUptodate(page);
+                SetPageError(page);
+        } else {
+                SetPageUptodate(page);
+                ClearPageError(page);
+        }
+        flush_dcache_page(page);
+        return ret;
+}
+static int logfs_reserve_bytes(struct inode *inode, int bytes)
+{
+        struct logfs_super *super = logfs_super(inode->i_sb);
+        u64 available = super->s_free_bytes + super->s_dirty_free_bytes
+                        - super->s_dirty_used_bytes - super->s_dirty_pages;
+        if (!bytes)
+                return 0;
+        if (available < bytes)
+                return -ENOSPC;
+        if (available < bytes + super->s_root_reserve &&
+                        !capable(CAP_SYS_RESOURCE))
+                return -ENOSPC;
+        return 0;
+}
+int get_page_reserve(struct inode *inode, struct page *page)
+{
+        struct logfs_super *super = logfs_super(inode->i_sb);
+        int ret;
+        if (logfs_block(page) && logfs_block(page)->reserved_bytes)
+                return 0;
+        logfs_get_wblocks(inode->i_sb, page, WF_LOCK);
+        ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE);
+        if (!ret) {
+                alloc_data_block(inode, page);
+                logfs_block(page)->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
+                super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE;
+        }
+        logfs_put_wblocks(inode->i_sb, page, WF_LOCK);
+        return ret;
+}
+/*
+ * We are protected by write lock.  Push victims up to superblock level
+ * and release transaction when appropriate.
+ */
+/* FIXME: This is currently called from the wrong spots. */
+static void logfs_handle_transaction(struct inode *inode,
+                struct logfs_transaction *ta)
+{
+        struct logfs_super *super = logfs_super(inode->i_sb);
+        if (!ta)
+                return;
+        logfs_inode(inode)->li_block->ta = NULL;
+        if (inode->i_ino != LOGFS_INO_MASTER) {
+                BUG(); /* FIXME: Yes, this needs more thought */
+                /* just remember the transaction until inode is written */
+                //BUG_ON(logfs_inode(inode)->li_transaction);
+                //logfs_inode(inode)->li_transaction = ta;
+                return;
+        }
+        switch (ta->state) {
+        case CREATE_1: /* fall through */
+        case UNLINK_1:
+                BUG_ON(super->s_victim_ino);
+                super->s_victim_ino = ta->ino;
+                break;
+        case CREATE_2: /* fall through */
+        case UNLINK_2:
+                BUG_ON(super->s_victim_ino != ta->ino);
+                super->s_victim_ino = 0;
+                /* transaction ends here - free it */
+                kfree(ta);
+                break;
+        case CROSS_RENAME_1:
+                BUG_ON(super->s_rename_dir);
+                BUG_ON(super->s_rename_pos);
+                super->s_rename_dir = ta->dir;
+                super->s_rename_pos = ta->pos;
+                break;
+        case CROSS_RENAME_2:
+                BUG_ON(super->s_rename_dir != ta->dir);
+                BUG_ON(super->s_rename_pos != ta->pos);
+                super->s_rename_dir = 0;
+                super->s_rename_pos = 0;
+                kfree(ta);
+                break;
+        case TARGET_RENAME_1:
+                BUG_ON(super->s_rename_dir);
+                BUG_ON(super->s_rename_pos);
+                BUG_ON(super->s_victim_ino);
+                super->s_rename_dir = ta->dir;
+                super->s_rename_pos = ta->pos;
+                super->s_victim_ino = ta->ino;
+                break;
+        case TARGET_RENAME_2:
+                BUG_ON(super->s_rename_dir != ta->dir);
+                BUG_ON(super->s_rename_pos != ta->pos);
+                BUG_ON(super->s_victim_ino != ta->ino);
+                super->s_rename_dir = 0;
+                super->s_rename_pos = 0;
+                break;
+        case TARGET_RENAME_3:
+                BUG_ON(super->s_rename_dir);
+                BUG_ON(super->s_rename_pos);
+                BUG_ON(super->s_victim_ino != ta->ino);
+                super->s_victim_ino = 0;
+                kfree(ta);
+                break;
+        default:
+                BUG();
+        }
+}
+/*
+ * Not strictly a reservation, but rather a check that we still have enough
+ * space to satisfy the write.
+ */
+static int logfs_reserve_blocks(struct inode *inode, int blocks)
+{
+        return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE);
+}
+struct write_control {
+        u64 ofs;
+        long flags;
+};
+static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix,
+                level_t level, u64 old_ofs)
+{
+        struct logfs_super *super = logfs_super(inode->i_sb);
+        struct logfs_shadow *shadow;
+        shadow = mempool_alloc(super->s_shadow_pool, GFP_NOFS);
+        memset(shadow, 0, sizeof(*shadow));
+        shadow->ino = inode->i_ino;
+        shadow->bix = bix;
+        shadow->gc_level = expand_level(inode->i_ino, level);
+        shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED;
+        return shadow;
+}
+static void free_shadow(struct inode *inode, struct logfs_shadow *shadow)
+{
+        struct logfs_super *super = logfs_super(inode->i_sb);
+        mempool_free(shadow, super->s_shadow_pool);
+}
+/**
+ * fill_shadow_tree - Propagate shadow tree changes due to a write
+ * @inode:      Inode owning the page
+ * @page:       Struct page that was written
+ * @shadow:     Shadow for the current write
+ *
+ * Writes in logfs can result in two semi-valid objects.  The old object
+ * is still valid as long as it can be reached by following pointers on
+ * the medium.  Only when writes propagate all the way up to the journal
+ * has the new object safely replaced the old one.
+ *
+ * To handle this problem, a struct logfs_shadow is used to represent
+ * every single write.  It is attached to the indirect block, which is
+ * marked dirty.  When the indirect block is written, its shadows are
+ * handed up to the next indirect block (or inode).  Untimately they
+ * will reach the master inode and be freed upon journal commit.
+ *
+ * This function handles a single step in the propagation.  It adds the
+ * shadow for the current write to the tree, along with any shadows in
+ * the page's tree, in case it was an indirect block.  If a page is
+ * written, the inode parameter is left NULL, if an inode is written,
+ * the page parameter is left NULL.
+ */
+static void fill_shadow_tree(struct inode *inode, struct page *page,
+                struct logfs_shadow *shadow)
+{
+        struct logfs_super *super = logfs_super(inode->i_sb);
+        struct logfs_block *block = logfs_block(page);
+        struct shadow_tree *tree = &super->s_shadow_tree;
+        if (PagePrivate(page)) {
+                if (block->alias_map)
+                        super->s_no_object_aliases -= bitmap_weight(
+                                        block->alias_map, LOGFS_BLOCK_FACTOR);
+                logfs_handle_transaction(inode, block->ta);
+                block->ops->free_block(inode->i_sb, block);
+        }
+        if (shadow) {
+                if (shadow->old_ofs)
+                        btree_insert64(&tree->old, shadow->old_ofs, shadow,
+                                        GFP_NOFS);
+                else
+                        btree_insert64(&tree->new, shadow->new_ofs, shadow,
+                                        GFP_NOFS);
+                super->s_dirty_used_bytes += shadow->new_len;
+                super->s_dirty_free_bytes += shadow->old_len;
+        }
+}
+static void logfs_set_alias(struct super_block *sb, struct logfs_block *block,
+                long child_no)
+{
+        struct logfs_super *super = logfs_super(sb);
+        if (block->inode && block->inode->i_ino == LOGFS_INO_MASTER) {
+                /* Aliases in the master inode are pointless. */
+                return;
+        }
+        if (!test_bit(child_no, block->alias_map)) {
+                set_bit(child_no, block->alias_map);
+                super->s_no_object_aliases++;
+        }
+        list_move_tail(&block->alias_list, &super->s_object_alias);
+}
+/*
+ * Object aliases can and often do change the size and occupied space of a
+ * file.  So not only do we have to change the pointers, we also have to
+ * change inode->i_size and li->li_used_bytes.  Which is done by setting
+ * another two object aliases for the inode itself.
+ */
+static void set_iused(struct inode *inode, struct logfs_shadow *shadow)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        if (shadow->new_len == shadow->old_len)
+                return;
+        alloc_inode_block(inode);
+        li->li_used_bytes += shadow->new_len - shadow->old_len;
+        __logfs_set_blocks(inode);
+        logfs_set_alias(inode->i_sb, li->li_block, INODE_USED_OFS);
+        logfs_set_alias(inode->i_sb, li->li_block, INODE_SIZE_OFS);
+}
+static int logfs_write_i0(struct inode *inode, struct page *page,
+                struct write_control *wc)
+{
+        struct logfs_shadow *shadow;
+        u64 bix;
+        level_t level;
+        int full, err = 0;
+        logfs_unpack_index(page->index, &bix, &level);
+        if (wc->ofs == 0)
+                if (logfs_reserve_blocks(inode, 1))
+                        return -ENOSPC;
+        shadow = alloc_shadow(inode, bix, level, wc->ofs);
+        if (wc->flags & WF_WRITE)
+                err = logfs_segment_write(inode, page, shadow);
+        if (wc->flags & WF_DELETE)
+                logfs_segment_delete(inode, shadow);
+        if (err) {
+                free_shadow(inode, shadow);
+                return err;
+        }
+        set_iused(inode, shadow);
+        full = 1;
+        if (level != 0) {
+                alloc_indirect_block(inode, page, 0);
+                full = logfs_block(page)->full == LOGFS_BLOCK_FACTOR;
+        }
+        fill_shadow_tree(inode, page, shadow);
+        wc->ofs = shadow->new_ofs;
+        if (wc->ofs && full)
+                wc->ofs |= LOGFS_FULLY_POPULATED;
+        return 0;
+}
+static int logfs_write_direct(struct inode *inode, struct page *page,
+                long flags)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        struct write_control wc = {
+                .ofs = li->li_data[page->index],
+                .flags = flags,
+        };
+        int err;
+        alloc_inode_block(inode);
+        err = logfs_write_i0(inode, page, &wc);
+        if (err)
+                return err;
+        li->li_data[page->index] = wc.ofs;
+        logfs_set_alias(inode->i_sb, li->li_block,
+                        page->index + INODE_POINTER_OFS);
+        return 0;
+}
+static int ptr_change(u64 ofs, struct page *page)
+{
+        struct logfs_block *block = logfs_block(page);
+        int empty0, empty1, full0, full1;
+        empty0 = ofs == 0;
+        empty1 = block->partial == 0;
+        if (empty0 != empty1)
+                return 1;
+        /* The !! is necessary to shrink result to int */
+        full0 = !!(ofs & LOGFS_FULLY_POPULATED);
+        full1 = block->full == LOGFS_BLOCK_FACTOR;
+        if (full0 != full1)
+                return 1;
+        return 0;
+}
+static int __logfs_write_rec(struct inode *inode, struct page *page,
+                struct write_control *this_wc,
+                pgoff_t bix, level_t target_level, level_t level)
+{
+        int ret, page_empty = 0;
+        int child_no = get_bits(bix, SUBLEVEL(level));
+        struct page *ipage;
+        struct write_control child_wc = {
+                .flags = this_wc->flags,
+        };
+        ipage = logfs_get_write_page(inode, bix, level);
+        if (!ipage)
+                return -ENOMEM;
+        if (this_wc->ofs) {
+                ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
+                if (ret)
+                        goto out;
+        } else if (!PageUptodate(ipage)) {
+                page_empty = 1;
+                logfs_read_empty(ipage);
+        }
+        child_wc.ofs = block_get_pointer(ipage, child_no);
+        if ((__force u8)level-1 > (__force u8)target_level)
+                ret = __logfs_write_rec(inode, page, &child_wc, bix,
+                                target_level, SUBLEVEL(level));
+        else
+                ret = logfs_write_i0(inode, page, &child_wc);
+        if (ret)
+                goto out;
+        alloc_indirect_block(inode, ipage, page_empty);
+        block_set_pointer(ipage, child_no, child_wc.ofs);
+        /* FIXME: first condition seems superfluous */
+        if (child_wc.ofs || logfs_block(ipage)->partial)
+                this_wc->flags |= WF_WRITE;
+        /* the condition on this_wc->ofs ensures that we won't consume extra
+         * space for indirect blocks in the future, which we cannot reserve */
+        if (!this_wc->ofs || ptr_change(this_wc->ofs, ipage))
+                ret = logfs_write_i0(inode, ipage, this_wc);
+        else
+                logfs_set_alias(inode->i_sb, logfs_block(ipage), child_no);
+out:
+        logfs_put_write_page(ipage);
+        return ret;
+}
+static int logfs_write_rec(struct inode *inode, struct page *page,
+                pgoff_t bix, level_t target_level, long flags)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        struct write_control wc = {
+                .ofs = li->li_data[INDIRECT_INDEX],
+                .flags = flags,
+        };
+        int ret;
+        alloc_inode_block(inode);
+        if (li->li_height > (__force u8)target_level)
+                ret = __logfs_write_rec(inode, page, &wc, bix, target_level,
+                                LEVEL(li->li_height));
+        else
+                ret = logfs_write_i0(inode, page, &wc);
+        if (ret)
+                return ret;
+        if (li->li_data[INDIRECT_INDEX] != wc.ofs) {
+                li->li_data[INDIRECT_INDEX] = wc.ofs;
+                logfs_set_alias(inode->i_sb, li->li_block,
+                                INDIRECT_INDEX + INODE_POINTER_OFS);
+        }
+        return ret;
+}
+void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta)
+{
+        alloc_inode_block(inode);
+        logfs_inode(inode)->li_block->ta = ta;
+}
+void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta)
+{
+        struct logfs_block *block = logfs_inode(inode)->li_block;
+        if (block && block->ta)
+                block->ta = NULL;
+}
+static int grow_inode(struct inode *inode, u64 bix, level_t level)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        u8 height = (__force u8)level;
+        struct page *page;
+        struct write_control wc = {
+                .flags = WF_WRITE,
+        };
+        int err;
+        BUG_ON(height > 5 || li->li_height > 5);
+        while (height > li->li_height || bix >= maxbix(li->li_height)) {
+                page = logfs_get_write_page(inode, I0_BLOCKS + 1,
+                                LEVEL(li->li_height + 1));
+                if (!page)
+                        return -ENOMEM;
+                logfs_read_empty(page);
+                alloc_indirect_block(inode, page, 1);
+                block_set_pointer(page, 0, li->li_data[INDIRECT_INDEX]);
+                err = logfs_write_i0(inode, page, &wc);
+                logfs_put_write_page(page);
+                if (err)
+                        return err;
+                li->li_data[INDIRECT_INDEX] = wc.ofs;
+                wc.ofs = 0;
+                li->li_height++;
+                logfs_set_alias(inode->i_sb, li->li_block, INODE_HEIGHT_OFS);
+        }
+        return 0;
+}
+static int __logfs_write_buf(struct inode *inode, struct page *page, long flags)
+{
+        struct logfs_super *super = logfs_super(inode->i_sb);
+        pgoff_t index = page->index;
+        u64 bix;
+        level_t level;
+        int err;
+        flags |= WF_WRITE | WF_DELETE;
+        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+        logfs_unpack_index(index, &bix, &level);
+        if (logfs_block(page) && logfs_block(page)->reserved_bytes)
+                super->s_dirty_pages -= logfs_block(page)->reserved_bytes;
+        if (index < I0_BLOCKS)
+                return logfs_write_direct(inode, page, flags);
+        bix = adjust_bix(bix, level);
+        err = grow_inode(inode, bix, level);
+        if (err)
+                return err;
+        return logfs_write_rec(inode, page, bix, level, flags);
+}
+int logfs_write_buf(struct inode *inode, struct page *page, long flags)
+{
+        struct super_block *sb = inode->i_sb;
+        int ret;
+        logfs_get_wblocks(sb, page, flags & WF_LOCK);
+        ret = __logfs_write_buf(inode, page, flags);
+        logfs_put_wblocks(sb, page, flags & WF_LOCK);
+        return ret;
+}
+static int __logfs_delete(struct inode *inode, struct page *page)
+{
+        long flags = WF_DELETE;
+        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+        if (page->index < I0_BLOCKS)
+                return logfs_write_direct(inode, page, flags);
+        return logfs_write_rec(inode, page, page->index, 0, flags);
+}
+int logfs_delete(struct inode *inode, pgoff_t index,
+                struct shadow_tree *shadow_tree)
+{
+        struct super_block *sb = inode->i_sb;
+        struct page *page;
+        int ret;
+        page = logfs_get_read_page(inode, index, 0);
+        if (!page)
+                return -ENOMEM;
+        logfs_get_wblocks(sb, page, 1);
+        ret = __logfs_delete(inode, page);
+        logfs_put_wblocks(sb, page, 1);
+        logfs_put_read_page(page);
+        return ret;
+}
+int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
+                gc_level_t gc_level, long flags)
+{
+        level_t level = shrink_level(gc_level);
+        struct page *page;
+        int err;
+        page = logfs_get_write_page(inode, bix, level);
+        if (!page)
+                return -ENOMEM;
+        err = logfs_segment_read(inode, page, ofs, bix, level);
+        if (!err) {
+                if (level != 0)
+                        alloc_indirect_block(inode, page, 0);
+                err = logfs_write_buf(inode, page, flags);
+                if (!err && shrink_level(gc_level) == 0) {
+                        /* Rewrite cannot mark the inode dirty but has to
+                         * write it immediatly.
+                         * Q: Can't we just create an alias for the inode
+                         * instead?  And if not, why not?
+                         */
+                        if (inode->i_ino == LOGFS_INO_MASTER)
+                                logfs_write_anchor(inode->i_sb);
+                        else {
+                                err = __logfs_write_inode(inode, flags);
+                        }
+                }
+        }
+        logfs_put_write_page(page);
+        return err;
+}
+static int truncate_data_block(struct inode *inode, struct page *page,
+                u64 ofs, struct logfs_shadow *shadow, u64 size)
+{
+        loff_t pageofs = page->index << inode->i_sb->s_blocksize_bits;
+        u64 bix;
+        level_t level;
+        int err;
+        /* Does truncation happen within this page? */
+        if (size <= pageofs || size - pageofs >= PAGE_SIZE)
+                return 0;
+        logfs_unpack_index(page->index, &bix, &level);
+        BUG_ON(level != 0);
+        err = logfs_segment_read(inode, page, ofs, bix, level);
+        if (err)
+                return err;
+        zero_user_segment(page, size - pageofs, PAGE_CACHE_SIZE);
+        return logfs_segment_write(inode, page, shadow);
+}
+static int logfs_truncate_i0(struct inode *inode, struct page *page,
+                struct write_control *wc, u64 size)
+{
+        struct logfs_shadow *shadow;
+        u64 bix;
+        level_t level;
+        int err = 0;
+        logfs_unpack_index(page->index, &bix, &level);
+        BUG_ON(level != 0);
+        shadow = alloc_shadow(inode, bix, level, wc->ofs);
+        err = truncate_data_block(inode, page, wc->ofs, shadow, size);
+        if (err) {
+                free_shadow(inode, shadow);
+                return err;
+        }
+        logfs_segment_delete(inode, shadow);
+        set_iused(inode, shadow);
+        fill_shadow_tree(inode, page, shadow);
+        wc->ofs = shadow->new_ofs;
+        return 0;
+}
+static int logfs_truncate_direct(struct inode *inode, u64 size)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        struct write_control wc;
+        struct page *page;
+        int e;
+        int err;
+        alloc_inode_block(inode);
+        for (e = I0_BLOCKS - 1; e >= 0; e--) {
+                if (size > (e+1) * LOGFS_BLOCKSIZE)
+                        break;
+                wc.ofs = li->li_data[e];
+                if (!wc.ofs)
+                        continue;
+                page = logfs_get_write_page(inode, e, 0);
+                if (!page)
+                        return -ENOMEM;
+                err = logfs_segment_read(inode, page, wc.ofs, e, 0);
+                if (err) {
+                        logfs_put_write_page(page);
+                        return err;
+                }
+                err = logfs_truncate_i0(inode, page, &wc, size);
+                logfs_put_write_page(page);
+                if (err)
+                        return err;
+                li->li_data[e] = wc.ofs;
+        }
+        return 0;
+}
+/* FIXME: these need to become per-sb once we support different blocksizes */
+static u64 __logfs_step[] = {
+        1,
+        I1_BLOCKS,
+        I2_BLOCKS,
+        I3_BLOCKS,
+};
+static u64 __logfs_start_index[] = {
+        I0_BLOCKS,
+        I1_BLOCKS,
+        I2_BLOCKS,
+        I3_BLOCKS
+};
+static inline u64 logfs_step(level_t level)
+{
+        return __logfs_step[(__force u8)level];
+}
+static inline u64 logfs_factor(u8 level)
+{
+        return __logfs_step[level] * LOGFS_BLOCKSIZE;
+}
+static inline u64 logfs_start_index(level_t level)
+{
+        return __logfs_start_index[(__force u8)level];
+}
+static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, level_t *level)
+{
+        logfs_unpack_index(index, bix, level);
+        if (*bix <= logfs_start_index(SUBLEVEL(*level)))
+                *bix = 0;
+}
+static int __logfs_truncate_rec(struct inode *inode, struct page *ipage,
+                struct write_control *this_wc, u64 size)
+{
+        int truncate_happened = 0;
+        int e, err = 0;
+        u64 bix, child_bix, next_bix;
+        level_t level;
+        struct page *page;
+        struct write_control child_wc = { /* FIXME: flags */ };
+        logfs_unpack_raw_index(ipage->index, &bix, &level);
+        err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
+        if (err)
+                return err;
+        for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) {
+                child_bix = bix + e * logfs_step(SUBLEVEL(level));
+                next_bix = child_bix + logfs_step(SUBLEVEL(level));
+                if (size > next_bix * LOGFS_BLOCKSIZE)
+                        break;
+                child_wc.ofs = pure_ofs(block_get_pointer(ipage, e));
+                if (!child_wc.ofs)
+                        continue;
+                page = logfs_get_write_page(inode, child_bix, SUBLEVEL(level));
+                if (!page)
+                        return -ENOMEM;
+                if ((__force u8)level > 1)
+                        err = __logfs_truncate_rec(inode, page, &child_wc, size);
+                else
+                        err = logfs_truncate_i0(inode, page, &child_wc, size);
+                logfs_put_write_page(page);
+                if (err)
+                        return err;
+                truncate_happened = 1;
+                alloc_indirect_block(inode, ipage, 0);
+                block_set_pointer(ipage, e, child_wc.ofs);
+        }
+        if (!truncate_happened) {
+                printk("ineffectual truncate (%lx, %lx, %llx)\n", inode->i_ino, ipage->index, size);
+                return 0;
+        }
+        this_wc->flags = WF_DELETE;
+        if (logfs_block(ipage)->partial)
+                this_wc->flags |= WF_WRITE;
+        return logfs_write_i0(inode, ipage, this_wc);
+}
+static int logfs_truncate_rec(struct inode *inode, u64 size)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        struct write_control wc = {
+                .ofs = li->li_data[INDIRECT_INDEX],
+        };
+        struct page *page;
+        int err;
+        alloc_inode_block(inode);
+        if (!wc.ofs)
+                return 0;
+        page = logfs_get_write_page(inode, 0, LEVEL(li->li_height));
+        if (!page)
+                return -ENOMEM;
+        err = __logfs_truncate_rec(inode, page, &wc, size);
+        logfs_put_write_page(page);
+        if (err)
+                return err;
+        if (li->li_data[INDIRECT_INDEX] != wc.ofs)
+                li->li_data[INDIRECT_INDEX] = wc.ofs;
+        return 0;
+}
+static int __logfs_truncate(struct inode *inode, u64 size)
+{
+        int ret;
+        if (size >= logfs_factor(logfs_inode(inode)->li_height))
+                return 0;
+        ret = logfs_truncate_rec(inode, size);
+        if (ret)
+                return ret;
+        return logfs_truncate_direct(inode, size);
+}
+int logfs_truncate(struct inode *inode, u64 size)
+{
+        struct super_block *sb = inode->i_sb;
+        int err;
+        logfs_get_wblocks(sb, NULL, 1);
+        err = __logfs_truncate(inode, size);
+        if (!err)
+                err = __logfs_write_inode(inode, 0);
+        logfs_put_wblocks(sb, NULL, 1);
+        if (!err)
+                err = vmtruncate(inode, size);
+        /* I don't trust error recovery yet. */
+        WARN_ON(err);
+        return err;
+}
+static void move_page_to_inode(struct inode *inode, struct page *page)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        struct logfs_block *block = logfs_block(page);
+        if (!block)
+                return;
+        log_blockmove("move_page_to_inode(%llx, %llx, %x)\n",
+                        block->ino, block->bix, block->level);
+        BUG_ON(li->li_block);
+        block->ops = &inode_block_ops;
+        block->inode = inode;
+        li->li_block = block;
+        block->page = NULL;
+        page->private = 0;
+        ClearPagePrivate(page);
+}
+static void move_inode_to_page(struct page *page, struct inode *inode)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        struct logfs_block *block = li->li_block;
+        if (!block)
+                return;
+        log_blockmove("move_inode_to_page(%llx, %llx, %x)\n",
+                        block->ino, block->bix, block->level);
+        BUG_ON(PagePrivate(page));
+        block->ops = &indirect_block_ops;
+        block->page = page;
+        page->private = (unsigned long)block;
+        SetPagePrivate(page);
+        block->inode = NULL;
+        li->li_block = NULL;
+}
+int logfs_read_inode(struct inode *inode)
+{
+        struct super_block *sb = inode->i_sb;
+        struct logfs_super *super = logfs_super(sb);
+        struct inode *master_inode = super->s_master_inode;
+        struct page *page;
+        struct logfs_disk_inode *di;
+        u64 ino = inode->i_ino;
+        if (ino << sb->s_blocksize_bits > i_size_read(master_inode))
+                return -ENODATA;
+        if (!logfs_exist_block(master_inode, ino))
+                return -ENODATA;
+        page = read_cache_page(master_inode->i_mapping, ino,
+                        (filler_t *)logfs_readpage, NULL);
+        if (IS_ERR(page))
+                return PTR_ERR(page);
+        di = kmap_atomic(page, KM_USER0);
+        logfs_disk_to_inode(di, inode);
+        kunmap_atomic(di, KM_USER0);
+        move_page_to_inode(inode, page);
+        page_cache_release(page);
+        return 0;
+}
+/* Caller must logfs_put_write_page(page); */
+static struct page *inode_to_page(struct inode *inode)
+{
+        struct inode *master_inode = logfs_super(inode->i_sb)->s_master_inode;
+        struct logfs_disk_inode *di;
+        struct page *page;
+        BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
+        page = logfs_get_write_page(master_inode, inode->i_ino, 0);
+        if (!page)
+                return NULL;
+        di = kmap_atomic(page, KM_USER0);
+        logfs_inode_to_disk(inode, di);
+        kunmap_atomic(di, KM_USER0);
+        move_inode_to_page(page, inode);
+        return page;
+}
+/* Cheaper version of write_inode.  All changes are concealed in
+ * aliases, which are moved back.  No write to the medium happens.
+ */
+void logfs_clear_inode(struct inode *inode)
+{
+        struct super_block *sb = inode->i_sb;
+        struct logfs_inode *li = logfs_inode(inode);
+        struct logfs_block *block = li->li_block;
+        struct page *page;
+        /* Only deleted files may be dirty at this point */
+        BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
+        if (!block)
+                return;
+        if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
+                block->ops->free_block(inode->i_sb, block);
+                return;
+        }
+        BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS);
+        page = inode_to_page(inode);
+        BUG_ON(!page); /* FIXME: Use emergency page */
+        logfs_put_write_page(page);
+}
+static int do_write_inode(struct inode *inode)
+{
+        struct super_block *sb = inode->i_sb;
+        struct inode *master_inode = logfs_super(sb)->s_master_inode;
+        loff_t size = (inode->i_ino + 1) << inode->i_sb->s_blocksize_bits;
+        struct page *page;
+        int err;
+        BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
+        /* FIXME: lock inode */
+        if (i_size_read(master_inode) < size)
+                i_size_write(master_inode, size);
+        /* TODO: Tell vfs this inode is clean now */
+        page = inode_to_page(inode);
+        if (!page)
+                return -ENOMEM;
+        /* FIXME: transaction is part of logfs_block now.  Is that enough? */
+        err = logfs_write_buf(master_inode, page, 0);
+        logfs_put_write_page(page);
+        return err;
+}
+static void logfs_mod_segment_entry(struct super_block *sb, u32 segno,
+                int write,
+                void (*change_se)(struct logfs_segment_entry *, long),
+                long arg)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct inode *inode;
+        struct page *page;
+        struct logfs_segment_entry *se;
+        pgoff_t page_no;
+        int child_no;
+        page_no = segno >> (sb->s_blocksize_bits - 3);
+        child_no = segno & ((sb->s_blocksize >> 3) - 1);
+        inode = super->s_segfile_inode;
+        page = logfs_get_write_page(inode, page_no, 0);
+        BUG_ON(!page); /* FIXME: We need some reserve page for this case */
+        if (!PageUptodate(page))
+                logfs_read_block(inode, page, WRITE);
+        if (write)
+                alloc_indirect_block(inode, page, 0);
+        se = kmap_atomic(page, KM_USER0);
+        change_se(se + child_no, arg);
+        if (write) {
+                logfs_set_alias(sb, logfs_block(page), child_no);
+                BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize);
+        }
+        kunmap_atomic(se, KM_USER0);
+        logfs_put_write_page(page);
+}
+static void __get_segment_entry(struct logfs_segment_entry *se, long _target)
+{
+        struct logfs_segment_entry *target = (void *)_target;
+        *target = *se;
+}
+void logfs_get_segment_entry(struct super_block *sb, u32 segno,
+                struct logfs_segment_entry *se)
+{
+        logfs_mod_segment_entry(sb, segno, 0, __get_segment_entry, (long)se);
+}
+static void __set_segment_used(struct logfs_segment_entry *se, long increment)
+{
+        u32 valid;
+        valid = be32_to_cpu(se->valid);
+        valid += increment;
+        se->valid = cpu_to_be32(valid);
+}
+void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment)
+{
+        struct logfs_super *super = logfs_super(sb);
+        u32 segno = ofs >> super->s_segshift;
+        if (!increment)
+                return;
+        logfs_mod_segment_entry(sb, segno, 1, __set_segment_used, increment);
+}
+static void __set_segment_erased(struct logfs_segment_entry *se, long ec_level)
+{
+        se->ec_level = cpu_to_be32(ec_level);
+}
+void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
+                gc_level_t gc_level)
+{
+        u32 ec_level = ec << 4 | (__force u8)gc_level;
+        logfs_mod_segment_entry(sb, segno, 1, __set_segment_erased, ec_level);
+}
+static void __set_segment_reserved(struct logfs_segment_entry *se, long ignore)
+{
+        se->valid = cpu_to_be32(RESERVED);
+}
+void logfs_set_segment_reserved(struct super_block *sb, u32 segno)
+{
+        logfs_mod_segment_entry(sb, segno, 1, __set_segment_reserved, 0);
+}
+static void __set_segment_unreserved(struct logfs_segment_entry *se,
+                long ec_level)
+{
+        se->valid = 0;
+        se->ec_level = cpu_to_be32(ec_level);
+}
+void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec)
+{
+        u32 ec_level = ec << 4;
+        logfs_mod_segment_entry(sb, segno, 1, __set_segment_unreserved,
+                        ec_level);
+}
+int __logfs_write_inode(struct inode *inode, long flags)
+{
+        struct super_block *sb = inode->i_sb;
+        int ret;
+        logfs_get_wblocks(sb, NULL, flags & WF_LOCK);
+        ret = do_write_inode(inode);
+        logfs_put_wblocks(sb, NULL, flags & WF_LOCK);
+        return ret;
+}
+static int do_delete_inode(struct inode *inode)
+{
+        struct super_block *sb = inode->i_sb;
+        struct inode *master_inode = logfs_super(sb)->s_master_inode;
+        struct page *page;
+        int ret;
+        page = logfs_get_write_page(master_inode, inode->i_ino, 0);
+        if (!page)
+                return -ENOMEM;
+        move_inode_to_page(page, inode);
+        logfs_get_wblocks(sb, page, 1);
+        ret = __logfs_delete(master_inode, page);
+        logfs_put_wblocks(sb, page, 1);
+        logfs_put_write_page(page);
+        return ret;
+}
+/*
+ * ZOMBIE inodes have already been deleted before and should remain dead,
+ * if it weren't for valid checking.  No need to kill them again here.
+ */
+void logfs_delete_inode(struct inode *inode)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        if (!(li->li_flags & LOGFS_IF_ZOMBIE)) {
+                li->li_flags |= LOGFS_IF_ZOMBIE;
+                if (i_size_read(inode) > 0)
+                        logfs_truncate(inode, 0);
+                do_delete_inode(inode);
+        }
+        truncate_inode_pages(&inode->i_data, 0);
+        clear_inode(inode);
+}
+void btree_write_block(struct logfs_block *block)
+{
+        struct inode *inode;
+        struct page *page;
+        int err, cookie;
+        inode = logfs_safe_iget(block->sb, block->ino, &cookie);
+        page = logfs_get_write_page(inode, block->bix, block->level);
+        err = logfs_readpage_nolock(page);
+        BUG_ON(err);
+        BUG_ON(!PagePrivate(page));
+        BUG_ON(logfs_block(page) != block);
+        err = __logfs_write_buf(inode, page, 0);
+        BUG_ON(err);
+        BUG_ON(PagePrivate(page) || page->private);
+        logfs_put_write_page(page);
+        logfs_safe_iput(inode, cookie);
+}
+/**
+ * logfs_inode_write - write inode or dentry objects
+ *
+ * @inode:              parent inode (ifile or directory)
+ * @buf:                object to write (inode or dentry)
+ * @n:                  object size
+ * @_pos:               object number (file position in blocks/objects)
+ * @flags:              write flags
+ * @lock:               0 if write lock is already taken, 1 otherwise
+ * @shadow_tree:        shadow below this inode
+ *
+ * FIXME: All caller of this put a 200-300 byte variable on the stack,
+ * only to call here and do a memcpy from that stack variable.  A good
+ * example of wasted performance and stack space.
+ */
+int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
+                loff_t bix, long flags, struct shadow_tree *shadow_tree)
+{
+        loff_t pos = bix << inode->i_sb->s_blocksize_bits;
+        int err;
+        struct page *page;
+        void *pagebuf;
+        BUG_ON(pos & (LOGFS_BLOCKSIZE-1));
+        BUG_ON(count > LOGFS_BLOCKSIZE);
+        page = logfs_get_write_page(inode, bix, 0);
+        if (!page)
+                return -ENOMEM;
+        pagebuf = kmap_atomic(page, KM_USER0);
+        memcpy(pagebuf, buf, count);
+        flush_dcache_page(page);
+        kunmap_atomic(pagebuf, KM_USER0);
+        if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE)
+                i_size_write(inode, pos + LOGFS_BLOCKSIZE);
+        err = logfs_write_buf(inode, page, flags);
+        logfs_put_write_page(page);
+        return err;
+}
+int logfs_open_segfile(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct inode *inode;
+        inode = logfs_read_meta_inode(sb, LOGFS_INO_SEGFILE);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
+        super->s_segfile_inode = inode;
+        return 0;
+}
+int logfs_init_rw(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        int min_fill = 3 * super->s_no_blocks;
+        INIT_LIST_HEAD(&super->s_object_alias);
+        mutex_init(&super->s_write_mutex);
+        super->s_block_pool = mempool_create_kmalloc_pool(min_fill,
+                        sizeof(struct logfs_block));
+        super->s_shadow_pool = mempool_create_kmalloc_pool(min_fill,
+                        sizeof(struct logfs_shadow));
+        return 0;
+}
+void logfs_cleanup_rw(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        destroy_meta_inode(super->s_segfile_inode);
+        if (super->s_block_pool)
+                mempool_destroy(super->s_block_pool);
+        if (super->s_shadow_pool)
+                mempool_destroy(super->s_shadow_pool);
+}
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
new file mode 100644
index 000000000000..801a3a141625
--- /dev/null
+++ b/fs/logfs/segment.c
@@ -0,0 +1,936 @@
+/*
+ * fs/logfs/segment.c   - Handling the Object Store
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ * Object store or ostore makes up the complete device with exception of
+ * the superblock and journal areas.  Apart from its own metadata it stores
+ * three kinds of objects: inodes, dentries and blocks, both data and indirect.
+ */
+#include "logfs.h"
+#include <linux/slab.h>
+static int logfs_mark_segment_bad(struct super_block *sb, u32 segno)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct btree_head32 *head = &super->s_reserved_segments;
+        int err;
+        err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
+        if (err)
+                return err;
+        logfs_super(sb)->s_bad_segments++;
+        /* FIXME: write to journal */
+        return 0;
+}
+int logfs_erase_segment(struct super_block *sb, u32 segno, int ensure_erase)
+{
+        struct logfs_super *super = logfs_super(sb);
+        super->s_gec++;
+        return super->s_devops->erase(sb, (u64)segno << super->s_segshift,
+                        super->s_segsize, ensure_erase);
+}
+static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes)
+{
+        s32 ofs;
+        logfs_open_area(area, bytes);
+        ofs = area->a_used_bytes;
+        area->a_used_bytes += bytes;
+        BUG_ON(area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize);
+        return dev_ofs(area->a_sb, area->a_segno, ofs);
+}
+static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
+                int use_filler)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct address_space *mapping = super->s_mapping_inode->i_mapping;
+        filler_t *filler = super->s_devops->readpage;
+        struct page *page;
+        BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS);
+        if (use_filler)
+                page = read_cache_page(mapping, index, filler, sb);
+        else {
+                page = find_or_create_page(mapping, index, GFP_NOFS);
+                unlock_page(page);
+        }
+        return page;
+}
+void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
+                int use_filler)
+{
+        pgoff_t index = ofs >> PAGE_SHIFT;
+        struct page *page;
+        long offset = ofs & (PAGE_SIZE-1);
+        long copylen;
+        /* Only logfs_wbuf_recover may use len==0 */
+        BUG_ON(!len && !use_filler);
+        do {
+                copylen = min((ulong)len, PAGE_SIZE - offset);
+                page = get_mapping_page(area->a_sb, index, use_filler);
+                SetPageUptodate(page);
+                BUG_ON(!page); /* FIXME: reserve a pool */
+                memcpy(page_address(page) + offset, buf, copylen);
+                SetPagePrivate(page);
+                page_cache_release(page);
+                buf += copylen;
+                len -= copylen;
+                offset = 0;
+                index++;
+        } while (len);
+}
+static void pad_partial_page(struct logfs_area *area)
+{
+        struct super_block *sb = area->a_sb;
+        struct page *page;
+        u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
+        pgoff_t index = ofs >> PAGE_SHIFT;
+        long offset = ofs & (PAGE_SIZE-1);
+        u32 len = PAGE_SIZE - offset;
+        if (len % PAGE_SIZE) {
+                page = get_mapping_page(sb, index, 0);
+                BUG_ON(!page); /* FIXME: reserve a pool */
+                memset(page_address(page) + offset, 0xff, len);
+                SetPagePrivate(page);
+                page_cache_release(page);
+        }
+}
+static void pad_full_pages(struct logfs_area *area)
+{
+        struct super_block *sb = area->a_sb;
+        struct logfs_super *super = logfs_super(sb);
+        u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
+        u32 len = super->s_segsize - area->a_used_bytes;
+        pgoff_t index = PAGE_CACHE_ALIGN(ofs) >> PAGE_CACHE_SHIFT;
+        pgoff_t no_indizes = len >> PAGE_CACHE_SHIFT;
+        struct page *page;
+        while (no_indizes) {
+                page = get_mapping_page(sb, index, 0);
+                BUG_ON(!page); /* FIXME: reserve a pool */
+                SetPageUptodate(page);
+                memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
+                SetPagePrivate(page);
+                page_cache_release(page);
+                index++;
+                no_indizes--;
+        }
+}
+/*
+ * bdev_writeseg will write full pages.  Memset the tail to prevent data leaks.
+ * Also make sure we allocate (and memset) all pages for final writeout.
+ */
+static void pad_wbuf(struct logfs_area *area, int final)
+{
+        pad_partial_page(area);
+        if (final)
+                pad_full_pages(area);
+}
+/*
+ * We have to be careful with the alias tree.  Since lookup is done by bix,
+ * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with
+ * indirect blocks.  So always use it through accessor functions.
+ */
+static void *alias_tree_lookup(struct super_block *sb, u64 ino, u64 bix,
+                level_t level)
+{
+        struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
+        pgoff_t index = logfs_pack_index(bix, level);
+        return btree_lookup128(head, ino, index);
+}
+static int alias_tree_insert(struct super_block *sb, u64 ino, u64 bix,
+                level_t level, void *val)
+{
+        struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
+        pgoff_t index = logfs_pack_index(bix, level);
+        return btree_insert128(head, ino, index, val, GFP_NOFS);
+}
+static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
+                write_alias_t *write_one_alias)
+{
+        struct object_alias_item *item;
+        int err;
+        list_for_each_entry(item, &block->item_list, list) {
+                err = write_alias_journal(sb, block->ino, block->bix,
+                                block->level, item->child_no, item->val);
+                if (err)
+                        return err;
+        }
+        return 0;
+}
+static gc_level_t btree_block_level(struct logfs_block *block)
+{
+        return expand_level(block->ino, block->level);
+}
+static struct logfs_block_ops btree_block_ops = {
+        .write_block    = btree_write_block,
+        .block_level    = btree_block_level,
+        .free_block     = __free_block,
+        .write_alias    = btree_write_alias,
+};
+int logfs_load_object_aliases(struct super_block *sb,
+                struct logfs_obj_alias *oa, int count)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct logfs_block *block;
+        struct object_alias_item *item;
+        u64 ino, bix;
+        level_t level;
+        int i, err;
+        super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
+        count /= sizeof(*oa);
+        for (i = 0; i < count; i++) {
+                item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
+                if (!item)
+                        return -ENOMEM;
+                memset(item, 0, sizeof(*item));
+                super->s_no_object_aliases++;
+                item->val = oa[i].val;
+                item->child_no = be16_to_cpu(oa[i].child_no);
+                ino = be64_to_cpu(oa[i].ino);
+                bix = be64_to_cpu(oa[i].bix);
+                level = LEVEL(oa[i].level);
+                log_aliases("logfs_load_object_aliases(%llx, %llx, %x, %x) %llx\n",
+                                ino, bix, level, item->child_no,
+                                be64_to_cpu(item->val));
+                block = alias_tree_lookup(sb, ino, bix, level);
+                if (!block) {
+                        block = __alloc_block(sb, ino, bix, level);
+                        block->ops = &btree_block_ops;
+                        err = alias_tree_insert(sb, ino, bix, level, block);
+                        BUG_ON(err); /* mempool empty */
+                }
+                if (test_and_set_bit(item->child_no, block->alias_map)) {
+                        printk(KERN_ERR"LogFS: Alias collision detected\n");
+                        return -EIO;
+                }
+                list_move_tail(&block->alias_list, &super->s_object_alias);
+                list_add(&item->list, &block->item_list);
+        }
+        return 0;
+}
+static void kill_alias(void *_block, unsigned long ignore0,
+                u64 ignore1, u64 ignore2, size_t ignore3)
+{
+        struct logfs_block *block = _block;
+        struct super_block *sb = block->sb;
+        struct logfs_super *super = logfs_super(sb);
+        struct object_alias_item *item;
+        while (!list_empty(&block->item_list)) {
+                item = list_entry(block->item_list.next, typeof(*item), list);
+                list_del(&item->list);
+                mempool_free(item, super->s_alias_pool);
+        }
+        block->ops->free_block(sb, block);
+}
+static int obj_type(struct inode *inode, level_t level)
+{
+        if (level == 0) {
+                if (S_ISDIR(inode->i_mode))
+                        return OBJ_DENTRY;
+                if (inode->i_ino == LOGFS_INO_MASTER)
+                        return OBJ_INODE;
+        }
+        return OBJ_BLOCK;
+}
+static int obj_len(struct super_block *sb, int obj_type)
+{
+        switch (obj_type) {
+        case OBJ_DENTRY:
+                return sizeof(struct logfs_disk_dentry);
+        case OBJ_INODE:
+                return sizeof(struct logfs_disk_inode);
+        case OBJ_BLOCK:
+                return sb->s_blocksize;
+        default:
+                BUG();
+        }
+}
+static int __logfs_segment_write(struct inode *inode, void *buf,
+                struct logfs_shadow *shadow, int type, int len, int compr)
+{
+        struct logfs_area *area;
+        struct super_block *sb = inode->i_sb;
+        s64 ofs;
+        struct logfs_object_header h;
+        int acc_len;
+        if (shadow->gc_level == 0)
+                acc_len = len;
+        else
+                acc_len = obj_len(sb, type);
+        area = get_area(sb, shadow->gc_level);
+        ofs = logfs_get_free_bytes(area, len + LOGFS_OBJECT_HEADERSIZE);
+        LOGFS_BUG_ON(ofs <= 0, sb);
+        /*
+         * Order is important.  logfs_get_free_bytes(), by modifying the
+         * segment file, may modify the content of the very page we're about
+         * to write now.  Which is fine, as long as the calculated crc and
+         * written data still match.  So do the modifications _before_
+         * calculating the crc.
+         */
+        h.len   = cpu_to_be16(len);
+        h.type  = type;
+        h.compr = compr;
+        h.ino   = cpu_to_be64(inode->i_ino);
+        h.bix   = cpu_to_be64(shadow->bix);
+        h.crc   = logfs_crc32(&h, sizeof(h) - 4, 4);
+        h.data_crc = logfs_crc32(buf, len, 0);
+        logfs_buf_write(area, ofs, &h, sizeof(h));
+        logfs_buf_write(area, ofs + LOGFS_OBJECT_HEADERSIZE, buf, len);
+        shadow->new_ofs = ofs;
+        shadow->new_len = acc_len + LOGFS_OBJECT_HEADERSIZE;
+        return 0;
+}
+static s64 logfs_segment_write_compress(struct inode *inode, void *buf,
+                struct logfs_shadow *shadow, int type, int len)
+{
+        struct super_block *sb = inode->i_sb;
+        void *compressor_buf = logfs_super(sb)->s_compressed_je;
+        ssize_t compr_len;
+        int ret;
+        mutex_lock(&logfs_super(sb)->s_journal_mutex);
+        compr_len = logfs_compress(buf, compressor_buf, len, len);
+        if (compr_len >= 0) {
+                ret = __logfs_segment_write(inode, compressor_buf, shadow,
+                                type, compr_len, COMPR_ZLIB);
+        } else {
+                ret = __logfs_segment_write(inode, buf, shadow, type, len,
+                                COMPR_NONE);
+        }
+        mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+        return ret;
+}
+/**
+ * logfs_segment_write - write data block to object store
+ * @inode:              inode containing data
+ *
+ * Returns an errno or zero.
+ */
+int logfs_segment_write(struct inode *inode, struct page *page,
+                struct logfs_shadow *shadow)
+{
+        struct super_block *sb = inode->i_sb;
+        struct logfs_super *super = logfs_super(sb);
+        int do_compress, type, len;
+        int ret;
+        void *buf;
+        super->s_flags |= LOGFS_SB_FLAG_DIRTY;
+        BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
+        do_compress = logfs_inode(inode)->li_flags & LOGFS_IF_COMPRESSED;
+        if (shadow->gc_level != 0) {
+                /* temporarily disable compression for indirect blocks */
+                do_compress = 0;
+        }
+        type = obj_type(inode, shrink_level(shadow->gc_level));
+        len = obj_len(sb, type);
+        buf = kmap(page);
+        if (do_compress)
+                ret = logfs_segment_write_compress(inode, buf, shadow, type,
+                                len);
+        else
+                ret = __logfs_segment_write(inode, buf, shadow, type, len,
+                                COMPR_NONE);
+        kunmap(page);
+        log_segment("logfs_segment_write(%llx, %llx, %x) %llx->%llx %x->%x\n",
+                        shadow->ino, shadow->bix, shadow->gc_level,
+                        shadow->old_ofs, shadow->new_ofs,
+                        shadow->old_len, shadow->new_len);
+        /* this BUG_ON did catch a locking bug.  useful */
+        BUG_ON(!(shadow->new_ofs & (super->s_segsize - 1)));
+        return ret;
+}
+int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf)
+{
+        pgoff_t index = ofs >> PAGE_SHIFT;
+        struct page *page;
+        long offset = ofs & (PAGE_SIZE-1);
+        long copylen;
+        while (len) {
+                copylen = min((ulong)len, PAGE_SIZE - offset);
+                page = get_mapping_page(sb, index, 1);
+                if (IS_ERR(page))
+                        return PTR_ERR(page);
+                memcpy(buf, page_address(page) + offset, copylen);
+                page_cache_release(page);
+                buf += copylen;
+                len -= copylen;
+                offset = 0;
+                index++;
+        }
+        return 0;
+}
+/*
+ * The "position" of indirect blocks is ambiguous.  It can be the position
+ * of any data block somewhere behind this indirect block.  So we need to
+ * normalize the positions through logfs_block_mask() before comparing.
+ */
+static int check_pos(struct super_block *sb, u64 pos1, u64 pos2, level_t level)
+{
+        return  (pos1 & logfs_block_mask(sb, level)) !=
+                (pos2 & logfs_block_mask(sb, level));
+}
+#if 0
+static int read_seg_header(struct super_block *sb, u64 ofs,
+                struct logfs_segment_header *sh)
+{
+        __be32 crc;
+        int err;
+        err = wbuf_read(sb, ofs, sizeof(*sh), sh);
+        if (err)
+                return err;
+        crc = logfs_crc32(sh, sizeof(*sh), 4);
+        if (crc != sh->crc) {
+                printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
+                                "got %x\n", ofs, be32_to_cpu(sh->crc),
+                                be32_to_cpu(crc));
+                return -EIO;
+        }
+        return 0;
+}
+#endif
+static int read_obj_header(struct super_block *sb, u64 ofs,
+                struct logfs_object_header *oh)
+{
+        __be32 crc;
+        int err;
+        err = wbuf_read(sb, ofs, sizeof(*oh), oh);
+        if (err)
+                return err;
+        crc = logfs_crc32(oh, sizeof(*oh) - 4, 4);
+        if (crc != oh->crc) {
+                printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
+                                "got %x\n", ofs, be32_to_cpu(oh->crc),
+                                be32_to_cpu(crc));
+                return -EIO;
+        }
+        return 0;
+}
+static void move_btree_to_page(struct inode *inode, struct page *page,
+                __be64 *data)
+{
+        struct super_block *sb = inode->i_sb;
+        struct logfs_super *super = logfs_super(sb);
+        struct btree_head128 *head = &super->s_object_alias_tree;
+        struct logfs_block *block;
+        struct object_alias_item *item, *next;
+        if (!(super->s_flags & LOGFS_SB_FLAG_OBJ_ALIAS))
+                return;
+        block = btree_remove128(head, inode->i_ino, page->index);
+        if (!block)
+                return;
+        log_blockmove("move_btree_to_page(%llx, %llx, %x)\n",
+                        block->ino, block->bix, block->level);
+        list_for_each_entry_safe(item, next, &block->item_list, list) {
+                data[item->child_no] = item->val;
+                list_del(&item->list);
+                mempool_free(item, super->s_alias_pool);
+        }
+        block->page = page;
+        SetPagePrivate(page);
+        page->private = (unsigned long)block;
+        block->ops = &indirect_block_ops;
+        initialize_block_counters(page, block, data, 0);
+}
+/*
+ * This silences a false, yet annoying gcc warning.  I hate it when my editor
+ * jumps into bitops.h each time I recompile this file.
+ * TODO: Complain to gcc folks about this and upgrade compiler.
+ */
+static unsigned long fnb(const unsigned long *addr,
+                unsigned long size, unsigned long offset)
+{
+        return find_next_bit(addr, size, offset);
+}
+void move_page_to_btree(struct page *page)
+{
+        struct logfs_block *block = logfs_block(page);
+        struct super_block *sb = block->sb;
+        struct logfs_super *super = logfs_super(sb);
+        struct object_alias_item *item;
+        unsigned long pos;
+        __be64 *child;
+        int err;
+        if (super->s_flags & LOGFS_SB_FLAG_SHUTDOWN) {
+                block->ops->free_block(sb, block);
+                return;
+        }
+        log_blockmove("move_page_to_btree(%llx, %llx, %x)\n",
+                        block->ino, block->bix, block->level);
+        super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
+        for (pos = 0; ; pos++) {
+                pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
+                if (pos >= LOGFS_BLOCK_FACTOR)
+                        break;
+                item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
+                BUG_ON(!item); /* mempool empty */
+                memset(item, 0, sizeof(*item));
+                child = kmap_atomic(page, KM_USER0);
+                item->val = child[pos];
+                kunmap_atomic(child, KM_USER0);
+                item->child_no = pos;
+                list_add(&item->list, &block->item_list);
+        }
+        block->page = NULL;
+        ClearPagePrivate(page);
+        page->private = 0;
+        block->ops = &btree_block_ops;
+        err = alias_tree_insert(block->sb, block->ino, block->bix, block->level,
+                        block);
+        BUG_ON(err); /* mempool empty */
+        ClearPageUptodate(page);
+}
+static int __logfs_segment_read(struct inode *inode, void *buf,
+                u64 ofs, u64 bix, level_t level)
+{
+        struct super_block *sb = inode->i_sb;
+        void *compressor_buf = logfs_super(sb)->s_compressed_je;
+        struct logfs_object_header oh;
+        __be32 crc;
+        u16 len;
+        int err, block_len;
+        block_len = obj_len(sb, obj_type(inode, level));
+        err = read_obj_header(sb, ofs, &oh);
+        if (err)
+                goto out_err;
+        err = -EIO;
+        if (be64_to_cpu(oh.ino) != inode->i_ino
+                        || check_pos(sb, be64_to_cpu(oh.bix), bix, level)) {
+                printk(KERN_ERR"LOGFS: (ino, bix) don't match at %llx: "
+                                "expected (%lx, %llx), got (%llx, %llx)\n",
+                                ofs, inode->i_ino, bix,
+                                be64_to_cpu(oh.ino), be64_to_cpu(oh.bix));
+                goto out_err;
+        }
+        len = be16_to_cpu(oh.len);
+        switch (oh.compr) {
+        case COMPR_NONE:
+                err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, buf);
+                if (err)
+                        goto out_err;
+                crc = logfs_crc32(buf, len, 0);
+                if (crc != oh.data_crc) {
+                        printk(KERN_ERR"LOGFS: uncompressed data crc error at "
+                                        "%llx: expected %x, got %x\n", ofs,
+                                        be32_to_cpu(oh.data_crc),
+                                        be32_to_cpu(crc));
+                        goto out_err;
+                }
+                break;
+        case COMPR_ZLIB:
+                mutex_lock(&logfs_super(sb)->s_journal_mutex);
+                err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len,
+                                compressor_buf);
+                if (err) {
+                        mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+                        goto out_err;
+                }
+                crc = logfs_crc32(compressor_buf, len, 0);
+                if (crc != oh.data_crc) {
+                        printk(KERN_ERR"LOGFS: compressed data crc error at "
+                                        "%llx: expected %x, got %x\n", ofs,
+                                        be32_to_cpu(oh.data_crc),
+                                        be32_to_cpu(crc));
+                        mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+                        goto out_err;
+                }
+                err = logfs_uncompress(compressor_buf, buf, len, block_len);
+                mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+                if (err) {
+                        printk(KERN_ERR"LOGFS: uncompress error at %llx\n", ofs);
+                        goto out_err;
+                }
+                break;
+        default:
+                LOGFS_BUG(sb);
+                err = -EIO;
+                goto out_err;
+        }
+        return 0;
+out_err:
+        logfs_set_ro(sb);
+        printk(KERN_ERR"LOGFS: device is read-only now\n");
+        LOGFS_BUG(sb);
+        return err;
+}
+/**
+ * logfs_segment_read - read data block from object store
+ * @inode:              inode containing data
+ * @buf:                data buffer
+ * @ofs:                physical data offset
+ * @bix:                block index
+ * @level:              block level
+ *
+ * Returns 0 on success or a negative errno.
+ */
+int logfs_segment_read(struct inode *inode, struct page *page,
+                u64 ofs, u64 bix, level_t level)
+{
+        int err;
+        void *buf;
+        if (PageUptodate(page))
+                return 0;
+        ofs &= ~LOGFS_FULLY_POPULATED;
+        buf = kmap(page);
+        err = __logfs_segment_read(inode, buf, ofs, bix, level);
+        if (!err) {
+                move_btree_to_page(inode, page, buf);
+                SetPageUptodate(page);
+        }
+        kunmap(page);
+        log_segment("logfs_segment_read(%lx, %llx, %x) %llx (%d)\n",
+                        inode->i_ino, bix, level, ofs, err);
+        return err;
+}
+int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow)
+{
+        struct super_block *sb = inode->i_sb;
+        struct logfs_super *super = logfs_super(sb);
+        struct logfs_object_header h;
+        u16 len;
+        int err;
+        super->s_flags |= LOGFS_SB_FLAG_DIRTY;
+        BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
+        BUG_ON(shadow->old_ofs & LOGFS_FULLY_POPULATED);
+        if (!shadow->old_ofs)
+                return 0;
+        log_segment("logfs_segment_delete(%llx, %llx, %x) %llx->%llx %x->%x\n",
+                        shadow->ino, shadow->bix, shadow->gc_level,
+                        shadow->old_ofs, shadow->new_ofs,
+                        shadow->old_len, shadow->new_len);
+        err = read_obj_header(sb, shadow->old_ofs, &h);
+        LOGFS_BUG_ON(err, sb);
+        LOGFS_BUG_ON(be64_to_cpu(h.ino) != inode->i_ino, sb);
+        LOGFS_BUG_ON(check_pos(sb, shadow->bix, be64_to_cpu(h.bix),
+                                shrink_level(shadow->gc_level)), sb);
+        if (shadow->gc_level == 0)
+                len = be16_to_cpu(h.len);
+        else
+                len = obj_len(sb, h.type);
+        shadow->old_len = len + sizeof(h);
+        return 0;
+}
+void freeseg(struct super_block *sb, u32 segno)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct address_space *mapping = super->s_mapping_inode->i_mapping;
+        struct page *page;
+        u64 ofs, start, end;
+        start = dev_ofs(sb, segno, 0);
+        end = dev_ofs(sb, segno + 1, 0);
+        for (ofs = start; ofs < end; ofs += PAGE_SIZE) {
+                page = find_get_page(mapping, ofs >> PAGE_SHIFT);
+                if (!page)
+                        continue;
+                ClearPagePrivate(page);
+                page_cache_release(page);
+        }
+}
+int logfs_open_area(struct logfs_area *area, size_t bytes)
+{
+        struct super_block *sb = area->a_sb;
+        struct logfs_super *super = logfs_super(sb);
+        int err, closed = 0;
+        if (area->a_is_open && area->a_used_bytes + bytes <= super->s_segsize)
+                return 0;
+        if (area->a_is_open) {
+                u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
+                u32 len = super->s_segsize - area->a_written_bytes;
+                log_gc("logfs_close_area(%x)\n", area->a_segno);
+                pad_wbuf(area, 1);
+                super->s_devops->writeseg(area->a_sb, ofs, len);
+                freeseg(sb, area->a_segno);
+                closed = 1;
+        }
+        area->a_used_bytes = 0;
+        area->a_written_bytes = 0;
+again:
+        area->a_ops->get_free_segment(area);
+        area->a_ops->get_erase_count(area);
+        log_gc("logfs_open_area(%x, %x)\n", area->a_segno, area->a_level);
+        err = area->a_ops->erase_segment(area);
+        if (err) {
+                printk(KERN_WARNING "LogFS: Error erasing segment %x\n",
+                                area->a_segno);
+                logfs_mark_segment_bad(sb, area->a_segno);
+                goto again;
+        }
+        area->a_is_open = 1;
+        return closed;
+}
+void logfs_sync_area(struct logfs_area *area)
+{
+        struct super_block *sb = area->a_sb;
+        struct logfs_super *super = logfs_super(sb);
+        u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
+        u32 len = (area->a_used_bytes - area->a_written_bytes);
+        if (super->s_writesize)
+                len &= ~(super->s_writesize - 1);
+        if (len == 0)
+                return;
+        pad_wbuf(area, 0);
+        super->s_devops->writeseg(sb, ofs, len);
+        area->a_written_bytes += len;
+}
+void logfs_sync_segments(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        int i;
+        for_each_area(i)
+                logfs_sync_area(super->s_area[i]);
+}
+/*
+ * Pick a free segment to be used for this area.  Effectively takes a
+ * candidate from the free list (not really a candidate anymore).
+ */
+static void ostore_get_free_segment(struct logfs_area *area)
+{
+        struct super_block *sb = area->a_sb;
+        struct logfs_super *super = logfs_super(sb);
+        if (super->s_free_list.count == 0) {
+                printk(KERN_ERR"LOGFS: ran out of free segments\n");
+                LOGFS_BUG(sb);
+        }
+        area->a_segno = get_best_cand(sb, &super->s_free_list, NULL);
+}
+static void ostore_get_erase_count(struct logfs_area *area)
+{
+        struct logfs_segment_entry se;
+        u32 ec_level;
+        logfs_get_segment_entry(area->a_sb, area->a_segno, &se);
+        BUG_ON(se.ec_level == cpu_to_be32(BADSEG) ||
+                        se.valid == cpu_to_be32(RESERVED));
+        ec_level = be32_to_cpu(se.ec_level);
+        area->a_erase_count = (ec_level >> 4) + 1;
+}
+static int ostore_erase_segment(struct logfs_area *area)
+{
+        struct super_block *sb = area->a_sb;
+        struct logfs_segment_header sh;
+        u64 ofs;
+        int err;
+        err = logfs_erase_segment(sb, area->a_segno, 0);
+        if (err)
+                return err;
+        sh.pad = 0;
+        sh.type = SEG_OSTORE;
+        sh.level = (__force u8)area->a_level;
+        sh.segno = cpu_to_be32(area->a_segno);
+        sh.ec = cpu_to_be32(area->a_erase_count);
+        sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
+        sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
+        logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count,
+                        area->a_level);
+        ofs = dev_ofs(sb, area->a_segno, 0);
+        area->a_used_bytes = sizeof(sh);
+        logfs_buf_write(area, ofs, &sh, sizeof(sh));
+        return 0;
+}
+static const struct logfs_area_ops ostore_area_ops = {
+        .get_free_segment       = ostore_get_free_segment,
+        .get_erase_count        = ostore_get_erase_count,
+        .erase_segment          = ostore_erase_segment,
+};
+static void free_area(struct logfs_area *area)
+{
+        if (area)
+                freeseg(area->a_sb, area->a_segno);
+        kfree(area);
+}
+static struct logfs_area *alloc_area(struct super_block *sb)
+{
+        struct logfs_area *area;
+        area = kzalloc(sizeof(*area), GFP_KERNEL);
+        if (!area)
+                return NULL;
+        area->a_sb = sb;
+        return area;
+}
+static void map_invalidatepage(struct page *page, unsigned long l)
+{
+        BUG();
+}
+static int map_releasepage(struct page *page, gfp_t g)
+{
+        /* Don't release these pages */
+        return 0;
+}
+static const struct address_space_operations mapping_aops = {
+        .invalidatepage = map_invalidatepage,
+        .releasepage    = map_releasepage,
+        .set_page_dirty = __set_page_dirty_nobuffers,
+};
+int logfs_init_mapping(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct address_space *mapping;
+        struct inode *inode;
+        inode = logfs_new_meta_inode(sb, LOGFS_INO_MAPPING);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
+        super->s_mapping_inode = inode;
+        mapping = inode->i_mapping;
+        mapping->a_ops = &mapping_aops;
+        /* Would it be possible to use __GFP_HIGHMEM as well? */
+        mapping_set_gfp_mask(mapping, GFP_NOFS);
+        return 0;
+}
+int logfs_init_areas(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        int i = -1;
+        super->s_alias_pool = mempool_create_kmalloc_pool(600,
+                        sizeof(struct object_alias_item));
+        if (!super->s_alias_pool)
+                return -ENOMEM;
+        super->s_journal_area = alloc_area(sb);
+        if (!super->s_journal_area)
+                goto err;
+        for_each_area(i) {
+                super->s_area[i] = alloc_area(sb);
+                if (!super->s_area[i])
+                        goto err;
+                super->s_area[i]->a_level = GC_LEVEL(i);
+                super->s_area[i]->a_ops = &ostore_area_ops;
+        }
+        btree_init_mempool128(&super->s_object_alias_tree,
+                        super->s_btree_pool);
+        return 0;
+err:
+        for (i--; i >= 0; i--)
+                free_area(super->s_area[i]);
+        free_area(super->s_journal_area);
+        mempool_destroy(super->s_alias_pool);
+        return -ENOMEM;
+}
+void logfs_cleanup_areas(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        int i;
+        btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias);
+        for_each_area(i)
+                free_area(super->s_area[i]);
+        free_area(super->s_journal_area);
+        destroy_meta_inode(super->s_mapping_inode);
+}
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
new file mode 100644
index 000000000000..b60bfac3263c
--- /dev/null
+++ b/fs/logfs/super.c
@@ -0,0 +1,650 @@
+/*
+ * fs/logfs/super.c
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ * Generally contains mount/umount code and also serves as a dump area for
+ * any functions that don't fit elsewhere and neither justify a file of their
+ * own.
+ */
+#include "logfs.h"
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/mtd/mtd.h>
+#include <linux/statfs.h>
+#include <linux/buffer_head.h>
+static DEFINE_MUTEX(emergency_mutex);
+static struct page *emergency_page;
+struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index)
+{
+        filler_t *filler = (filler_t *)mapping->a_ops->readpage;
+        struct page *page;
+        int err;
+        page = read_cache_page(mapping, index, filler, NULL);
+        if (page)
+                return page;
+        /* No more pages available, switch to emergency page */
+        printk(KERN_INFO"Logfs: Using emergency page\n");
+        mutex_lock(&emergency_mutex);
+        err = filler(NULL, emergency_page);
+        if (err) {
+                mutex_unlock(&emergency_mutex);
+                printk(KERN_EMERG"Logfs: Error reading emergency page\n");
+                return ERR_PTR(err);
+        }
+        return emergency_page;
+}
+void emergency_read_end(struct page *page)
+{
+        if (page == emergency_page)
+                mutex_unlock(&emergency_mutex);
+        else
+                page_cache_release(page);
+}
+static void dump_segfile(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct logfs_segment_entry se;
+        u32 segno;
+        for (segno = 0; segno < super->s_no_segs; segno++) {
+                logfs_get_segment_entry(sb, segno, &se);
+                printk("%3x: %6x %8x", segno, be32_to_cpu(se.ec_level),
+                                be32_to_cpu(se.valid));
+                if (++segno < super->s_no_segs) {
+                        logfs_get_segment_entry(sb, segno, &se);
+                        printk(" %6x %8x", be32_to_cpu(se.ec_level),
+                                        be32_to_cpu(se.valid));
+                }
+                if (++segno < super->s_no_segs) {
+                        logfs_get_segment_entry(sb, segno, &se);
+                        printk(" %6x %8x", be32_to_cpu(se.ec_level),
+                                        be32_to_cpu(se.valid));
+                }
+                if (++segno < super->s_no_segs) {
+                        logfs_get_segment_entry(sb, segno, &se);
+                        printk(" %6x %8x", be32_to_cpu(se.ec_level),
+                                        be32_to_cpu(se.valid));
+                }
+                printk("\n");
+        }
+}
+/*
+ * logfs_crash_dump - dump debug information to device
+ *
+ * The LogFS superblock only occupies part of a segment.  This function will
+ * write as much debug information as it can gather into the spare space.
+ */
+void logfs_crash_dump(struct super_block *sb)
+{
+        dump_segfile(sb);
+}
+/*
+ * TODO: move to lib/string.c
+ */
+/**
+ * memchr_inv - Find a character in an area of memory.
+ * @s: The memory area
+ * @c: The byte to search for
+ * @n: The size of the area.
+ *
+ * returns the address of the first character other than @c, or %NULL
+ * if the whole buffer contains just @c.
+ */
+void *memchr_inv(const void *s, int c, size_t n)
+{
+        const unsigned char *p = s;
+        while (n-- != 0)
+                if ((unsigned char)c != *p++)
+                        return (void *)(p - 1);
+        return NULL;
+}
+/*
+ * FIXME: There should be a reserve for root, similar to ext2.
+ */
+int logfs_statfs(struct dentry *dentry, struct kstatfs *stats)
+{
+        struct super_block *sb = dentry->d_sb;
+        struct logfs_super *super = logfs_super(sb);
+        stats->f_type           = LOGFS_MAGIC_U32;
+        stats->f_bsize          = sb->s_blocksize;
+        stats->f_blocks         = super->s_size >> LOGFS_BLOCK_BITS >> 3;
+        stats->f_bfree          = super->s_free_bytes >> sb->s_blocksize_bits;
+        stats->f_bavail         = super->s_free_bytes >> sb->s_blocksize_bits;
+        stats->f_files          = 0;
+        stats->f_ffree          = 0;
+        stats->f_namelen        = LOGFS_MAX_NAMELEN;
+        return 0;
+}
+static int logfs_sb_set(struct super_block *sb, void *_super)
+{
+        struct logfs_super *super = _super;
+        sb->s_fs_info = super;
+        sb->s_mtd = super->s_mtd;
+        sb->s_bdev = super->s_bdev;
+        return 0;
+}
+static int logfs_sb_test(struct super_block *sb, void *_super)
+{
+        struct logfs_super *super = _super;
+        struct mtd_info *mtd = super->s_mtd;
+        if (mtd && sb->s_mtd == mtd)
+                return 1;
+        if (super->s_bdev && sb->s_bdev == super->s_bdev)
+                return 1;
+        return 0;
+}
+static void set_segment_header(struct logfs_segment_header *sh, u8 type,
+                u8 level, u32 segno, u32 ec)
+{
+        sh->pad = 0;
+        sh->type = type;
+        sh->level = level;
+        sh->segno = cpu_to_be32(segno);
+        sh->ec = cpu_to_be32(ec);
+        sh->gec = cpu_to_be64(segno);
+        sh->crc = logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4);
+}
+static void logfs_write_ds(struct super_block *sb, struct logfs_disk_super *ds,
+                u32 segno, u32 ec)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct logfs_segment_header *sh = &ds->ds_sh;
+        int i;
+        memset(ds, 0, sizeof(*ds));
+        set_segment_header(sh, SEG_SUPER, 0, segno, ec);
+        ds->ds_ifile_levels     = super->s_ifile_levels;
+        ds->ds_iblock_levels    = super->s_iblock_levels;
+        ds->ds_data_levels      = super->s_data_levels; /* XXX: Remove */
+        ds->ds_segment_shift    = super->s_segshift;
+        ds->ds_block_shift      = sb->s_blocksize_bits;
+        ds->ds_write_shift      = super->s_writeshift;
+        ds->ds_filesystem_size  = cpu_to_be64(super->s_size);
+        ds->ds_segment_size     = cpu_to_be32(super->s_segsize);
+        ds->ds_bad_seg_reserve  = cpu_to_be32(super->s_bad_seg_reserve);
+        ds->ds_feature_incompat = cpu_to_be64(super->s_feature_incompat);
+        ds->ds_feature_ro_compat= cpu_to_be64(super->s_feature_ro_compat);
+        ds->ds_feature_compat   = cpu_to_be64(super->s_feature_compat);
+        ds->ds_feature_flags    = cpu_to_be64(super->s_feature_flags);
+        ds->ds_root_reserve     = cpu_to_be64(super->s_root_reserve);
+        ds->ds_speed_reserve    = cpu_to_be64(super->s_speed_reserve);
+        journal_for_each(i)
+                ds->ds_journal_seg[i] = cpu_to_be32(super->s_journal_seg[i]);
+        ds->ds_magic            = cpu_to_be64(LOGFS_MAGIC);
+        ds->ds_crc = logfs_crc32(ds, sizeof(*ds),
+                        LOGFS_SEGMENT_HEADERSIZE + 12);
+}
+static int write_one_sb(struct super_block *sb,
+                struct page *(*find_sb)(struct super_block *sb, u64 *ofs))
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct logfs_disk_super *ds;
+        struct logfs_segment_entry se;
+        struct page *page;
+        u64 ofs;
+        u32 ec, segno;
+        int err;
+        page = find_sb(sb, &ofs);
+        if (!page)
+                return -EIO;
+        ds = page_address(page);
+        segno = seg_no(sb, ofs);
+        logfs_get_segment_entry(sb, segno, &se);
+        ec = be32_to_cpu(se.ec_level) >> 4;
+        ec++;
+        logfs_set_segment_erased(sb, segno, ec, 0);
+        logfs_write_ds(sb, ds, segno, ec);
+        err = super->s_devops->write_sb(sb, page);
+        page_cache_release(page);
+        return err;
+}
+int logfs_write_sb(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        int err;
+        /* First superblock */
+        err = write_one_sb(sb, super->s_devops->find_first_sb);
+        if (err)
+                return err;
+        /* Last superblock */
+        err = write_one_sb(sb, super->s_devops->find_last_sb);
+        if (err)
+                return err;
+        return 0;
+}
+static int ds_cmp(const void *ds0, const void *ds1)
+{
+        size_t len = sizeof(struct logfs_disk_super);
+        /* We know the segment headers differ, so ignore them */
+        len -= LOGFS_SEGMENT_HEADERSIZE;
+        ds0 += LOGFS_SEGMENT_HEADERSIZE;
+        ds1 += LOGFS_SEGMENT_HEADERSIZE;
+        return memcmp(ds0, ds1, len);
+}
+static int logfs_recover_sb(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct logfs_disk_super _ds0, *ds0 = &_ds0;
+        struct logfs_disk_super _ds1, *ds1 = &_ds1;
+        int err, valid0, valid1;
+        /* read first superblock */
+        err = wbuf_read(sb, super->s_sb_ofs[0], sizeof(*ds0), ds0);
+        if (err)
+                return err;
+        /* read last superblock */
+        err = wbuf_read(sb, super->s_sb_ofs[1], sizeof(*ds1), ds1);
+        if (err)
+                return err;
+        valid0 = logfs_check_ds(ds0) == 0;
+        valid1 = logfs_check_ds(ds1) == 0;
+        if (!valid0 && valid1) {
+                printk(KERN_INFO"First superblock is invalid - fixing.\n");
+                return write_one_sb(sb, super->s_devops->find_first_sb);
+        }
+        if (valid0 && !valid1) {
+                printk(KERN_INFO"Last superblock is invalid - fixing.\n");
+                return write_one_sb(sb, super->s_devops->find_last_sb);
+        }
+        if (valid0 && valid1 && ds_cmp(ds0, ds1)) {
+                printk(KERN_INFO"Superblocks don't match - fixing.\n");
+                return logfs_write_sb(sb);
+        }
+        /* If neither is valid now, something's wrong.  Didn't we properly
+         * check them before?!? */
+        BUG_ON(!valid0 && !valid1);
+        return 0;
+}
+static int logfs_make_writeable(struct super_block *sb)
+{
+        int err;
+        err = logfs_open_segfile(sb);
+        if (err)
+                return err;
+        /* Repair any broken superblock copies */
+        err = logfs_recover_sb(sb);
+        if (err)
+                return err;
+        /* Check areas for trailing unaccounted data */
+        err = logfs_check_areas(sb);
+        if (err)
+                return err;
+        /* Do one GC pass before any data gets dirtied */
+        logfs_gc_pass(sb);
+        /* after all initializations are done, replay the journal
+         * for rw-mounts, if necessary */
+        err = logfs_replay_journal(sb);
+        if (err)
+                return err;
+        return 0;
+}
+static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct inode *rootdir;
+        int err;
+        /* root dir */
+        rootdir = logfs_iget(sb, LOGFS_INO_ROOT);
+        if (IS_ERR(rootdir))
+                goto fail;
+        sb->s_root = d_alloc_root(rootdir);
+        if (!sb->s_root)
+                goto fail2;
+        super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
+        if (!super->s_erase_page)
+                goto fail2;
+        memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE);
+        /* FIXME: check for read-only mounts */
+        err = logfs_make_writeable(sb);
+        if (err)
+                goto fail3;
+        log_super("LogFS: Finished mounting\n");
+        simple_set_mnt(mnt, sb);
+        return 0;
+fail3:
+        __free_page(super->s_erase_page);
+fail2:
+        iput(rootdir);
+fail:
+        iput(logfs_super(sb)->s_master_inode);
+        return -EIO;
+}
+int logfs_check_ds(struct logfs_disk_super *ds)
+{
+        struct logfs_segment_header *sh = &ds->ds_sh;
+        if (ds->ds_magic != cpu_to_be64(LOGFS_MAGIC))
+                return -EINVAL;
+        if (sh->crc != logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4))
+                return -EINVAL;
+        if (ds->ds_crc != logfs_crc32(ds, sizeof(*ds),
+                                LOGFS_SEGMENT_HEADERSIZE + 12))
+                return -EINVAL;
+        return 0;
+}
+static struct page *find_super_block(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct page *first, *last;
+        first = super->s_devops->find_first_sb(sb, &super->s_sb_ofs[0]);
+        if (!first || IS_ERR(first))
+                return NULL;
+        last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
+        if (!last || IS_ERR(first)) {
+                page_cache_release(first);
+                return NULL;
+        }
+        if (!logfs_check_ds(page_address(first))) {
+                page_cache_release(last);
+                return first;
+        }
+        /* First one didn't work, try the second superblock */
+        if (!logfs_check_ds(page_address(last))) {
+                page_cache_release(first);
+                return last;
+        }
+        /* Neither worked, sorry folks */
+        page_cache_release(first);
+        page_cache_release(last);
+        return NULL;
+}
+static int __logfs_read_sb(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        struct page *page;
+        struct logfs_disk_super *ds;
+        int i;
+        page = find_super_block(sb);
+        if (!page)
+                return -EIO;
+        ds = page_address(page);
+        super->s_size = be64_to_cpu(ds->ds_filesystem_size);
+        super->s_root_reserve = be64_to_cpu(ds->ds_root_reserve);
+        super->s_speed_reserve = be64_to_cpu(ds->ds_speed_reserve);
+        super->s_bad_seg_reserve = be32_to_cpu(ds->ds_bad_seg_reserve);
+        super->s_segsize = 1 << ds->ds_segment_shift;
+        super->s_segmask = (1 << ds->ds_segment_shift) - 1;
+        super->s_segshift = ds->ds_segment_shift;
+        sb->s_blocksize = 1 << ds->ds_block_shift;
+        sb->s_blocksize_bits = ds->ds_block_shift;
+        super->s_writesize = 1 << ds->ds_write_shift;
+        super->s_writeshift = ds->ds_write_shift;
+        super->s_no_segs = super->s_size >> super->s_segshift;
+        super->s_no_blocks = super->s_segsize >> sb->s_blocksize_bits;
+        super->s_feature_incompat = be64_to_cpu(ds->ds_feature_incompat);
+        super->s_feature_ro_compat = be64_to_cpu(ds->ds_feature_ro_compat);
+        super->s_feature_compat = be64_to_cpu(ds->ds_feature_compat);
+        super->s_feature_flags = be64_to_cpu(ds->ds_feature_flags);
+        journal_for_each(i)
+                super->s_journal_seg[i] = be32_to_cpu(ds->ds_journal_seg[i]);
+        super->s_ifile_levels = ds->ds_ifile_levels;
+        super->s_iblock_levels = ds->ds_iblock_levels;
+        super->s_data_levels = ds->ds_data_levels;
+        super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels
+                + super->s_data_levels;
+        page_cache_release(page);
+        return 0;
+}
+static int logfs_read_sb(struct super_block *sb, int read_only)
+{
+        struct logfs_super *super = logfs_super(sb);
+        int ret;
+        super->s_btree_pool = mempool_create(32, btree_alloc, btree_free, NULL);
+        if (!super->s_btree_pool)
+                return -ENOMEM;
+        btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool);
+        btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool);
+        ret = logfs_init_mapping(sb);
+        if (ret)
+                return ret;
+        ret = __logfs_read_sb(sb);
+        if (ret)
+                return ret;
+        if (super->s_feature_incompat & ~LOGFS_FEATURES_INCOMPAT)
+                return -EIO;
+        if ((super->s_feature_ro_compat & ~LOGFS_FEATURES_RO_COMPAT) &&
+                        !read_only)
+                return -EIO;
+        mutex_init(&super->s_dirop_mutex);
+        mutex_init(&super->s_object_alias_mutex);
+        INIT_LIST_HEAD(&super->s_freeing_list);
+        ret = logfs_init_rw(sb);
+        if (ret)
+                return ret;
+        ret = logfs_init_areas(sb);
+        if (ret)
+                return ret;
+        ret = logfs_init_gc(sb);
+        if (ret)
+                return ret;
+        ret = logfs_init_journal(sb);
+        if (ret)
+                return ret;
+        return 0;
+}
+static void logfs_kill_sb(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        log_super("LogFS: Start unmounting\n");
+        /* Alias entries slow down mount, so evict as many as possible */
+        sync_filesystem(sb);
+        logfs_write_anchor(sb);
+        /*
+         * From this point on alias entries are simply dropped - and any
+         * writes to the object store are considered bugs.
+         */
+        super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
+        log_super("LogFS: Now in shutdown\n");
+        generic_shutdown_super(sb);
+        BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes);
+        logfs_cleanup_gc(sb);
+        logfs_cleanup_journal(sb);
+        logfs_cleanup_areas(sb);
+        logfs_cleanup_rw(sb);
+        if (super->s_erase_page)
+                __free_page(super->s_erase_page);
+        super->s_devops->put_device(sb);
+        mempool_destroy(super->s_btree_pool);
+        mempool_destroy(super->s_alias_pool);
+        kfree(super);
+        log_super("LogFS: Finished unmounting\n");
+}
+int logfs_get_sb_device(struct file_system_type *type, int flags,
+                struct mtd_info *mtd, struct block_device *bdev,
+                const struct logfs_device_ops *devops, struct vfsmount *mnt)
+{
+        struct logfs_super *super;
+        struct super_block *sb;
+        int err = -ENOMEM;
+        static int mount_count;
+        log_super("LogFS: Start mount %x\n", mount_count++);
+        super = kzalloc(sizeof(*super), GFP_KERNEL);
+        if (!super)
+                goto err0;
+        super->s_mtd    = mtd;
+        super->s_bdev   = bdev;
+        err = -EINVAL;
+        sb = sget(type, logfs_sb_test, logfs_sb_set, super);
+        if (IS_ERR(sb))
+                goto err0;
+        if (sb->s_root) {
+                /* Device is already in use */
+                err = 0;
+                simple_set_mnt(mnt, sb);
+                goto err0;
+        }
+        super->s_devops = devops;
+        /*
+         * sb->s_maxbytes is limited to 8TB.  On 32bit systems, the page cache
+         * only covers 16TB and the upper 8TB are used for indirect blocks.
+         * On 64bit system we could bump up the limit, but that would make
+         * the filesystem incompatible with 32bit systems.
+         */
+        sb->s_maxbytes  = (1ull << 43) - 1;
+        sb->s_op        = &logfs_super_operations;
+        sb->s_flags     = flags | MS_NOATIME;
+        err = logfs_read_sb(sb, sb->s_flags & MS_RDONLY);
+        if (err)
+                goto err1;
+        sb->s_flags |= MS_ACTIVE;
+        err = logfs_get_sb_final(sb, mnt);
+        if (err)
+                goto err1;
+        return 0;
+err1:
+        deactivate_locked_super(sb);
+        return err;
+err0:
+        kfree(super);
+        //devops->put_device(sb);
+        return err;
+}
+static int logfs_get_sb(struct file_system_type *type, int flags,
+                const char *devname, void *data, struct vfsmount *mnt)
+{
+        ulong mtdnr;
+        if (!devname)
+                return logfs_get_sb_bdev(type, flags, devname, mnt);
+        if (strncmp(devname, "mtd", 3))
+                return logfs_get_sb_bdev(type, flags, devname, mnt);
+        {
+                char *garbage;
+                mtdnr = simple_strtoul(devname+3, &garbage, 0);
+                if (*garbage)
+                        return -EINVAL;
+        }
+        return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
+}
+static struct file_system_type logfs_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "logfs",
+        .get_sb         = logfs_get_sb,
+        .kill_sb        = logfs_kill_sb,
+        .fs_flags       = FS_REQUIRES_DEV,
+};
+static int __init logfs_init(void)
+{
+        int ret;
+        emergency_page = alloc_pages(GFP_KERNEL, 0);
+        if (!emergency_page)
+                return -ENOMEM;
+        ret = logfs_compr_init();
+        if (ret)
+                goto out1;
+        ret = logfs_init_inode_cache();
+        if (ret)
+                goto out2;
+        return register_filesystem(&logfs_fs_type);
+out2:
+        logfs_compr_exit();
+out1:
+        __free_pages(emergency_page, 0);
+        return ret;
+}
+static void __exit logfs_exit(void)
+{
+        unregister_filesystem(&logfs_fs_type);
+        logfs_destroy_inode_cache();
+        logfs_compr_exit();
+        __free_pages(emergency_page, 0);
+}
+module_init(logfs_init);
+module_exit(logfs_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
+MODULE_DESCRIPTION("scalable flash filesystem");
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 74ea82d72164..756f8c93780c 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -17,8 +17,10 @@
 #include <linux/init.h>
 #include <linux/highuid.h>
 #include <linux/vfs.h>
+#include <linux/writeback.h>
-static int minix_write_inode(struct inode * inode, int wait);
+static int minix_write_inode(struct inode *inode,
+                struct writeback_control *wbc);
 static int minix_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int minix_remount (struct super_block * sb, int * flags, char * data);
@@ -552,7 +554,7 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode)
        return bh;
 }
-static int minix_write_inode(struct inode *inode, int wait)
+static int minix_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        int err = 0;
        struct buffer_head *bh;
@@ -563,7 +565,7 @@ static int minix_write_inode(struct inode *inode, int wait)
                bh = V2_minix_update_inode(inode);
        if (!bh)
                return -EIO;
-        if (wait && buffer_dirty(bh)) {
+        if (wbc->sync_mode == WB_SYNC_ALL && buffer_dirty(bh)) {
                sync_dirty_buffer(bh);
                if (buffer_req(bh) && !buffer_uptodate(bh)) {
                        printk("IO error syncing minix inode [%s:%08lx]\n",
diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c
index 82d6554b02fe..282e15ad8cd8 100644
--- a/fs/minix/itree_v1.c
+++ b/fs/minix/itree_v1.c
@@ -1,4 +1,5 @@
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include "minix.h"
 enum {DEPTH = 3, DIRECT = 7};   /* Only double indirect */
diff --git a/fs/mpage.c b/fs/mpage.c
index 42381bd6543b..fd56ca2ea556 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/kdev_t.h>
+#include <linux/gfp.h>
 #include <linux/bio.h>
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
@@ -561,7 +562,7 @@ page_is_mapped:
        if (page->index >= end_index) {
                /*
                 * The page straddles i_size.  It must be zeroed out on each
-                 * and every writepage invokation because it may be mmapped.
+                 * and every writepage invocation because it may be mmapped.
                 * "A file is mapped in multiples of the page size.  For a file
                 * that is not a multiple of the page size, the remaining memory
                 * is zeroed when mapped, and writes to that region are not
diff --git a/fs/namei.c b/fs/namei.c
index d11f404667e9..a7dce91a7e42 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -19,7 +19,6 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/namei.h>
-#include <linux/quotaops.h>
 #include <linux/pagemap.h>
 #include <linux/fsnotify.h>
 #include <linux/personality.h>
@@ -35,7 +34,7 @@
 #include <linux/fs_struct.h>
 #include <asm/uaccess.h>
-#define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
+#include "internal.h"
 /* [Feb-1997 T. Schoebel-Theuer]
 * Fundamental changes in the pathname lookup mechanisms (namei)
@@ -108,8 +107,6 @@
 * any extra contention...
 */
-static int __link_path_walk(const char *name, struct nameidata *nd);
 /* In order to reduce some races, while at the same time doing additional
 * checking and hopefully speeding things up, we copy filenames to the
 * kernel data space before using them..
@@ -234,6 +231,7 @@ int generic_permission(struct inode *inode, int mask,
        /*
         * Searching includes executable on directories, else just read.
         */
+        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
        if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
                if (capable(CAP_DAC_READ_SEARCH))
                        return 0;
@@ -414,36 +412,55 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
 }
 /*
- * Internal lookup() using the new generic dcache.
+ * force_reval_path - force revalidation of a dentry
- * SMP-safe
+ *
+ * In some situations the path walking code will trust dentries without
+ * revalidating them. This causes problems for filesystems that depend on
+ * d_revalidate to handle file opens (e.g. NFSv4). When FS_REVAL_DOT is set
+ * (which indicates that it's possible for the dentry to go stale), force
+ * a d_revalidate call before proceeding.
+ *
+ * Returns 0 if the revalidation was successful. If the revalidation fails,
+ * either return the error returned by d_revalidate or -ESTALE if the
+ * revalidation it just returned 0. If d_revalidate returns 0, we attempt to
+ * invalidate the dentry. It's up to the caller to handle putting references
+ * to the path if necessary.
 */
-static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
+static int
+force_reval_path(struct path *path, struct nameidata *nd)
 {
-        struct dentry * dentry = __d_lookup(parent, name);
+        int status;
+        struct dentry *dentry = path->dentry;
-        /* lockess __d_lookup may fail due to concurrent d_move() 
+        /*
-         * in some unrelated directory, so try with d_lookup
+         * only check on filesystems where it's possible for the dentry to
+         * become stale. It's assumed that if this flag is set then the
+         * d_revalidate op will also be defined.
         */
-        if (!dentry)
+        if (!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))
-                dentry = d_lookup(parent, name);
+                return 0;
-        if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
+        status = dentry->d_op->d_revalidate(dentry, nd);
-                dentry = do_revalidate(dentry, nd);
+        if (status > 0)
+                return 0;
-        return dentry;
+        if (!status) {
+                d_invalidate(dentry);
+                status = -ESTALE;
+        }
+        return status;
 }
 /*
- * Short-cut version of permission(), for calling by
+ * Short-cut version of permission(), for calling on directories
- * path_walk(), when dcache lock is held.  Combines parts
+ * during pathname resolution.  Combines parts of permission()
- * of permission() and generic_permission(), and tests ONLY for
+ * and generic_permission(), and tests ONLY for MAY_EXEC permission.
- * MAY_EXEC permission.
 *
 * If appropriate, check DAC only.  If not appropriate, or
- * short-cut DAC fails, then call permission() to do more
+ * short-cut DAC fails, then call ->permission() to do more
 * complete permission check.
 */
-static int exec_permission_lite(struct inode *inode)
+static int exec_permission(struct inode *inode)
 {
        int ret;
@@ -465,99 +482,6 @@ ok:
        return security_inode_permission(inode, MAY_EXEC);
 }
-/*
- * This is called when everything else fails, and we actually have
- * to go to the low-level filesystem to find out what we should do..
- *
- * We get the directory semaphore, and after getting that we also
- * make sure that nobody added the entry to the dcache in the meantime..
- * SMP-safe
- */
-static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
-{
-        struct dentry * result;
-        struct inode *dir = parent->d_inode;
-        mutex_lock(&dir->i_mutex);
-        /*
-         * First re-do the cached lookup just in case it was created
-         * while we waited for the directory semaphore..
-         *
-         * FIXME! This could use version numbering or similar to
-         * avoid unnecessary cache lookups.
-         *
-         * The "dcache_lock" is purely to protect the RCU list walker
-         * from concurrent renames at this point (we mustn't get false
-         * negatives from the RCU list walk here, unlike the optimistic
-         * fast walk).
-         *
-         * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
-         */
-        result = d_lookup(parent, name);
-        if (!result) {
-                struct dentry *dentry;
-                /* Don't create child dentry for a dead directory. */
-                result = ERR_PTR(-ENOENT);
-                if (IS_DEADDIR(dir))
-                        goto out_unlock;
-                dentry = d_alloc(parent, name);
-                result = ERR_PTR(-ENOMEM);
-                if (dentry) {
-                        result = dir->i_op->lookup(dir, dentry, nd);
-                        if (result)
-                                dput(dentry);
-                        else
-                                result = dentry;
-                }
-out_unlock:
-                mutex_unlock(&dir->i_mutex);
-                return result;
-        }
-        /*
-         * Uhhuh! Nasty case: the cache was re-populated while
-         * we waited on the semaphore. Need to revalidate.
-         */
-        mutex_unlock(&dir->i_mutex);
-        if (result->d_op && result->d_op->d_revalidate) {
-                result = do_revalidate(result, nd);
-                if (!result)
-                        result = ERR_PTR(-ENOENT);
-        }
-        return result;
-}
-/*
- * Wrapper to retry pathname resolution whenever the underlying
- * file system returns an ESTALE.
- *
- * Retry the whole path once, forcing real lookup requests
- * instead of relying on the dcache.
- */
-static __always_inline int link_path_walk(const char *name, struct nameidata *nd)
-{
-        struct path save = nd->path;
-        int result;
-        /* make sure the stuff we saved doesn't go away */
-        path_get(&save);
-        result = __link_path_walk(name, nd);
-        if (result == -ESTALE) {
-                /* nd->path had been dropped */
-                nd->path = save;
-                path_get(&nd->path);
-                nd->flags |= LOOKUP_REVAL;
-                result = __link_path_walk(name, nd);
-        }
-        path_put(&save);
-        return result;
-}
 static __always_inline void set_root(struct nameidata *nd)
 {
        if (!nd->root.mnt) {
@@ -569,10 +493,10 @@ static __always_inline void set_root(struct nameidata *nd)
        }
 }
+static int link_path_walk(const char *, struct nameidata *);
 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
 {
-        int res = 0;
-        char *name;
        if (IS_ERR(link))
                goto fail;
@@ -583,22 +507,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
                path_get(&nd->root);
        }
-        res = link_path_walk(link, nd);
+        return link_path_walk(link, nd);
-        if (nd->depth || res || nd->last_type!=LAST_NORM)
-                return res;
-        /*
-         * If it is an iterative symlinks resolution in open_namei() we
-         * have to copy the last component. And all that crap because of
-         * bloody create() on broken symlinks. Furrfu...
-         */
-        name = __getname();
-        if (unlikely(!name)) {
-                path_put(&nd->path);
-                return -ENOMEM;
-        }
-        strcpy(name, nd->last.name);
-        nd->last.name = name;
-        return 0;
 fail:
        path_put(&nd->path);
        return PTR_ERR(link);
@@ -620,10 +529,10 @@ static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
        nd->path.dentry = path->dentry;
 }
-static __always_inline int __do_follow_link(struct path *path, struct nameidata *nd)
+static __always_inline int
+__do_follow_link(struct path *path, struct nameidata *nd, void **p)
 {
        int error;
-        void *cookie;
        struct dentry *dentry = path->dentry;
        touch_atime(path->mnt, dentry);
@@ -634,18 +543,20 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
                dget(dentry);
        }
        mntget(path->mnt);
-        cookie = dentry->d_inode->i_op->follow_link(dentry, nd);
+        nd->last_type = LAST_BIND;
-        error = PTR_ERR(cookie);
+        *p = dentry->d_inode->i_op->follow_link(dentry, nd);
-        if (!IS_ERR(cookie)) {
+        error = PTR_ERR(*p);
+        if (!IS_ERR(*p)) {
                char *s = nd_get_link(nd);
                error = 0;
                if (s)
                        error = __vfs_follow_link(nd, s);
-                if (dentry->d_inode->i_op->put_link)
+                else if (nd->last_type == LAST_BIND) {
-                        dentry->d_inode->i_op->put_link(dentry, nd, cookie);
+                        error = force_reval_path(&nd->path, nd);
+                        if (error)
+                                path_put(&nd->path);
+                }
        }
-        path_put(path);
        return error;
 }
@@ -658,6 +569,7 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
 */
 static inline int do_follow_link(struct path *path, struct nameidata *nd)
 {
+        void *cookie;
        int err = -ELOOP;
        if (current->link_count >= MAX_NESTED_LINKS)
                goto loop;
@@ -671,7 +583,10 @@ static inline int do_follow_link(struct path *path, struct nameidata *nd)
        current->link_count++;
        current->total_link_count++;
        nd->depth++;
-        err = __do_follow_link(path, nd);
+        err = __do_follow_link(path, nd, &cookie);
+        if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
+                path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
+        path_put(path);
        current->link_count--;
        nd->depth--;
        return err;
@@ -757,33 +672,20 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
        set_root(nd);
        while(1) {
-                struct vfsmount *parent;
                struct dentry *old = nd->path.dentry;
                if (nd->path.dentry == nd->root.dentry &&
                    nd->path.mnt == nd->root.mnt) {
                        break;
                }
-                spin_lock(&dcache_lock);
                if (nd->path.dentry != nd->path.mnt->mnt_root) {
-                        nd->path.dentry = dget(nd->path.dentry->d_parent);
+                        /* rare case of legitimate dget_parent()... */
-                        spin_unlock(&dcache_lock);
+                        nd->path.dentry = dget_parent(nd->path.dentry);
                        dput(old);
                        break;
                }
-                spin_unlock(&dcache_lock);
+                if (!follow_up(&nd->path))
-                spin_lock(&vfsmount_lock);
-                parent = nd->path.mnt->mnt_parent;
-                if (parent == nd->path.mnt) {
-                        spin_unlock(&vfsmount_lock);
                        break;
-                }
-                mntget(parent);
-                nd->path.dentry = dget(nd->path.mnt->mnt_mountpoint);
-                spin_unlock(&vfsmount_lock);
-                dput(old);
-                mntput(nd->path.mnt);
-                nd->path.mnt = parent;
        }
        follow_mount(&nd->path);
 }
@@ -797,8 +699,19 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
                     struct path *path)
 {
        struct vfsmount *mnt = nd->path.mnt;
-        struct dentry *dentry = __d_lookup(nd->path.dentry, name);
+        struct dentry *dentry, *parent;
+        struct inode *dir;
+        /*
+         * See if the low-level filesystem might want
+         * to use its own hash..
+         */
+        if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
+                int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, name);
+                if (err < 0)
+                        return err;
+        }
+        dentry = __d_lookup(nd->path.dentry, name);
        if (!dentry)
                goto need_lookup;
        if (dentry->d_op && dentry->d_op->d_revalidate)
@@ -810,7 +723,59 @@ done:
        return 0;
 need_lookup:
-        dentry = real_lookup(nd->path.dentry, name, nd);
+        parent = nd->path.dentry;
+        dir = parent->d_inode;
+        mutex_lock(&dir->i_mutex);
+        /*
+         * First re-do the cached lookup just in case it was created
+         * while we waited for the directory semaphore..
+         *
+         * FIXME! This could use version numbering or similar to
+         * avoid unnecessary cache lookups.
+         *
+         * The "dcache_lock" is purely to protect the RCU list walker
+         * from concurrent renames at this point (we mustn't get false
+         * negatives from the RCU list walk here, unlike the optimistic
+         * fast walk).
+         *
+         * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
+         */
+        dentry = d_lookup(parent, name);
+        if (!dentry) {
+                struct dentry *new;
+                /* Don't create child dentry for a dead directory. */
+                dentry = ERR_PTR(-ENOENT);
+                if (IS_DEADDIR(dir))
+                        goto out_unlock;
+                new = d_alloc(parent, name);
+                dentry = ERR_PTR(-ENOMEM);
+                if (new) {
+                        dentry = dir->i_op->lookup(dir, new, nd);
+                        if (dentry)
+                                dput(new);
+                        else
+                                dentry = new;
+                }
+out_unlock:
+                mutex_unlock(&dir->i_mutex);
+                if (IS_ERR(dentry))
+                        goto fail;
+                goto done;
+        }
+        /*
+         * Uhhuh! Nasty case: the cache was re-populated while
+         * we waited on the semaphore. Need to revalidate.
+         */
+        mutex_unlock(&dir->i_mutex);
+        if (dentry->d_op && dentry->d_op->d_revalidate) {
+                dentry = do_revalidate(dentry, nd);
+                if (!dentry)
+                        dentry = ERR_PTR(-ENOENT);
+        }
        if (IS_ERR(dentry))
                goto fail;
        goto done;
@@ -828,6 +793,17 @@ fail:
 }
 /*
+ * This is a temporary kludge to deal with "automount" symlinks; proper
+ * solution is to trigger them on follow_mount(), so that do_lookup()
+ * would DTRT.  To be killed before 2.6.34-final.
+ */
+static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
+{
+        return inode && unlikely(inode->i_op->follow_link) &&
+                ((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode));
+}
+/*
 * Name resolution.
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
@@ -835,7 +811,7 @@ fail:
 * Returns 0 and nd will have valid dentry and mnt on success.
 * Returns error and drops reference to input namei data on failure.
 */
-static int __link_path_walk(const char *name, struct nameidata *nd)
+static int link_path_walk(const char *name, struct nameidata *nd)
 {
        struct path next;
        struct inode *inode;
@@ -858,7 +834,7 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
                unsigned int c;
                nd->flags |= LOOKUP_CONTINUE;
-                err = exec_permission_lite(inode);
+                err = exec_permission(inode);
                if (err)
                        break;
@@ -898,16 +874,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
                        case 1:
                                continue;
                }
-                /*
-                 * See if the low-level filesystem might want
-                 * to use its own hash..
-                 */
-                if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
-                        err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
-                                                            &this);
-                        if (err < 0)
-                                break;
-                }
                /* This does the actual lookups.. */
                err = do_lookup(nd, &this, &next);
                if (err)
@@ -953,18 +919,11 @@ last_component:
                        case 1:
                                goto return_reval;
                }
-                if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
-                        err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
-                                                            &this);
-                        if (err < 0)
-                                break;
-                }
                err = do_lookup(nd, &this, &next);
                if (err)
                        break;
                inode = next.dentry->d_inode;
-                if ((lookup_flags & LOOKUP_FOLLOW)
+                if (follow_on_final(inode, lookup_flags)) {
-                    && inode && inode->i_op->follow_link) {
                        err = do_follow_link(&next, nd);
                        if (err)
                                goto return_err;
@@ -1017,8 +976,27 @@ return_err:
 static int path_walk(const char *name, struct nameidata *nd)
 {
+        struct path save = nd->path;
+        int result;
        current->total_link_count = 0;
-        return link_path_walk(name, nd);
+        /* make sure the stuff we saved doesn't go away */
+        path_get(&save);
+        result = link_path_walk(name, nd);
+        if (result == -ESTALE) {
+                /* nd->path had been dropped */
+                current->total_link_count = 0;
+                nd->path = save;
+                path_get(&nd->path);
+                nd->flags |= LOOKUP_REVAL;
+                result = link_path_walk(name, nd);
+        }
+        path_put(&save);
+        return result;
 }
 static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
@@ -1141,36 +1119,6 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
        return retval;
 }
-/**
- * path_lookup_open - lookup a file path with open intent
- * @dfd: the directory to use as base, or AT_FDCWD
- * @name: pointer to file name
- * @lookup_flags: lookup intent flags
- * @nd: pointer to nameidata
- * @open_flags: open intent flags
- */
-static int path_lookup_open(int dfd, const char *name,
-                unsigned int lookup_flags, struct nameidata *nd, int open_flags)
-{
-        struct file *filp = get_empty_filp();
-        int err;
-        if (filp == NULL)
-                return -ENFILE;
-        nd->intent.open.file = filp;
-        nd->intent.open.flags = open_flags;
-        nd->intent.open.create_mode = 0;
-        err = do_path_lookup(dfd, name, lookup_flags|LOOKUP_OPEN, nd);
-        if (IS_ERR(nd->intent.open.file)) {
-                if (err == 0) {
-                        err = PTR_ERR(nd->intent.open.file);
-                        path_put(&nd->path);
-                }
-        } else if (err != 0)
-                release_open_intent(nd);
-        return err;
-}
 static struct dentry *__lookup_hash(struct qstr *name,
                struct dentry *base, struct nameidata *nd)
 {
@@ -1191,7 +1139,17 @@ static struct dentry *__lookup_hash(struct qstr *name,
                        goto out;
        }
-        dentry = cached_lookup(base, name, nd);
+        dentry = __d_lookup(base, name);
+        /* lockess __d_lookup may fail due to concurrent d_move()
+         * in some unrelated directory, so try with d_lookup
+         */
+        if (!dentry)
+                dentry = d_lookup(base, name);
+        if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
+                dentry = do_revalidate(dentry, nd);
        if (!dentry) {
                struct dentry *new;
@@ -1223,7 +1181,7 @@ static struct dentry *lookup_hash(struct nameidata *nd)
 {
        int err;
-        err = inode_permission(nd->path.dentry->d_inode, MAY_EXEC);
+        err = exec_permission(nd->path.dentry->d_inode);
        if (err)
                return ERR_PTR(err);
        return __lookup_hash(&nd->last, nd->path.dentry, nd);
@@ -1273,29 +1231,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
        if (err)
                return ERR_PTR(err);
-        err = inode_permission(base->d_inode, MAY_EXEC);
+        err = exec_permission(base->d_inode);
-        if (err)
-                return ERR_PTR(err);
-        return __lookup_hash(&this, base, NULL);
-}
-/**
- * lookup_one_noperm - bad hack for sysfs
- * @name:       pathname component to lookup
- * @base:       base directory to lookup from
- *
- * This is a variant of lookup_one_len that doesn't perform any permission
- * checks.   It's a horrible hack to work around the braindead sysfs
- * architecture and should not be used anywhere else.
- *
- * DON'T USE THIS FUNCTION EVER, thanks.
- */
-struct dentry *lookup_one_noperm(const char *name, struct dentry *base)
-{
-        int err;
-        struct qstr this;
-        err = __lookup_one_len(name, &this, base, strlen(name));
        if (err)
                return ERR_PTR(err);
        return __lookup_hash(&this, base, NULL);
@@ -1381,7 +1317,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
                return -ENOENT;
        BUG_ON(victim->d_parent->d_inode != dir);
-        audit_inode_child(victim->d_name.name, victim, dir);
+        audit_inode_child(victim, dir);
        error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
        if (error)
@@ -1422,22 +1358,6 @@ static inline int may_create(struct inode *dir, struct dentry *child)
        return inode_permission(dir, MAY_WRITE | MAY_EXEC);
 }
-/* 
- * O_DIRECTORY translates into forcing a directory lookup.
- */
-static inline int lookup_flags(unsigned int f)
-{
-        unsigned long retval = LOOKUP_FOLLOW;
-        if (f & O_NOFOLLOW)
-                retval &= ~LOOKUP_FOLLOW;
-        
-        if (f & O_DIRECTORY)
-                retval |= LOOKUP_DIRECTORY;
-        return retval;
-}
 /*
 * p1 and p2 should be directories on the same fs.
 */
@@ -1495,7 +1415,6 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
        error = security_inode_create(dir, dentry, mode);
        if (error)
                return error;
-        vfs_dq_init(dir);
        error = dir->i_op->create(dir, dentry, mode, nd);
        if (!error)
                fsnotify_create(dir, dentry);
@@ -1533,69 +1452,45 @@ int may_open(struct path *path, int acc_mode, int flag)
        if (error)
                return error;
-        error = ima_path_check(path, acc_mode ?
-                               acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) :
-                               ACC_MODE(flag) & (MAY_READ | MAY_WRITE),
-                               IMA_COUNT_UPDATE);
-        if (error)
-                return error;
        /*
         * An append-only file must be opened in append mode for writing.
         */
        if (IS_APPEND(inode)) {
-                error = -EPERM;
+                if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
-                if  ((flag & FMODE_WRITE) && !(flag & O_APPEND))
+                        return -EPERM;
-                        goto err_out;
                if (flag & O_TRUNC)
-                        goto err_out;
+                        return -EPERM;
        }
        /* O_NOATIME can only be set by the owner or superuser */
-        if (flag & O_NOATIME)
+        if (flag & O_NOATIME && !is_owner_or_cap(inode))
-                if (!is_owner_or_cap(inode)) {
+                return -EPERM;
-                        error = -EPERM;
-                        goto err_out;
-                }
        /*
         * Ensure there are no outstanding leases on the file.
         */
-        error = break_lease(inode, flag);
+        return break_lease(inode, flag);
-        if (error)
+}
-                goto err_out;
-        if (flag & O_TRUNC) {
-                error = get_write_access(inode);
-                if (error)
-                        goto err_out;
-                /*
-                 * Refuse to truncate files with mandatory locks held on them.
-                 */
-                error = locks_verify_locked(inode);
-                if (!error)
-                        error = security_path_truncate(path, 0,
-                                               ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
-                if (!error) {
-                        vfs_dq_init(inode);
-                        error = do_truncate(dentry, 0,
-                                            ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
-                                            NULL);
-                }
-                put_write_access(inode);
-                if (error)
-                        goto err_out;
-        } else
-                if (flag & FMODE_WRITE)
-                        vfs_dq_init(inode);
-        return 0;
+static int handle_truncate(struct path *path)
-err_out:
+{
-        ima_counts_put(path, acc_mode ?
+        struct inode *inode = path->dentry->d_inode;
-                       acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) :
+        int error = get_write_access(inode);
-                       ACC_MODE(flag) & (MAY_READ | MAY_WRITE));
+        if (error)
+                return error;
+        /*
+         * Refuse to truncate files with mandatory locks held on them.
+         */
+        error = locks_verify_locked(inode);
+        if (!error)
+                error = security_path_truncate(path, 0,
+                                       ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
+        if (!error) {
+                error = do_truncate(path->dentry, 0,
+                                    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
+                                    NULL);
+        }
+        put_write_access(inode);
        return error;
 }
@@ -1605,7 +1500,7 @@ err_out:
 * what get passed to sys_open().
 */
 static int __open_namei_create(struct nameidata *nd, struct path *path,
-                                int flag, int mode)
+                                int open_flag, int mode)
 {
        int error;
        struct dentry *dir = nd->path.dentry;
@@ -1623,7 +1518,7 @@ out_unlock:
        if (error)
                return error;
        /* Don't check for write permission, don't truncate */
-        return may_open(&nd->path, 0, flag & ~O_TRUNC);
+        return may_open(&nd->path, 0, open_flag & ~O_TRUNC);
 }
 /*
@@ -1650,7 +1545,7 @@ static inline int open_to_namei_flags(int flag)
        return flag;
 }
-static int open_will_write_to_fs(int flag, struct inode *inode)
+static int open_will_truncate(int flag, struct inode *inode)
 {
        /*
         * We'll never write to the fs underlying
@@ -1661,100 +1556,133 @@ static int open_will_write_to_fs(int flag, struct inode *inode)
        return (flag & O_TRUNC);
 }
-/*
+static struct file *finish_open(struct nameidata *nd,
- * Note that the low bits of the passed in "open_flag"
+                                int open_flag, int acc_mode)
- * are not the same as in the local variable "flag". See
- * open_to_namei_flags() for more details.
- */
-struct file *do_filp_open(int dfd, const char *pathname,
-                int open_flag, int mode, int acc_mode)
 {
        struct file *filp;
-        struct nameidata nd;
+        int will_truncate;
        int error;
-        struct path path;
-        struct dentry *dir;
-        int count = 0;
-        int will_write;
-        int flag = open_to_namei_flags(open_flag);
-        if (!acc_mode)
+        will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
-                acc_mode = MAY_OPEN | ACC_MODE(flag);
+        if (will_truncate) {
+                error = mnt_want_write(nd->path.mnt);
+                if (error)
+                        goto exit;
+        }
+        error = may_open(&nd->path, acc_mode, open_flag);
+        if (error) {
+                if (will_truncate)
+                        mnt_drop_write(nd->path.mnt);
+                goto exit;
+        }
+        filp = nameidata_to_filp(nd);
+        if (!IS_ERR(filp)) {
+                error = ima_file_check(filp, acc_mode);
+                if (error) {
+                        fput(filp);
+                        filp = ERR_PTR(error);
+                }
+        }
+        if (!IS_ERR(filp)) {
+                if (will_truncate) {
+                        error = handle_truncate(&nd->path);
+                        if (error) {
+                                fput(filp);
+                                filp = ERR_PTR(error);
+                        }
+                }
+        }
+        /*
+         * It is now safe to drop the mnt write
+         * because the filp has had a write taken
+         * on its behalf.
+         */
+        if (will_truncate)
+                mnt_drop_write(nd->path.mnt);
+        return filp;
-        /* O_TRUNC implies we need access checks for write permissions */
+exit:
-        if (flag & O_TRUNC)
+        if (!IS_ERR(nd->intent.open.file))
-                acc_mode |= MAY_WRITE;
+                release_open_intent(nd);
+        path_put(&nd->path);
+        return ERR_PTR(error);
+}
-        /* Allow the LSM permission hook to distinguish append 
+static struct file *do_last(struct nameidata *nd, struct path *path,
-           access from general write access. */
+                            int open_flag, int acc_mode,
-        if (flag & O_APPEND)
+                            int mode, const char *pathname)
-                acc_mode |= MAY_APPEND;
+{
+        struct dentry *dir = nd->path.dentry;
+        struct file *filp;
+        int error = -EISDIR;
-        /*
+        switch (nd->last_type) {
-         * The simplest case - just a plain lookup.
+        case LAST_DOTDOT:
-         */
+                follow_dotdot(nd);
-        if (!(flag & O_CREAT)) {
+                dir = nd->path.dentry;
-                error = path_lookup_open(dfd, pathname, lookup_flags(flag),
+                if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
-                                         &nd, flag);
+                        if (!dir->d_op->d_revalidate(dir, nd)) {
-                if (error)
+                                error = -ESTALE;
-                        return ERR_PTR(error);
+                                goto exit;
+                        }
+                }
+                /* fallthrough */
+        case LAST_DOT:
+        case LAST_ROOT:
+                if (open_flag & O_CREAT)
+                        goto exit;
+                /* fallthrough */
+        case LAST_BIND:
+                audit_inode(pathname, dir);
                goto ok;
        }
-        /*
+        /* trailing slashes? */
-         * Create - we need to know the parent.
+        if (nd->last.name[nd->last.len]) {
-         */
+                if (open_flag & O_CREAT)
-        error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
+                        goto exit;
-        if (error)
+                nd->flags |= LOOKUP_DIRECTORY;
-                return ERR_PTR(error);
-        error = path_walk(pathname, &nd);
-        if (error) {
-                if (nd.root.mnt)
-                        path_put(&nd.root);
-                return ERR_PTR(error);
        }
-        if (unlikely(!audit_dummy_context()))
-                audit_inode(pathname, nd.path.dentry);
-        /*
+        /* just plain open? */
-         * We have the parent and last component. First of all, check
+        if (!(open_flag & O_CREAT)) {
-         * that we are not asked to creat(2) an obvious directory - that
+                error = do_lookup(nd, &nd->last, path);
-         * will not do.
+                if (error)
-         */
+                        goto exit;
-        error = -EISDIR;
+                error = -ENOENT;
-        if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len])
+                if (!path->dentry->d_inode)
-                goto exit_parent;
+                        goto exit_dput;
+                if (path->dentry->d_inode->i_op->follow_link)
+                        return NULL;
+                error = -ENOTDIR;
+                if (nd->flags & LOOKUP_DIRECTORY) {
+                        if (!path->dentry->d_inode->i_op->lookup)
+                                goto exit_dput;
+                }
+                path_to_nameidata(path, nd);
+                audit_inode(pathname, nd->path.dentry);
+                goto ok;
+        }
-        error = -ENFILE;
+        /* OK, it's O_CREAT */
-        filp = get_empty_filp();
-        if (filp == NULL)
-                goto exit_parent;
-        nd.intent.open.file = filp;
-        nd.intent.open.flags = flag;
-        nd.intent.open.create_mode = mode;
-        dir = nd.path.dentry;
-        nd.flags &= ~LOOKUP_PARENT;
-        nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;
-        if (flag & O_EXCL)
-                nd.flags |= LOOKUP_EXCL;
        mutex_lock(&dir->d_inode->i_mutex);
-        path.dentry = lookup_hash(&nd);
-        path.mnt = nd.path.mnt;
-do_last:
+        path->dentry = lookup_hash(nd);
-        error = PTR_ERR(path.dentry);
+        path->mnt = nd->path.mnt;
-        if (IS_ERR(path.dentry)) {
+        error = PTR_ERR(path->dentry);
+        if (IS_ERR(path->dentry)) {
                mutex_unlock(&dir->d_inode->i_mutex);
                goto exit;
        }
-        if (IS_ERR(nd.intent.open.file)) {
+        if (IS_ERR(nd->intent.open.file)) {
-                error = PTR_ERR(nd.intent.open.file);
+                error = PTR_ERR(nd->intent.open.file);
                goto exit_mutex_unlock;
        }
        /* Negative dentry, just create the file */
-        if (!path.dentry->d_inode) {
+        if (!path->dentry->d_inode) {
                /*
                 * This write is needed to ensure that a
                 * ro->rw transition does not occur between
@@ -1762,22 +1690,23 @@ do_last:
                 * a permanent write count is taken through
                 * the 'struct file' in nameidata_to_filp().
                 */
-                error = mnt_want_write(nd.path.mnt);
+                error = mnt_want_write(nd->path.mnt);
                if (error)
                        goto exit_mutex_unlock;
-                error = __open_namei_create(&nd, &path, flag, mode);
+                error = __open_namei_create(nd, path, open_flag, mode);
                if (error) {
-                        mnt_drop_write(nd.path.mnt);
+                        mnt_drop_write(nd->path.mnt);
                        goto exit;
                }
-                filp = nameidata_to_filp(&nd, open_flag);
+                filp = nameidata_to_filp(nd);
-                if (IS_ERR(filp))
+                mnt_drop_write(nd->path.mnt);
-                        ima_counts_put(&nd.path,
+                if (!IS_ERR(filp)) {
-                                       acc_mode & (MAY_READ | MAY_WRITE |
+                        error = ima_file_check(filp, acc_mode);
-                                                   MAY_EXEC));
+                        if (error) {
-                mnt_drop_write(nd.path.mnt);
+                                fput(filp);
-                if (nd.root.mnt)
+                                filp = ERR_PTR(error);
-                        path_put(&nd.root);
+                        }
+                }
                return filp;
        }
@@ -1785,129 +1714,182 @@ do_last:
         * It already exists.
         */
        mutex_unlock(&dir->d_inode->i_mutex);
-        audit_inode(pathname, path.dentry);
+        audit_inode(pathname, path->dentry);
        error = -EEXIST;
-        if (flag & O_EXCL)
+        if (open_flag & O_EXCL)
                goto exit_dput;
-        if (__follow_mount(&path)) {
+        if (__follow_mount(path)) {
                error = -ELOOP;
-                if (flag & O_NOFOLLOW)
+                if (open_flag & O_NOFOLLOW)
                        goto exit_dput;
        }
        error = -ENOENT;
-        if (!path.dentry->d_inode)
+        if (!path->dentry->d_inode)
                goto exit_dput;
-        if (path.dentry->d_inode->i_op->follow_link)
-                goto do_link;
-        path_to_nameidata(&path, &nd);
+        if (path->dentry->d_inode->i_op->follow_link)
+                return NULL;
+        path_to_nameidata(path, nd);
        error = -EISDIR;
-        if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
+        if (S_ISDIR(path->dentry->d_inode->i_mode))
                goto exit;
 ok:
-        /*
+        filp = finish_open(nd, open_flag, acc_mode);
-         * Consider:
-         * 1. may_open() truncates a file
-         * 2. a rw->ro mount transition occurs
-         * 3. nameidata_to_filp() fails due to
-         *    the ro mount.
-         * That would be inconsistent, and should
-         * be avoided. Taking this mnt write here
-         * ensures that (2) can not occur.
-         */
-        will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
-        if (will_write) {
-                error = mnt_want_write(nd.path.mnt);
-                if (error)
-                        goto exit;
-        }
-        error = may_open(&nd.path, acc_mode, flag);
-        if (error) {
-                if (will_write)
-                        mnt_drop_write(nd.path.mnt);
-                goto exit;
-        }
-        filp = nameidata_to_filp(&nd, open_flag);
-        if (IS_ERR(filp))
-                ima_counts_put(&nd.path,
-                               acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
-        /*
-         * It is now safe to drop the mnt write
-         * because the filp has had a write taken
-         * on its behalf.
-         */
-        if (will_write)
-                mnt_drop_write(nd.path.mnt);
-        if (nd.root.mnt)
-                path_put(&nd.root);
        return filp;
 exit_mutex_unlock:
        mutex_unlock(&dir->d_inode->i_mutex);
 exit_dput:
-        path_put_conditional(&path, &nd);
+        path_put_conditional(path, nd);
 exit:
-        if (!IS_ERR(nd.intent.open.file))
+        if (!IS_ERR(nd->intent.open.file))
-                release_open_intent(&nd);
+                release_open_intent(nd);
-exit_parent:
+        path_put(&nd->path);
-        if (nd.root.mnt)
-                path_put(&nd.root);
-        path_put(&nd.path);
        return ERR_PTR(error);
+}
+/*
+ * Note that the low bits of the passed in "open_flag"
+ * are not the same as in the local variable "flag". See
+ * open_to_namei_flags() for more details.
+ */
+struct file *do_filp_open(int dfd, const char *pathname,
+                int open_flag, int mode, int acc_mode)
+{
+        struct file *filp;
+        struct nameidata nd;
+        int error;
+        struct path path;
+        int count = 0;
+        int flag = open_to_namei_flags(open_flag);
+        int force_reval = 0;
+        if (!(open_flag & O_CREAT))
+                mode = 0;
-do_link:
-        error = -ELOOP;
-        if (flag & O_NOFOLLOW)
-                goto exit_dput;
        /*
-         * This is subtle. Instead of calling do_follow_link() we do the
+         * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
-         * thing by hands. The reason is that this way we have zero link_count
+         * check for O_DSYNC if the need any syncing at all we enforce it's
-         * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT.
+         * always set instead of having to deal with possibly weird behaviour
-         * After that we have the parent and last component, i.e.
+         * for malicious applications setting only __O_SYNC.
-         * we are in the same situation as after the first path_walk().
-         * Well, almost - if the last component is normal we get its copy
-         * stored in nd->last.name and we will have to putname() it when we
-         * are done. Procfs-like symlinks just set LAST_BIND.
         */
-        nd.flags |= LOOKUP_PARENT;
+        if (open_flag & __O_SYNC)
-        error = security_inode_follow_link(path.dentry, &nd);
+                open_flag |= O_DSYNC;
+        if (!acc_mode)
+                acc_mode = MAY_OPEN | ACC_MODE(open_flag);
+        /* O_TRUNC implies we need access checks for write permissions */
+        if (open_flag & O_TRUNC)
+                acc_mode |= MAY_WRITE;
+        /* Allow the LSM permission hook to distinguish append 
+           access from general write access. */
+        if (open_flag & O_APPEND)
+                acc_mode |= MAY_APPEND;
+        /* find the parent */
+reval:
+        error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
        if (error)
-                goto exit_dput;
-        error = __do_follow_link(&path, &nd);
-        if (error) {
-                /* Does someone understand code flow here? Or it is only
-                 * me so stupid? Anathema to whoever designed this non-sense
-                 * with "intent.open".
-                 */
-                release_open_intent(&nd);
-                if (nd.root.mnt)
-                        path_put(&nd.root);
                return ERR_PTR(error);
+        if (force_reval)
+                nd.flags |= LOOKUP_REVAL;
+        current->total_link_count = 0;
+        error = link_path_walk(pathname, &nd);
+        if (error) {
+                filp = ERR_PTR(error);
+                goto out;
        }
+        if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT))
+                audit_inode(pathname, nd.path.dentry);
+        /*
+         * We have the parent and last component.
+         */
+        error = -ENFILE;
+        filp = get_empty_filp();
+        if (filp == NULL)
+                goto exit_parent;
+        nd.intent.open.file = filp;
+        filp->f_flags = open_flag;
+        nd.intent.open.flags = flag;
+        nd.intent.open.create_mode = mode;
        nd.flags &= ~LOOKUP_PARENT;
-        if (nd.last_type == LAST_BIND)
+        nd.flags |= LOOKUP_OPEN;
-                goto ok;
+        if (open_flag & O_CREAT) {
-        error = -EISDIR;
+                nd.flags |= LOOKUP_CREATE;
-        if (nd.last_type != LAST_NORM)
+                if (open_flag & O_EXCL)
-                goto exit;
+                        nd.flags |= LOOKUP_EXCL;
-        if (nd.last.name[nd.last.len]) {
-                __putname(nd.last.name);
-                goto exit;
        }
-        error = -ELOOP;
+        if (open_flag & O_DIRECTORY)
-        if (count++==32) {
+                nd.flags |= LOOKUP_DIRECTORY;
-                __putname(nd.last.name);
+        filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
-                goto exit;
+        while (unlikely(!filp)) { /* trailing symlink */
+                struct path holder;
+                struct inode *inode = path.dentry->d_inode;
+                void *cookie;
+                error = -ELOOP;
+                /* S_ISDIR part is a temporary automount kludge */
+                if ((open_flag & O_NOFOLLOW) && !S_ISDIR(inode->i_mode))
+                        goto exit_dput;
+                if (count++ == 32)
+                        goto exit_dput;
+                /*
+                 * This is subtle. Instead of calling do_follow_link() we do
+                 * the thing by hands. The reason is that this way we have zero
+                 * link_count and path_walk() (called from ->follow_link)
+                 * honoring LOOKUP_PARENT.  After that we have the parent and
+                 * last component, i.e. we are in the same situation as after
+                 * the first path_walk().  Well, almost - if the last component
+                 * is normal we get its copy stored in nd->last.name and we will
+                 * have to putname() it when we are done. Procfs-like symlinks
+                 * just set LAST_BIND.
+                 */
+                nd.flags |= LOOKUP_PARENT;
+                error = security_inode_follow_link(path.dentry, &nd);
+                if (error)
+                        goto exit_dput;
+                error = __do_follow_link(&path, &nd, &cookie);
+                if (unlikely(error)) {
+                        /* nd.path had been dropped */
+                        if (!IS_ERR(cookie) && inode->i_op->put_link)
+                                inode->i_op->put_link(path.dentry, &nd, cookie);
+                        path_put(&path);
+                        release_open_intent(&nd);
+                        filp = ERR_PTR(error);
+                        goto out;
+                }
+                holder = path;
+                nd.flags &= ~LOOKUP_PARENT;
+                filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
+                if (inode->i_op->put_link)
+                        inode->i_op->put_link(holder.dentry, &nd, cookie);
+                path_put(&holder);
        }
-        dir = nd.path.dentry;
+out:
-        mutex_lock(&dir->d_inode->i_mutex);
+        if (nd.root.mnt)
-        path.dentry = lookup_hash(&nd);
+                path_put(&nd.root);
-        path.mnt = nd.path.mnt;
+        if (filp == ERR_PTR(-ESTALE) && !force_reval) {
-        __putname(nd.last.name);
+                force_reval = 1;
-        goto do_last;
+                goto reval;
+        }
+        return filp;
+exit_dput:
+        path_put_conditional(&path, &nd);
+        if (!IS_ERR(nd.intent.open.file))
+                release_open_intent(&nd);
+exit_parent:
+        path_put(&nd.path);
+        filp = ERR_PTR(error);
+        goto out;
 }
 /**
@@ -2001,7 +1983,6 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
        if (error)
                return error;
-        vfs_dq_init(dir);
        error = dir->i_op->mknod(dir, dentry, mode, dev);
        if (!error)
                fsnotify_create(dir, dentry);
@@ -2100,7 +2081,6 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        if (error)
                return error;
-        vfs_dq_init(dir);
        error = dir->i_op->mkdir(dir, dentry, mode);
        if (!error)
                fsnotify_mkdir(dir, dentry);
@@ -2186,8 +2166,6 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (!dir->i_op->rmdir)
                return -EPERM;
-        vfs_dq_init(dir);
        mutex_lock(&dentry->d_inode->i_mutex);
        dentry_unhash(dentry);
        if (d_mountpoint(dentry))
@@ -2273,15 +2251,16 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
        if (!dir->i_op->unlink)
                return -EPERM;
-        vfs_dq_init(dir);
        mutex_lock(&dentry->d_inode->i_mutex);
        if (d_mountpoint(dentry))
                error = -EBUSY;
        else {
                error = security_inode_unlink(dir, dentry);
-                if (!error)
+                if (!error) {
                        error = dir->i_op->unlink(dir, dentry);
+                        if (!error)
+                                dentry->d_inode->i_flags |= S_DEAD;
+                }
        }
        mutex_unlock(&dentry->d_inode->i_mutex);
@@ -2384,7 +2363,6 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
        if (error)
                return error;
-        vfs_dq_init(dir);
        error = dir->i_op->symlink(dir, dentry, oldname);
        if (!error)
                fsnotify_create(dir, dentry);
@@ -2468,7 +2446,6 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
                return error;
        mutex_lock(&inode->i_mutex);
-        vfs_dq_init(dir);
        error = dir->i_op->link(old_dentry, dir, new_dentry);
        mutex_unlock(&inode->i_mutex);
        if (!error)
@@ -2569,7 +2546,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
 *      e) conversion from fhandle to dentry may come in the wrong moment - when
 *         we are removing the target. Solution: we will have to grab ->i_mutex
 *         in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
- *         ->i_mutex on parents, which works but leads to some truely excessive
+ *         ->i_mutex on parents, which works but leads to some truly excessive
 *         locking].
 */
 static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
@@ -2634,6 +2611,8 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
        else
                error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
        if (!error) {
+                if (target)
+                        target->i_flags |= S_DEAD;
                if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
                        d_move(old_dentry, new_dentry);
        }
@@ -2667,20 +2646,15 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (!old_dir->i_op->rename)
                return -EPERM;
-        vfs_dq_init(old_dir);
-        vfs_dq_init(new_dir);
        old_name = fsnotify_oldname_init(old_dentry->d_name.name);
        if (is_dir)
                error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
        else
                error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
-        if (!error) {
+        if (!error)
-                const char *new_name = old_dentry->d_name.name;
+                fsnotify_move(old_dir, new_dir, old_name, is_dir,
-                fsnotify_move(old_dir, new_dir, old_name, new_name, is_dir,
                              new_dentry->d_inode, old_dentry);
-        }
        fsnotify_oldname_free(old_name);
        return error;
diff --git a/fs/namespace.c b/fs/namespace.c
index bdc3cb4fd222..8174c8ab5c70 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -573,7 +573,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
                        mnt->mnt_master = old;
                        CLEAR_MNT_SHARED(mnt);
                } else if (!(flag & CL_PRIVATE)) {
-                        if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old))
+                        if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
                                list_add(&mnt->mnt_share, &old->mnt_share);
                        if (IS_MNT_SLAVE(old))
                                list_add(&mnt->mnt_slave, &old->mnt_slave);
@@ -737,6 +737,21 @@ static void m_stop(struct seq_file *m, void *v)
        up_read(&namespace_sem);
 }
+int mnt_had_events(struct proc_mounts *p)
+{
+        struct mnt_namespace *ns = p->ns;
+        int res = 0;
+        spin_lock(&vfsmount_lock);
+        if (p->event != ns->event) {
+                p->event = ns->event;
+                res = 1;
+        }
+        spin_unlock(&vfsmount_lock);
+        return res;
+}
 struct proc_fs_info {
        int flag;
        const char *str;
@@ -965,10 +980,12 @@ EXPORT_SYMBOL(may_umount_tree);
 int may_umount(struct vfsmount *mnt)
 {
        int ret = 1;
+        down_read(&namespace_sem);
        spin_lock(&vfsmount_lock);
        if (propagate_mount_busy(mnt, 2))
                ret = 0;
        spin_unlock(&vfsmount_lock);
+        up_read(&namespace_sem);
        return ret;
 }
@@ -1119,8 +1136,15 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
 {
        struct path path;
        int retval;
+        int lookup_flags = 0;
+        if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
+                return -EINVAL;
-        retval = user_path(name, &path);
+        if (!(flags & UMOUNT_NOFOLLOW))
+                lookup_flags |= LOOKUP_FOLLOW;
+        retval = user_path_at(AT_FDCWD, name, lookup_flags, &path);
        if (retval)
                goto out;
        retval = -EINVAL;
@@ -1244,6 +1268,21 @@ void drop_collected_mounts(struct vfsmount *mnt)
        release_mounts(&umount_list);
 }
+int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
+                   struct vfsmount *root)
+{
+        struct vfsmount *mnt;
+        int res = f(root, arg);
+        if (res)
+                return res;
+        list_for_each_entry(mnt, &root->mnt_list, mnt_list) {
+                res = f(mnt, arg);
+                if (res)
+                        return res;
+        }
+        return 0;
+}
 static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end)
 {
        struct vfsmount *p;
@@ -1352,12 +1391,12 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
        if (err)
                goto out_cleanup_ids;
+        spin_lock(&vfsmount_lock);
        if (IS_MNT_SHARED(dest_mnt)) {
                for (p = source_mnt; p; p = next_mnt(p, source_mnt))
                        set_mnt_shared(p);
        }
-        spin_lock(&vfsmount_lock);
        if (parent_path) {
                detach_mnt(source_mnt, parent_path);
                attach_mnt(source_mnt, path);
@@ -1534,8 +1573,12 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
                err = change_mount_flags(path->mnt, flags);
        else
                err = do_remount_sb(sb, flags, data, 0);
-        if (!err)
+        if (!err) {
+                spin_lock(&vfsmount_lock);
+                mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK;
                path->mnt->mnt_flags = mnt_flags;
+                spin_unlock(&vfsmount_lock);
+        }
        up_write(&sb->s_umount);
        if (!err) {
                security_sb_post_remount(path->mnt, flags, data);
@@ -1665,6 +1708,8 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
 {
        int err;
+        mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
        down_write(&namespace_sem);
        /* Something was mounted here while we slept */
        while (d_mountpoint(path->dentry) &&
@@ -1921,6 +1966,16 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
        if (data_page)
                ((char *)data_page)[PAGE_SIZE - 1] = 0;
+        /* ... and get the mountpoint */
+        retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
+        if (retval)
+                return retval;
+        retval = security_sb_mount(dev_name, &path,
+                                   type_page, flags, data_page);
+        if (retval)
+                goto dput_out;
        /* Default to relatime unless overriden */
        if (!(flags & MS_NOATIME))
                mnt_flags |= MNT_RELATIME;
@@ -1945,16 +2000,6 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
                   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
                   MS_STRICTATIME);
-        /* ... and get the mountpoint */
-        retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
-        if (retval)
-                return retval;
-        retval = security_sb_mount(dev_name, &path,
-                                   type_page, flags, data_page);
-        if (retval)
-                goto dput_out;
        if (flags & MS_REMOUNT)
                retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
                                    data_page);
@@ -2306,17 +2351,13 @@ void __init mnt_init(void)
 void put_mnt_ns(struct mnt_namespace *ns)
 {
-        struct vfsmount *root;
        LIST_HEAD(umount_list);
-        if (!atomic_dec_and_lock(&ns->count, &vfsmount_lock))
+        if (!atomic_dec_and_test(&ns->count))
                return;
-        root = ns->root;
-        ns->root = NULL;
-        spin_unlock(&vfsmount_lock);
        down_write(&namespace_sem);
        spin_lock(&vfsmount_lock);
-        umount_tree(root, 0, &umount_list);
+        umount_tree(ns->root, 0, &umount_list);
        spin_unlock(&vfsmount_lock);
        up_write(&namespace_sem);
        release_mounts(&umount_list);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index b8b5b30d53f0..7edfcd4d5e52 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -15,7 +15,6 @@
 #include <linux/errno.h>
 #include <linux/stat.h>
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
 #include <asm/uaccess.h>
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 6a7d901f1936..1daabb90e0a5 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -15,7 +15,6 @@
 #include <linux/fcntl.h>
 #include <linux/stat.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
 #include <linux/smp_lock.h>
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 0d58caf4a6e1..60a5e2864ea8 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -15,6 +15,7 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/mount.h>
+#include <linux/slab.h>
 #include <linux/highuid.h>
 #include <linux/smp_lock.h>
 #include <linux/vmalloc.h>
@@ -835,7 +836,7 @@ static int ncp_ioctl_need_write(unsigned int cmd)
        case NCP_IOC_SETROOT:
                return 0;
        default:
-                /* unkown IOCTL command, assume write */
+                /* unknown IOCTL command, assume write */
                return 1;
        }
 }
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 15458decdb8a..56f5b3a0e1ee 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -9,12 +9,12 @@
 #include <linux/stat.h>
 #include <linux/time.h>
 #include <linux/kernel.h>
+#include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/shm.h>
 #include <linux/errno.h>
 #include <linux/mman.h>
 #include <linux/string.h>
-#include <linux/slab.h>
 #include <linux/fcntl.h>
 #include <linux/ncp_fs.h>
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index e37df8d5fe70..c7ff6c700a6e 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -21,6 +21,7 @@
 #include <linux/mm.h>
 #include <linux/netdevice.h>
 #include <linux/signal.h>
+#include <linux/slab.h>
 #include <net/scm.h>
 #include <net/sock.h>
 #include <linux/ipx.h>
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index e3d26c1bd105..c634fd17b337 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -27,6 +27,7 @@
 #include <linux/fs.h>
 #include <linux/ncp_fs.h>
 #include <linux/time.h>
+#include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/stat.h>
 #include "ncplib_kernel.h"
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 2a77bc25d5af..a43d07e7b924 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -90,13 +90,12 @@ config ROOT_NFS
          If you want your system to mount its root file system via NFS,
          choose Y here.  This is common practice for managing systems
          without local permanent storage.  For details, read
-          <file:Documentation/filesystems/nfsroot.txt>.
+          <file:Documentation/filesystems/nfs/nfsroot.txt>.
          Most people say N here.
 config NFS_FSCACHE
-        bool "Provide NFS client caching support (EXPERIMENTAL)"
+        bool "Provide NFS client caching support"
-        depends on EXPERIMENTAL
        depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
        help
          Say Y here if you want NFS data to be cached locally on disc through
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index b4ffd0146ea6..84690319e625 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -10,6 +10,7 @@
 #include <linux/moduleparam.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #include <linux/sunrpc/cache.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 293fa0528a6e..36dfdae95123 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -78,11 +78,6 @@ nfs4_callback_svc(void *vrqstp)
        set_freezable();
-        /*
-         * FIXME: do we really need to run this under the BKL? If so, please
-         * add a comment about what it's intended to protect.
-         */
-        lock_kernel();
        while (!kthread_should_stop()) {
                /*
                 * Listen for a request on the socket
@@ -104,7 +99,6 @@ nfs4_callback_svc(void *vrqstp)
                preverr = err;
                svc_process(rqstp);
        }
-        unlock_kernel();
        return 0;
 }
@@ -124,7 +118,6 @@ nfs4_callback_up(struct svc_serv *serv)
        dprintk("NFS: Callback listener port = %u (af %u)\n",
                        nfs_callback_tcpport, PF_INET);
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
        ret = svc_create_xprt(serv, "tcp", PF_INET6,
                                nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
        if (ret > 0) {
@@ -135,7 +128,6 @@ nfs4_callback_up(struct svc_serv *serv)
                ret = 0;
        else
                goto out_err;
-#endif  /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
        return svc_prepare_thread(serv, &serv->sv_pools[0]);
@@ -160,11 +152,6 @@ nfs41_callback_svc(void *vrqstp)
        set_freezable();
-        /*
-         * FIXME: do we really need to run this under the BKL? If so, please
-         * add a comment about what it's intended to protect.
-         */
-        lock_kernel();
        while (!kthread_should_stop()) {
                prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
                spin_lock_bh(&serv->sv_cb_lock);
@@ -183,7 +170,6 @@ nfs41_callback_svc(void *vrqstp)
                }
                finish_wait(&serv->sv_cb_waitq, &wq);
        }
-        unlock_kernel();
        return 0;
 }
@@ -397,6 +383,7 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 */
 static struct svc_version *nfs4_callback_version[] = {
        [1] = &nfs4_callback_version1,
+        [4] = &nfs4_callback_version4,
 };
 static struct svc_stat nfs4_callback_stats;
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 07baa8254ca1..85a7cfd1b8dd 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -106,6 +106,27 @@ struct cb_sequenceres {
 extern unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
                                       struct cb_sequenceres *res);
+extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
+                                             const nfs4_stateid *stateid);
+#define RCA4_TYPE_MASK_RDATA_DLG        0
+#define RCA4_TYPE_MASK_WDATA_DLG        1
+struct cb_recallanyargs {
+        struct sockaddr *craa_addr;
+        uint32_t        craa_objs_to_keep;
+        uint32_t        craa_type_mask;
+};
+extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy);
+struct cb_recallslotargs {
+        struct sockaddr *crsa_addr;
+        uint32_t        crsa_target_max_slots;
+};
+extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args,
+                                          void *dummy);
 #endif /* CONFIG_NFS_V4_1 */
 extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
@@ -114,8 +135,9 @@ extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
 #ifdef CONFIG_NFS_V4
 extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
 extern void nfs_callback_down(int minorversion);
+extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
+                                            const nfs4_stateid *stateid);
 #endif /* CONFIG_NFS_V4 */
 /*
 * nfs41: Callbacks are expected to not cause substantial latency,
 * so we limit their concurrency to 1 by setting up the maximum number
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index b7da1f54da68..a08770a7e857 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -7,6 +7,7 @@
 */
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
+#include <linux/slab.h>
 #include "nfs4_fs.h"
 #include "callback.h"
 #include "delegation.h"
@@ -61,6 +62,16 @@ out:
        return res->status;
 }
+static int (*nfs_validate_delegation_stateid(struct nfs_client *clp))(struct nfs_delegation *, const nfs4_stateid *)
+{
+#if defined(CONFIG_NFS_V4_1)
+        if (clp->cl_minorversion > 0)
+                return nfs41_validate_delegation_stateid;
+#endif
+        return nfs4_validate_delegation_stateid;
+}
 __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
 {
        struct nfs_client *clp;
@@ -81,7 +92,8 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
                inode = nfs_delegation_find_inode(clp, &args->fh);
                if (inode != NULL) {
                        /* Set up a helper thread to actually return the delegation */
-                        switch(nfs_async_inode_return_delegation(inode, &args->stateid)) {
+                        switch (nfs_async_inode_return_delegation(inode, &args->stateid,
+                                                                  nfs_validate_delegation_stateid(clp))) {
                                case 0:
                                        res = 0;
                                        break;
@@ -102,51 +114,79 @@ out:
        return res;
 }
+int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
+{
+        if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data,
+                                         sizeof(delegation->stateid.data)) != 0)
+                return 0;
+        return 1;
+}
 #if defined(CONFIG_NFS_V4_1)
+int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
+{
+        if (delegation == NULL)
+                return 0;
+        /* seqid is 4-bytes long */
+        if (((u32 *) &stateid->data)[0] != 0)
+                return 0;
+        if (memcmp(&delegation->stateid.data[4], &stateid->data[4],
+                   sizeof(stateid->data)-4))
+                return 0;
+        return 1;
+}
 /*
 * Validate the sequenceID sent by the server.
 * Return success if the sequenceID is one more than what we last saw on
 * this slot, accounting for wraparound.  Increments the slot's sequence.
 *
- * We don't yet implement a duplicate request cache, so at this time
+ * We don't yet implement a duplicate request cache, instead we set the
- * we will log replays, and process them as if we had not seen them before,
+ * back channel ca_maxresponsesize_cached to zero. This is OK for now
- * but we don't bump the sequence in the slot.  Not too worried about it,
 * since we only currently implement idempotent callbacks anyway.
 *
 * We have a single slot backchannel at this time, so we don't bother
 * checking the used_slots bit array on the table.  The lower layer guarantees
 * a single outstanding callback request at a time.
 */
-static int
+static __be32
-validate_seqid(struct nfs4_slot_table *tbl, u32 slotid, u32 seqid)
+validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
 {
        struct nfs4_slot *slot;
        dprintk("%s enter. slotid %d seqid %d\n",
-                __func__, slotid, seqid);
+                __func__, args->csa_slotid, args->csa_sequenceid);
-        if (slotid > NFS41_BC_MAX_CALLBACKS)
+        if (args->csa_slotid > NFS41_BC_MAX_CALLBACKS)
                return htonl(NFS4ERR_BADSLOT);
-        slot = tbl->slots + slotid;
+        slot = tbl->slots + args->csa_slotid;
        dprintk("%s slot table seqid: %d\n", __func__, slot->seq_nr);
        /* Normal */
-        if (likely(seqid == slot->seq_nr + 1)) {
+        if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {
                slot->seq_nr++;
                return htonl(NFS4_OK);
        }
        /* Replay */
-        if (seqid == slot->seq_nr) {
+        if (args->csa_sequenceid == slot->seq_nr) {
-                dprintk("%s seqid %d is a replay - no DRC available\n",
+                dprintk("%s seqid %d is a replay\n",
-                        __func__, seqid);
+                        __func__, args->csa_sequenceid);
-                return htonl(NFS4_OK);
+                /* Signal process_op to set this error on next op */
+                if (args->csa_cachethis == 0)
+                        return htonl(NFS4ERR_RETRY_UNCACHED_REP);
+                /* The ca_maxresponsesize_cached is 0 with no DRC */
+                else if (args->csa_cachethis == 1)
+                        return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
        }
        /* Wraparound */
-        if (seqid == 1 && (slot->seq_nr + 1) == 0) {
+        if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) {
                slot->seq_nr = 1;
                return htonl(NFS4_OK);
        }
@@ -191,27 +231,87 @@ validate_seqid(struct nfs4_slot_table *tbl, u32 slotid, u32 seqid)
        return NULL;
 }
-/* FIXME: referring calls should be processed */
+/*
-unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
+ * For each referring call triple, check the session's slot table for
+ * a match.  If the slot is in use and the sequence numbers match, the
+ * client is still waiting for a response to the original request.
+ */
+static bool referring_call_exists(struct nfs_client *clp,
+                                  uint32_t nrclists,
+                                  struct referring_call_list *rclists)
+{
+        bool status = 0;
+        int i, j;
+        struct nfs4_session *session;
+        struct nfs4_slot_table *tbl;
+        struct referring_call_list *rclist;
+        struct referring_call *ref;
+        /*
+         * XXX When client trunking is implemented, this becomes
+         * a session lookup from within the loop
+         */
+        session = clp->cl_session;
+        tbl = &session->fc_slot_table;
+        for (i = 0; i < nrclists; i++) {
+                rclist = &rclists[i];
+                if (memcmp(session->sess_id.data,
+                           rclist->rcl_sessionid.data,
+                           NFS4_MAX_SESSIONID_LEN) != 0)
+                        continue;
+                for (j = 0; j < rclist->rcl_nrefcalls; j++) {
+                        ref = &rclist->rcl_refcalls[j];
+                        dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u "
+                                "slotid %u\n", __func__,
+                                ((u32 *)&rclist->rcl_sessionid.data)[0],
+                                ((u32 *)&rclist->rcl_sessionid.data)[1],
+                                ((u32 *)&rclist->rcl_sessionid.data)[2],
+                                ((u32 *)&rclist->rcl_sessionid.data)[3],
+                                ref->rc_sequenceid, ref->rc_slotid);
+                        spin_lock(&tbl->slot_tbl_lock);
+                        status = (test_bit(ref->rc_slotid, tbl->used_slots) &&
+                                  tbl->slots[ref->rc_slotid].seq_nr ==
+                                        ref->rc_sequenceid);
+                        spin_unlock(&tbl->slot_tbl_lock);
+                        if (status)
+                                goto out;
+                }
+        }
+out:
+        return status;
+}
+__be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
                                struct cb_sequenceres *res)
 {
        struct nfs_client *clp;
-        int i, status;
+        int i;
+        __be32 status;
-        for (i = 0; i < args->csa_nrclists; i++)
-                kfree(args->csa_rclists[i].rcl_refcalls);
-        kfree(args->csa_rclists);
        status = htonl(NFS4ERR_BADSESSION);
        clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid);
        if (clp == NULL)
                goto out;
-        status = validate_seqid(&clp->cl_session->bc_slot_table,
+        status = validate_seqid(&clp->cl_session->bc_slot_table, args);
-                                args->csa_slotid, args->csa_sequenceid);
        if (status)
                goto out_putclient;
+        /*
+         * Check for pending referring calls.  If a match is found, a
+         * related callback was received before the response to the original
+         * call.
+         */
+        if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
+                status = htonl(NFS4ERR_DELAY);
+                goto out_putclient;
+        }
        memcpy(&res->csr_sessionid, &args->csa_sessionid,
               sizeof(res->csr_sessionid));
        res->csr_sequenceid = args->csa_sequenceid;
@@ -222,9 +322,81 @@ unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
 out_putclient:
        nfs_put_client(clp);
 out:
+        for (i = 0; i < args->csa_nrclists; i++)
+                kfree(args->csa_rclists[i].rcl_refcalls);
+        kfree(args->csa_rclists);
+        if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP))
+                res->csr_status = 0;
+        else
+                res->csr_status = status;
+        dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,
+                ntohl(status), ntohl(res->csr_status));
+        return status;
+}
+__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy)
+{
+        struct nfs_client *clp;
+        __be32 status;
+        fmode_t flags = 0;
+        status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+        clp = nfs_find_client(args->craa_addr, 4);
+        if (clp == NULL)
+                goto out;
+        dprintk("NFS: RECALL_ANY callback request from %s\n",
+                rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+        if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *)
+                     &args->craa_type_mask))
+                flags = FMODE_READ;
+        if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *)
+                     &args->craa_type_mask))
+                flags |= FMODE_WRITE;
+        if (flags)
+                nfs_expire_all_delegation_types(clp, flags);
+        status = htonl(NFS4_OK);
+out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
-        res->csr_status = status;
+        return status;
-        return res->csr_status;
 }
+/* Reduce the fore channel's max_slots to the target value */
+__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy)
+{
+        struct nfs_client *clp;
+        struct nfs4_slot_table *fc_tbl;
+        __be32 status;
+        status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+        clp = nfs_find_client(args->crsa_addr, 4);
+        if (clp == NULL)
+                goto out;
+        dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
+                rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
+                args->crsa_target_max_slots);
+        fc_tbl = &clp->cl_session->fc_slot_table;
+        status = htonl(NFS4ERR_BAD_HIGH_SLOT);
+        if (args->crsa_target_max_slots > fc_tbl->max_slots ||
+            args->crsa_target_max_slots < 1)
+                goto out_putclient;
+        status = htonl(NFS4_OK);
+        if (args->crsa_target_max_slots == fc_tbl->max_slots)
+                goto out_putclient;
+        fc_tbl->target_max_slots = args->crsa_target_max_slots;
+        nfs41_handle_recall_slot(clp);
+out_putclient:
+        nfs_put_client(clp);    /* balance nfs_find_client */
+out:
+        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+        return status;
+}
 #endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 76b0aa0f73bf..05af212f0edf 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -9,6 +9,7 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
+#include <linux/slab.h>
 #include "nfs4_fs.h"
 #include "callback.h"
@@ -23,10 +24,15 @@
 #if defined(CONFIG_NFS_V4_1)
 #define CB_OP_SEQUENCE_RES_MAXSZ        (CB_OP_HDR_RES_MAXSZ + \
                                        4 + 1 + 3)
+#define CB_OP_RECALLANY_RES_MAXSZ       (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_RECALLSLOT_RES_MAXSZ      (CB_OP_HDR_RES_MAXSZ)
 #endif /* CONFIG_NFS_V4_1 */
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
+/* Internal error code */
+#define NFS4ERR_RESOURCE_HDR    11050
 typedef __be32 (*callback_process_op_t)(void *, void *);
 typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
 typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
@@ -172,7 +178,7 @@ static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
        __be32 *p;
        p = read_buf(xdr, 4);
        if (unlikely(p == NULL))
-                return htonl(NFS4ERR_RESOURCE);
+                return htonl(NFS4ERR_RESOURCE_HDR);
        *op = ntohl(*p);
        return 0;
 }
@@ -214,10 +220,10 @@ out:
 #if defined(CONFIG_NFS_V4_1)
-static unsigned decode_sessionid(struct xdr_stream *xdr,
+static __be32 decode_sessionid(struct xdr_stream *xdr,
                                 struct nfs4_sessionid *sid)
 {
-        uint32_t *p;
+        __be32 *p;
        int len = NFS4_MAX_SESSIONID_LEN;
        p = read_buf(xdr, len);
@@ -228,12 +234,12 @@ static unsigned decode_sessionid(struct xdr_stream *xdr,
        return 0;
 }
-static unsigned decode_rc_list(struct xdr_stream *xdr,
+static __be32 decode_rc_list(struct xdr_stream *xdr,
                               struct referring_call_list *rc_list)
 {
-        uint32_t *p;
+        __be32 *p;
        int i;
-        unsigned status;
+        __be32 status;
        status = decode_sessionid(xdr, &rc_list->rcl_sessionid);
        if (status)
@@ -266,13 +272,13 @@ out:
        return status;
 }
-static unsigned decode_cb_sequence_args(struct svc_rqst *rqstp,
+static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
                                        struct xdr_stream *xdr,
                                        struct cb_sequenceargs *args)
 {
-        uint32_t *p;
+        __be32 *p;
        int i;
-        unsigned status;
+        __be32 status;
        status = decode_sessionid(xdr, &args->csa_sessionid);
        if (status)
@@ -326,6 +332,39 @@ out_free:
        goto out;
 }
+static __be32 decode_recallany_args(struct svc_rqst *rqstp,
+                                      struct xdr_stream *xdr,
+                                      struct cb_recallanyargs *args)
+{
+        __be32 *p;
+        args->craa_addr = svc_addr(rqstp);
+        p = read_buf(xdr, 4);
+        if (unlikely(p == NULL))
+                return htonl(NFS4ERR_BADXDR);
+        args->craa_objs_to_keep = ntohl(*p++);
+        p = read_buf(xdr, 4);
+        if (unlikely(p == NULL))
+                return htonl(NFS4ERR_BADXDR);
+        args->craa_type_mask = ntohl(*p);
+        return 0;
+}
+static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
+                                        struct xdr_stream *xdr,
+                                        struct cb_recallslotargs *args)
+{
+        __be32 *p;
+        args->crsa_addr = svc_addr(rqstp);
+        p = read_buf(xdr, 4);
+        if (unlikely(p == NULL))
+                return htonl(NFS4ERR_BADXDR);
+        args->crsa_target_max_slots = ntohl(*p++);
+        return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
@@ -445,7 +484,7 @@ static __be32 encode_op_hdr(struct xdr_stream *xdr, uint32_t op, __be32 res)
        
        p = xdr_reserve_space(xdr, 8);
        if (unlikely(p == NULL))
-                return htonl(NFS4ERR_RESOURCE);
+                return htonl(NFS4ERR_RESOURCE_HDR);
        *p++ = htonl(op);
        *p = res;
        return 0;
@@ -479,10 +518,10 @@ out:
 #if defined(CONFIG_NFS_V4_1)
-static unsigned encode_sessionid(struct xdr_stream *xdr,
+static __be32 encode_sessionid(struct xdr_stream *xdr,
                                 const struct nfs4_sessionid *sid)
 {
-        uint32_t *p;
+        __be32 *p;
        int len = NFS4_MAX_SESSIONID_LEN;
        p = xdr_reserve_space(xdr, len);
@@ -493,11 +532,11 @@ static unsigned encode_sessionid(struct xdr_stream *xdr,
        return 0;
 }
-static unsigned encode_cb_sequence_res(struct svc_rqst *rqstp,
+static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp,
                                       struct xdr_stream *xdr,
                                       const struct cb_sequenceres *res)
 {
-        uint32_t *p;
+        __be32 *p;
        unsigned status = res->csr_status;
        if (unlikely(status != 0))
@@ -533,6 +572,8 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        case OP_CB_GETATTR:
        case OP_CB_RECALL:
        case OP_CB_SEQUENCE:
+        case OP_CB_RECALL_ANY:
+        case OP_CB_RECALL_SLOT:
                *op = &callback_ops[op_nr];
                break;
@@ -540,9 +581,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        case OP_CB_NOTIFY_DEVICEID:
        case OP_CB_NOTIFY:
        case OP_CB_PUSH_DELEG:
-        case OP_CB_RECALL_ANY:
        case OP_CB_RECALLABLE_OBJ_AVAIL:
-        case OP_CB_RECALL_SLOT:
        case OP_CB_WANTS_CANCELLED:
        case OP_CB_NOTIFY_LOCK:
                return htonl(NFS4ERR_NOTSUPP);
@@ -582,20 +621,18 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
 static __be32 process_op(uint32_t minorversion, int nop,
                struct svc_rqst *rqstp,
                struct xdr_stream *xdr_in, void *argp,
-                struct xdr_stream *xdr_out, void *resp)
+                struct xdr_stream *xdr_out, void *resp, int* drc_status)
 {
        struct callback_op *op = &callback_ops[0];
-        unsigned int op_nr = OP_CB_ILLEGAL;
+        unsigned int op_nr;
        __be32 status;
        long maxlen;
        __be32 res;
        dprintk("%s: start\n", __func__);
        status = decode_op_hdr(xdr_in, &op_nr);
-        if (unlikely(status)) {
+        if (unlikely(status))
-                status = htonl(NFS4ERR_OP_ILLEGAL);
+                return status;
-                goto out;
-        }
        dprintk("%s: minorversion=%d nop=%d op_nr=%u\n",
                __func__, minorversion, nop, op_nr);
@@ -604,19 +641,32 @@ static __be32 process_op(uint32_t minorversion, int nop,
                                preprocess_nfs4_op(op_nr, &op);
        if (status == htonl(NFS4ERR_OP_ILLEGAL))
                op_nr = OP_CB_ILLEGAL;
-out:
+        if (status)
+                goto encode_hdr;
+        if (*drc_status) {
+                status = *drc_status;
+                goto encode_hdr;
+        }
        maxlen = xdr_out->end - xdr_out->p;
        if (maxlen > 0 && maxlen < PAGE_SIZE) {
-                if (likely(status == 0 && op->decode_args != NULL))
+                status = op->decode_args(rqstp, xdr_in, argp);
-                        status = op->decode_args(rqstp, xdr_in, argp);
+                if (likely(status == 0))
-                if (likely(status == 0 && op->process_op != NULL))
                        status = op->process_op(argp, resp);
        } else
                status = htonl(NFS4ERR_RESOURCE);
+        /* Only set by OP_CB_SEQUENCE processing */
+        if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
+                *drc_status = status;
+                status = 0;
+        }
+encode_hdr:
        res = encode_op_hdr(xdr_out, op_nr, status);
-        if (status == 0)
+        if (unlikely(res))
-                status = res;
+                return res;
        if (op->encode_res != NULL && status == 0)
                status = op->encode_res(rqstp, xdr_out, resp);
        dprintk("%s: done, status = %d\n", __func__, ntohl(status));
@@ -632,7 +682,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        struct cb_compound_hdr_res hdr_res = { NULL };
        struct xdr_stream xdr_in, xdr_out;
        __be32 *p;
-        __be32 status;
+        __be32 status, drc_status = 0;
        unsigned int nops = 0;
        dprintk("%s: start\n", __func__);
@@ -652,11 +702,18 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
                return rpc_system_err;
        while (status == 0 && nops != hdr_arg.nops) {
-                status = process_op(hdr_arg.minorversion, nops,
+                status = process_op(hdr_arg.minorversion, nops, rqstp,
-                                    rqstp, &xdr_in, argp, &xdr_out, resp);
+                                    &xdr_in, argp, &xdr_out, resp, &drc_status);
                nops++;
        }
+        /* Buffer overflow in decode_ops_hdr or encode_ops_hdr. Return
+        * resource error in cb_compound status without returning op */
+        if (unlikely(status == htonl(NFS4ERR_RESOURCE_HDR))) {
+                status = htonl(NFS4ERR_RESOURCE);
+                nops--;
+        }
        *hdr_res.status = status;
        *hdr_res.nops = htonl(nops);
        dprintk("%s: done, status = %u\n", __func__, ntohl(status));
@@ -688,6 +745,16 @@ static struct callback_op callback_ops[] = {
                .encode_res = (callback_encode_res_t)encode_cb_sequence_res,
                .res_maxsize = CB_OP_SEQUENCE_RES_MAXSZ,
        },
+        [OP_CB_RECALL_ANY] = {
+                .process_op = (callback_process_op_t)nfs4_callback_recallany,
+                .decode_args = (callback_decode_arg_t)decode_recallany_args,
+                .res_maxsize = CB_OP_RECALLANY_RES_MAXSZ,
+        },
+        [OP_CB_RECALL_SLOT] = {
+                .process_op = (callback_process_op_t)nfs4_callback_recallslot,
+                .decode_args = (callback_decode_arg_t)decode_recallslot_args,
+                .res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ,
+        },
 #endif /* CONFIG_NFS_V4_1 */
 };
@@ -716,5 +783,13 @@ struct svc_version nfs4_callback_version1 = {
        .vs_proc = nfs4_callback_procedures1,
        .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
        .vs_dispatch = NULL,
+        .vs_hidden = 1,
 };
+struct svc_version nfs4_callback_version4 = {
+        .vs_vers = 4,
+        .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1),
+        .vs_proc = nfs4_callback_procedures1,
+        .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
+        .vs_dispatch = NULL,
+};
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 99ea196f071f..2a3d352c0bff 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -35,6 +35,7 @@
 #include <linux/vfs.h>
 #include <linux/inet.h>
 #include <linux/in6.h>
+#include <linux/slab.h>
 #include <net/ipv6.h>
 #include <linux/nfs_xdr.h>
 #include <linux/sunrpc/bc_xprt.h>
@@ -164,30 +165,7 @@ error_0:
        return ERR_PTR(err);
 }
-static void nfs4_shutdown_client(struct nfs_client *clp)
-{
-#ifdef CONFIG_NFS_V4
-        if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
-                nfs4_kill_renewd(clp);
-        BUG_ON(!RB_EMPTY_ROOT(&clp->cl_state_owners));
-        if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
-                nfs_idmap_delete(clp);
-        rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
-#endif
-}
-/*
- * Destroy the NFS4 callback service
- */
-static void nfs4_destroy_callback(struct nfs_client *clp)
-{
 #ifdef CONFIG_NFS_V4
-        if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
-                nfs_callback_down(clp->cl_minorversion);
-#endif /* CONFIG_NFS_V4 */
-}
 /*
 * Clears/puts all minor version specific parts from an nfs_client struct
 * reverting it to minorversion 0.
@@ -202,9 +180,33 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp)
        clp->cl_call_sync = _nfs4_call_sync;
 #endif /* CONFIG_NFS_V4_1 */
+}
+/*
+ * Destroy the NFS4 callback service
+ */
+static void nfs4_destroy_callback(struct nfs_client *clp)
+{
+        if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
+                nfs_callback_down(clp->cl_minorversion);
+}
+static void nfs4_shutdown_client(struct nfs_client *clp)
+{
+        if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
+                nfs4_kill_renewd(clp);
+        nfs4_clear_client_minor_version(clp);
        nfs4_destroy_callback(clp);
+        if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
+                nfs_idmap_delete(clp);
+        rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
+}
+#else
+static void nfs4_shutdown_client(struct nfs_client *clp)
+{
 }
+#endif /* CONFIG_NFS_V4 */
 /*
 * Destroy a shared client record
@@ -213,7 +215,6 @@ static void nfs_free_client(struct nfs_client *clp)
 {
        dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version);
-        nfs4_clear_client_minor_version(clp);
        nfs4_shutdown_client(clp);
        nfs_fscache_release_client_cookie(clp);
@@ -1260,10 +1261,20 @@ error:
 static void nfs4_session_set_rwsize(struct nfs_server *server)
 {
 #ifdef CONFIG_NFS_V4_1
+        struct nfs4_session *sess;
+        u32 server_resp_sz;
+        u32 server_rqst_sz;
        if (!nfs4_has_session(server->nfs_client))
                return;
-        server->rsize = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+        sess = server->nfs_client->cl_session;
-        server->wsize = server->nfs_client->cl_session->fc_attrs.max_rqst_sz;
+        server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead;
+        server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead;
+        if (server->rsize > server_resp_sz)
+                server->rsize = server_resp_sz;
+        if (server->wsize > server_rqst_sz)
+                server->wsize = server_rqst_sz;
 #endif /* CONFIG_NFS_V4_1 */
 }
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 6dd48a4405b4..15671245c6ee 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -10,6 +10,7 @@
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/spinlock.h>
@@ -92,7 +93,7 @@ out:
        return status;
 }
-static void nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *stateid)
+static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *stateid)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_open_context *ctx;
@@ -116,10 +117,11 @@ again:
                        err = nfs_delegation_claim_locks(ctx, state);
                put_nfs_open_context(ctx);
                if (err != 0)
-                        return;
+                        return err;
                goto again;
        }
        spin_unlock(&inode->i_lock);
+        return 0;
 }
 /*
@@ -261,30 +263,34 @@ static void nfs_msync_inode(struct inode *inode)
 /*
 * Basic procedure for returning a delegation to the server
 */
-static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation)
+static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
+        int err;
-        nfs_msync_inode(inode);
        /*
         * Guard against new delegated open/lock/unlock calls and against
         * state recovery
         */
        down_write(&nfsi->rwsem);
-        nfs_delegation_claim_opens(inode, &delegation->stateid);
+        err = nfs_delegation_claim_opens(inode, &delegation->stateid);
        up_write(&nfsi->rwsem);
-        nfs_msync_inode(inode);
+        if (err)
+                goto out;
-        return nfs_do_return_delegation(inode, delegation, 1);
+        err = nfs_do_return_delegation(inode, delegation, issync);
+out:
+        return err;
 }
 /*
 * Return all delegations that have been marked for return
 */
-void nfs_client_return_marked_delegations(struct nfs_client *clp)
+int nfs_client_return_marked_delegations(struct nfs_client *clp)
 {
        struct nfs_delegation *delegation;
        struct inode *inode;
+        int err = 0;
 restart:
        rcu_read_lock();
@@ -298,12 +304,18 @@ restart:
                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
                spin_unlock(&clp->cl_lock);
                rcu_read_unlock();
-                if (delegation != NULL)
+                if (delegation != NULL) {
-                        __nfs_inode_return_delegation(inode, delegation);
+                        filemap_flush(inode->i_mapping);
+                        err = __nfs_inode_return_delegation(inode, delegation, 0);
+                }
                iput(inode);
-                goto restart;
+                if (!err)
+                        goto restart;
+                set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+                return err;
        }
        rcu_read_unlock();
+        return 0;
 }
 /*
@@ -338,8 +350,10 @@ int nfs_inode_return_delegation(struct inode *inode)
                spin_lock(&clp->cl_lock);
                delegation = nfs_detach_delegation_locked(nfsi, NULL);
                spin_unlock(&clp->cl_lock);
-                if (delegation != NULL)
+                if (delegation != NULL) {
-                        err = __nfs_inode_return_delegation(inode, delegation);
+                        nfs_msync_inode(inode);
+                        err = __nfs_inode_return_delegation(inode, delegation, 1);
+                }
        }
        return err;
 }
@@ -368,33 +382,47 @@ void nfs_super_return_all_delegations(struct super_block *sb)
                spin_unlock(&delegation->lock);
        }
        rcu_read_unlock();
-        nfs_client_return_marked_delegations(clp);
+        if (nfs_client_return_marked_delegations(clp) != 0)
+                nfs4_schedule_state_manager(clp);
 }
-static void nfs_client_mark_return_all_delegations(struct nfs_client *clp)
+static
+void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp, fmode_t flags)
 {
        struct nfs_delegation *delegation;
        rcu_read_lock();
        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
-                set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+                if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))
-                set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+                        continue;
+                if (delegation->type & flags)
+                        nfs_mark_return_delegation(clp, delegation);
        }
        rcu_read_unlock();
 }
+static void nfs_client_mark_return_all_delegations(struct nfs_client *clp)
+{
+        nfs_client_mark_return_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
+}
 static void nfs_delegation_run_state_manager(struct nfs_client *clp)
 {
        if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state))
                nfs4_schedule_state_manager(clp);
 }
-void nfs_expire_all_delegations(struct nfs_client *clp)
+void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags)
 {
-        nfs_client_mark_return_all_delegations(clp);
+        nfs_client_mark_return_all_delegation_types(clp, flags);
        nfs_delegation_run_state_manager(clp);
 }
+void nfs_expire_all_delegations(struct nfs_client *clp)
+{
+        nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
+}
 /*
 * Return all delegations following an NFS4ERR_CB_PATH_DOWN error.
 */
@@ -413,8 +441,7 @@ static void nfs_client_mark_return_unreferenced_delegations(struct nfs_client *c
        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
                if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
                        continue;
-                set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+                nfs_mark_return_delegation(clp, delegation);
-                set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
        }
        rcu_read_unlock();
 }
@@ -428,18 +455,21 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
 /*
 * Asynchronous delegation recall!
 */
-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid)
+int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid,
+                                      int (*validate_stateid)(struct nfs_delegation *delegation,
+                                                              const nfs4_stateid *stateid))
 {
        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
        struct nfs_delegation *delegation;
        rcu_read_lock();
        delegation = rcu_dereference(NFS_I(inode)->delegation);
-        if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data,
-                                sizeof(delegation->stateid.data)) != 0) {
+        if (!validate_stateid(delegation, stateid)) {
                rcu_read_unlock();
                return -ENOENT;
        }
        nfs_mark_return_delegation(clp, delegation);
        rcu_read_unlock();
        nfs_delegation_run_state_manager(clp);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 09f383795174..69e7b8140122 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -34,15 +34,18 @@ enum {
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
 void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
 int nfs_inode_return_delegation(struct inode *inode);
-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
+int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid,
+                                      int (*validate_stateid)(struct nfs_delegation *delegation,
+                                                              const nfs4_stateid *stateid));
 void nfs_inode_return_delegation_noreclaim(struct inode *inode);
 struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
 void nfs_super_return_all_delegations(struct super_block *sb);
 void nfs_expire_all_delegations(struct nfs_client *clp);
+void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags);
 void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
 void nfs_handle_cb_pathdown(struct nfs_client *clp);
-void nfs_client_return_marked_delegations(struct nfs_client *clp);
+int nfs_client_return_marked_delegations(struct nfs_client *clp);
 void nfs_delegation_mark_reclaim(struct nfs_client *clp);
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
@@ -68,4 +71,10 @@ static inline int nfs_inode_return_delegation(struct inode *inode)
 }
 #endif
+static inline int nfs_have_delegated_attributes(struct inode *inode)
+{
+        return nfs_have_delegation(inode, FMODE_READ) &&
+                !(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED);
+}
 #endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7cb298525eef..c6f2750648f4 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -560,7 +560,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        desc->entry = &my_entry;
        nfs_block_sillyrename(dentry);
-        res = nfs_revalidate_mapping_nolock(inode, filp->f_mapping);
+        res = nfs_revalidate_mapping(inode, filp->f_mapping);
        if (res < 0)
                goto out;
@@ -1579,55 +1579,47 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct dentry *dentry = NULL, *rehash = NULL;
        int error = -EBUSY;
-        /*
-         * To prevent any new references to the target during the rename,
-         * we unhash the dentry and free the inode in advance.
-         */
-        if (!d_unhashed(new_dentry)) {
-                d_drop(new_dentry);
-                rehash = new_dentry;
-        }
        dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
                 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
                 new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
                 atomic_read(&new_dentry->d_count));
        /*
-         * First check whether the target is busy ... we can't
+         * For non-directories, check whether the target is busy and if so,
-         * safely do _any_ rename if the target is in use.
+         * make a copy of the dentry and then do a silly-rename. If the
-         *
+         * silly-rename succeeds, the copied dentry is hashed and becomes
-         * For files, make a copy of the dentry and then do a 
+         * the new target.
-         * silly-rename. If the silly-rename succeeds, the
-         * copied dentry is hashed and becomes the new target.
         */
-        if (!new_inode)
+        if (new_inode && !S_ISDIR(new_inode->i_mode)) {
-                goto go_ahead;
+                /*
-        if (S_ISDIR(new_inode->i_mode)) {
+                 * To prevent any new references to the target during the
-                error = -EISDIR;
+                 * rename, we unhash the dentry in advance.
-                if (!S_ISDIR(old_inode->i_mode))
+                 */
-                        goto out;
+                if (!d_unhashed(new_dentry)) {
-        } else if (atomic_read(&new_dentry->d_count) > 2) {
+                        d_drop(new_dentry);
-                int err;
+                        rehash = new_dentry;
-                /* copy the target dentry's name */
+                }
-                dentry = d_alloc(new_dentry->d_parent,
-                                 &new_dentry->d_name);
+                if (atomic_read(&new_dentry->d_count) > 2) {
-                if (!dentry)
+                        int err;
-                        goto out;
+                        /* copy the target dentry's name */
+                        dentry = d_alloc(new_dentry->d_parent,
+                                         &new_dentry->d_name);
+                        if (!dentry)
+                                goto out;
-                /* silly-rename the existing target ... */
+                        /* silly-rename the existing target ... */
-                err = nfs_sillyrename(new_dir, new_dentry);
+                        err = nfs_sillyrename(new_dir, new_dentry);
-                if (!err) {
+                        if (err)
-                        new_dentry = rehash = dentry;
+                                goto out;
+                        new_dentry = dentry;
+                        rehash = NULL;
                        new_inode = NULL;
-                        /* instantiate the replacement target */
+                }
-                        d_instantiate(new_dentry, NULL);
-                } else if (atomic_read(&new_dentry->d_count) > 1)
-                        /* dentry still busy? */
-                        goto out;
        }
-go_ahead:
        /*
         * ... prune child dentries and writebacks if needed.
         */
@@ -1797,7 +1789,7 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
        cache = nfs_access_search_rbtree(inode, cred);
        if (cache == NULL)
                goto out;
-        if (!nfs_have_delegation(inode, FMODE_READ) &&
+        if (!nfs_have_delegated_attributes(inode) &&
            !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
                goto out_stale;
        res->jiffies = cache->jiffies;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e1d415e97849..ad4cd31d6050 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -44,6 +44,7 @@
 #include <linux/file.h>
 #include <linux/pagemap.h>
 #include <linux/kref.h>
+#include <linux/slab.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
@@ -342,6 +343,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
                data->res.fattr = &data->fattr;
                data->res.eof = 0;
                data->res.count = bytes;
+                nfs_fattr_init(&data->fattr);
                msg.rpc_argp = &data->args;
                msg.rpc_resp = &data->res;
@@ -575,6 +577,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
        data->res.count = 0;
        data->res.fattr = &data->fattr;
        data->res.verf = &data->verf;
+        nfs_fattr_init(&data->fattr);
        NFS_PROTO(data->inode)->commit_setup(data, &msg);
@@ -766,6 +769,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
                data->res.fattr = &data->fattr;
                data->res.count = bytes;
                data->res.verf = &data->verf;
+                nfs_fattr_init(&data->fattr);
                task_setup_data.task = &data->task;
                task_setup_data.callback_data = data;
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index f4d54ba97cc6..76fd235d0024 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -9,6 +9,7 @@
 #include <linux/hash.h>
 #include <linux/string.h>
 #include <linux/kmod.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/socket.h>
 #include <linux/seq_file.h>
@@ -36,6 +37,19 @@ struct nfs_dns_ent {
 };
+static void nfs_dns_ent_update(struct cache_head *cnew,
+                struct cache_head *ckey)
+{
+        struct nfs_dns_ent *new;
+        struct nfs_dns_ent *key;
+        new = container_of(cnew, struct nfs_dns_ent, h);
+        key = container_of(ckey, struct nfs_dns_ent, h);
+        memcpy(&new->addr, &key->addr, key->addrlen);
+        new->addrlen = key->addrlen;
+}
 static void nfs_dns_ent_init(struct cache_head *cnew,
                struct cache_head *ckey)
 {
@@ -49,8 +63,7 @@ static void nfs_dns_ent_init(struct cache_head *cnew,
        new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL);
        if (new->hostname) {
                new->namelen = key->namelen;
-                memcpy(&new->addr, &key->addr, key->addrlen);
+                nfs_dns_ent_update(cnew, ckey);
-                new->addrlen = key->addrlen;
        } else {
                new->namelen = 0;
                new->addrlen = 0;
@@ -146,7 +159,7 @@ static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
        return 0;
 }
-struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
+static struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
                struct nfs_dns_ent *key)
 {
        struct cache_head *ch;
@@ -159,7 +172,7 @@ struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
        return container_of(ch, struct nfs_dns_ent, h);
 }
-struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd,
+static struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd,
                struct nfs_dns_ent *new,
                struct nfs_dns_ent *key)
 {
@@ -234,7 +247,7 @@ static struct cache_detail nfs_dns_resolve = {
        .cache_show = nfs_dns_show,
        .match = nfs_dns_match,
        .init = nfs_dns_ent_init,
-        .update = nfs_dns_ent_init,
+        .update = nfs_dns_ent_update,
        .alloc = nfs_dns_ent_alloc,
 };
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index f5fdd39e037a..8d965bddb87e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -24,9 +24,9 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/aio.h>
+#include <linux/gfp.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -123,11 +123,11 @@ nfs_file_open(struct inode *inode, struct file *filp)
                        filp->f_path.dentry->d_parent->d_name.name,
                        filp->f_path.dentry->d_name.name);
+        nfs_inc_stats(inode, NFSIOS_VFSOPEN);
        res = nfs_check_flags(filp->f_flags);
        if (res)
                return res;
-        nfs_inc_stats(inode, NFSIOS_VFSOPEN);
        res = nfs_open(inode, filp);
        return res;
 }
@@ -237,9 +237,9 @@ nfs_file_flush(struct file *file, fl_owner_t id)
                        dentry->d_parent->d_name.name,
                        dentry->d_name.name);
+        nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
        if ((file->f_mode & FMODE_WRITE) == 0)
                return 0;
-        nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
        /* Flush writes to the server and return any errors */
        return nfs_do_fsync(ctx, inode);
@@ -262,9 +262,11 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
                (unsigned long) count, (unsigned long) pos);
        result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
-        nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, count);
+        if (!result) {
-        if (!result)
                result = generic_file_aio_read(iocb, iov, nr_segs, pos);
+                if (result > 0)
+                        nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
+        }
        return result;
 }
@@ -282,8 +284,11 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
                (unsigned long) count, (unsigned long long) *ppos);
        res = nfs_revalidate_mapping(inode, filp->f_mapping);
-        if (!res)
+        if (!res) {
                res = generic_file_splice_read(filp, ppos, pipe, count, flags);
+                if (res > 0)
+                        nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res);
+        }
        return res;
 }
@@ -486,6 +491,9 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
 {
        dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
+        /* Only do I/O if gfp is a superset of GFP_KERNEL */
+        if ((gfp & GFP_KERNEL) == GFP_KERNEL)
+                nfs_wb_page(page->mapping->host, page);
        /* If PagePrivate() is set, then the page is not freeable */
        if (PagePrivate(page))
                return 0;
@@ -581,7 +589,7 @@ static int nfs_need_sync_write(struct file *filp, struct inode *inode)
 {
        struct nfs_open_context *ctx;
-        if (IS_SYNC(inode) || (filp->f_flags & O_SYNC))
+        if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC))
                return 1;
        ctx = nfs_file_open_context(filp);
        if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags))
@@ -594,6 +602,7 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
 {
        struct dentry * dentry = iocb->ki_filp->f_path.dentry;
        struct inode * inode = dentry->d_inode;
+        unsigned long written = 0;
        ssize_t result;
        size_t count = iov_length(iov, nr_segs);
@@ -620,14 +629,18 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
        if (!count)
                goto out;
-        nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
        result = generic_file_aio_write(iocb, iov, nr_segs, pos);
-        /* Return error values for O_SYNC and IS_SYNC() */
+        if (result > 0)
+                written = result;
+        /* Return error values for O_DSYNC and IS_SYNC() */
        if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
                int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
                if (err < 0)
                        result = err;
        }
+        if (result > 0)
+                nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
 out:
        return result;
@@ -642,6 +655,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
 {
        struct dentry *dentry = filp->f_path.dentry;
        struct inode *inode = dentry->d_inode;
+        unsigned long written = 0;
        ssize_t ret;
        dprintk("NFS splice_write(%s/%s, %lu@%llu)\n",
@@ -652,14 +666,17 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
         * The combination of splice and an O_APPEND destination is disallowed.
         */
-        nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
        ret = generic_file_splice_write(pipe, filp, ppos, count, flags);
+        if (ret > 0)
+                written = ret;
        if (ret >= 0 && nfs_need_sync_write(filp, inode)) {
                int err = nfs_do_fsync(nfs_file_open_context(filp), inode);
                if (err < 0)
                        ret = err;
        }
+        if (ret > 0)
+                nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
        return ret;
 }
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index fa588006588d..a6b16ed93229 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -17,6 +17,7 @@
 #include <linux/nfs_fs_sb.h>
 #include <linux/in6.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include "internal.h"
 #include "iostat.h"
@@ -354,12 +355,11 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode)
 */
 int nfs_fscache_release_page(struct page *page, gfp_t gfp)
 {
-        struct nfs_inode *nfsi = NFS_I(page->mapping->host);
-        struct fscache_cookie *cookie = nfsi->fscache;
-        BUG_ON(!cookie);
        if (PageFsCache(page)) {
+                struct nfs_inode *nfsi = NFS_I(page->mapping->host);
+                struct fscache_cookie *cookie = nfsi->fscache;
+                BUG_ON(!cookie);
                dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
                         cookie, page, nfsi);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index faa091865ad0..737128f777f3 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -36,6 +36,7 @@
 #include <linux/vfs.h>
 #include <linux/inet.h>
 #include <linux/nfs_xdr.h>
+#include <linux/slab.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -97,22 +98,6 @@ u64 nfs_compat_user_ino64(u64 fileid)
        return ino;
 }
-int nfs_write_inode(struct inode *inode, int sync)
-{
-        int ret;
-        if (sync) {
-                ret = filemap_fdatawait(inode->i_mapping);
-                if (ret == 0)
-                        ret = nfs_commit_inode(inode, FLUSH_SYNC);
-        } else
-                ret = nfs_commit_inode(inode, 0);
-        if (ret >= 0)
-                return 0;
-        __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-        return ret;
-}
 void nfs_clear_inode(struct inode *inode)
 {
        /*
@@ -130,16 +115,12 @@ void nfs_clear_inode(struct inode *inode)
 */
 int nfs_sync_mapping(struct address_space *mapping)
 {
-        int ret;
+        int ret = 0;
-        if (mapping->nrpages == 0)
+        if (mapping->nrpages != 0) {
-                return 0;
+                unmap_mapping_range(mapping, 0, 0, 0);
-        unmap_mapping_range(mapping, 0, 0, 0);
+                ret = nfs_wb_all(mapping->host);
-        ret = filemap_write_and_wait(mapping);
+        }
-        if (ret != 0)
-                goto out;
-        ret = nfs_wb_all(mapping->host);
-out:
        return ret;
 }
@@ -511,17 +492,11 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
        int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
        int err;
-        /*
+        /* Flush out writes to the server in order to update c/mtime.  */
-         * Flush out writes to the server in order to update c/mtime.
-         *
-         * Hold the i_mutex to suspend application writes temporarily;
-         * this prevents long-running writing applications from blocking
-         * nfs_wb_nocommit.
-         */
        if (S_ISREG(inode->i_mode)) {
-                mutex_lock(&inode->i_mutex);
+                err = filemap_write_and_wait(inode->i_mapping);
-                nfs_wb_nocommit(inode);
+                if (err)
-                mutex_unlock(&inode->i_mutex);
+                        goto out;
        }
        /*
@@ -545,6 +520,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
                generic_fillattr(inode, stat);
                stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
        }
+out:
        return err;
 }
@@ -574,14 +550,14 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
        nfs_revalidate_inode(server, inode);
 }
-static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred)
+static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred)
 {
        struct nfs_open_context *ctx;
        ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
        if (ctx != NULL) {
-                ctx->path.dentry = dget(dentry);
+                ctx->path = *path;
-                ctx->path.mnt = mntget(mnt);
+                path_get(&ctx->path);
                ctx->cred = get_rpccred(cred);
                ctx->state = NULL;
                ctx->lockowner = current->files;
@@ -620,11 +596,6 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
        __put_nfs_open_context(ctx, 0);
 }
-static void put_nfs_open_context_sync(struct nfs_open_context *ctx)
-{
-        __put_nfs_open_context(ctx, 1);
-}
 /*
 * Ensure that mmap has a recent RPC credential for use when writing out
 * shared pages
@@ -671,7 +642,7 @@ static void nfs_file_clear_open_context(struct file *filp)
                spin_lock(&inode->i_lock);
                list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
                spin_unlock(&inode->i_lock);
-                put_nfs_open_context_sync(ctx);
+                __put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1);
        }
 }
@@ -686,7 +657,7 @@ int nfs_open(struct inode *inode, struct file *filp)
        cred = rpc_lookup_cred();
        if (IS_ERR(cred))
                return PTR_ERR(cred);
-        ctx = alloc_nfs_open_context(filp->f_path.mnt, filp->f_path.dentry, cred);
+        ctx = alloc_nfs_open_context(&filp->f_path, cred);
        put_rpccred(cred);
        if (ctx == NULL)
                return -ENOMEM;
@@ -759,7 +730,7 @@ int nfs_attribute_timeout(struct inode *inode)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
-        if (nfs_have_delegation(inode, FMODE_READ))
+        if (nfs_have_delegated_attributes(inode))
                return 0;
        return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
 }
@@ -779,7 +750,7 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
        return __nfs_revalidate_inode(server, inode);
 }
-static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_space *mapping)
+static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
        
@@ -800,49 +771,10 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa
        return 0;
 }
-static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
-{
-        int ret = 0;
-        mutex_lock(&inode->i_mutex);
-        if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_DATA) {
-                ret = nfs_sync_mapping(mapping);
-                if (ret == 0)
-                        ret = nfs_invalidate_mapping_nolock(inode, mapping);
-        }
-        mutex_unlock(&inode->i_mutex);
-        return ret;
-}
-/**
- * nfs_revalidate_mapping_nolock - Revalidate the pagecache
- * @inode - pointer to host inode
- * @mapping - pointer to mapping
- */
-int nfs_revalidate_mapping_nolock(struct inode *inode, struct address_space *mapping)
-{
-        struct nfs_inode *nfsi = NFS_I(inode);
-        int ret = 0;
-        if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
-                        || nfs_attribute_timeout(inode) || NFS_STALE(inode)) {
-                ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
-                if (ret < 0)
-                        goto out;
-        }
-        if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
-                ret = nfs_invalidate_mapping_nolock(inode, mapping);
-out:
-        return ret;
-}
 /**
 * nfs_revalidate_mapping - Revalidate the pagecache
 * @inode - pointer to host inode
 * @mapping - pointer to mapping
- *
- * This version of the function will take the inode->i_mutex and attempt to
- * flush out all dirty data if it needs to invalidate the page cache.
 */
 int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
@@ -1261,8 +1193,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        if (fattr->valid & NFS_ATTR_FATTR_MODE) {
                if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
+                        umode_t newmode = inode->i_mode & S_IFMT;
+                        newmode |= fattr->mode & S_IALLUGO;
+                        inode->i_mode = newmode;
                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-                        inode->i_mode = fattr->mode;
                }
        } else if (server->caps & NFS_CAP_MODE)
                invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
@@ -1418,6 +1352,7 @@ static void init_once(void *foo)
        INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
        INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
        nfsi->npages = 0;
+        nfsi->ncommit = 0;
        atomic_set(&nfsi->silly_count, 1);
        INIT_HLIST_HEAD(&nfsi->silly_list);
        init_waitqueue_head(&nfsi->waitqueue);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e21b1bb9972f..11f82f03c5de 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -30,6 +30,15 @@ static inline int nfs4_has_session(const struct nfs_client *clp)
        return 0;
 }
+static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
+{
+#ifdef CONFIG_NFS_V4_1
+        if (nfs4_has_session(clp))
+                return (clp->cl_session->flags & SESSION4_PERSIST);
+#endif /* CONFIG_NFS_V4_1 */
+        return 0;
+}
 struct nfs_clone_mount {
        const struct super_block *sb;
        const struct dentry *dentry;
@@ -156,6 +165,7 @@ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentr
 /* callback_xdr.c */
 extern struct svc_version nfs4_callback_version1;
+extern struct svc_version nfs4_callback_version4;
 /* pagelist.c */
 extern int __init nfs_init_nfspagecache(void);
@@ -177,24 +187,14 @@ extern __be32 * nfs_decode_dirent(__be32 *, struct nfs_entry *, int);
 extern struct rpc_procinfo nfs3_procedures[];
 extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int);
-/* nfs4proc.c */
-static inline void nfs4_restart_rpc(struct rpc_task *task,
-                                    const struct nfs_client *clp)
-{
-#ifdef CONFIG_NFS_V4_1
-        if (nfs4_has_session(clp) &&
-            test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) {
-                rpc_restart_call_prepare(task);
-                return;
-        }
-#endif /* CONFIG_NFS_V4_1 */
-        rpc_restart_call(task);
-}
 /* nfs4xdr.c */
 #ifdef CONFIG_NFS_V4
 extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
 #endif
+#ifdef CONFIG_NFS_V4_1
+extern const u32 nfs41_maxread_overhead;
+extern const u32 nfs41_maxwrite_overhead;
+#endif
 /* nfs4proc.c */
 #ifdef CONFIG_NFS_V4
@@ -211,7 +211,7 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
 extern struct workqueue_struct *nfsiod_workqueue;
 extern struct inode *nfs_alloc_inode(struct super_block *sb);
 extern void nfs_destroy_inode(struct inode *);
-extern int nfs_write_inode(struct inode *,int);
+extern int nfs_write_inode(struct inode *, struct writeback_control *);
 extern void nfs_clear_inode(struct inode *);
 #ifdef CONFIG_NFS_V4
 extern void nfs4_clear_inode(struct inode *);
@@ -273,20 +273,6 @@ extern int _nfs4_call_sync_session(struct nfs_server *server,
                                   struct nfs4_sequence_res *res,
                                   int cache_reply);
-#ifdef CONFIG_NFS_V4_1
-extern void nfs41_sequence_free_slot(const struct nfs_client *,
-                                     struct nfs4_sequence_res *res);
-#endif /* CONFIG_NFS_V4_1 */
-static inline void nfs4_sequence_free_slot(const struct nfs_client *clp,
-                                           struct nfs4_sequence_res *res)
-{
-#ifdef CONFIG_NFS_V4_1
-        if (nfs4_has_session(clp))
-                nfs41_sequence_free_slot(clp, res);
-#endif /* CONFIG_NFS_V4_1 */
-}
 /*
 * Determine the device name as a string
 */
@@ -380,3 +366,15 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
        return ((unsigned long)len + (unsigned long)base +
                PAGE_SIZE - 1) >> PAGE_SHIFT;
 }
+/*
+ * Helper for restarting RPC calls in the possible presence of NFSv4.1
+ * sessions.
+ */
+static inline void nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp)
+{
+        if (nfs4_has_session(clp))
+                rpc_restart_call_prepare(task);
+        else
+                rpc_restart_call(task);
+}
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index ceda50aad73c..1d8d5c813b01 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -25,13 +25,7 @@ struct nfs_iostats {
 static inline void nfs_inc_server_stats(const struct nfs_server *server,
                                        enum nfs_stat_eventcounters stat)
 {
-        struct nfs_iostats *iostats;
+        this_cpu_inc(server->io_stats->events[stat]);
-        int cpu;
-        cpu = get_cpu();
-        iostats = per_cpu_ptr(server->io_stats, cpu);
-        iostats->events[stat]++;
-        put_cpu();
 }
 static inline void nfs_inc_stats(const struct inode *inode,
@@ -44,13 +38,7 @@ static inline void nfs_add_server_stats(const struct nfs_server *server,
                                        enum nfs_stat_bytecounters stat,
                                        unsigned long addend)
 {
-        struct nfs_iostats *iostats;
+        this_cpu_add(server->io_stats->bytes[stat], addend);
-        int cpu;
-        cpu = get_cpu();
-        iostats = per_cpu_ptr(server->io_stats, cpu);
-        iostats->bytes[stat] += addend;
-        put_cpu();
 }
 static inline void nfs_add_stats(const struct inode *inode,
@@ -65,22 +53,16 @@ static inline void nfs_add_fscache_stats(struct inode *inode,
                                         enum nfs_stat_fscachecounters stat,
                                         unsigned long addend)
 {
-        struct nfs_iostats *iostats;
+        this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend);
-        int cpu;
-        cpu = get_cpu();
-        iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu);
-        iostats->fscache[stat] += addend;
-        put_cpu();
 }
 #endif
-static inline struct nfs_iostats *nfs_alloc_iostats(void)
+static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void)
 {
        return alloc_percpu(struct nfs_iostats);
 }
-static inline void nfs_free_iostats(struct nfs_iostats *stats)
+static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats)
 {
        if (stats != NULL)
                free_percpu(stats);
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 0adefc40cc89..59047f8d7d72 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -120,7 +120,7 @@ static struct {
        { .status = MNT3ERR_INVAL,              .errno = -EINVAL,       },
        { .status = MNT3ERR_NAMETOOLONG,        .errno = -ENAMETOOLONG, },
        { .status = MNT3ERR_NOTSUPP,            .errno = -ENOTSUPP,     },
-        { .status = MNT3ERR_SERVERFAULT,        .errno = -ESERVERFAULT, },
+        { .status = MNT3ERR_SERVERFAULT,        .errno = -EREMOTEIO,    },
 };
 struct mountres {
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 40c766782891..7888cf36022d 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -8,6 +8,7 @@
 */
 #include <linux/dcache.h>
+#include <linux/gfp.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/nfs_fs.h>
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 5e078b222b4e..81cf14257916 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -12,7 +12,6 @@
 #include <linux/param.h>
 #include <linux/time.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
@@ -699,7 +698,7 @@ static struct {
        { NFSERR_BAD_COOKIE,    -EBADCOOKIE     },
        { NFSERR_NOTSUPP,       -ENOTSUPP       },
        { NFSERR_TOOSMALL,      -ETOOSMALL      },
-        { NFSERR_SERVERFAULT,   -ESERVERFAULT   },
+        { NFSERR_SERVERFAULT,   -EREMOTEIO      },
        { NFSERR_BADTYPE,       -EBADTYPE       },
        { NFSERR_JUKEBOX,       -EJUKEBOX       },
        { -1,                   -EIO            }
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index bac60515a4b3..d150ae0c5ecd 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -1,4 +1,5 @@
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/nfs.h>
 #include <linux/nfs3.h>
 #include <linux/nfs_fs.h>
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 3f8881d1a050..e701002694e5 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -10,6 +10,7 @@
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/slab.h>
 #include <linux/nfs.h>
 #include <linux/nfs3.h>
 #include <linux/nfs_fs.h>
@@ -22,14 +23,14 @@
 #define NFSDBG_FACILITY         NFSDBG_PROC
-/* A wrapper to handle the EJUKEBOX error message */
+/* A wrapper to handle the EJUKEBOX and EKEYEXPIRED error messages */
 static int
 nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
 {
        int res;
        do {
                res = rpc_call_sync(clnt, msg, flags);
-                if (res != -EJUKEBOX)
+                if (res != -EJUKEBOX && res != -EKEYEXPIRED)
                        break;
                schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
                res = -ERESTARTSYS;
@@ -42,9 +43,10 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
 static int
 nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode)
 {
-        if (task->tk_status != -EJUKEBOX)
+        if (task->tk_status != -EJUKEBOX && task->tk_status != -EKEYEXPIRED)
                return 0;
-        nfs_inc_stats(inode, NFSIOS_DELAY);
+        if (task->tk_status == -EJUKEBOX)
+                nfs_inc_stats(inode, NFSIOS_DELAY);
        task->tk_status = 0;
        rpc_restart_call(task);
        rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 5fe5492fbd29..56a86f6ac8b5 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -9,7 +9,6 @@
 #include <linux/param.h>
 #include <linux/time.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 6ea07a3c75d4..a187200a7aac 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,7 +44,9 @@ enum nfs4_client_state {
        NFS4CLNT_RECLAIM_REBOOT,
        NFS4CLNT_RECLAIM_NOGRACE,
        NFS4CLNT_DELEGRETURN,
-        NFS4CLNT_SESSION_SETUP,
+        NFS4CLNT_SESSION_RESET,
+        NFS4CLNT_SESSION_DRAINING,
+        NFS4CLNT_RECALL_SLOT,
 };
 /*
@@ -107,6 +109,10 @@ enum {
        NFS_OWNER_RECLAIM_NOGRACE
 };
+#define NFS_LOCK_NEW            0
+#define NFS_LOCK_RECLAIM        1
+#define NFS_LOCK_EXPIRED        2
 /*
 * struct nfs4_state maintains the client-side state for a given
 * (state_owner,inode) tuple (OPEN) or state_owner (LOCK).
@@ -141,6 +147,7 @@ enum {
        NFS_O_RDWR_STATE,               /* OPEN stateid has read/write state */
        NFS_STATE_RECLAIM_REBOOT,       /* OPEN stateid server rebooted */
        NFS_STATE_RECLAIM_NOGRACE,      /* OPEN stateid needs to recover state */
+        NFS_STATE_POSIX_LOCKS,          /* Posix locks are supported */
 };
 struct nfs4_state {
@@ -180,6 +187,7 @@ struct nfs4_state_recovery_ops {
        int (*recover_lock)(struct nfs4_state *, struct file_lock *);
        int (*establish_clid)(struct nfs_client *, struct rpc_cred *);
        struct rpc_cred * (*get_clid_cred)(struct nfs_client *);
+        int (*reclaim_complete)(struct nfs_client *);
 };
 struct nfs4_state_maintenance_ops {
@@ -200,9 +208,11 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
 /* nfs4proc.c */
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *);
 extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *);
+extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
 extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
+extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait);
 extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
@@ -218,9 +228,11 @@ extern int nfs4_setup_sequence(struct nfs_client *clp,
                int cache_reply, struct rpc_task *task);
 extern void nfs4_destroy_session(struct nfs4_session *session);
 extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
-extern int nfs4_proc_create_session(struct nfs_client *, int reset);
+extern int nfs4_proc_create_session(struct nfs_client *);
 extern int nfs4_proc_destroy_session(struct nfs4_session *);
 extern int nfs4_init_session(struct nfs_server *server);
+extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
+                struct nfs_fsinfo *fsinfo);
 #else /* CONFIG_NFS_v4_1 */
 static inline int nfs4_setup_sequence(struct nfs_client *clp,
                struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
@@ -267,6 +279,9 @@ extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
 extern void nfs4_schedule_state_recovery(struct nfs_client *);
 extern void nfs4_schedule_state_manager(struct nfs_client *);
 extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state);
+extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state);
+extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
+extern void nfs41_handle_recall_slot(struct nfs_client *clp);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
 extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
 extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
@@ -275,6 +290,7 @@ extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter);
 extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
 extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
 extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
+extern void nfs_release_seqid(struct nfs_seqid *seqid);
 extern void nfs_free_seqid(struct nfs_seqid *seqid);
 extern const nfs4_stateid zero_stateid;
@@ -287,6 +303,7 @@ struct nfs4_mount_data;
 /* callback_xdr.c */
 extern struct svc_version nfs4_callback_version1;
+extern struct svc_version nfs4_callback_version4;
 #else
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index fa3408f20112..f071d12c613b 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -11,6 +11,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/nfs_fs.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/vfs.h>
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 741a562177fc..d79a7b37e56c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -39,6 +39,7 @@
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/string.h>
+#include <linux/slab.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/nfs.h>
 #include <linux/nfs4.h>
@@ -64,6 +65,7 @@
 struct nfs4_opendata;
 static int _nfs4_proc_open(struct nfs4_opendata *data);
+static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
 static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
@@ -248,19 +250,15 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
                        if (state == NULL)
                                break;
                        nfs4_state_mark_reclaim_nograce(clp, state);
-                case -NFS4ERR_STALE_CLIENTID:
+                        goto do_state_recovery;
                case -NFS4ERR_STALE_STATEID:
-                case -NFS4ERR_EXPIRED:
+                        if (state == NULL)
-                        nfs4_schedule_state_recovery(clp);
-                        ret = nfs4_wait_clnt_recover(clp);
-                        if (ret == 0)
-                                exception->retry = 1;
-#if !defined(CONFIG_NFS_V4_1)
-                        break;
-#else /* !defined(CONFIG_NFS_V4_1) */
-                        if (!nfs4_has_session(server->nfs_client))
                                break;
-                        /* FALLTHROUGH */
+                        nfs4_state_mark_reclaim_reboot(clp, state);
+                case -NFS4ERR_STALE_CLIENTID:
+                case -NFS4ERR_EXPIRED:
+                        goto do_state_recovery;
+#if defined(CONFIG_NFS_V4_1)
                case -NFS4ERR_BADSESSION:
                case -NFS4ERR_BADSLOT:
                case -NFS4ERR_BAD_HIGH_SLOT:
@@ -270,13 +268,21 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
                case -NFS4ERR_SEQ_MISORDERED:
                        dprintk("%s ERROR: %d Reset session\n", __func__,
                                errorcode);
-                        set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
+                        nfs4_schedule_state_recovery(clp);
                        exception->retry = 1;
-                        /* FALLTHROUGH */
+                        break;
-#endif /* !defined(CONFIG_NFS_V4_1) */
+#endif /* defined(CONFIG_NFS_V4_1) */
                case -NFS4ERR_FILE_OPEN:
+                        if (exception->timeout > HZ) {
+                                /* We have retried a decent amount, time to
+                                 * fail
+                                 */
+                                ret = -EBUSY;
+                                break;
+                        }
                case -NFS4ERR_GRACE:
                case -NFS4ERR_DELAY:
+                case -EKEYEXPIRED:
                        ret = nfs4_delay(server->client, &exception->timeout);
                        if (ret != 0)
                                break;
@@ -285,6 +291,12 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
        }
        /* We failed to handle the error */
        return nfs4_map_errors(ret);
+do_state_recovery:
+        nfs4_schedule_state_recovery(clp);
+        ret = nfs4_wait_clnt_recover(clp);
+        if (ret == 0)
+                exception->retry = 1;
+        return ret;
 }
@@ -311,48 +323,67 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
 * so we need to scan down from highest_used_slotid to 0 looking for the now
 * highest slotid in use.
 * If none found, highest_used_slotid is set to -1.
+ *
+ * Must be called while holding tbl->slot_tbl_lock
 */
 static void
 nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
 {
        int slotid = free_slotid;
-        spin_lock(&tbl->slot_tbl_lock);
        /* clear used bit in bitmap */
        __clear_bit(slotid, tbl->used_slots);
        /* update highest_used_slotid when it is freed */
        if (slotid == tbl->highest_used_slotid) {
                slotid = find_last_bit(tbl->used_slots, tbl->max_slots);
-                if (slotid >= 0 && slotid < tbl->max_slots)
+                if (slotid < tbl->max_slots)
                        tbl->highest_used_slotid = slotid;
                else
                        tbl->highest_used_slotid = -1;
        }
-        rpc_wake_up_next(&tbl->slot_tbl_waitq);
-        spin_unlock(&tbl->slot_tbl_lock);
        dprintk("%s: free_slotid %u highest_used_slotid %d\n", __func__,
                free_slotid, tbl->highest_used_slotid);
 }
-void nfs41_sequence_free_slot(const struct nfs_client *clp,
+/*
-                              struct nfs4_sequence_res *res)
+ * Signal state manager thread if session is drained
+ */
+static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
 {
-        struct nfs4_slot_table *tbl;
+        struct rpc_task *task;
-        if (!nfs4_has_session(clp)) {
+        if (!test_bit(NFS4CLNT_SESSION_DRAINING, &ses->clp->cl_state)) {
-                dprintk("%s: No session\n", __func__);
+                task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq);
+                if (task)
+                        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
                return;
        }
+        if (ses->fc_slot_table.highest_used_slotid != -1)
+                return;
+        dprintk("%s COMPLETE: Session Drained\n", __func__);
+        complete(&ses->complete);
+}
+static void nfs41_sequence_free_slot(const struct nfs_client *clp,
+                              struct nfs4_sequence_res *res)
+{
+        struct nfs4_slot_table *tbl;
        tbl = &clp->cl_session->fc_slot_table;
        if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) {
-                dprintk("%s: No slot\n", __func__);
                /* just wake up the next guy waiting since
                 * we may have not consumed a slot after all */
-                rpc_wake_up_next(&tbl->slot_tbl_waitq);
+                dprintk("%s: No slot\n", __func__);
                return;
        }
+        spin_lock(&tbl->slot_tbl_lock);
        nfs4_free_slot(tbl, res->sr_slotid);
+        nfs41_check_drain_session_complete(clp->cl_session);
+        spin_unlock(&tbl->slot_tbl_lock);
        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
 }
@@ -377,10 +408,10 @@ static void nfs41_sequence_done(struct nfs_client *clp,
        if (res->sr_slotid == NFS4_MAX_SLOT_TABLE)
                goto out;
-        tbl = &clp->cl_session->fc_slot_table;
+        /* Check the SEQUENCE operation status */
-        slot = tbl->slots + res->sr_slotid;
        if (res->sr_status == 0) {
+                tbl = &clp->cl_session->fc_slot_table;
+                slot = tbl->slots + res->sr_slotid;
                /* Update the slot's sequence and clientid lease timer */
                ++slot->seq_nr;
                timestamp = res->sr_renewal_time;
@@ -388,7 +419,9 @@ static void nfs41_sequence_done(struct nfs_client *clp,
                if (time_before(clp->cl_last_renewal, timestamp))
                        clp->cl_last_renewal = timestamp;
                spin_unlock(&clp->cl_lock);
-                return;
+                /* Check sequence flags */
+                if (atomic_read(&clp->cl_count) > 1)
+                        nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
        }
 out:
        /* The session may be reset by one of the error handlers. */
@@ -407,7 +440,7 @@ out:
 * Note: must be called with under the slot_tbl_lock.
 */
 static u8
-nfs4_find_slot(struct nfs4_slot_table *tbl, struct rpc_task *task)
+nfs4_find_slot(struct nfs4_slot_table *tbl)
 {
        int slotid;
        u8 ret_id = NFS4_MAX_SLOT_TABLE;
@@ -429,24 +462,6 @@ out:
        return ret_id;
 }
-static int nfs4_recover_session(struct nfs4_session *session)
-{
-        struct nfs_client *clp = session->clp;
-        unsigned int loop;
-        int ret;
-        for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
-                ret = nfs4_wait_clnt_recover(clp);
-                if (ret != 0)
-                        break;
-                if (!test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state))
-                        break;
-                nfs4_schedule_state_manager(clp);
-                ret = -EIO;
-        }
-        return ret;
-}
 static int nfs41_setup_sequence(struct nfs4_session *session,
                                struct nfs4_sequence_args *args,
                                struct nfs4_sequence_res *res,
@@ -455,7 +470,6 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
 {
        struct nfs4_slot *slot;
        struct nfs4_slot_table *tbl;
-        int status = 0;
        u8 slotid;
        dprintk("--> %s\n", __func__);
@@ -468,24 +482,27 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
        tbl = &session->fc_slot_table;
        spin_lock(&tbl->slot_tbl_lock);
-        if (test_bit(NFS4CLNT_SESSION_SETUP, &session->clp->cl_state)) {
+        if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state) &&
-                if (tbl->highest_used_slotid != -1) {
+            !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
-                        rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
+                /*
-                        spin_unlock(&tbl->slot_tbl_lock);
+                 * The state manager will wait until the slot table is empty.
-                        dprintk("<-- %s: Session reset: draining\n", __func__);
+                 * Schedule the reset thread
-                        return -EAGAIN;
+                 */
-                }
+                rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
+                spin_unlock(&tbl->slot_tbl_lock);
+                dprintk("%s Schedule Session Reset\n", __func__);
+                return -EAGAIN;
+        }
-                /* The slot table is empty; start the reset thread */
+        if (!rpc_queue_empty(&tbl->slot_tbl_waitq) &&
-                dprintk("%s Session Reset\n", __func__);
+            !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
+                rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
                spin_unlock(&tbl->slot_tbl_lock);
-                status = nfs4_recover_session(session);
+                dprintk("%s enforce FIFO order\n", __func__);
-                if (status)
+                return -EAGAIN;
-                        return status;
-                spin_lock(&tbl->slot_tbl_lock);
        }
-        slotid = nfs4_find_slot(tbl, task);
+        slotid = nfs4_find_slot(tbl);
        if (slotid == NFS4_MAX_SLOT_TABLE) {
                rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
                spin_unlock(&tbl->slot_tbl_lock);
@@ -494,6 +511,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
        }
        spin_unlock(&tbl->slot_tbl_lock);
+        rpc_task_set_priority(task, RPC_PRIORITY_NORMAL);
        slot = tbl->slots + slotid;
        args->sa_session = session;
        args->sa_slotid = slotid;
@@ -527,7 +545,7 @@ int nfs4_setup_sequence(struct nfs_client *clp,
                goto out;
        ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply,
                                   task);
-        if (ret != -EAGAIN) {
+        if (ret && ret != -EAGAIN) {
                /* terminate rpc task */
                task->tk_status = ret;
                task->tk_action = NULL;
@@ -556,12 +574,17 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
        rpc_call_start(task);
 }
+static void nfs41_call_priv_sync_prepare(struct rpc_task *task, void *calldata)
+{
+        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
+        nfs41_call_sync_prepare(task, calldata);
+}
 static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
 {
        struct nfs41_call_sync_data *data = calldata;
        nfs41_sequence_done(data->clp, data->seq_res, task->tk_status);
-        nfs41_sequence_free_slot(data->clp, data->seq_res);
 }
 struct rpc_call_ops nfs41_call_sync_ops = {
@@ -569,12 +592,18 @@ struct rpc_call_ops nfs41_call_sync_ops = {
        .rpc_call_done = nfs41_call_sync_done,
 };
+struct rpc_call_ops nfs41_call_priv_sync_ops = {
+        .rpc_call_prepare = nfs41_call_priv_sync_prepare,
+        .rpc_call_done = nfs41_call_sync_done,
+};
 static int nfs4_call_sync_sequence(struct nfs_client *clp,
                                   struct rpc_clnt *clnt,
                                   struct rpc_message *msg,
                                   struct nfs4_sequence_args *args,
                                   struct nfs4_sequence_res *res,
-                                   int cache_reply)
+                                   int cache_reply,
+                                   int privileged)
 {
        int ret;
        struct rpc_task *task;
@@ -592,6 +621,8 @@ static int nfs4_call_sync_sequence(struct nfs_client *clp,
        };
        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
+        if (privileged)
+                task_setup.callback_ops = &nfs41_call_priv_sync_ops;
        task = rpc_run_task(&task_setup);
        if (IS_ERR(task))
                ret = PTR_ERR(task);
@@ -609,7 +640,7 @@ int _nfs4_call_sync_session(struct nfs_server *server,
                            int cache_reply)
 {
        return nfs4_call_sync_sequence(server->nfs_client, server->client,
-                                       msg, args, res, cache_reply);
+                                       msg, args, res, cache_reply, 0);
 }
 #endif /* CONFIG_NFS_V4_1 */
@@ -637,15 +668,6 @@ static void nfs4_sequence_done(const struct nfs_server *server,
 #endif /* CONFIG_NFS_V4_1 */
 }
-/* no restart, therefore free slot here */
-static void nfs4_sequence_done_free_slot(const struct nfs_server *server,
-                                         struct nfs4_sequence_res *res,
-                                         int rpc_status)
-{
-        nfs4_sequence_done(server, res, rpc_status);
-        nfs4_sequence_free_slot(server->nfs_client, res);
-}
 static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 {
        struct nfs_inode *nfsi = NFS_I(dir);
@@ -705,8 +727,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
        p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
        if (p->o_arg.seqid == NULL)
                goto err_free;
-        p->path.mnt = mntget(path->mnt);
+        path_get(path);
-        p->path.dentry = dget(path->dentry);
+        p->path = *path;
        p->dir = parent;
        p->owner = sp;
        atomic_inc(&sp->so_count);
@@ -720,9 +742,15 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
        p->o_arg.bitmask = server->attr_bitmask;
        p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
        if (flags & O_EXCL) {
-                u32 *s = (u32 *) p->o_arg.u.verifier.data;
+                if (nfs4_has_persistent_session(server->nfs_client)) {
-                s[0] = jiffies;
+                        /* GUARDED */
-                s[1] = current->pid;
+                        p->o_arg.u.attrs = &p->attrs;
+                        memcpy(&p->attrs, attrs, sizeof(p->attrs));
+                } else { /* EXCLUSIVE4_1 */
+                        u32 *s = (u32 *) p->o_arg.u.verifier.data;
+                        s[0] = jiffies;
+                        s[1] = current->pid;
+                }
        } else if (flags & O_CREAT) {
                p->o_arg.u.attrs = &p->attrs;
                memcpy(&p->attrs, attrs, sizeof(p->attrs));
@@ -776,13 +804,16 @@ static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode
                goto out;
        switch (mode & (FMODE_READ|FMODE_WRITE)) {
                case FMODE_READ:
-                        ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0;
+                        ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0
+                                && state->n_rdonly != 0;
                        break;
                case FMODE_WRITE:
-                        ret |= test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0;
+                        ret |= test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0
+                                && state->n_wronly != 0;
                        break;
                case FMODE_READ|FMODE_WRITE:
-                        ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0;
+                        ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0
+                                && state->n_rdwr != 0;
        }
 out:
        return ret;
@@ -1047,7 +1078,7 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod
        memset(&opendata->o_res, 0, sizeof(opendata->o_res));
        memset(&opendata->c_res, 0, sizeof(opendata->c_res));
        nfs4_init_opendata_res(opendata);
-        ret = _nfs4_proc_open(opendata);
+        ret = _nfs4_recover_proc_open(opendata);
        if (ret != 0)
                return ret; 
        newstate = nfs4_opendata_to_nfs4_state(opendata);
@@ -1135,7 +1166,7 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
        int err;
        do {
                err = _nfs4_do_open_reclaim(ctx, state);
-                if (err != -NFS4ERR_DELAY)
+                if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED)
                        break;
                nfs4_handle_exception(server, err, &exception);
        } while (exception.retry);
@@ -1183,6 +1214,14 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
                        case -ENOENT:
                        case -ESTALE:
                                goto out;
+                        case -NFS4ERR_BADSESSION:
+                        case -NFS4ERR_BADSLOT:
+                        case -NFS4ERR_BAD_HIGH_SLOT:
+                        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+                        case -NFS4ERR_DEADSESSION:
+                                nfs4_schedule_state_recovery(
+                                        server->nfs_client);
+                                goto out;
                        case -NFS4ERR_STALE_CLIENTID:
                        case -NFS4ERR_STALE_STATEID:
                        case -NFS4ERR_EXPIRED:
@@ -1330,14 +1369,20 @@ out_no_action:
 }
+static void nfs4_recover_open_prepare(struct rpc_task *task, void *calldata)
+{
+        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
+        nfs4_open_prepare(task, calldata);
+}
 static void nfs4_open_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_opendata *data = calldata;
        data->rpc_status = task->tk_status;
-        nfs4_sequence_done_free_slot(data->o_arg.server, &data->o_res.seq_res,
+        nfs4_sequence_done(data->o_arg.server, &data->o_res.seq_res,
-                                     task->tk_status);
+                        task->tk_status);
        if (RPC_ASSASSINATED(task))
                return;
@@ -1388,10 +1433,13 @@ static const struct rpc_call_ops nfs4_open_ops = {
        .rpc_release = nfs4_open_release,
 };
-/*
+static const struct rpc_call_ops nfs4_recover_open_ops = {
- * Note: On error, nfs4_proc_open will free the struct nfs4_opendata
+        .rpc_call_prepare = nfs4_recover_open_prepare,
- */
+        .rpc_call_done = nfs4_open_done,
-static int _nfs4_proc_open(struct nfs4_opendata *data)
+        .rpc_release = nfs4_open_release,
+};
+static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)
 {
        struct inode *dir = data->dir->d_inode;
        struct nfs_server *server = NFS_SERVER(dir);
@@ -1418,21 +1466,57 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
        data->rpc_done = 0;
        data->rpc_status = 0;
        data->cancelled = 0;
+        if (isrecover)
+                task_setup_data.callback_ops = &nfs4_recover_open_ops;
        task = rpc_run_task(&task_setup_data);
-        if (IS_ERR(task))
+        if (IS_ERR(task))
-                return PTR_ERR(task);
+                return PTR_ERR(task);
-        status = nfs4_wait_for_completion_rpc_task(task);
+        status = nfs4_wait_for_completion_rpc_task(task);
-        if (status != 0) {
+        if (status != 0) {
-                data->cancelled = 1;
+                data->cancelled = 1;
-                smp_wmb();
+                smp_wmb();
-        } else
+        } else
-                status = data->rpc_status;
+                status = data->rpc_status;
-        rpc_put_task(task);
+        rpc_put_task(task);
+        return status;
+}
+static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
+{
+        struct inode *dir = data->dir->d_inode;
+        struct nfs_openres *o_res = &data->o_res;
+        int status;
+        status = nfs4_run_open_task(data, 1);
        if (status != 0 || !data->rpc_done)
                return status;
-        if (o_res->fh.size == 0)
+        nfs_refresh_inode(dir, o_res->dir_attr);
-                _nfs4_proc_lookup(dir, o_arg->name, &o_res->fh, o_res->f_attr);
+        if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
+                status = _nfs4_proc_open_confirm(data);
+                if (status != 0)
+                        return status;
+        }
+        return status;
+}
+/*
+ * Note: On error, nfs4_proc_open will free the struct nfs4_opendata
+ */
+static int _nfs4_proc_open(struct nfs4_opendata *data)
+{
+        struct inode *dir = data->dir->d_inode;
+        struct nfs_server *server = NFS_SERVER(dir);
+        struct nfs_openargs *o_arg = &data->o_arg;
+        struct nfs_openres *o_res = &data->o_res;
+        int status;
+        status = nfs4_run_open_task(data, 0);
+        if (status != 0 || !data->rpc_done)
+                return status;
        if (o_arg->open_flags & O_CREAT) {
                update_changeattr(dir, &o_res->cinfo);
@@ -1488,7 +1572,7 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s
        return ret;
 }
-static inline int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state)
+static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state)
 {
        struct nfs_server *server = NFS_SERVER(state->inode);
        struct nfs4_exception exception = { };
@@ -1496,10 +1580,17 @@ static inline int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4
        do {
                err = _nfs4_open_expired(ctx, state);
-                if (err != -NFS4ERR_DELAY)
+                switch (err) {
-                        break;
+                default:
-                nfs4_handle_exception(server, err, &exception);
+                        goto out;
+                case -NFS4ERR_GRACE:
+                case -NFS4ERR_DELAY:
+                case -EKEYEXPIRED:
+                        nfs4_handle_exception(server, err, &exception);
+                        err = 0;
+                }
        } while (exception.retry);
+out:
        return err;
 }
@@ -1573,6 +1664,8 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
        status = PTR_ERR(state);
        if (IS_ERR(state))
                goto err_opendata_put;
+        if ((opendata->o_res.rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) != 0)
+                set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
        nfs4_opendata_put(opendata);
        nfs4_put_state_owner(sp);
        *res = state;
@@ -1712,6 +1805,18 @@ static void nfs4_free_closedata(void *data)
        kfree(calldata);
 }
+static void nfs4_close_clear_stateid_flags(struct nfs4_state *state,
+                fmode_t fmode)
+{
+        spin_lock(&state->owner->so_lock);
+        if (!(fmode & FMODE_READ))
+                clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+        if (!(fmode & FMODE_WRITE))
+                clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+        clear_bit(NFS_O_RDWR_STATE, &state->flags);
+        spin_unlock(&state->owner->so_lock);
+}
 static void nfs4_close_done(struct rpc_task *task, void *data)
 {
        struct nfs4_closedata *calldata = data;
@@ -1728,6 +1833,8 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
                case 0:
                        nfs_set_open_stateid(state, &calldata->res.stateid, 0);
                        renew_lease(server, calldata->timestamp);
+                        nfs4_close_clear_stateid_flags(state,
+                                        calldata->arg.fmode);
                        break;
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_OLD_STATEID:
@@ -1736,12 +1843,10 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
                        if (calldata->arg.fmode == 0)
                                break;
                default:
-                        if (nfs4_async_handle_error(task, server, state) == -EAGAIN) {
+                        if (nfs4_async_handle_error(task, server, state) == -EAGAIN)
-                                nfs4_restart_rpc(task, server->nfs_client);
+                                rpc_restart_call_prepare(task);
-                                return;
-                        }
        }
-        nfs4_sequence_free_slot(server->nfs_client, &calldata->res.seq_res);
+        nfs_release_seqid(calldata->arg.seqid);
        nfs_refresh_inode(calldata->inode, calldata->res.fattr);
 }
@@ -1749,38 +1854,39 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 {
        struct nfs4_closedata *calldata = data;
        struct nfs4_state *state = calldata->state;
-        int clear_rd, clear_wr, clear_rdwr;
+        int call_close = 0;
        if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
                return;
-        clear_rd = clear_wr = clear_rdwr = 0;
+        task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
+        calldata->arg.fmode = FMODE_READ|FMODE_WRITE;
        spin_lock(&state->owner->so_lock);
        /* Calculate the change in open mode */
        if (state->n_rdwr == 0) {
                if (state->n_rdonly == 0) {
-                        clear_rd |= test_and_clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+                        call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags);
-                        clear_rdwr |= test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags);
+                        call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
+                        calldata->arg.fmode &= ~FMODE_READ;
                }
                if (state->n_wronly == 0) {
-                        clear_wr |= test_and_clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+                        call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags);
-                        clear_rdwr |= test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags);
+                        call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
+                        calldata->arg.fmode &= ~FMODE_WRITE;
                }
        }
        spin_unlock(&state->owner->so_lock);
-        if (!clear_rd && !clear_wr && !clear_rdwr) {
+        if (!call_close) {
                /* Note: exit _without_ calling nfs4_close_done */
                task->tk_action = NULL;
                return;
        }
+        if (calldata->arg.fmode == 0)
+                task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
        nfs_fattr_init(calldata->res.fattr);
-        if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0) {
-                task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
-                calldata->arg.fmode = FMODE_READ;
-        } else if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0) {
-                task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
-                calldata->arg.fmode = FMODE_WRITE;
-        }
        calldata->timestamp = jiffies;
        if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client,
                                &calldata->arg.seq_args, &calldata->res.seq_res,
@@ -1832,8 +1938,6 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
        calldata->state = state;
        calldata->arg.fh = NFS_FH(state->inode);
        calldata->arg.stateid = &state->open_stateid;
-        if (nfs4_has_session(server->nfs_client))
-                memset(calldata->arg.stateid->data, 0, 4);    /* clear seqid */
        /* Serialization for the sequence id */
        calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid);
        if (calldata->arg.seqid == NULL)
@@ -1844,8 +1948,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
        calldata->res.seqid = calldata->arg.seqid;
        calldata->res.server = server;
        calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
-        calldata->path.mnt = mntget(path->mnt);
+        path_get(path);
-        calldata->path.dentry = dget(path->dentry);
+        calldata->path = *path;
        msg.rpc_argp = &calldata->arg,
        msg.rpc_resp = &calldata->res,
@@ -1981,7 +2085,7 @@ out_drop:
        return 0;
 }
-void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
+static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
 {
        if (ctx->state == NULL)
                return;
@@ -2532,7 +2636,6 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
        nfs4_sequence_done(res->server, &res->seq_res, task->tk_status);
        if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
                return 0;
-        nfs4_sequence_free_slot(res->server->nfs_client, &res->seq_res);
        update_changeattr(dir, &res->cinfo);
        nfs_post_op_update_inode(dir, &res->dir_attr);
        return 1;
@@ -2971,11 +3074,10 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
        dprintk("--> %s\n", __func__);
-        /* nfs4_sequence_free_slot called in the read rpc_call_done */
        nfs4_sequence_done(server, &data->res.seq_res, task->tk_status);
        if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
-                nfs4_restart_rpc(task, server->nfs_client);
+                nfs_restart_rpc(task, server->nfs_client);
                return -EAGAIN;
        }
@@ -2995,12 +3097,11 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct inode *inode = data->inode;
        
-        /* slot is freed in nfs_writeback_done */
        nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res,
                           task->tk_status);
        if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
-                nfs4_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
+                nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
                return -EAGAIN;
        }
        if (task->tk_status >= 0) {
@@ -3028,11 +3129,9 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
        nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res,
                           task->tk_status);
        if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
-                nfs4_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
+                nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
                return -EAGAIN;
        }
-        nfs4_sequence_free_slot(NFS_SERVER(inode)->nfs_client,
-                                &data->res.seq_res);
        nfs_refresh_inode(inode, data->res.fattr);
        return 0;
 }
@@ -3050,10 +3149,19 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
 * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special
 * standalone procedure for queueing an asynchronous RENEW.
 */
+static void nfs4_renew_release(void *data)
+{
+        struct nfs_client *clp = data;
+        if (atomic_read(&clp->cl_count) > 1)
+                nfs4_schedule_state_renewal(clp);
+        nfs_put_client(clp);
+}
 static void nfs4_renew_done(struct rpc_task *task, void *data)
 {
-        struct nfs_client *clp = (struct nfs_client *)task->tk_msg.rpc_argp;
+        struct nfs_client *clp = data;
-        unsigned long timestamp = (unsigned long)data;
+        unsigned long timestamp = task->tk_start;
        if (task->tk_status < 0) {
                /* Unless we're shutting down, schedule state recovery! */
@@ -3069,6 +3177,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *data)
 static const struct rpc_call_ops nfs4_renew_ops = {
        .rpc_call_done = nfs4_renew_done,
+        .rpc_release = nfs4_renew_release,
 };
 int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
@@ -3079,8 +3188,10 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
                .rpc_cred       = cred,
        };
+        if (!atomic_inc_not_zero(&clp->cl_count))
+                return -EIO;
        return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
-                        &nfs4_renew_ops, (void *)jiffies);
+                        &nfs4_renew_ops, clp);
 }
 int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
@@ -3331,15 +3442,14 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
                        if (state == NULL)
                                break;
                        nfs4_state_mark_reclaim_nograce(clp, state);
-                case -NFS4ERR_STALE_CLIENTID:
+                        goto do_state_recovery;
                case -NFS4ERR_STALE_STATEID:
+                        if (state == NULL)
+                                break;
+                        nfs4_state_mark_reclaim_reboot(clp, state);
+                case -NFS4ERR_STALE_CLIENTID:
                case -NFS4ERR_EXPIRED:
-                        rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
+                        goto do_state_recovery;
-                        nfs4_schedule_state_recovery(clp);
-                        if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
-                                rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
-                        task->tk_status = 0;
-                        return -EAGAIN;
 #if defined(CONFIG_NFS_V4_1)
                case -NFS4ERR_BADSESSION:
                case -NFS4ERR_BADSLOT:
@@ -3350,7 +3460,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
                case -NFS4ERR_SEQ_MISORDERED:
                        dprintk("%s ERROR %d, Reset session\n", __func__,
                                task->tk_status);
-                        set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
+                        nfs4_schedule_state_recovery(clp);
                        task->tk_status = 0;
                        return -EAGAIN;
 #endif /* CONFIG_NFS_V4_1 */
@@ -3358,6 +3468,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
                        if (server)
                                nfs_inc_server_stats(server, NFSIOS_DELAY);
                case -NFS4ERR_GRACE:
+                case -EKEYEXPIRED:
                        rpc_delay(task, NFS4_POLL_RETRY_MAX);
                        task->tk_status = 0;
                        return -EAGAIN;
@@ -3367,6 +3478,13 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
        }
        task->tk_status = nfs4_map_errors(task->tk_status);
        return 0;
+do_state_recovery:
+        rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
+        nfs4_schedule_state_recovery(clp);
+        if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
+                rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
+        task->tk_status = 0;
+        return -EAGAIN;
 }
 static int
@@ -3463,6 +3581,7 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
                        case -NFS4ERR_RESOURCE:
                                /* The IBM lawyers misread another document! */
                        case -NFS4ERR_DELAY:
+                        case -EKEYEXPIRED:
                                err = nfs4_delay(clp->cl_rpcclient, &timeout);
                }
        } while (err == 0);
@@ -3483,12 +3602,23 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_delegreturndata *data = calldata;
-        nfs4_sequence_done_free_slot(data->res.server, &data->res.seq_res,
+        nfs4_sequence_done(data->res.server, &data->res.seq_res,
-                                     task->tk_status);
+                        task->tk_status);
-        data->rpc_status = task->tk_status;
+        switch (task->tk_status) {
-        if (data->rpc_status == 0)
+        case -NFS4ERR_STALE_STATEID:
+        case -NFS4ERR_EXPIRED:
+        case 0:
                renew_lease(data->res.server, data->timestamp);
+                break;
+        default:
+                if (nfs4_async_handle_error(task, data->res.server, NULL) ==
+                                -EAGAIN) {
+                        nfs_restart_rpc(task, data->res.server->nfs_client);
+                        return;
+                }
+        }
+        data->rpc_status = task->tk_status;
 }
 static void nfs4_delegreturn_release(void *calldata)
@@ -3741,11 +3871,9 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
                        break;
                default:
                        if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
-                                nfs4_restart_rpc(task,
+                                nfs_restart_rpc(task,
-                                                calldata->server->nfs_client);
+                                                 calldata->server->nfs_client);
        }
-        nfs4_sequence_free_slot(calldata->server->nfs_client,
-                                &calldata->res.seq_res);
 }
 static void nfs4_locku_prepare(struct rpc_task *task, void *data)
@@ -3921,14 +4049,20 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
        dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
 }
+static void nfs4_recover_lock_prepare(struct rpc_task *task, void *calldata)
+{
+        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
+        nfs4_lock_prepare(task, calldata);
+}
 static void nfs4_lock_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_lockdata *data = calldata;
        dprintk("%s: begin!\n", __func__);
-        nfs4_sequence_done_free_slot(data->server, &data->res.seq_res,
+        nfs4_sequence_done(data->server, &data->res.seq_res,
-                                     task->tk_status);
+                        task->tk_status);
        data->rpc_status = task->tk_status;
        if (RPC_ASSASSINATED(task))
@@ -3976,7 +4110,35 @@ static const struct rpc_call_ops nfs4_lock_ops = {
        .rpc_release = nfs4_lock_release,
 };
-static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int reclaim)
+static const struct rpc_call_ops nfs4_recover_lock_ops = {
+        .rpc_call_prepare = nfs4_recover_lock_prepare,
+        .rpc_call_done = nfs4_lock_done,
+        .rpc_release = nfs4_lock_release,
+};
+static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error)
+{
+        struct nfs_client *clp = server->nfs_client;
+        struct nfs4_state *state = lsp->ls_state;
+        switch (error) {
+        case -NFS4ERR_ADMIN_REVOKED:
+        case -NFS4ERR_BAD_STATEID:
+        case -NFS4ERR_EXPIRED:
+                if (new_lock_owner != 0 ||
+                   (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
+                        nfs4_state_mark_reclaim_nograce(clp, state);
+                lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
+                break;
+        case -NFS4ERR_STALE_STATEID:
+                if (new_lock_owner != 0 ||
+                    (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
+                        nfs4_state_mark_reclaim_reboot(clp, state);
+                lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
+        };
+}
+static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int recovery_type)
 {
        struct nfs4_lockdata *data;
        struct rpc_task *task;
@@ -4000,8 +4162,11 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
                return -ENOMEM;
        if (IS_SETLKW(cmd))
                data->arg.block = 1;
-        if (reclaim != 0)
+        if (recovery_type > NFS_LOCK_NEW) {
-                data->arg.reclaim = 1;
+                if (recovery_type == NFS_LOCK_RECLAIM)
+                        data->arg.reclaim = NFS_LOCK_RECLAIM;
+                task_setup_data.callback_ops = &nfs4_recover_lock_ops;
+        }
        msg.rpc_argp = &data->arg,
        msg.rpc_resp = &data->res,
        task_setup_data.callback_data = data;
@@ -4011,6 +4176,9 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
        ret = nfs4_wait_for_completion_rpc_task(task);
        if (ret == 0) {
                ret = data->rpc_status;
+                if (ret)
+                        nfs4_handle_setlk_error(data->server, data->lsp,
+                                        data->arg.new_lock_owner, ret);
        } else
                data->cancelled = 1;
        rpc_put_task(task);
@@ -4028,8 +4196,8 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
                /* Cache the lock if possible... */
                if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
                        return 0;
-                err = _nfs4_do_setlk(state, F_SETLK, request, 1);
+                err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
-                if (err != -NFS4ERR_DELAY)
+                if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED)
                        break;
                nfs4_handle_exception(server, err, &exception);
        } while (exception.retry);
@@ -4048,11 +4216,18 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
        do {
                if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
                        return 0;
-                err = _nfs4_do_setlk(state, F_SETLK, request, 0);
+                err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED);
-                if (err != -NFS4ERR_DELAY)
+                switch (err) {
-                        break;
+                default:
-                nfs4_handle_exception(server, err, &exception);
+                        goto out;
+                case -NFS4ERR_GRACE:
+                case -NFS4ERR_DELAY:
+                case -EKEYEXPIRED:
+                        nfs4_handle_exception(server, err, &exception);
+                        err = 0;
+                }
        } while (exception.retry);
+out:
        return err;
 }
@@ -4060,8 +4235,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
 {
        struct nfs_inode *nfsi = NFS_I(state->inode);
        unsigned char fl_flags = request->fl_flags;
-        int status;
+        int status = -ENOLCK;
+        if ((fl_flags & FL_POSIX) &&
+                        !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
+                goto out;
        /* Is this a delegated open? */
        status = nfs4_set_lock_state(state, request);
        if (status != 0)
@@ -4078,7 +4256,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
                status = do_vfs_lock(request->fl_file, request);
                goto out_unlock;
        }
-        status = _nfs4_do_setlk(state, cmd, request, 0);
+        status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
        if (status != 0)
                goto out_unlock;
        /* Note: we always want to sleep here! */
@@ -4161,7 +4339,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
        if (err != 0)
                goto out;
        do {
-                err = _nfs4_do_setlk(state, F_SETLK, fl, 0);
+                err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW);
                switch (err) {
                        default:
                                printk(KERN_ERR "%s: unhandled error %d.\n",
@@ -4172,6 +4350,11 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
                        case -NFS4ERR_EXPIRED:
                        case -NFS4ERR_STALE_CLIENTID:
                        case -NFS4ERR_STALE_STATEID:
+                        case -NFS4ERR_BADSESSION:
+                        case -NFS4ERR_BADSLOT:
+                        case -NFS4ERR_BAD_HIGH_SLOT:
+                        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+                        case -NFS4ERR_DEADSESSION:
                                nfs4_schedule_state_recovery(server->nfs_client);
                                goto out;
                        case -ERESTARTSYS:
@@ -4191,6 +4374,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
                                err = 0;
                                goto out;
                        case -NFS4ERR_DELAY:
+                        case -EKEYEXPIRED:
                                break;
                }
                err = nfs4_handle_exception(server, err, &exception);
@@ -4296,7 +4480,7 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 * NFS4ERR_BADSESSION in the sequence operation, and will therefore
 * be in some phase of session reset.
 */
-static int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
+int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 {
        nfs4_verifier verifier;
        struct nfs41_exchange_id_args args = {
@@ -4318,6 +4502,9 @@ static int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
        dprintk("--> %s\n", __func__);
        BUG_ON(clp == NULL);
+        /* Remove server-only flags */
+        args.flags &= ~EXCHGID4_FLAG_CONFIRMED_R;
        p = (u32 *)verifier.data;
        *p++ = htonl((u32)clp->cl_boot_time.tv_sec);
        *p = htonl((u32)clp->cl_boot_time.tv_nsec);
@@ -4333,7 +4520,7 @@ static int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
                status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
-                if (status != NFS4ERR_CLID_INUSE)
+                if (status != -NFS4ERR_CLID_INUSE)
                        break;
                if (signalled())
@@ -4361,11 +4548,12 @@ static void nfs4_get_lease_time_prepare(struct rpc_task *task,
                        (struct nfs4_get_lease_time_data *)calldata;
        dprintk("--> %s\n", __func__);
+        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
        /* just setup sequence, do not trigger session recovery
           since we're invoked within one */
        ret = nfs41_setup_sequence(data->clp->cl_session,
-                                        &data->args->la_seq_args,
+                                   &data->args->la_seq_args,
-                                        &data->res->lr_seq_res, 0, task);
+                                   &data->res->lr_seq_res, 0, task);
        BUG_ON(ret == -EAGAIN);
        rpc_call_start(task);
@@ -4386,13 +4574,13 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
        switch (task->tk_status) {
        case -NFS4ERR_DELAY:
        case -NFS4ERR_GRACE:
+        case -EKEYEXPIRED:
                dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
                rpc_delay(task, NFS4_POLL_RETRY_MIN);
                task->tk_status = 0;
-                nfs4_restart_rpc(task, data->clp);
+                nfs_restart_rpc(task, data->clp);
                return;
        }
-        nfs41_sequence_free_slot(data->clp, &data->res->lr_seq_res);
        dprintk("<-- %s\n", __func__);
 }
@@ -4444,28 +4632,33 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
 /*
 * Reset a slot table
 */
-static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, int max_slots,
+static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
-                int old_max_slots, int ivalue)
+                                 int ivalue)
 {
+        struct nfs4_slot *new = NULL;
        int i;
        int ret = 0;
-        dprintk("--> %s: max_reqs=%u, tbl %p\n", __func__, max_slots, tbl);
+        dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__,
+                max_reqs, tbl->max_slots);
-        /*
+        /* Does the newly negotiated max_reqs match the existing slot table? */
-         * Until we have dynamic slot table adjustment, insist
+        if (max_reqs != tbl->max_slots) {
-         * upon the same slot table size
+                ret = -ENOMEM;
-         */
+                new = kmalloc(max_reqs * sizeof(struct nfs4_slot),
-        if (max_slots != old_max_slots) {
+                              GFP_KERNEL);
-                dprintk("%s reset slot table does't match old\n",
+                if (!new)
-                        __func__);
+                        goto out;
-                ret = -EINVAL; /*XXX NFS4ERR_REQ_TOO_BIG ? */
+                ret = 0;
-                goto out;
+                kfree(tbl->slots);
        }
        spin_lock(&tbl->slot_tbl_lock);
-        for (i = 0; i < max_slots; ++i)
+        if (new) {
+                tbl->slots = new;
+                tbl->max_slots = max_reqs;
+        }
+        for (i = 0; i < tbl->max_slots; ++i)
                tbl->slots[i].seq_nr = ivalue;
-        tbl->highest_used_slotid = -1;
        spin_unlock(&tbl->slot_tbl_lock);
        dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
                tbl, tbl->slots, tbl->max_slots);
@@ -4482,16 +4675,12 @@ static int nfs4_reset_slot_tables(struct nfs4_session *session)
        int status;
        status = nfs4_reset_slot_table(&session->fc_slot_table,
-                        session->fc_attrs.max_reqs,
+                        session->fc_attrs.max_reqs, 1);
-                        session->fc_slot_table.max_slots,
-                        1);
        if (status)
                return status;
        status = nfs4_reset_slot_table(&session->bc_slot_table,
-                        session->bc_attrs.max_reqs,
+                        session->bc_attrs.max_reqs, 0);
-                        session->bc_slot_table.max_slots,
-                        0);
        return status;
 }
@@ -4515,7 +4704,6 @@ static void nfs4_destroy_slot_tables(struct nfs4_session *session)
 static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
                int max_slots, int ivalue)
 {
-        int i;
        struct nfs4_slot *slot;
        int ret = -ENOMEM;
@@ -4526,18 +4714,9 @@ static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
        slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_KERNEL);
        if (!slot)
                goto out;
-        for (i = 0; i < max_slots; ++i)
-                slot[i].seq_nr = ivalue;
        ret = 0;
        spin_lock(&tbl->slot_tbl_lock);
-        if (tbl->slots != NULL) {
-                spin_unlock(&tbl->slot_tbl_lock);
-                dprintk("%s: slot table already initialized. tbl=%p slots=%p\n",
-                        __func__, tbl, tbl->slots);
-                WARN_ON(1);
-                goto out_free;
-        }
        tbl->max_slots = max_slots;
        tbl->slots = slot;
        tbl->highest_used_slotid = -1;  /* no slot is currently used */
@@ -4547,10 +4726,6 @@ static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
 out:
        dprintk("<-- %s: return %d\n", __func__, ret);
        return ret;
-out_free:
-        kfree(slot);
-        goto out;
 }
 /*
@@ -4558,17 +4733,24 @@ out_free:
 */
 static int nfs4_init_slot_tables(struct nfs4_session *session)
 {
-        int status;
+        struct nfs4_slot_table *tbl;
+        int status = 0;
-        status = nfs4_init_slot_table(&session->fc_slot_table,
+        tbl = &session->fc_slot_table;
-                        session->fc_attrs.max_reqs, 1);
+        if (tbl->slots == NULL) {
-        if (status)
+                status = nfs4_init_slot_table(tbl,
-                return status;
+                                session->fc_attrs.max_reqs, 1);
+                if (status)
+                        return status;
+        }
-        status = nfs4_init_slot_table(&session->bc_slot_table,
+        tbl = &session->bc_slot_table;
-                        session->bc_attrs.max_reqs, 0);
+        if (tbl->slots == NULL) {
-        if (status)
+                status = nfs4_init_slot_table(tbl,
-                nfs4_destroy_slot_tables(session);
+                                session->bc_attrs.max_reqs, 0);
+                if (status)
+                        nfs4_destroy_slot_tables(session);
+        }
        return status;
 }
@@ -4582,7 +4764,6 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
        if (!session)
                return NULL;
-        set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
        /*
         * The create session reply races with the server back
         * channel probe. Mark the client NFS_CS_SESSION_INITING
@@ -4590,12 +4771,15 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
         * nfs_client struct
         */
        clp->cl_cons_state = NFS_CS_SESSION_INITING;
+        init_completion(&session->complete);
        tbl = &session->fc_slot_table;
+        tbl->highest_used_slotid = -1;
        spin_lock_init(&tbl->slot_tbl_lock);
-        rpc_init_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
+        rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
        tbl = &session->bc_slot_table;
+        tbl->highest_used_slotid = -1;
        spin_lock_init(&tbl->slot_tbl_lock);
        rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
@@ -4637,16 +4821,14 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
        args->fc_attrs.headerpadsz = 0;
        args->fc_attrs.max_rqst_sz = mxrqst_sz;
        args->fc_attrs.max_resp_sz = mxresp_sz;
-        args->fc_attrs.max_resp_sz_cached = mxresp_sz;
        args->fc_attrs.max_ops = NFS4_MAX_OPS;
        args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs;
        dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u "
-                "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
+                "max_ops=%u max_reqs=%u\n",
                __func__,
                args->fc_attrs.max_rqst_sz, args->fc_attrs.max_resp_sz,
-                args->fc_attrs.max_resp_sz_cached, args->fc_attrs.max_ops,
+                args->fc_attrs.max_ops, args->fc_attrs.max_reqs);
-                args->fc_attrs.max_reqs);
        /* Back channel attributes */
        args->bc_attrs.headerpadsz = 0;
@@ -4747,11 +4929,10 @@ static int _nfs4_proc_create_session(struct nfs_client *clp)
 * It is the responsibility of the caller to verify the session is
 * expired before calling this routine.
 */
-int nfs4_proc_create_session(struct nfs_client *clp, int reset)
+int nfs4_proc_create_session(struct nfs_client *clp)
 {
        int status;
        unsigned *ptr;
-        struct nfs_fsinfo fsinfo;
        struct nfs4_session *session = clp->cl_session;
        dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
@@ -4760,35 +4941,19 @@ int nfs4_proc_create_session(struct nfs_client *clp, int reset)
        if (status)
                goto out;
-        /* Init or reset the fore channel */
+        /* Init and reset the fore channel */
-        if (reset)
+        status = nfs4_init_slot_tables(session);
-                status = nfs4_reset_slot_tables(session);
+        dprintk("slot table initialization returned %d\n", status);
-        else
+        if (status)
-                status = nfs4_init_slot_tables(session);
+                goto out;
-        dprintk("fore channel slot table initialization returned %d\n", status);
+        status = nfs4_reset_slot_tables(session);
+        dprintk("slot table reset returned %d\n", status);
        if (status)
                goto out;
        ptr = (unsigned *)&session->sess_id.data[0];
        dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__,
                clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]);
-        if (reset)
-                /* Lease time is aleady set */
-                goto out;
-        /* Get the lease time */
-        status = nfs4_proc_get_lease_time(clp, &fsinfo);
-        if (status == 0) {
-                /* Update lease time and schedule renewal */
-                spin_lock(&clp->cl_lock);
-                clp->cl_lease_time = fsinfo.lease_time * HZ;
-                clp->cl_last_renewal = jiffies;
-                clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
-                spin_unlock(&clp->cl_lock);
-                nfs4_schedule_state_renewal(clp);
-        }
 out:
        dprintk("<-- %s\n", __func__);
        return status;
@@ -4827,13 +4992,24 @@ int nfs4_proc_destroy_session(struct nfs4_session *session)
 int nfs4_init_session(struct nfs_server *server)
 {
        struct nfs_client *clp = server->nfs_client;
+        struct nfs4_session *session;
+        unsigned int rsize, wsize;
        int ret;
        if (!nfs4_has_session(clp))
                return 0;
-        clp->cl_session->fc_attrs.max_rqst_sz = server->wsize;
+        rsize = server->rsize;
-        clp->cl_session->fc_attrs.max_resp_sz = server->rsize;
+        if (rsize == 0)
+                rsize = NFS_MAX_FILE_IO_SIZE;
+        wsize = server->wsize;
+        if (wsize == 0)
+                wsize = NFS_MAX_FILE_IO_SIZE;
+        session = clp->cl_session;
+        session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead;
+        session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead;
        ret = nfs4_recover_expired_lease(server);
        if (!ret)
                ret = nfs4_check_client_ready(clp);
@@ -4858,10 +5034,19 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
        args.sa_cache_this = 0;
        return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args,
-                                       &res, 0);
+                                       &res, args.sa_cache_this, 1);
+}
+static void nfs41_sequence_release(void *data)
+{
+        struct nfs_client *clp = (struct nfs_client *)data;
+        if (atomic_read(&clp->cl_count) > 1)
+                nfs4_schedule_state_renewal(clp);
+        nfs_put_client(clp);
 }
-void nfs41_sequence_call_done(struct rpc_task *task, void *data)
+static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
 {
        struct nfs_client *clp = (struct nfs_client *)data;
@@ -4869,16 +5054,17 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data)
        if (task->tk_status < 0) {
                dprintk("%s ERROR %d\n", __func__, task->tk_status);
+                if (atomic_read(&clp->cl_count) == 1)
+                        goto out;
                if (_nfs4_async_handle_error(task, NULL, clp, NULL)
                                                                == -EAGAIN) {
-                        nfs4_restart_rpc(task, clp);
+                        nfs_restart_rpc(task, clp);
                        return;
                }
        }
-        nfs41_sequence_free_slot(clp, task->tk_msg.rpc_resp);
        dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
+out:
        kfree(task->tk_msg.rpc_argp);
        kfree(task->tk_msg.rpc_resp);
@@ -4903,6 +5089,7 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
 static const struct rpc_call_ops nfs41_sequence_ops = {
        .rpc_call_done = nfs41_sequence_call_done,
        .rpc_call_prepare = nfs41_sequence_prepare,
+        .rpc_release = nfs41_sequence_release,
 };
 static int nfs41_proc_async_sequence(struct nfs_client *clp,
@@ -4915,12 +5102,14 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp,
                .rpc_cred = cred,
        };
+        if (!atomic_inc_not_zero(&clp->cl_count))
+                return -EIO;
        args = kzalloc(sizeof(*args), GFP_KERNEL);
-        if (!args)
-                return -ENOMEM;
        res = kzalloc(sizeof(*res), GFP_KERNEL);
-        if (!res) {
+        if (!args || !res) {
                kfree(args);
+                kfree(res);
+                nfs_put_client(clp);
                return -ENOMEM;
        }
        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
@@ -4931,6 +5120,110 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp,
                              &nfs41_sequence_ops, (void *)clp);
 }
+struct nfs4_reclaim_complete_data {
+        struct nfs_client *clp;
+        struct nfs41_reclaim_complete_args arg;
+        struct nfs41_reclaim_complete_res res;
+};
+static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data)
+{
+        struct nfs4_reclaim_complete_data *calldata = data;
+        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
+        if (nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args,
+                                &calldata->res.seq_res, 0, task))
+                return;
+        rpc_call_start(task);
+}
+static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
+{
+        struct nfs4_reclaim_complete_data *calldata = data;
+        struct nfs_client *clp = calldata->clp;
+        struct nfs4_sequence_res *res = &calldata->res.seq_res;
+        dprintk("--> %s\n", __func__);
+        nfs41_sequence_done(clp, res, task->tk_status);
+        switch (task->tk_status) {
+        case 0:
+        case -NFS4ERR_COMPLETE_ALREADY:
+                break;
+        case -NFS4ERR_BADSESSION:
+        case -NFS4ERR_DEADSESSION:
+                /*
+                 * Handle the session error, but do not retry the operation, as
+                 * we have no way of telling whether the clientid had to be
+                 * reset before we got our reply.  If reset, a new wave of
+                 * reclaim operations will follow, containing their own reclaim
+                 * complete.  We don't want our retry to get on the way of
+                 * recovery by incorrectly indicating to the server that we're
+                 * done reclaiming state since the process had to be restarted.
+                 */
+                _nfs4_async_handle_error(task, NULL, clp, NULL);
+                break;
+        default:
+                if (_nfs4_async_handle_error(
+                                task, NULL, clp, NULL) == -EAGAIN) {
+                        rpc_restart_call_prepare(task);
+                        return;
+                }
+        }
+        dprintk("<-- %s\n", __func__);
+}
+static void nfs4_free_reclaim_complete_data(void *data)
+{
+        struct nfs4_reclaim_complete_data *calldata = data;
+        kfree(calldata);
+}
+static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = {
+        .rpc_call_prepare = nfs4_reclaim_complete_prepare,
+        .rpc_call_done = nfs4_reclaim_complete_done,
+        .rpc_release = nfs4_free_reclaim_complete_data,
+};
+/*
+ * Issue a global reclaim complete.
+ */
+static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
+{
+        struct nfs4_reclaim_complete_data *calldata;
+        struct rpc_task *task;
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE],
+        };
+        struct rpc_task_setup task_setup_data = {
+                .rpc_client = clp->cl_rpcclient,
+                .rpc_message = &msg,
+                .callback_ops = &nfs4_reclaim_complete_call_ops,
+                .flags = RPC_TASK_ASYNC,
+        };
+        int status = -ENOMEM;
+        dprintk("--> %s\n", __func__);
+        calldata = kzalloc(sizeof(*calldata), GFP_KERNEL);
+        if (calldata == NULL)
+                goto out;
+        calldata->clp = clp;
+        calldata->arg.one_fs = 0;
+        calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
+        msg.rpc_argp = &calldata->arg;
+        msg.rpc_resp = &calldata->res;
+        task_setup_data.callback_data = calldata;
+        task = rpc_run_task(&task_setup_data);
+        if (IS_ERR(task))
+                status = PTR_ERR(task);
+        rpc_put_task(task);
+out:
+        dprintk("<-- %s status=%d\n", __func__, status);
+        return status;
+}
 #endif /* CONFIG_NFS_V4_1 */
 struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
@@ -4948,8 +5241,9 @@ struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
        .state_flag_bit = NFS_STATE_RECLAIM_REBOOT,
        .recover_open   = nfs4_open_reclaim,
        .recover_lock   = nfs4_lock_reclaim,
-        .establish_clid = nfs4_proc_exchange_id,
+        .establish_clid = nfs41_init_clientid,
        .get_clid_cred  = nfs4_get_exchange_id_cred,
+        .reclaim_complete = nfs41_proc_reclaim_complete,
 };
 #endif /* CONFIG_NFS_V4_1 */
@@ -4968,7 +5262,7 @@ struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
        .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
        .recover_open   = nfs4_open_expired,
        .recover_lock   = nfs4_lock_expired,
-        .establish_clid = nfs4_proc_exchange_id,
+        .establish_clid = nfs41_init_clientid,
        .get_clid_cred  = nfs4_get_exchange_id_cred,
 };
 #endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 0156c01c212c..d87f10327b72 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -36,11 +36,6 @@
 * as an rpc_task, not a real kernel thread, so it always runs in rpciod's
 * context.  There is one renewd per nfs_server.
 *
- * TODO: If the send queue gets backlogged (e.g., if the server goes down),
- * we will keep filling the queue with periodic RENEW requests.  We need a
- * mechanism for ensuring that if renewd successfully sends off a request,
- * then it only wakes up when the request is finished.  Maybe use the
- * child task framework of the RPC layer?
 */
 #include <linux/mm.h>
@@ -63,7 +58,7 @@ nfs4_renew_state(struct work_struct *work)
        struct nfs_client *clp =
                container_of(work, struct nfs_client, cl_renewd.work);
        struct rpc_cred *cred;
-        long lease, timeout;
+        long lease;
        unsigned long last, now;
        ops = nfs4_state_renewal_ops[clp->cl_minorversion];
@@ -75,7 +70,6 @@ nfs4_renew_state(struct work_struct *work)
        lease = clp->cl_lease_time;
        last = clp->cl_last_renewal;
        now = jiffies;
-        timeout = (2 * lease) / 3 + (long)last - (long)now;
        /* Are we close to a lease timeout? */
        if (time_after(now, last + lease/3)) {
                cred = ops->get_state_renewal_cred_locked(clp);
@@ -90,19 +84,15 @@ nfs4_renew_state(struct work_struct *work)
                        /* Queue an asynchronous RENEW. */
                        ops->sched_state_renewal(clp, cred);
                        put_rpccred(cred);
+                        goto out_exp;
                }
-                timeout = (2 * lease) / 3;
+        } else {
-                spin_lock(&clp->cl_lock);
-        } else
                dprintk("%s: failed to call renewd. Reason: lease not expired \n",
                                __func__);
-        if (timeout < 5 * HZ)    /* safeguard */
+                spin_unlock(&clp->cl_lock);
-                timeout = 5 * HZ;
+        }
-        dprintk("%s: requeueing work. Lease period = %ld\n",
+        nfs4_schedule_state_renewal(clp);
-                        __func__, (timeout + HZ - 1) / HZ);
+out_exp:
-        cancel_delayed_work(&clp->cl_renewd);
-        schedule_delayed_work(&clp->cl_renewd, timeout);
-        spin_unlock(&clp->cl_lock);
        nfs_expire_unreferenced_delegations(clp);
 out:
        dprintk("%s: done\n", __func__);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2ef4fecf3984..6c5ed51f105e 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -116,6 +116,79 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
 #if defined(CONFIG_NFS_V4_1)
+static int nfs41_setup_state_renewal(struct nfs_client *clp)
+{
+        int status;
+        struct nfs_fsinfo fsinfo;
+        status = nfs4_proc_get_lease_time(clp, &fsinfo);
+        if (status == 0) {
+                /* Update lease time and schedule renewal */
+                spin_lock(&clp->cl_lock);
+                clp->cl_lease_time = fsinfo.lease_time * HZ;
+                clp->cl_last_renewal = jiffies;
+                spin_unlock(&clp->cl_lock);
+                nfs4_schedule_state_renewal(clp);
+        }
+        return status;
+}
+static void nfs4_end_drain_session(struct nfs_client *clp)
+{
+        struct nfs4_session *ses = clp->cl_session;
+        int max_slots;
+        if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) {
+                spin_lock(&ses->fc_slot_table.slot_tbl_lock);
+                max_slots = ses->fc_slot_table.max_slots;
+                while (max_slots--) {
+                        struct rpc_task *task;
+                        task = rpc_wake_up_next(&ses->fc_slot_table.
+                                                slot_tbl_waitq);
+                        if (!task)
+                                break;
+                        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
+                }
+                spin_unlock(&ses->fc_slot_table.slot_tbl_lock);
+        }
+}
+static int nfs4_begin_drain_session(struct nfs_client *clp)
+{
+        struct nfs4_session *ses = clp->cl_session;
+        struct nfs4_slot_table *tbl = &ses->fc_slot_table;
+        spin_lock(&tbl->slot_tbl_lock);
+        set_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state);
+        if (tbl->highest_used_slotid != -1) {
+                INIT_COMPLETION(ses->complete);
+                spin_unlock(&tbl->slot_tbl_lock);
+                return wait_for_completion_interruptible(&ses->complete);
+        }
+        spin_unlock(&tbl->slot_tbl_lock);
+        return 0;
+}
+int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
+{
+        int status;
+        nfs4_begin_drain_session(clp);
+        status = nfs4_proc_exchange_id(clp, cred);
+        if (status != 0)
+                goto out;
+        status = nfs4_proc_create_session(clp);
+        if (status != 0)
+                goto out;
+        nfs41_setup_state_renewal(clp);
+        nfs_mark_client_ready(clp, NFS_CS_READY);
+out:
+        return status;
+}
 struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp)
 {
        struct rpc_cred *cred;
@@ -693,16 +766,21 @@ struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter)
        return new;
 }
-void nfs_free_seqid(struct nfs_seqid *seqid)
+void nfs_release_seqid(struct nfs_seqid *seqid)
 {
        if (!list_empty(&seqid->list)) {
                struct rpc_sequence *sequence = seqid->sequence->sequence;
                spin_lock(&sequence->lock);
-                list_del(&seqid->list);
+                list_del_init(&seqid->list);
                spin_unlock(&sequence->lock);
                rpc_wake_up(&sequence->wait);
        }
+}
+void nfs_free_seqid(struct nfs_seqid *seqid)
+{
+        nfs_release_seqid(seqid);
        kfree(seqid);
 }
@@ -823,7 +901,7 @@ void nfs4_schedule_state_recovery(struct nfs_client *clp)
        nfs4_schedule_state_manager(clp);
 }
-static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
+int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
 {
        set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
@@ -877,6 +955,10 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
                        case -NFS4ERR_EXPIRED:
                        case -NFS4ERR_NO_GRACE:
                        case -NFS4ERR_STALE_CLIENTID:
+                        case -NFS4ERR_BADSESSION:
+                        case -NFS4ERR_BADSLOT:
+                        case -NFS4ERR_BAD_HIGH_SLOT:
+                        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
                                goto out;
                        default:
                                printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
@@ -959,6 +1041,10 @@ restart:
                        case -NFS4ERR_NO_GRACE:
                                nfs4_state_mark_reclaim_nograce(sp->so_client, state);
                        case -NFS4ERR_STALE_CLIENTID:
+                        case -NFS4ERR_BADSESSION:
+                        case -NFS4ERR_BADSLOT:
+                        case -NFS4ERR_BAD_HIGH_SLOT:
+                        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
                                goto out_err;
                }
                nfs4_put_open_state(state);
@@ -1011,6 +1097,14 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
        nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot);
 }
+static void nfs4_reclaim_complete(struct nfs_client *clp,
+                                 const struct nfs4_state_recovery_ops *ops)
+{
+        /* Notify the server we're done reclaiming our state */
+        if (ops->reclaim_complete)
+                (void)ops->reclaim_complete(clp);
+}
 static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
 {
        struct nfs4_state_owner *sp;
@@ -1020,6 +1114,9 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
        if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
                return;
+        nfs4_reclaim_complete(clp,
+                nfs4_reboot_recovery_ops[clp->cl_minorversion]);
        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
                spin_lock(&sp->so_lock);
@@ -1046,25 +1143,25 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
        nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
 }
-static void nfs4_state_end_reclaim_nograce(struct nfs_client *clp)
+static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
-{
-        clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
-}
-static void nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 {
        switch (error) {
                case -NFS4ERR_CB_PATH_DOWN:
                        nfs_handle_cb_pathdown(clp);
-                        break;
+                        return 0;
+                case -NFS4ERR_NO_GRACE:
+                        nfs4_state_end_reclaim_reboot(clp);
+                        return 0;
                case -NFS4ERR_STALE_CLIENTID:
                case -NFS4ERR_LEASE_MOVED:
                        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+                        nfs4_state_end_reclaim_reboot(clp);
                        nfs4_state_start_reclaim_reboot(clp);
                        break;
                case -NFS4ERR_EXPIRED:
                        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
                        nfs4_state_start_reclaim_nograce(clp);
+                        break;
                case -NFS4ERR_BADSESSION:
                case -NFS4ERR_BADSLOT:
                case -NFS4ERR_BAD_HIGH_SLOT:
@@ -1072,8 +1169,11 @@ static void nfs4_recovery_handle_error(struct nfs_client *clp, int error)
                case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
                case -NFS4ERR_SEQ_FALSE_RETRY:
                case -NFS4ERR_SEQ_MISORDERED:
-                        set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
+                        set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+                        /* Zero session reset errors */
+                        return 0;
        }
+        return error;
 }
 static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops)
@@ -1093,8 +1193,7 @@ restart:
                if (status < 0) {
                        set_bit(ops->owner_flag_bit, &sp->so_flags);
                        nfs4_put_state_owner(sp);
-                        nfs4_recovery_handle_error(clp, status);
+                        return nfs4_recovery_handle_error(clp, status);
-                        return status;
                }
                nfs4_put_state_owner(sp);
                goto restart;
@@ -1124,8 +1223,7 @@ static int nfs4_check_lease(struct nfs_client *clp)
        status = ops->renew_lease(clp, cred);
        put_rpccred(cred);
 out:
-        nfs4_recovery_handle_error(clp, status);
+        return nfs4_recovery_handle_error(clp, status);
-        return status;
 }
 static int nfs4_reclaim_lease(struct nfs_client *clp)
@@ -1151,55 +1249,127 @@ static int nfs4_reclaim_lease(struct nfs_client *clp)
 }
 #ifdef CONFIG_NFS_V4_1
-static void nfs4_session_recovery_handle_error(struct nfs_client *clp, int err)
+void nfs41_handle_recall_slot(struct nfs_client *clp)
 {
-        switch (err) {
+        set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
-        case -NFS4ERR_STALE_CLIENTID:
+        nfs4_schedule_state_recovery(clp);
-                set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+}
-                set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
+static void nfs4_reset_all_state(struct nfs_client *clp)
+{
+        if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
+                clp->cl_boot_time = CURRENT_TIME;
+                nfs4_state_start_reclaim_nograce(clp);
+                nfs4_schedule_state_recovery(clp);
        }
 }
+static void nfs41_handle_server_reboot(struct nfs_client *clp)
+{
+        if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
+                nfs4_state_start_reclaim_reboot(clp);
+                nfs4_schedule_state_recovery(clp);
+        }
+}
+static void nfs41_handle_state_revoked(struct nfs_client *clp)
+{
+        /* Temporary */
+        nfs4_reset_all_state(clp);
+}
+static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp)
+{
+        /* This will need to handle layouts too */
+        nfs_expire_all_delegations(clp);
+}
+static void nfs41_handle_cb_path_down(struct nfs_client *clp)
+{
+        nfs_expire_all_delegations(clp);
+        if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0)
+                nfs4_schedule_state_recovery(clp);
+}
+void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
+{
+        if (!flags)
+                return;
+        else if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED)
+                nfs41_handle_server_reboot(clp);
+        else if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |
+                            SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED |
+                            SEQ4_STATUS_ADMIN_STATE_REVOKED |
+                            SEQ4_STATUS_LEASE_MOVED))
+                nfs41_handle_state_revoked(clp);
+        else if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
+                nfs41_handle_recallable_state_revoked(clp);
+        else if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
+                            SEQ4_STATUS_BACKCHANNEL_FAULT |
+                            SEQ4_STATUS_CB_PATH_DOWN_SESSION))
+                nfs41_handle_cb_path_down(clp);
+}
 static int nfs4_reset_session(struct nfs_client *clp)
 {
        int status;
+        nfs4_begin_drain_session(clp);
        status = nfs4_proc_destroy_session(clp->cl_session);
        if (status && status != -NFS4ERR_BADSESSION &&
            status != -NFS4ERR_DEADSESSION) {
-                nfs4_session_recovery_handle_error(clp, status);
+                status = nfs4_recovery_handle_error(clp, status);
                goto out;
        }
        memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN);
-        status = nfs4_proc_create_session(clp, 1);
+        status = nfs4_proc_create_session(clp);
-        if (status)
+        if (status) {
-                nfs4_session_recovery_handle_error(clp, status);
+                status = nfs4_recovery_handle_error(clp, status);
-                /* fall through*/
+                goto out;
+        }
+        /* create_session negotiated new slot table */
+        clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
+         /* Let the state manager reestablish state */
+        if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+                nfs41_setup_state_renewal(clp);
 out:
-        /* Wake up the next rpc task even on error */
-        rpc_wake_up_next(&clp->cl_session->fc_slot_table.slot_tbl_waitq);
        return status;
 }
-static int nfs4_initialize_session(struct nfs_client *clp)
+static int nfs4_recall_slot(struct nfs_client *clp)
 {
-        int status;
+        struct nfs4_slot_table *fc_tbl = &clp->cl_session->fc_slot_table;
+        struct nfs4_channel_attrs *fc_attrs = &clp->cl_session->fc_attrs;
+        struct nfs4_slot *new, *old;
+        int i;
+        nfs4_begin_drain_session(clp);
+        new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot),
+                      GFP_KERNEL);
+        if (!new)
+                return -ENOMEM;
-        status = nfs4_proc_create_session(clp, 0);
+        spin_lock(&fc_tbl->slot_tbl_lock);
-        if (!status) {
+        for (i = 0; i < fc_tbl->target_max_slots; i++)
-                nfs_mark_client_ready(clp, NFS_CS_READY);
+                new[i].seq_nr = fc_tbl->slots[i].seq_nr;
-        } else if (status == -NFS4ERR_STALE_CLIENTID) {
+        old = fc_tbl->slots;
-                set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+        fc_tbl->slots = new;
-                set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state);
+        fc_tbl->max_slots = fc_tbl->target_max_slots;
-        } else {
+        fc_tbl->target_max_slots = 0;
-                nfs_mark_client_ready(clp, status);
+        fc_attrs->max_reqs = fc_tbl->max_slots;
-        }
+        spin_unlock(&fc_tbl->slot_tbl_lock);
-        return status;
+        kfree(old);
+        nfs4_end_drain_session(clp);
+        return 0;
 }
 #else /* CONFIG_NFS_V4_1 */
 static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
-static int nfs4_initialize_session(struct nfs_client *clp) { return 0; }
+static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; }
+static int nfs4_recall_slot(struct nfs_client *clp) { return 0; }
 #endif /* CONFIG_NFS_V4_1 */
 /* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors
@@ -1212,6 +1382,7 @@ static void nfs4_set_lease_expired(struct nfs_client *clp, int status)
                case -NFS4ERR_DELAY:
                case -NFS4ERR_CLID_INUSE:
                case -EAGAIN:
+                case -EKEYEXPIRED:
                        break;
                case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
@@ -1234,7 +1405,8 @@ static void nfs4_state_manager(struct nfs_client *clp)
                        status = nfs4_reclaim_lease(clp);
                        if (status) {
                                nfs4_set_lease_expired(clp, status);
-                                if (status == -EAGAIN)
+                                if (test_bit(NFS4CLNT_LEASE_EXPIRED,
+                                                        &clp->cl_state))
                                        continue;
                                if (clp->cl_cons_state ==
                                                        NFS_CS_SESSION_INITING)
@@ -1242,61 +1414,67 @@ static void nfs4_state_manager(struct nfs_client *clp)
                                goto out_error;
                        }
                        clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+                        set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
                }
                if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
                        status = nfs4_check_lease(clp);
-                        if (status != 0)
+                        if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
                                continue;
+                        if (status < 0 && status != -NFS4ERR_CB_PATH_DOWN)
+                                goto out_error;
                }
                /* Initialize or reset the session */
-                if (test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)
+                if (test_and_clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)
                   && nfs4_has_session(clp)) {
-                        if (clp->cl_cons_state == NFS_CS_SESSION_INITING)
+                        status = nfs4_reset_session(clp);
-                                status = nfs4_initialize_session(clp);
+                        if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
-                        else
+                                continue;
-                                status = nfs4_reset_session(clp);
+                        if (status < 0)
-                        if (status) {
-                                if (status == -NFS4ERR_STALE_CLIENTID)
-                                        continue;
                                goto out_error;
-                        }
                }
                /* First recover reboot state... */
-                if (test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
+                if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
                        status = nfs4_do_reclaim(clp,
                                nfs4_reboot_recovery_ops[clp->cl_minorversion]);
-                        if (status == -NFS4ERR_STALE_CLIENTID)
+                        if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
-                                continue;
+                            test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
-                        if (test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state))
                                continue;
                        nfs4_state_end_reclaim_reboot(clp);
-                        continue;
+                        if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state))
+                                continue;
+                        if (status < 0)
+                                goto out_error;
                }
                /* Now recover expired state... */
                if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
                        status = nfs4_do_reclaim(clp,
                                nfs4_nograce_recovery_ops[clp->cl_minorversion]);
-                        if (status < 0) {
+                        if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
-                                set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
+                            test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) ||
-                                if (status == -NFS4ERR_STALE_CLIENTID)
+                            test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
-                                        continue;
+                                continue;
-                                if (status == -NFS4ERR_EXPIRED)
+                        if (status < 0)
-                                        continue;
-                                if (test_bit(NFS4CLNT_SESSION_SETUP,
-                                                                &clp->cl_state))
-                                        continue;
                                goto out_error;
-                        } else
-                                nfs4_state_end_reclaim_nograce(clp);
-                        continue;
                }
+                nfs4_end_drain_session(clp);
                if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
                        nfs_client_return_marked_delegations(clp);
                        continue;
                }
+                /* Recall session slots */
+                if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state)
+                   && nfs4_has_session(clp)) {
+                        status = nfs4_recall_slot(clp);
+                        if (status < 0)
+                                goto out_error;
+                        continue;
+                }
                nfs4_clear_state_manager_bit(clp);
                /* Did we race with an attempt to give us more work? */
@@ -1309,8 +1487,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
 out_error:
        printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s"
                        " with error %d\n", clp->cl_hostname, -status);
-        if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+        nfs4_end_drain_session(clp);
-                nfs4_state_end_reclaim_reboot(clp);
        nfs4_clear_state_manager_bit(clp);
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 20b4e30e6c82..38f3b582e7c2 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -38,7 +38,6 @@
 #include <linux/param.h>
 #include <linux/time.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
@@ -46,11 +45,13 @@
 #include <linux/proc_fs.h>
 #include <linux/kdev_t.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/msg_prot.h>
 #include <linux/nfs.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_idmap.h>
 #include "nfs4_fs.h"
+#include "internal.h"
 #define NFSDBG_FACILITY         NFSDBG_XDR
@@ -134,7 +135,7 @@ static int nfs4_stat_to_errno(int);
 #define decode_lookup_maxsz     (op_decode_hdr_maxsz)
 #define encode_share_access_maxsz \
                                (2)
-#define encode_createmode_maxsz (1 + encode_attrs_maxsz)
+#define encode_createmode_maxsz (1 + encode_attrs_maxsz + encode_verifier_maxsz)
 #define encode_opentype_maxsz   (1 + encode_createmode_maxsz)
 #define encode_claim_null_maxsz (1 + nfs4_name_maxsz)
 #define encode_open_maxsz       (op_encode_hdr_maxsz + \
@@ -299,6 +300,8 @@ static int nfs4_stat_to_errno(int);
                                XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 4)
 #define decode_sequence_maxsz   (op_decode_hdr_maxsz + \
                                XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
+#define encode_reclaim_complete_maxsz   (op_encode_hdr_maxsz + 4)
+#define decode_reclaim_complete_maxsz   (op_decode_hdr_maxsz + 4)
 #else /* CONFIG_NFS_V4_1 */
 #define encode_sequence_maxsz   0
 #define decode_sequence_maxsz   0
@@ -676,6 +679,25 @@ static int nfs4_stat_to_errno(int);
                                         decode_sequence_maxsz + \
                                         decode_putrootfh_maxsz + \
                                         decode_fsinfo_maxsz)
+#define NFS4_enc_reclaim_complete_sz    (compound_encode_hdr_maxsz + \
+                                         encode_sequence_maxsz + \
+                                         encode_reclaim_complete_maxsz)
+#define NFS4_dec_reclaim_complete_sz    (compound_decode_hdr_maxsz + \
+                                         decode_sequence_maxsz + \
+                                         decode_reclaim_complete_maxsz)
+const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+                                      compound_encode_hdr_maxsz +
+                                      encode_sequence_maxsz +
+                                      encode_putfh_maxsz +
+                                      encode_getattr_maxsz) *
+                                     XDR_UNIT);
+const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+                                     compound_decode_hdr_maxsz +
+                                     decode_sequence_maxsz +
+                                     decode_putfh_maxsz) *
+                                    XDR_UNIT);
 #endif /* CONFIG_NFS_V4_1 */
 static const umode_t nfs_type2fmt[] = {
@@ -1140,6 +1162,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
 static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
 {
        __be32 *p;
+        struct nfs_client *clp;
        p = reserve_space(xdr, 4);
        switch(arg->open_flags & O_EXCL) {
@@ -1148,8 +1171,23 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
                encode_attrs(xdr, arg->u.attrs, arg->server);
                break;
        default:
-                *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
+                clp = arg->server->nfs_client;
-                encode_nfs4_verifier(xdr, &arg->u.verifier);
+                if (clp->cl_minorversion > 0) {
+                        if (nfs4_has_persistent_session(clp)) {
+                                *p = cpu_to_be32(NFS4_CREATE_GUARDED);
+                                encode_attrs(xdr, arg->u.attrs, arg->server);
+                        } else {
+                                struct iattr dummy;
+                                *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
+                                encode_nfs4_verifier(xdr, &arg->u.verifier);
+                                dummy.ia_valid = 0;
+                                encode_attrs(xdr, &dummy, arg->server);
+                        }
+                } else {
+                        *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
+                        encode_nfs4_verifier(xdr, &arg->u.verifier);
+                }
        }
 }
@@ -1539,6 +1577,14 @@ static void encode_create_session(struct xdr_stream *xdr,
        char machine_name[NFS4_MAX_MACHINE_NAME_LEN];
        uint32_t len;
        struct nfs_client *clp = args->client;
+        u32 max_resp_sz_cached;
+        /*
+         * Assumes OPEN is the biggest non-idempotent compound.
+         * 2 is the verifier.
+         */
+        max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE +
+                              RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT;
        len = scnprintf(machine_name, sizeof(machine_name), "%s",
                        clp->cl_ipaddr);
@@ -1553,7 +1599,7 @@ static void encode_create_session(struct xdr_stream *xdr,
        *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */
        *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */
        *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */
-        *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz_cached);  /* Max resp sz cached */
+        *p++ = cpu_to_be32(max_resp_sz_cached);         /* Max resp sz cached */
        *p++ = cpu_to_be32(args->fc_attrs.max_ops);     /* max operations */
        *p++ = cpu_to_be32(args->fc_attrs.max_reqs);    /* max requests */
        *p++ = cpu_to_be32(0);                          /* rdmachannel_attrs */
@@ -1592,6 +1638,19 @@ static void encode_destroy_session(struct xdr_stream *xdr,
        hdr->nops++;
        hdr->replen += decode_destroy_session_maxsz;
 }
+static void encode_reclaim_complete(struct xdr_stream *xdr,
+                                    struct nfs41_reclaim_complete_args *args,
+                                    struct compound_hdr *hdr)
+{
+        __be32 *p;
+        p = reserve_space(xdr, 8);
+        *p++ = cpu_to_be32(OP_RECLAIM_COMPLETE);
+        *p++ = cpu_to_be32(args->one_fs);
+        hdr->nops++;
+        hdr->replen += decode_reclaim_complete_maxsz;
+}
 #endif /* CONFIG_NFS_V4_1 */
 static void encode_sequence(struct xdr_stream *xdr,
@@ -2096,7 +2155,7 @@ nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
        encode_compound_hdr(&xdr, req, &hdr);
        encode_sequence(&xdr, &args->seq_args, &hdr);
        encode_putfh(&xdr, args->fh, &hdr);
-        replen = hdr.replen + nfs4_fattr_bitmap_maxsz + 1;
+        replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1;
        encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
@@ -2420,6 +2479,26 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
        encode_nops(&hdr);
        return 0;
 }
+/*
+ * a RECLAIM_COMPLETE request
+ */
+static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p,
+                                     struct nfs41_reclaim_complete_args *args)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr = {
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args)
+        };
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_reclaim_complete(&xdr, args, &hdr);
+        encode_nops(&hdr);
+        return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -4528,6 +4607,11 @@ static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
 {
        return decode_op_hdr(xdr, OP_DESTROY_SESSION);
 }
+static int decode_reclaim_complete(struct xdr_stream *xdr, void *dummy)
+{
+        return decode_op_hdr(xdr, OP_RECLAIM_COMPLETE);
+}
 #endif /* CONFIG_NFS_V4_1 */
 static int decode_sequence(struct xdr_stream *xdr,
@@ -4554,7 +4638,7 @@ static int decode_sequence(struct xdr_stream *xdr,
         * If the server returns different values for sessionID, slotID or
         * sequence number, the server is looney tunes.
         */
-        status = -ESERVERFAULT;
+        status = -EREMOTEIO;
        if (memcmp(id.data, res->sr_session->sess_id.data,
                   NFS4_MAX_SESSIONID_LEN)) {
@@ -4583,8 +4667,8 @@ static int decode_sequence(struct xdr_stream *xdr,
        dummy = be32_to_cpup(p++);
        /* target highest slot id - currently not processed */
        dummy = be32_to_cpup(p++);
-        /* result flags - currently not processed */
+        /* result flags */
-        dummy = be32_to_cpup(p);
+        res->sr_status_flags = be32_to_cpup(p);
        status = 0;
 out_err:
        res->sr_status = status;
@@ -5309,7 +5393,7 @@ out:
 }
 /*
- * FSINFO request
+ * Decode FSINFO response
 */
 static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p,
                               struct nfs4_fsinfo_res *res)
@@ -5330,7 +5414,7 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p,
 }
 /*
- * PATHCONF request
+ * Decode PATHCONF response
 */
 static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p,
                                 struct nfs4_pathconf_res *res)
@@ -5351,7 +5435,7 @@ static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p,
 }
 /*
- * STATFS request
+ * Decode STATFS response
 */
 static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p,
                               struct nfs4_statfs_res *res)
@@ -5372,7 +5456,7 @@ static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p,
 }
 /*
- * GETATTR_BITMAP request
+ * Decode GETATTR_BITMAP response
 */
 static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4_server_caps_res *res)
 {
@@ -5411,7 +5495,7 @@ static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
 }
 /*
- * a SETCLIENTID request
+ * Decode SETCLIENTID response
 */
 static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
                struct nfs_client *clp)
@@ -5428,7 +5512,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
 }
 /*
- * a SETCLIENTID_CONFIRM request
+ * Decode SETCLIENTID_CONFIRM response
 */
 static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo)
 {
@@ -5448,7 +5532,7 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
 }
 /*
- * DELEGRETURN request
+ * Decode DELEGRETURN response
 */
 static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_delegreturnres *res)
 {
@@ -5467,6 +5551,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
        if (status != 0)
                goto out;
        status = decode_delegreturn(&xdr);
+        if (status != 0)
+                goto out;
        decode_getfattr(&xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
@@ -5474,7 +5560,7 @@ out:
 }
 /*
- * FS_LOCATIONS request
+ * Decode FS_LOCATIONS response
 */
 static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p,
                                     struct nfs4_fs_locations_res *res)
@@ -5504,7 +5590,7 @@ out:
 #if defined(CONFIG_NFS_V4_1)
 /*
- * EXCHANGE_ID request
+ * Decode EXCHANGE_ID response
 */
 static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p,
                                    void *res)
@@ -5521,7 +5607,7 @@ static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p,
 }
 /*
- * a CREATE_SESSION request
+ * Decode CREATE_SESSION response
 */
 static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p,
                                       struct nfs41_create_session_res *res)
@@ -5538,7 +5624,7 @@ static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p,
 }
 /*
- * a DESTROY_SESSION request
+ * Decode DESTROY_SESSION response
 */
 static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p,
                                        void *dummy)
@@ -5555,7 +5641,7 @@ static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p,
 }
 /*
- * a SEQUENCE request
+ * Decode SEQUENCE response
 */
 static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p,
                                 struct nfs4_sequence_res *res)
@@ -5572,7 +5658,7 @@ static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p,
 }
 /*
- * a GET_LEASE_TIME request
+ * Decode GET_LEASE_TIME response
 */
 static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p,
                                       struct nfs4_get_lease_time_res *res)
@@ -5591,6 +5677,25 @@ static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p,
                status = decode_fsinfo(&xdr, res->lr_fsinfo);
        return status;
 }
+/*
+ * Decode RECLAIM_COMPLETE response
+ */
+static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p,
+                                         struct nfs41_reclaim_complete_res *res)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr;
+        int status;
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (!status)
+                status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (!status)
+                status = decode_reclaim_complete(&xdr, (void *)NULL);
+        return status;
+}
 #endif /* CONFIG_NFS_V4_1 */
 __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
@@ -5678,7 +5783,7 @@ static struct {
        { NFS4ERR_BAD_COOKIE,   -EBADCOOKIE     },
        { NFS4ERR_NOTSUPP,      -ENOTSUPP       },
        { NFS4ERR_TOOSMALL,     -ETOOSMALL      },
-        { NFS4ERR_SERVERFAULT,  -ESERVERFAULT   },
+        { NFS4ERR_SERVERFAULT,  -EREMOTEIO      },
        { NFS4ERR_BADTYPE,      -EBADTYPE       },
        { NFS4ERR_LOCKED,       -EAGAIN         },
        { NFS4ERR_SYMLINK,      -ELOOP          },
@@ -5705,7 +5810,7 @@ nfs4_stat_to_errno(int stat)
        }
        if (stat <= 10000 || stat > 10100) {
                /* The server is looney tunes. */
-                return -ESERVERFAULT;
+                return -EREMOTEIO;
        }
        /* If we cannot translate the error, the recovery routines should
         * handle it.
@@ -5767,6 +5872,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
  PROC(DESTROY_SESSION, enc_destroy_session,    dec_destroy_session),
  PROC(SEQUENCE,        enc_sequence,   dec_sequence),
  PROC(GET_LEASE_TIME,  enc_get_lease_time,     dec_get_lease_time),
+  PROC(RECLAIM_COMPLETE, enc_reclaim_complete,  dec_reclaim_complete),
 #endif /* CONFIG_NFS_V4_1 */
 };
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index e2975939126a..29d9d36cd5f4 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -112,12 +112,10 @@ void nfs_unlock_request(struct nfs_page *req)
 */
 int nfs_set_page_tag_locked(struct nfs_page *req)
 {
-        struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode);
        if (!nfs_lock_request_dontget(req))
                return 0;
        if (req->wb_page != NULL)
-                radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
+                radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
        return 1;
 }
@@ -126,10 +124,10 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
 */
 void nfs_clear_page_tag_locked(struct nfs_page *req)
 {
-        struct inode *inode = req->wb_context->path.dentry->d_inode;
-        struct nfs_inode *nfsi = NFS_I(inode);
        if (req->wb_page != NULL) {
+                struct inode *inode = req->wb_context->path.dentry->d_inode;
+                struct nfs_inode *nfsi = NFS_I(inode);
                spin_lock(&inode->i_lock);
                radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
                nfs_unlock_request(req);
@@ -142,16 +140,22 @@ void nfs_clear_page_tag_locked(struct nfs_page *req)
 * nfs_clear_request - Free up all resources allocated to the request
 * @req:
 *
- * Release page resources associated with a write request after it
+ * Release page and open context resources associated with a read/write
- * has completed.
+ * request after it has completed.
 */
 void nfs_clear_request(struct nfs_page *req)
 {
        struct page *page = req->wb_page;
+        struct nfs_open_context *ctx = req->wb_context;
        if (page != NULL) {
                page_cache_release(page);
                req->wb_page = NULL;
        }
+        if (ctx != NULL) {
+                put_nfs_open_context(ctx);
+                req->wb_context = NULL;
+        }
 }
@@ -165,9 +169,8 @@ static void nfs_free_request(struct kref *kref)
 {
        struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
-        /* Release struct file or cached credential */
+        /* Release struct file and open context */
        nfs_clear_request(req);
-        put_nfs_open_context(req->wb_context);
        nfs_page_free(req);
 }
@@ -176,6 +179,12 @@ void nfs_release_request(struct nfs_page *req)
        kref_put(&req->wb_kref, nfs_free_request);
 }
+static int nfs_wait_bit_uninterruptible(void *word)
+{
+        io_schedule();
+        return 0;
+}
 /**
 * nfs_wait_on_request - Wait for a request to complete.
 * @req: request to wait upon.
@@ -186,14 +195,9 @@ void nfs_release_request(struct nfs_page *req)
 int
 nfs_wait_on_request(struct nfs_page *req)
 {
-        int ret = 0;
+        return wait_on_bit(&req->wb_flags, PG_BUSY,
+                        nfs_wait_bit_uninterruptible,
-        if (!test_bit(PG_BUSY, &req->wb_flags))
+                        TASK_UNINTERRUPTIBLE);
-                goto out;
-        ret = out_of_line_wait_on_bit(&req->wb_flags, PG_BUSY,
-                        nfs_wait_bit_killable, TASK_KILLABLE);
-out:
-        return ret;
 }
 /**
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index ef583854d8d0..0288be80444f 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -29,7 +29,6 @@
 #include <linux/types.h>
 #include <linux/param.h>
-#include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
@@ -47,6 +46,39 @@
 #define NFSDBG_FACILITY         NFSDBG_PROC
 /*
+ * wrapper to handle the -EKEYEXPIRED error message. This should generally
+ * only happen if using krb5 auth and a user's TGT expires. NFSv2 doesn't
+ * support the NFSERR_JUKEBOX error code, but we handle this situation in the
+ * same way that we handle that error with NFSv3.
+ */
+static int
+nfs_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
+{
+        int res;
+        do {
+                res = rpc_call_sync(clnt, msg, flags);
+                if (res != -EKEYEXPIRED)
+                        break;
+                schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
+                res = -ERESTARTSYS;
+        } while (!fatal_signal_pending(current));
+        return res;
+}
+#define rpc_call_sync(clnt, msg, flags) nfs_rpc_wrapper(clnt, msg, flags)
+static int
+nfs_async_handle_expired_key(struct rpc_task *task)
+{
+        if (task->tk_status != -EKEYEXPIRED)
+                return 0;
+        task->tk_status = 0;
+        rpc_restart_call(task);
+        rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
+        return 1;
+}
+/*
 * Bare-bones access to getattr: this is for nfs_read_super.
 */
 static int
@@ -307,6 +339,8 @@ nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
 static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 {
+        if (nfs_async_handle_expired_key(task))
+                return 0;
        nfs_mark_for_revalidate(dir);
        return 1;
 }
@@ -560,6 +594,9 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
+        if (nfs_async_handle_expired_key(task))
+                return -EAGAIN;
        nfs_invalidate_atime(data->inode);
        if (task->tk_status >= 0) {
                nfs_refresh_inode(data->inode, data->res.fattr);
@@ -579,6 +616,9 @@ static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *
 static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
+        if (nfs_async_handle_expired_key(task))
+                return -EAGAIN;
        if (task->tk_status >= 0)
                nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr);
        return 0;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 12c9e66d3f1d..db9b360ae19d 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -356,25 +356,19 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
        struct nfs_readres *resp = &data->res;
        if (resp->eof || resp->count == argp->count)
-                goto out;
+                return;
        /* This is a short read! */
        nfs_inc_stats(data->inode, NFSIOS_SHORTREAD);
        /* Has the server at least made some progress? */
        if (resp->count == 0)
-                goto out;
+                return;
        /* Yes, so retry the read at the end of the data */
        argp->offset += resp->count;
        argp->pgbase += resp->count;
        argp->count -= resp->count;
-        nfs4_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client);
+        nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client);
-        return;
-out:
-        nfs4_sequence_free_slot(NFS_SERVER(data->inode)->nfs_client,
-                                &data->res.seq_res);
-        return;
 }
 /*
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 90be551b80c1..e01637240eeb 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -48,6 +48,7 @@
 #include <linux/vfs.h>
 #include <linux/inet.h>
 #include <linux/in6.h>
+#include <linux/slab.h>
 #include <net/ipv6.h>
 #include <linux/netdevice.h>
 #include <linux/nfs_xdr.h>
@@ -175,14 +176,16 @@ static const match_table_t nfs_mount_option_tokens = {
 };
 enum {
-        Opt_xprt_udp, Opt_xprt_tcp, Opt_xprt_rdma,
+        Opt_xprt_udp, Opt_xprt_udp6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_rdma,
        Opt_xprt_err
 };
 static const match_table_t nfs_xprt_protocol_tokens = {
        { Opt_xprt_udp, "udp" },
+        { Opt_xprt_udp6, "udp6" },
        { Opt_xprt_tcp, "tcp" },
+        { Opt_xprt_tcp6, "tcp6" },
        { Opt_xprt_rdma, "rdma" },
        { Opt_xprt_err, NULL }
@@ -241,6 +244,7 @@ static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
 static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);
 static int nfs_xdev_get_sb(struct file_system_type *fs_type,
                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static void nfs_put_super(struct super_block *);
 static void nfs_kill_super(struct super_block *);
 static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
@@ -264,6 +268,7 @@ static const struct super_operations nfs_sops = {
        .alloc_inode    = nfs_alloc_inode,
        .destroy_inode  = nfs_destroy_inode,
        .write_inode    = nfs_write_inode,
+        .put_super      = nfs_put_super,
        .statfs         = nfs_statfs,
        .clear_inode    = nfs_clear_inode,
        .umount_begin   = nfs_umount_begin,
@@ -333,6 +338,7 @@ static const struct super_operations nfs4_sops = {
        .alloc_inode    = nfs_alloc_inode,
        .destroy_inode  = nfs_destroy_inode,
        .write_inode    = nfs_write_inode,
+        .put_super      = nfs_put_super,
        .statfs         = nfs_statfs,
        .clear_inode    = nfs4_clear_inode,
        .umount_begin   = nfs_umount_begin,
@@ -492,6 +498,45 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour)
        return sec_flavours[i].str;
 }
+static void nfs_show_mountd_netid(struct seq_file *m, struct nfs_server *nfss,
+                                  int showdefaults)
+{
+        struct sockaddr *sap = (struct sockaddr *) &nfss->mountd_address;
+        seq_printf(m, ",mountproto=");
+        switch (sap->sa_family) {
+        case AF_INET:
+                switch (nfss->mountd_protocol) {
+                case IPPROTO_UDP:
+                        seq_printf(m, RPCBIND_NETID_UDP);
+                        break;
+                case IPPROTO_TCP:
+                        seq_printf(m, RPCBIND_NETID_TCP);
+                        break;
+                default:
+                        if (showdefaults)
+                                seq_printf(m, "auto");
+                }
+                break;
+        case AF_INET6:
+                switch (nfss->mountd_protocol) {
+                case IPPROTO_UDP:
+                        seq_printf(m, RPCBIND_NETID_UDP6);
+                        break;
+                case IPPROTO_TCP:
+                        seq_printf(m, RPCBIND_NETID_TCP6);
+                        break;
+                default:
+                        if (showdefaults)
+                                seq_printf(m, "auto");
+                }
+                break;
+        default:
+                if (showdefaults)
+                        seq_printf(m, "auto");
+        }
+}
 static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
                                    int showdefaults)
 {
@@ -505,7 +550,7 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
        }
        case AF_INET6: {
                struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
-                seq_printf(m, ",mountaddr=%pI6", &sin6->sin6_addr);
+                seq_printf(m, ",mountaddr=%pI6c", &sin6->sin6_addr);
                break;
        }
        default:
@@ -518,17 +563,7 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
        if (nfss->mountd_port || showdefaults)
                seq_printf(m, ",mountport=%u", nfss->mountd_port);
-        switch (nfss->mountd_protocol) {
+        nfs_show_mountd_netid(m, nfss, showdefaults);
-        case IPPROTO_UDP:
-                seq_printf(m, ",mountproto=udp");
-                break;
-        case IPPROTO_TCP:
-                seq_printf(m, ",mountproto=tcp");
-                break;
-        default:
-                if (showdefaults)
-                        seq_printf(m, ",mountproto=auto");
-        }
 }
 /*
@@ -578,7 +613,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
                        seq_puts(m, nfs_infop->nostr);
        }
        seq_printf(m, ",proto=%s",
-                   rpc_peeraddr2str(nfss->client, RPC_DISPLAY_PROTO));
+                   rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID));
        if (version == 4) {
                if (nfss->port != NFS_PORT)
                        seq_printf(m, ",port=%u", nfss->port);
@@ -714,8 +749,6 @@ static void nfs_umount_begin(struct super_block *sb)
        struct nfs_server *server;
        struct rpc_clnt *rpc;
-        lock_kernel();
        server = NFS_SB(sb);
        /* -EIO all pending I/O */
        rpc = server->client_acl;
@@ -724,8 +757,6 @@ static void nfs_umount_begin(struct super_block *sb)
        rpc = server->client;
        if (!IS_ERR(rpc))
                rpc_killall_tasks(rpc);
-        unlock_kernel();
 }
 static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version)
@@ -734,8 +765,6 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve
        data = kzalloc(sizeof(*data), GFP_KERNEL);
        if (data) {
-                data->rsize             = NFS_MAX_FILE_IO_SIZE;
-                data->wsize             = NFS_MAX_FILE_IO_SIZE;
                data->acregmin          = NFS_DEF_ACREGMIN;
                data->acregmax          = NFS_DEF_ACREGMAX;
                data->acdirmin          = NFS_DEF_ACDIRMIN;
@@ -887,6 +916,8 @@ static int nfs_parse_mount_options(char *raw,
 {
        char *p, *string, *secdata;
        int rc, sloppy = 0, invalid_option = 0;
+        unsigned short protofamily = AF_UNSPEC;
+        unsigned short mountfamily = AF_UNSPEC;
        if (!raw) {
                dfprintk(MOUNT, "NFS: mount options string was NULL.\n");
@@ -1232,12 +1263,17 @@ static int nfs_parse_mount_options(char *raw,
                        token = match_token(string,
                                            nfs_xprt_protocol_tokens, args);
+                        protofamily = AF_INET;
                        switch (token) {
+                        case Opt_xprt_udp6:
+                                protofamily = AF_INET6;
                        case Opt_xprt_udp:
                                mnt->flags &= ~NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
                                kfree(string);
                                break;
+                        case Opt_xprt_tcp6:
+                                protofamily = AF_INET6;
                        case Opt_xprt_tcp:
                                mnt->flags |= NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
@@ -1265,10 +1301,15 @@ static int nfs_parse_mount_options(char *raw,
                                            nfs_xprt_protocol_tokens, args);
                        kfree(string);
+                        mountfamily = AF_INET;
                        switch (token) {
+                        case Opt_xprt_udp6:
+                                mountfamily = AF_INET6;
                        case Opt_xprt_udp:
                                mnt->mount_server.protocol = XPRT_TRANSPORT_UDP;
                                break;
+                        case Opt_xprt_tcp6:
+                                mountfamily = AF_INET6;
                        case Opt_xprt_tcp:
                                mnt->mount_server.protocol = XPRT_TRANSPORT_TCP;
                                break;
@@ -1367,8 +1408,33 @@ static int nfs_parse_mount_options(char *raw,
        if (!sloppy && invalid_option)
                return 0;
+        /*
+         * verify that any proto=/mountproto= options match the address
+         * familiies in the addr=/mountaddr= options.
+         */
+        if (protofamily != AF_UNSPEC &&
+            protofamily != mnt->nfs_server.address.ss_family)
+                goto out_proto_mismatch;
+        if (mountfamily != AF_UNSPEC) {
+                if (mnt->mount_server.addrlen) {
+                        if (mountfamily != mnt->mount_server.address.ss_family)
+                                goto out_mountproto_mismatch;
+                } else {
+                        if (mountfamily != mnt->nfs_server.address.ss_family)
+                                goto out_mountproto_mismatch;
+                }
+        }
        return 1;
+out_mountproto_mismatch:
+        printk(KERN_INFO "NFS: mount server address does not match mountproto= "
+                         "option\n");
+        return 0;
+out_proto_mismatch:
+        printk(KERN_INFO "NFS: server address does not match proto= option\n");
+        return 0;
 out_invalid_address:
        printk(KERN_INFO "NFS: bad IP address specified: %s\n", p);
        return 0;
@@ -1881,7 +1947,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
        if (data == NULL)
                return -ENOMEM;
-        lock_kernel();
        /* fill out struct with values from existing mount */
        data->flags = nfss->flags;
        data->rsize = nfss->rsize;
@@ -1907,7 +1972,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
        error = nfs_compare_remount_data(nfss, data);
 out:
        kfree(data);
-        unlock_kernel();
        return error;
 }
@@ -2151,7 +2215,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        } else {
                error = nfs_bdi_register(server);
                if (error)
-                        goto error_splat_super;
+                        goto error_splat_bdi;
        }
        if (!s->s_root) {
@@ -2193,11 +2257,25 @@ out_err_nosb:
 error_splat_root:
        dput(mntroot);
 error_splat_super:
+        if (server && !s->s_root)
+                bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
        deactivate_locked_super(s);
        goto out;
 }
 /*
+ * Ensure that we unregister the bdi before kill_anon_super
+ * releases the device name
+ */
+static void nfs_put_super(struct super_block *s)
+{
+        struct nfs_server *server = NFS_SB(s);
+        bdi_unregister(&server->backing_dev_info);
+}
+/*
 * Destroy an NFS2/3 superblock
 */
 static void nfs_kill_super(struct super_block *s)
@@ -2205,7 +2283,6 @@ static void nfs_kill_super(struct super_block *s)
        struct nfs_server *server = NFS_SB(s);
        kill_anon_super(s);
-        bdi_unregister(&server->backing_dev_info);
        nfs_fscache_release_super_cookie(s);
        nfs_free_server(server);
 }
@@ -2253,7 +2330,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
        } else {
                error = nfs_bdi_register(server);
                if (error)
-                        goto error_splat_super;
+                        goto error_splat_bdi;
        }
        if (!s->s_root) {
@@ -2290,6 +2367,9 @@ out_err_noserver:
        return error;
 error_splat_super:
+        if (server && !s->s_root)
+                bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
        deactivate_locked_super(s);
        dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error);
        return error;
@@ -2505,7 +2585,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
        } else {
                error = nfs_bdi_register(server);
                if (error)
-                        goto error_splat_super;
+                        goto error_splat_bdi;
        }
        if (!s->s_root) {
@@ -2543,6 +2623,9 @@ out_free:
 error_splat_root:
        dput(mntroot);
 error_splat_super:
+        if (server && !s->s_root)
+                bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
        deactivate_locked_super(s);
        goto out;
 }
@@ -2738,7 +2821,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
        } else {
                error = nfs_bdi_register(server);
                if (error)
-                        goto error_splat_super;
+                        goto error_splat_bdi;
        }
        if (!s->s_root) {
@@ -2774,6 +2857,9 @@ out_err_noserver:
        return error;
 error_splat_super:
+        if (server && !s->s_root)
+                bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
        deactivate_locked_super(s);
        dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error);
        return error;
@@ -2820,7 +2906,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
        } else {
                error = nfs_bdi_register(server);
                if (error)
-                        goto error_splat_super;
+                        goto error_splat_bdi;
        }
        if (!s->s_root) {
@@ -2856,6 +2942,9 @@ out_err_noserver:
        return error;
 error_splat_super:
+        if (server && !s->s_root)
+                bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
        deactivate_locked_super(s);
        dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
        return error;
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 412738dbfbc7..05c9e02f4153 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -19,7 +19,6 @@
 #include <linux/pagemap.h>
 #include <linux/stat.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/namei.h>
@@ -50,7 +49,7 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
        struct page *page;
        void *err;
-        err = ERR_PTR(nfs_revalidate_mapping_nolock(inode, inode->i_mapping));
+        err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
        if (err)
                goto read_failed;
        page = read_cache_page(&inode->i_data, 0,
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index b62481dabae9..ad4d2e787b20 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -15,70 +15,64 @@
 #include "callback.h"
+#ifdef CONFIG_NFS_V4
 static const int nfs_set_port_min = 0;
 static const int nfs_set_port_max = 65535;
+#endif
 static struct ctl_table_header *nfs_callback_sysctl_table;
 static ctl_table nfs_cb_sysctls[] = {
 #ifdef CONFIG_NFS_V4
        {
-                .ctl_name = CTL_UNNUMBERED,
                .procname = "nfs_callback_tcpport",
                .data = &nfs_callback_set_tcpport,
                .maxlen = sizeof(int),
                .mode = 0644,
-                .proc_handler = &proc_dointvec_minmax,
+                .proc_handler = proc_dointvec_minmax,
                .extra1 = (int *)&nfs_set_port_min,
                .extra2 = (int *)&nfs_set_port_max,
        },
        {
-                .ctl_name = CTL_UNNUMBERED,
                .procname = "idmap_cache_timeout",
                .data = &nfs_idmap_cache_timeout,
                .maxlen = sizeof(int),
                .mode = 0644,
-                .proc_handler = &proc_dointvec_jiffies,
+                .proc_handler = proc_dointvec_jiffies,
-                .strategy = &sysctl_jiffies,
        },
 #endif
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nfs_mountpoint_timeout",
                .data           = &nfs_mountpoint_expiry_timeout,
                .maxlen         = sizeof(nfs_mountpoint_expiry_timeout),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_jiffies,
+                .proc_handler   = proc_dointvec_jiffies,
-                .strategy       = &sysctl_jiffies,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nfs_congestion_kb",
                .data           = &nfs_congestion_kb,
                .maxlen         = sizeof(nfs_congestion_kb),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
-        { .ctl_name = 0 }
+        { }
 };
 static ctl_table nfs_cb_sysctl_dir[] = {
        {
-                .ctl_name = CTL_UNNUMBERED,
                .procname = "nfs",
                .mode = 0555,
                .child = nfs_cb_sysctls,
        },
-        { .ctl_name = 0 }
+        { }
 };
 static ctl_table nfs_cb_sysctl_root[] = {
        {
-                .ctl_name = CTL_FS,
                .procname = "fs",
                .mode = 0555,
                .child = nfs_cb_sysctl_dir,
        },
-        { .ctl_name = 0 }
+        { }
 };
 int nfs_register_sysctl(void)
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 1064c91ae810..6da3d3ff6edd 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -83,7 +83,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
        struct inode *dir = data->dir;
        if (!NFS_PROTO(dir)->unlink_done(task, dir))
-                nfs4_restart_rpc(task, NFS_SERVER(dir)->nfs_client);
+                nfs_restart_rpc(task, NFS_SERVER(dir)->nfs_client);
 }
 /**
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 53eb26c16b50..53ff70e23993 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -178,7 +178,7 @@ static int wb_priority(struct writeback_control *wbc)
 {
        if (wbc->for_reclaim)
                return FLUSH_HIGHPRI | FLUSH_STABLE;
-        if (wbc->for_kupdate)
+        if (wbc->for_kupdate || wbc->for_background)
                return FLUSH_LOWPRI;
        return 0;
 }
@@ -438,6 +438,7 @@ nfs_mark_request_commit(struct nfs_page *req)
        radix_tree_tag_set(&nfsi->nfs_page_tree,
                        req->wb_index,
                        NFS_PAGE_TAG_COMMIT);
+        nfsi->ncommit++;
        spin_unlock(&inode->i_lock);
        inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
        inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
@@ -501,57 +502,6 @@ int nfs_reschedule_unstable_write(struct nfs_page *req)
 }
 #endif
-/*
- * Wait for a request to complete.
- *
- * Interruptible by fatal signals only.
- */
-static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, unsigned int npages)
-{
-        struct nfs_inode *nfsi = NFS_I(inode);
-        struct nfs_page *req;
-        pgoff_t idx_end, next;
-        unsigned int            res = 0;
-        int                     error;
-        if (npages == 0)
-                idx_end = ~0;
-        else
-                idx_end = idx_start + npages - 1;
-        next = idx_start;
-        while (radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, (void **)&req, next, 1, NFS_PAGE_TAG_LOCKED)) {
-                if (req->wb_index > idx_end)
-                        break;
-                next = req->wb_index + 1;
-                BUG_ON(!NFS_WBACK_BUSY(req));
-                kref_get(&req->wb_kref);
-                spin_unlock(&inode->i_lock);
-                error = nfs_wait_on_request(req);
-                nfs_release_request(req);
-                spin_lock(&inode->i_lock);
-                if (error < 0)
-                        return error;
-                res++;
-        }
-        return res;
-}
-static void nfs_cancel_commit_list(struct list_head *head)
-{
-        struct nfs_page *req;
-        while(!list_empty(head)) {
-                req = nfs_list_entry(head->next);
-                nfs_list_remove_request(req);
-                nfs_clear_request_commit(req);
-                nfs_inode_remove_request(req);
-                nfs_unlock_request(req);
-        }
-}
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 static int
 nfs_need_commit(struct nfs_inode *nfsi)
@@ -573,11 +523,17 @@ static int
 nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
+        int ret;
        if (!nfs_need_commit(nfsi))
                return 0;
-        return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
+        ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
+        if (ret > 0)
+                nfsi->ncommit -= ret;
+        if (nfs_need_commit(NFS_I(inode)))
+                __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+        return ret;
 }
 #else
 static inline int nfs_need_commit(struct nfs_inode *nfsi)
@@ -642,9 +598,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
                spin_lock(&inode->i_lock);
        }
-        if (nfs_clear_request_commit(req))
+        if (nfs_clear_request_commit(req) &&
-                radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
+                        radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
-                                req->wb_index, NFS_PAGE_TAG_COMMIT);
+                                req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL)
+                NFS_I(inode)->ncommit--;
        /* Okay, the request matches. Update the region */
        if (offset < req->wb_offset) {
@@ -774,7 +731,7 @@ int nfs_updatepage(struct file *file, struct page *page,
         */
        if (nfs_write_pageuptodate(page, inode) &&
                        inode->i_flock == NULL &&
-                        !(file->f_flags & O_SYNC)) {
+                        !(file->f_flags & O_DSYNC)) {
                count = max(count + offset, nfs_page_length(page));
                offset = 0;
        }
@@ -1216,7 +1173,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                                 */
                                argp->stable = NFS_FILE_SYNC;
                        }
-                        nfs4_restart_rpc(task, server->nfs_client);
+                        nfs_restart_rpc(task, server->nfs_client);
                        return -EAGAIN;
                }
                if (time_before(complain, jiffies)) {
@@ -1228,13 +1185,12 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                /* Can't do anything about it except throw an error. */
                task->tk_status = -EIO;
        }
-        nfs4_sequence_free_slot(server->nfs_client, &data->res.seq_res);
        return 0;
 }
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-void nfs_commitdata_release(void *data)
+static void nfs_commitdata_release(void *data)
 {
        struct nfs_write_data *wdata = data;
@@ -1392,7 +1348,7 @@ static const struct rpc_call_ops nfs_commit_ops = {
        .rpc_release = nfs_commit_release,
 };
-int nfs_commit_inode(struct inode *inode, int how)
+static int nfs_commit_inode(struct inode *inode, int how)
 {
        LIST_HEAD(head);
        int res;
@@ -1407,92 +1363,51 @@ int nfs_commit_inode(struct inode *inode, int how)
        }
        return res;
 }
-#else
-static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
-{
-        return 0;
-}
-#endif
-long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how)
+static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
 {
-        struct inode *inode = mapping->host;
+        struct nfs_inode *nfsi = NFS_I(inode);
-        pgoff_t idx_start, idx_end;
+        int flags = FLUSH_SYNC;
-        unsigned int npages = 0;
+        int ret = 0;
-        LIST_HEAD(head);
-        int nocommit = how & FLUSH_NOCOMMIT;
+        /* Don't commit yet if this is a non-blocking flush and there are
-        long pages, ret;
+         * lots of outstanding writes for this mapping.
+         */
-        /* FIXME */
+        if (wbc->sync_mode == WB_SYNC_NONE &&
-        if (wbc->range_cyclic)
+            nfsi->ncommit <= (nfsi->npages >> 1))
-                idx_start = 0;
+                goto out_mark_dirty;
-        else {
-                idx_start = wbc->range_start >> PAGE_CACHE_SHIFT;
+        if (wbc->nonblocking || wbc->for_background)
-                idx_end = wbc->range_end >> PAGE_CACHE_SHIFT;
+                flags = 0;
-                if (idx_end > idx_start) {
+        ret = nfs_commit_inode(inode, flags);
-                        pgoff_t l_npages = 1 + idx_end - idx_start;
+        if (ret >= 0) {
-                        npages = l_npages;
+                if (wbc->sync_mode == WB_SYNC_NONE) {
-                        if (sizeof(npages) != sizeof(l_npages) &&
+                        if (ret < wbc->nr_to_write)
-                                        (pgoff_t)npages != l_npages)
+                                wbc->nr_to_write -= ret;
-                                npages = 0;
+                        else
+                                wbc->nr_to_write = 0;
                }
+                return 0;
        }
-        how &= ~FLUSH_NOCOMMIT;
+out_mark_dirty:
-        spin_lock(&inode->i_lock);
+        __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-        do {
-                ret = nfs_wait_on_requests_locked(inode, idx_start, npages);
-                if (ret != 0)
-                        continue;
-                if (nocommit)
-                        break;
-                pages = nfs_scan_commit(inode, &head, idx_start, npages);
-                if (pages == 0)
-                        break;
-                if (how & FLUSH_INVALIDATE) {
-                        spin_unlock(&inode->i_lock);
-                        nfs_cancel_commit_list(&head);
-                        ret = pages;
-                        spin_lock(&inode->i_lock);
-                        continue;
-                }
-                pages += nfs_scan_commit(inode, &head, 0, 0);
-                spin_unlock(&inode->i_lock);
-                ret = nfs_commit_list(inode, &head, how);
-                spin_lock(&inode->i_lock);
-        } while (ret >= 0);
-        spin_unlock(&inode->i_lock);
        return ret;
 }
+#else
-static int __nfs_write_mapping(struct address_space *mapping, struct writeback_control *wbc, int how)
+static int nfs_commit_inode(struct inode *inode, int how)
 {
-        int ret;
-        ret = nfs_writepages(mapping, wbc);
-        if (ret < 0)
-                goto out;
-        ret = nfs_sync_mapping_wait(mapping, wbc, how);
-        if (ret < 0)
-                goto out;
        return 0;
-out:
-        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-        return ret;
 }
-/* Two pass sync: first using WB_SYNC_NONE, then WB_SYNC_ALL */
+static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
-static int nfs_write_mapping(struct address_space *mapping, int how)
 {
-        struct writeback_control wbc = {
+        return 0;
-                .bdi = mapping->backing_dev_info,
+}
-                .sync_mode = WB_SYNC_ALL,
+#endif
-                .nr_to_write = LONG_MAX,
-                .range_start = 0,
-                .range_end = LLONG_MAX,
-        };
-        return __nfs_write_mapping(mapping, &wbc, how);
+int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+        return nfs_commit_unstable_pages(inode, wbc);
 }
 /*
@@ -1500,37 +1415,26 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
 */
 int nfs_wb_all(struct inode *inode)
 {
-        return nfs_write_mapping(inode->i_mapping, 0);
+        struct writeback_control wbc = {
-}
+                .sync_mode = WB_SYNC_ALL,
+                .nr_to_write = LONG_MAX,
+                .range_start = 0,
+                .range_end = LLONG_MAX,
+        };
-int nfs_wb_nocommit(struct inode *inode)
+        return sync_inode(inode, &wbc);
-{
-        return nfs_write_mapping(inode->i_mapping, FLUSH_NOCOMMIT);
 }
 int nfs_wb_page_cancel(struct inode *inode, struct page *page)
 {
        struct nfs_page *req;
-        loff_t range_start = page_offset(page);
-        loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
-        struct writeback_control wbc = {
-                .bdi = page->mapping->backing_dev_info,
-                .sync_mode = WB_SYNC_ALL,
-                .nr_to_write = LONG_MAX,
-                .range_start = range_start,
-                .range_end = range_end,
-        };
        int ret = 0;
        BUG_ON(!PageLocked(page));
        for (;;) {
                req = nfs_page_find_request(page);
                if (req == NULL)
-                        goto out;
-                if (test_bit(PG_CLEAN, &req->wb_flags)) {
-                        nfs_release_request(req);
                        break;
-                }
                if (nfs_lock_request_dontget(req)) {
                        nfs_inode_remove_request(req);
                        /*
@@ -1542,55 +1446,56 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
                        break;
                }
                ret = nfs_wait_on_request(req);
+                nfs_release_request(req);
                if (ret < 0)
-                        goto out;
+                        break;
        }
-        if (!PagePrivate(page))
-                return 0;
-        ret = nfs_sync_mapping_wait(page->mapping, &wbc, FLUSH_INVALIDATE);
-out:
        return ret;
 }
-static int nfs_wb_page_priority(struct inode *inode, struct page *page,
+/*
-                                int how)
+ * Write back all requests on one page - we do this before reading it.
+ */
+int nfs_wb_page(struct inode *inode, struct page *page)
 {
        loff_t range_start = page_offset(page);
        loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
        struct writeback_control wbc = {
-                .bdi = page->mapping->backing_dev_info,
                .sync_mode = WB_SYNC_ALL,
-                .nr_to_write = LONG_MAX,
+                .nr_to_write = 0,
                .range_start = range_start,
                .range_end = range_end,
        };
+        struct nfs_page *req;
+        int need_commit;
        int ret;
-        do {
+        while(PagePrivate(page)) {
                if (clear_page_dirty_for_io(page)) {
                        ret = nfs_writepage_locked(page, &wbc);
                        if (ret < 0)
                                goto out_error;
-                } else if (!PagePrivate(page))
+                }
+                req = nfs_find_and_lock_request(page);
+                if (!req)
                        break;
-                ret = nfs_sync_mapping_wait(page->mapping, &wbc, how);
+                if (IS_ERR(req)) {
-                if (ret < 0)
+                        ret = PTR_ERR(req);
                        goto out_error;
-        } while (PagePrivate(page));
+                }
+                need_commit = test_bit(PG_CLEAN, &req->wb_flags);
+                nfs_clear_page_tag_locked(req);
+                if (need_commit) {
+                        ret = nfs_commit_inode(inode, FLUSH_SYNC);
+                        if (ret < 0)
+                                goto out_error;
+                }
+        }
        return 0;
 out_error:
-        __mark_inode_dirty(inode, I_DIRTY_PAGES);
        return ret;
 }
-/*
- * Write back all requests on one page - we do this before reading it.
- */
-int nfs_wb_page(struct inode *inode, struct page* page)
-{
-        return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
-}
 #ifdef CONFIG_MIGRATION
 int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
                struct page *page)
@@ -1598,8 +1503,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
        struct nfs_page *req;
        int ret;
-        if (PageFsCache(page))
+        nfs_fscache_release_page(page, GFP_KERNEL);
-                nfs_fscache_release_page(page, GFP_KERNEL);
        req = nfs_find_and_lock_request(page);
        ret = PTR_ERR(req);
@@ -1612,15 +1516,16 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
        if (ret)
                goto out_unlock;
        page_cache_get(newpage);
+        spin_lock(&mapping->host->i_lock);
        req->wb_page = newpage;
        SetPagePrivate(newpage);
-        set_page_private(newpage, page_private(page));
+        set_page_private(newpage, (unsigned long)req);
        ClearPagePrivate(page);
        set_page_private(page, 0);
+        spin_unlock(&mapping->host->i_lock);
        page_cache_release(page);
 out_unlock:
        nfs_clear_page_tag_locked(req);
-        nfs_release_request(req);
 out:
        return ret;
 }
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 04133aacb1e5..fc1c52571c03 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/sunrpc/xdr.h>
 #include <linux/nfsacl.h>
 #include <linux/nfs3.h>
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index 8f9a20556f79..bf9cbd242ddd 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -7,8 +7,6 @@
 #include <linux/types.h>
 #include <linux/file.h>
 #include <linux/fs.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/syscall.h>
 #include <linux/cred.h>
 #include <linux/sched.h>
@@ -38,10 +36,9 @@ static struct file *do_open(char *name, int flags)
                return ERR_PTR(error);
        if (flags == O_RDWR)
-                error = may_open(&nd.path, MAY_READ|MAY_WRITE,
+                error = may_open(&nd.path, MAY_READ|MAY_WRITE, flags);
-                                           FMODE_READ|FMODE_WRITE);
        else
-                error = may_open(&nd.path, MAY_WRITE, FMODE_WRITE);
+                error = may_open(&nd.path, MAY_WRITE, flags);
        if (!error)
                return dentry_open(nd.path.dentry, nd.path.mnt, flags,
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 36fcabbf5186..79717a40daba 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -1,15 +1,7 @@
-/*
+/* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */
- * linux/fs/nfsd/auth.c
- *
- * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
- */
-#include <linux/types.h>
 #include <linux/sched.h>
-#include <linux/sunrpc/svc.h>
+#include "nfsd.h"
-#include <linux/sunrpc/svcauth.h>
-#include <linux/nfsd/nfsd.h>
-#include <linux/nfsd/export.h>
 #include "auth.h"
 int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp)
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
new file mode 100644
index 000000000000..d892be61016c
--- /dev/null
+++ b/fs/nfsd/cache.h
@@ -0,0 +1,83 @@
+/*
+ * Request reply cache. This was heavily inspired by the
+ * implementation in 4.3BSD/4.4BSD.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+#ifndef NFSCACHE_H
+#define NFSCACHE_H
+#include <linux/sunrpc/svc.h>
+/*
+ * Representation of a reply cache entry.
+ */
+struct svc_cacherep {
+        struct hlist_node       c_hash;
+        struct list_head        c_lru;
+        unsigned char           c_state,        /* unused, inprog, done */
+                                c_type,         /* status, buffer */
+                                c_secure : 1;   /* req came from port < 1024 */
+        struct sockaddr_in      c_addr;
+        __be32                  c_xid;
+        u32                     c_prot;
+        u32                     c_proc;
+        u32                     c_vers;
+        unsigned long           c_timestamp;
+        union {
+                struct kvec     u_vec;
+                __be32          u_status;
+        }                       c_u;
+};
+#define c_replvec               c_u.u_vec
+#define c_replstat              c_u.u_status
+/* cache entry states */
+enum {
+        RC_UNUSED,
+        RC_INPROG,
+        RC_DONE
+};
+/* return values */
+enum {
+        RC_DROPIT,
+        RC_REPLY,
+        RC_DOIT,
+        RC_INTR
+};
+/*
+ * Cache types.
+ * We may want to add more types one day, e.g. for diropres and
+ * attrstat replies. Using cache entries with fixed length instead
+ * of buffer pointers may be more efficient.
+ */
+enum {
+        RC_NOCACHE,
+        RC_REPLSTAT,
+        RC_REPLBUFF,
+};
+/*
+ * If requests are retransmitted within this interval, they're dropped.
+ */
+#define RC_DELAY                (HZ/5)
+int     nfsd_reply_cache_init(void);
+void    nfsd_reply_cache_shutdown(void);
+int     nfsd_cache_lookup(struct svc_rqst *, int);
+void    nfsd_cache_update(struct svc_rqst *, int, __be32 *);
+#ifdef CONFIG_NFSD_V4
+void    nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp);
+#else  /* CONFIG_NFSD_V4 */
+static inline void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
+{
+}
+#endif /* CONFIG_NFSD_V4 */
+#endif /* NFSCACHE_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c1c9e035d4a4..872a5ef550c7 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1,7 +1,5 @@
 #define MSNFS   /* HACK HACK */
 /*
- * linux/fs/nfsd/export.c
- *
 * NFS exporting and validation.
 *
 * We maintain a list of clients, each of which has a list of
@@ -14,29 +12,17 @@
 * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de>
 */
-#include <linux/unistd.h>
 #include <linux/slab.h>
-#include <linux/stat.h>
-#include <linux/in.h>
-#include <linux/seq_file.h>
-#include <linux/syscalls.h>
-#include <linux/rwsem.h>
-#include <linux/dcache.h>
 #include <linux/namei.h>
-#include <linux/mount.h>
-#include <linux/hash.h>
 #include <linux/module.h>
 #include <linux/exportfs.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/nfsd/nfsd.h>
-#include <linux/nfsd/nfsfh.h>
 #include <linux/nfsd/syscall.h>
-#include <linux/lockd/bind.h>
-#include <linux/sunrpc/msg_prot.h>
-#include <linux/sunrpc/gss_api.h>
 #include <net/ipv6.h>
+#include "nfsd.h"
+#include "nfsfh.h"
 #define NFSDDBG_FACILITY        NFSDDBG_EXPORT
 typedef struct auth_domain      svc_client;
@@ -369,16 +355,25 @@ static struct svc_export *svc_export_update(struct svc_export *new,
                                            struct svc_export *old);
 static struct svc_export *svc_export_lookup(struct svc_export *);
-static int check_export(struct inode *inode, int flags, unsigned char *uuid)
+static int check_export(struct inode *inode, int *flags, unsigned char *uuid)
 {
-        /* We currently export only dirs and regular files.
+        /*
-         * This is what umountd does.
+         * We currently export only dirs, regular files, and (for v4
+         * pseudoroot) symlinks.
         */
        if (!S_ISDIR(inode->i_mode) &&
+            !S_ISLNK(inode->i_mode) &&
            !S_ISREG(inode->i_mode))
                return -ENOTDIR;
+        /*
+         * Mountd should never pass down a writeable V4ROOT export, but,
+         * just to make sure:
+         */
+        if (*flags & NFSEXP_V4ROOT)
+                *flags |= NFSEXP_READONLY;
        /* There are two requirements on a filesystem to be exportable.
         * 1:  We must be able to identify the filesystem from a number.
         *       either a device number (so FS_REQUIRES_DEV needed)
@@ -387,7 +382,7 @@ static int check_export(struct inode *inode, int flags, unsigned char *uuid)
         *       This means that s_export_op must be set.
         */
        if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) &&
-            !(flags & NFSEXP_FSID) &&
+            !(*flags & NFSEXP_FSID) &&
            uuid == NULL) {
                dprintk("exp_export: export of non-dev fs without fsid\n");
                return -EINVAL;
@@ -602,7 +597,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
                                goto out4;
                }
-                err = check_export(exp.ex_path.dentry->d_inode, exp.ex_flags,
+                err = check_export(exp.ex_path.dentry->d_inode, &exp.ex_flags,
                                   exp.ex_uuid);
                if (err)
                        goto out4;
@@ -1041,7 +1036,7 @@ exp_export(struct nfsctl_export *nxp)
                goto finish;
        }
-        err = check_export(path.dentry->d_inode, nxp->ex_flags, NULL);
+        err = check_export(path.dentry->d_inode, &nxp->ex_flags, NULL);
        if (err) goto finish;
        err = -ENOMEM;
@@ -1320,6 +1315,15 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
        return exp;
 }
+static struct svc_export *find_fsidzero_export(struct svc_rqst *rqstp)
+{
+        u32 fsidv[2];
+        mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL);
+        return rqst_exp_find(rqstp, FSID_NUM, fsidv);
+}
 /*
 * Called when we need the filehandle for the root of the pseudofs,
 * for a given NFSv4 client.   The root is defined to be the
@@ -1330,11 +1334,8 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
 {
        struct svc_export *exp;
        __be32 rv;
-        u32 fsidv[2];
-        mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL);
-        exp = rqst_exp_find(rqstp, FSID_NUM, fsidv);
+        exp = find_fsidzero_export(rqstp);
        if (IS_ERR(exp))
                return nfserrno(PTR_ERR(exp));
        rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL);
@@ -1425,6 +1426,7 @@ static struct flags {
        { NFSEXP_CROSSMOUNT, {"crossmnt", ""}},
        { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
        { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
+        { NFSEXP_V4ROOT, {"v4root", ""}},
 #ifdef MSNFS
        { NFSEXP_MSNFS, {"msnfs", ""}},
 #endif
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index b2786a5f9afe..0c6d81670137 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -1,6 +1,4 @@
 /*
- * linux/fs/nfsd/lockd.c
- *
 * This file contains all the stubs needed when communicating with lockd.
 * This level of indirection is necessary so we can run nfsd+lockd without
 * requiring the nfs client to be compiled in/loaded, and vice versa.
@@ -8,14 +6,10 @@
 * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/types.h>
-#include <linux/fs.h>
 #include <linux/file.h>
-#include <linux/mount.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/nfsd/nfsd.h>
 #include <linux/lockd/bind.h>
+#include "nfsd.h"
+#include "vfs.h"
 #define NFSDDBG_FACILITY                NFSDDBG_LOCKD
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 4e3219e84116..6aa5590c3679 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -1,19 +1,16 @@
 /*
- * linux/fs/nfsd/nfs2acl.c
- *
 * Process version 2 NFSACL requests.
 *
 * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de>
 */
-#include <linux/sunrpc/svc.h>
+#include "nfsd.h"
-#include <linux/nfs.h>
+/* FIXME: nfsacl.h is a broken header */
-#include <linux/nfsd/nfsd.h>
-#include <linux/nfsd/cache.h>
-#include <linux/nfsd/xdr.h>
-#include <linux/nfsd/xdr3.h>
-#include <linux/posix_acl.h>
 #include <linux/nfsacl.h>
+#include <linux/gfp.h>
+#include "cache.h"
+#include "xdr3.h"
+#include "vfs.h"
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 #define RETURN_STATUS(st)       { resp->status = (st); return (st); }
@@ -217,6 +214,16 @@ static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p,
 * XDR encode functions
 */
+/*
+ * There must be an encoding function for void results so svc_process
+ * will work properly.
+ */
+int
+nfsaclsvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+        return xdr_ressize_check(rqstp, p);
+}
 /* GETACL */
 static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
                struct nfsd3_getaclres *resp)
@@ -308,7 +315,6 @@ static int nfsaclsvc_release_access(struct svc_rqst *rqstp, __be32 *p,
 }
 #define nfsaclsvc_decode_voidargs       NULL
-#define nfsaclsvc_encode_voidres        NULL
 #define nfsaclsvc_release_void          NULL
 #define nfsd3_fhandleargs       nfsd_fhandle
 #define nfsd3_attrstatres       nfsd_attrstat
@@ -346,5 +352,5 @@ struct svc_version	nfsd_acl_version2 = {
                .vs_proc        = nfsd_acl_procedures2,
                .vs_dispatch    = nfsd_dispatch,
                .vs_xdrsize     = NFS3_SVC_XDRSIZE,
-                .vs_hidden      = 1,
+                .vs_hidden      = 0,
 };
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 9981dbb377a3..a596e9d987e4 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -1,18 +1,16 @@
 /*
- * linux/fs/nfsd/nfs3acl.c
- *
 * Process version 3 NFSACL requests.
 *
 * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de>
 */
-#include <linux/sunrpc/svc.h>
+#include "nfsd.h"
-#include <linux/nfs3.h>
+/* FIXME: nfsacl.h is a broken header */
-#include <linux/nfsd/nfsd.h>
-#include <linux/nfsd/cache.h>
-#include <linux/nfsd/xdr3.h>
-#include <linux/posix_acl.h>
 #include <linux/nfsacl.h>
+#include <linux/gfp.h>
+#include "cache.h"
+#include "xdr3.h"
+#include "vfs.h"
 #define RETURN_STATUS(st)       { resp->status = (st); return (st); }
@@ -264,6 +262,6 @@ struct svc_version	nfsd_acl_version3 = {
                .vs_proc        = nfsd_acl_procedures3,
                .vs_dispatch    = nfsd_dispatch,
                .vs_xdrsize     = NFS3_SVC_XDRSIZE,
-                .vs_hidden      = 1,
+                .vs_hidden      = 0,
 };
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index a713c418a922..3d68f45a37b9 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -1,30 +1,16 @@
 /*
- * linux/fs/nfsd/nfs3proc.c
- *
 * Process version 3 NFS requests.
 *
 * Copyright (C) 1996, 1997, 1998 Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/linkage.h>
-#include <linux/time.h>
-#include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/ext2_fs.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/net.h>
-#include <linux/in.h>
-#include <linux/unistd.h>
-#include <linux/slab.h>
-#include <linux/major.h>
 #include <linux/magic.h>
-#include <linux/sunrpc/svc.h>
+#include "cache.h"
-#include <linux/nfsd/nfsd.h>
+#include "xdr3.h"
-#include <linux/nfsd/cache.h>
+#include "vfs.h"
-#include <linux/nfsd/xdr3.h>
-#include <linux/nfs3.h>
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index d0a2ce1b4324..2a533a0af2a9 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -1,6 +1,4 @@
 /*
- * linux/fs/nfsd/nfs3xdr.c
- *
 * XDR support for nfsd/protocol version 3.
 *
 * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
@@ -8,19 +6,8 @@
 * 2003-08-09 Jamie Lokier: Use htonl() for nanoseconds, not htons()!
 */
-#include <linux/types.h>
-#include <linux/time.h>
-#include <linux/nfs3.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <linux/dcache.h>
 #include <linux/namei.h>
-#include <linux/mm.h>
+#include "xdr3.h"
-#include <linux/vfs.h>
-#include <linux/sunrpc/xdr.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/nfsd/nfsd.h>
-#include <linux/nfsd/xdr3.h>
 #include "auth.h"
 #define NFSDDBG_FACILITY                NFSDDBG_XDR
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 725d02f210e2..e48052615159 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -1,6 +1,4 @@
 /*
- *  fs/nfs4acl/acl.c
- *
 *  Common NFSv4 ACL handling code.
 *
 *  Copyright (c) 2002, 2003 The Regents of the University of Michigan.
@@ -36,15 +34,8 @@
 *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
-#include <linux/string.h>
 #include <linux/slab.h>
-#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/module.h>
 #include <linux/nfs_fs.h>
-#include <linux/posix_acl.h>
-#include <linux/nfs4.h>
 #include <linux/nfs4_acl.h>
@@ -389,7 +380,7 @@ sort_pacl(struct posix_acl *pacl)
        sort_pacl_range(pacl, 1, i-1);
        BUG_ON(pacl->a_entries[i].e_tag != ACL_GROUP_OBJ);
-        j = i++;
+        j = ++i;
        while (pacl->a_entries[j].e_tag == ACL_GROUP)
                j++;
        sort_pacl_range(pacl, i, j-1);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 24e8d78f8dde..7e32bd394e86 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1,6 +1,4 @@
 /*
- *  linux/fs/nfsd/nfs4callback.c
- *
 *  Copyright (c) 2001 The Regents of the University of Michigan.
 *  All rights reserved.
 *
@@ -33,22 +31,10 @@
 *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
-#include <linux/module.h>
-#include <linux/list.h>
-#include <linux/inet.h>
-#include <linux/errno.h>
-#include <linux/delay.h>
-#include <linux/sched.h>
-#include <linux/kthread.h>
-#include <linux/sunrpc/xdr.h>
-#include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/clnt.h>
-#include <linux/sunrpc/svcsock.h>
+#include <linux/slab.h>
-#include <linux/nfsd/nfsd.h>
+#include "nfsd.h"
-#include <linux/nfsd/state.h>
+#include "state.h"
-#include <linux/sunrpc/sched.h>
-#include <linux/nfs4.h>
-#include <linux/sunrpc/xprtsock.h>
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
@@ -540,6 +526,8 @@ static struct rpc_cred *callback_cred;
 int set_callback_cred(void)
 {
+        if (callback_cred)
+                return 0;
        callback_cred = rpc_lookup_machine_cred();
        if (!callback_cred)
                return -ENOMEM;
@@ -557,7 +545,8 @@ void do_probe_callback(struct nfs4_client *clp)
        };
        int status;
-        status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_SOFT,
+        status = rpc_call_async(cb->cb_client, &msg,
+                                RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
                                &nfsd4_cb_probe_ops, (void *)clp);
        if (status) {
                warn_no_callback_path(clp, status);
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index ba2c199592fd..c78dbf493424 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -1,6 +1,4 @@
 /*
- *  fs/nfsd/nfs4idmap.c
- *
 *  Mapping of UID/GIDs to name and vice versa.
 *
 *  Copyright (c) 2002, 2003 The Regents of the University of
@@ -35,22 +33,10 @@
 */
 #include <linux/module.h>
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/nfs.h>
-#include <linux/nfs4.h>
-#include <linux/nfs_fs.h>
-#include <linux/nfs_page.h>
-#include <linux/sunrpc/cache.h>
 #include <linux/nfsd_idmap.h>
-#include <linux/list.h>
-#include <linux/time.h>
 #include <linux/seq_file.h>
-#include <linux/sunrpc/svcauth.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
 /*
 * Cache entry
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index bebc0c2e1b0a..2ab9e8501bfe 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1,6 +1,4 @@
 /*
- *  fs/nfsd/nfs4proc.c
- *
 *  Server-side procedures for NFSv4.
 *
 *  Copyright (c) 2002 The Regents of the University of Michigan.
@@ -34,20 +32,12 @@
 *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
-#include <linux/param.h>
-#include <linux/major.h>
-#include <linux/slab.h>
 #include <linux/file.h>
+#include <linux/slab.h>
-#include <linux/sunrpc/svc.h>
+#include "cache.h"
-#include <linux/nfsd/nfsd.h>
+#include "xdr4.h"
-#include <linux/nfsd/cache.h>
+#include "vfs.h"
-#include <linux/nfs4.h>
-#include <linux/nfsd/state.h>
-#include <linux/nfsd/xdr4.h>
-#include <linux/nfs4_acl.h>
-#include <linux/sunrpc/gss_api.h>
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
@@ -170,7 +160,7 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
                accmode |= NFSD_MAY_READ;
        if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
                accmode |= (NFSD_MAY_WRITE | NFSD_MAY_TRUNC);
-        if (open->op_share_deny & NFS4_SHARE_DENY_WRITE)
+        if (open->op_share_deny & NFS4_SHARE_DENY_READ)
                accmode |= NFSD_MAY_WRITE;
        status = fh_verify(rqstp, current_fh, S_IFREG, accmode);
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index b5348405046b..7a9ae3254a4b 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -1,6 +1,4 @@
 /*
-*  linux/fs/nfsd/nfs4recover.c
-*
 *  Copyright (c) 2004 The Regents of the University of Michigan.
 *  All rights reserved.
 *
@@ -33,20 +31,15 @@
 *
 */
-#include <linux/err.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/nfsd/nfsd.h>
-#include <linux/nfs4.h>
-#include <linux/nfsd/state.h>
-#include <linux/nfsd/xdr4.h>
-#include <linux/param.h>
 #include <linux/file.h>
+#include <linux/slab.h>
 #include <linux/namei.h>
-#include <asm/uaccess.h>
-#include <linux/scatterlist.h>
 #include <linux/crypto.h>
 #include <linux/sched.h>
-#include <linux/mount.h>
+#include "nfsd.h"
+#include "state.h"
+#include "vfs.h"
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
@@ -127,9 +120,7 @@ out_no_tfm:
 static void
 nfsd4_sync_rec_dir(void)
 {
-        mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
+        vfs_fsync(NULL, rec_dir.dentry, 0);
-        nfsd_sync_dir(rec_dir.dentry);
-        mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
 }
 int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2153f9bdbebd..6a8fedaa4f55 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1,6 +1,4 @@
 /*
-*  linux/fs/nfsd/nfs4state.c
-*
 *  Copyright (c) 2001 The Regents of the University of Michigan.
 *  All rights reserved.
 *
@@ -34,28 +32,15 @@
 *
 */
-#include <linux/param.h>
-#include <linux/major.h>
-#include <linux/slab.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/nfsd/nfsd.h>
-#include <linux/nfsd/cache.h>
 #include <linux/file.h>
-#include <linux/mount.h>
-#include <linux/workqueue.h>
 #include <linux/smp_lock.h>
-#include <linux/kthread.h>
+#include <linux/slab.h>
-#include <linux/nfs4.h>
-#include <linux/nfsd/state.h>
-#include <linux/nfsd/xdr4.h>
 #include <linux/namei.h>
 #include <linux/swap.h>
-#include <linux/mutex.h>
-#include <linux/lockd/bind.h>
-#include <linux/module.h>
 #include <linux/sunrpc/svcauth_gss.h>
 #include <linux/sunrpc/clnt.h>
+#include "xdr4.h"
+#include "vfs.h"
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
@@ -477,13 +462,14 @@ static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan)
 /*
 * fchan holds the client values on input, and the server values on output
+ * sv_max_mesg is the maximum payload plus one page for overhead.
 */
 static int init_forechannel_attrs(struct svc_rqst *rqstp,
                                  struct nfsd4_channel_attrs *session_fchan,
                                  struct nfsd4_channel_attrs *fchan)
 {
        int status = 0;
-        __u32   maxcount = svc_max_payload(rqstp);
+        __u32   maxcount = nfsd_serv->sv_max_mesg;
        /* headerpadsz set to zero in encode routine */
@@ -523,6 +509,15 @@ free_session_slots(struct nfsd4_session *ses)
                kfree(ses->se_slots[i]);
 }
+/*
+ * We don't actually need to cache the rpc and session headers, so we
+ * can allocate a little less for each slot:
+ */
+static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
+{
+        return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
+}
 static int
 alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
                   struct nfsd4_create_session *cses)
@@ -554,7 +549,7 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
        memcpy(new, &tmp, sizeof(*new));
        /* allocate each struct nfsd4_slot and data cache in one piece */
-        cachesize = new->se_fchannel.maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
+        cachesize = slot_bytes(&new->se_fchannel);
        for (i = 0; i < new->se_fchannel.maxreqs; i++) {
                sp = kzalloc(sizeof(*sp) + cachesize, GFP_KERNEL);
                if (!sp)
@@ -628,10 +623,12 @@ void
 free_session(struct kref *kref)
 {
        struct nfsd4_session *ses;
+        int mem;
        ses = container_of(kref, struct nfsd4_session, se_ref);
        spin_lock(&nfsd_drc_lock);
-        nfsd_drc_mem_used -= ses->se_fchannel.maxreqs * NFSD_SLOT_CACHE_SIZE;
+        mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
+        nfsd_drc_mem_used -= mem;
        spin_unlock(&nfsd_drc_lock);
        free_session_slots(ses);
        kfree(ses);
@@ -2002,7 +1999,9 @@ nfs4_file_downgrade(struct file *filp, unsigned int share_access)
 {
        if (share_access & NFS4_SHARE_ACCESS_WRITE) {
                drop_file_write_access(filp);
+                spin_lock(&filp->f_lock);
                filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE;
+                spin_unlock(&filp->f_lock);
        }
 }
@@ -2404,11 +2403,8 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
        memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid));
-        dprintk("NFSD: delegation stateid=(%08x/%08x/%08x/%08x)\n\n",
+        dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
-                     dp->dl_stateid.si_boot,
+                STATEID_VAL(&dp->dl_stateid));
-                     dp->dl_stateid.si_stateownerid,
-                     dp->dl_stateid.si_fileid,
-                     dp->dl_stateid.si_generation);
 out:
        if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS
                        && flag == NFS4_OPEN_DELEGATE_NONE
@@ -2487,8 +2483,10 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
        }
        memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
-        if (nfsd4_has_session(&resp->cstate))
+        if (nfsd4_has_session(&resp->cstate)) {
                open->op_stateowner->so_confirmed = 1;
+                nfsd4_create_clid_dir(open->op_stateowner->so_client);
+        }
        /*
        * Attempt to hand out a delegation. No error return, because the
@@ -2498,9 +2496,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
        status = nfs_ok;
-        dprintk("nfs4_process_open2: stateid=(%08x/%08x/%08x/%08x)\n",
+        dprintk("%s: stateid=" STATEID_FMT "\n", __func__,
-                    stp->st_stateid.si_boot, stp->st_stateid.si_stateownerid,
+                STATEID_VAL(&stp->st_stateid));
-                    stp->st_stateid.si_fileid, stp->st_stateid.si_generation);
 out:
        if (fp)
                put_nfs4_file(fp);
@@ -2666,9 +2663,8 @@ STALE_STATEID(stateid_t *stateid)
 {
        if (time_after((unsigned long)boot_time,
                        (unsigned long)stateid->si_boot)) {
-                dprintk("NFSD: stale stateid (%08x/%08x/%08x/%08x)!\n",
+                dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
-                        stateid->si_boot, stateid->si_stateownerid,
+                        STATEID_VAL(stateid));
-                        stateid->si_fileid, stateid->si_generation);
                return 1;
        }
        return 0;
@@ -2680,9 +2676,8 @@ EXPIRED_STATEID(stateid_t *stateid)
        if (time_before((unsigned long)boot_time,
                        ((unsigned long)stateid->si_boot)) &&
            time_before((unsigned long)(stateid->si_boot + lease_time), get_seconds())) {
-                dprintk("NFSD: expired stateid (%08x/%08x/%08x/%08x)!\n",
+                dprintk("NFSD: expired stateid " STATEID_FMT "!\n",
-                        stateid->si_boot, stateid->si_stateownerid,
+                        STATEID_VAL(stateid));
-                        stateid->si_fileid, stateid->si_generation);
                return 1;
        }
        return 0;
@@ -2696,9 +2691,8 @@ stateid_error_map(stateid_t *stateid)
        if (EXPIRED_STATEID(stateid))
                return nfserr_expired;
-        dprintk("NFSD: bad stateid (%08x/%08x/%08x/%08x)!\n",
+        dprintk("NFSD: bad stateid " STATEID_FMT "!\n",
-                stateid->si_boot, stateid->si_stateownerid,
+                STATEID_VAL(stateid));
-                stateid->si_fileid, stateid->si_generation);
        return nfserr_bad_stateid;
 }
@@ -2884,10 +2878,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
        struct svc_fh *current_fh = &cstate->current_fh;
        __be32 status;
-        dprintk("NFSD: preprocess_seqid_op: seqid=%d " 
+        dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__,
-                        "stateid = (%08x/%08x/%08x/%08x)\n", seqid,
+                seqid, STATEID_VAL(stateid));
-                stateid->si_boot, stateid->si_stateownerid, stateid->si_fileid,
-                stateid->si_generation);
        *stpp = NULL;
        *sopp = NULL;
@@ -3019,12 +3011,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        sop->so_confirmed = 1;
        update_stateid(&stp->st_stateid);
        memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t));
-        dprintk("NFSD: nfsd4_open_confirm: success, seqid=%d " 
+        dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
-                "stateid=(%08x/%08x/%08x/%08x)\n", oc->oc_seqid,
+                __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stateid));
-                         stp->st_stateid.si_boot,
-                         stp->st_stateid.si_stateownerid,
-                         stp->st_stateid.si_fileid,
-                         stp->st_stateid.si_generation);
        nfsd4_create_clid_dir(sop->so_client);
 out:
@@ -3283,9 +3271,8 @@ find_delegation_stateid(struct inode *ino, stateid_t *stid)
        struct nfs4_file *fp;
        struct nfs4_delegation *dl;
-        dprintk("NFSD:find_delegation_stateid stateid=(%08x/%08x/%08x/%08x)\n",
+        dprintk("NFSD: %s: stateid=" STATEID_FMT "\n", __func__,
-                    stid->si_boot, stid->si_stateownerid,
+                STATEID_VAL(stid));
-                    stid->si_fileid, stid->si_generation);
        fp = find_file(ino);
        if (!fp)
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 0fbd50cee1f6..e1703175ee28 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -40,24 +40,17 @@
 * at the end of nfs4svc_decode_compoundargs.
 */
-#include <linux/param.h>
+#include <linux/slab.h>
-#include <linux/smp.h>
-#include <linux/fs.h>
 #include <linux/namei.h>
-#include <linux/vfs.h>
+#include <linux/statfs.h>
 #include <linux/utsname.h>
-#include <linux/sunrpc/xdr.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/nfsd/nfsd.h>
-#include <linux/nfsd/state.h>
-#include <linux/nfsd/xdr4.h>
 #include <linux/nfsd_idmap.h>
-#include <linux/nfs4.h>
 #include <linux/nfs4_acl.h>
-#include <linux/sunrpc/gss_api.h>
 #include <linux/sunrpc/svcauth_gss.h>
+#include "xdr4.h"
+#include "vfs.h"
 #define NFSDDBG_FACILITY                NFSDDBG_XDR
 /*
@@ -1442,7 +1435,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
                }
                op->opnum = ntohl(*argp->p++);
-                if (op->opnum >= OP_ACCESS && op->opnum < ops->nops)
+                if (op->opnum >= FIRST_NFS4_OP && op->opnum <= LAST_NFS4_OP)
                        op->status = ops->decoders[op->opnum](argp, &op->u);
                else {
                        op->opnum = OP_ILLEGAL;
@@ -1536,7 +1529,7 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
        } } while (0);
 /* Encode as an array of strings the string given with components
- * seperated @sep.
+ * separated @sep.
 */
 static __be32 nfsd4_encode_components(char sep, char *components,
                                   __be32 **pp, int *buflen)
@@ -2129,9 +2122,15 @@ out_acl:
                 * and this is the root of a cross-mounted filesystem.
                 */
                if (ignore_crossmnt == 0 &&
-                    exp->ex_path.mnt->mnt_root->d_inode == dentry->d_inode) {
+                    dentry == exp->ex_path.mnt->mnt_root) {
-                        err = vfs_getattr(exp->ex_path.mnt->mnt_parent,
+                        struct path path = exp->ex_path;
-                                exp->ex_path.mnt->mnt_mountpoint, &stat);
+                        path_get(&path);
+                        while (follow_up(&path)) {
+                                if (path.dentry != path.mnt->mnt_root)
+                                        break;
+                        }
+                        err = vfs_getattr(path.mnt, path.dentry, &stat);
+                        path_put(&path);
                        if (err)
                                goto out_nfserr;
                }
@@ -2204,11 +2203,14 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
         * we will not follow the cross mount and will fill the attribtutes
         * directly from the mountpoint dentry.
         */
-        if (d_mountpoint(dentry) && !attributes_need_mount(cd->rd_bmval))
+        if (nfsd_mountpoint(dentry, exp)) {
-                ignore_crossmnt = 1;
-        else if (d_mountpoint(dentry)) {
                int err;
+                if (!(exp->ex_flags & NFSEXP_V4ROOT)
+                                && !attributes_need_mount(cd->rd_bmval)) {
+                        ignore_crossmnt = 1;
+                        goto out_encode;
+                }
                /*
                 * Why the heck aren't we just using nfsd_lookup??
                 * Different "."/".." handling?  Something else?
@@ -2224,6 +2226,7 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
                        goto out_put;
        }
+out_encode:
        nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval,
                                        cd->rd_rqstp, ignore_crossmnt);
 out_put:
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 4638635c5d87..4666a209678a 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -1,6 +1,4 @@
 /*
- * linux/fs/nfsd/nfscache.c
- *
 * Request reply cache. This is currently a global cache, but this may
 * change in the future and be a per-client cache.
 *
@@ -10,16 +8,10 @@
 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/kernel.h>
-#include <linux/time.h>
 #include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/spinlock.h>
-#include <linux/list.h>
-#include <linux/sunrpc/svc.h>
+#include "nfsd.h"
-#include <linux/nfsd/nfsd.h>
+#include "cache.h"
-#include <linux/nfsd/cache.h>
 /* Size of reply cache. Common values are:
 * 4.3BSD:      128
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 5c01fc148ce8..e3591073098f 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1,46 +1,21 @@
 /*
- * linux/fs/nfsd/nfsctl.c
- *
 * Syscall interface to knfsd.
 *
 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/module.h>
-#include <linux/linkage.h>
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/namei.h>
-#include <linux/fcntl.h>
-#include <linux/net.h>
-#include <linux/in.h>
-#include <linux/syscalls.h>
-#include <linux/unistd.h>
 #include <linux/slab.h>
-#include <linux/proc_fs.h>
+#include <linux/namei.h>
-#include <linux/seq_file.h>
-#include <linux/pagemap.h>
-#include <linux/init.h>
-#include <linux/inet.h>
-#include <linux/string.h>
 #include <linux/ctype.h>
-#include <linux/nfs.h>
 #include <linux/nfsd_idmap.h>
-#include <linux/lockd/bind.h>
-#include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svcsock.h>
-#include <linux/nfsd/nfsd.h>
-#include <linux/nfsd/cache.h>
-#include <linux/nfsd/xdr.h>
 #include <linux/nfsd/syscall.h>
 #include <linux/lockd/lockd.h>
 #include <linux/sunrpc/clnt.h>
-#include <asm/uaccess.h>
+#include "nfsd.h"
-#include <net/ipv6.h>
+#include "cache.h"
 /*
 *      We have a single directory with 9 nodes in it.
@@ -55,6 +30,7 @@ enum {
        NFSD_Getfd,
        NFSD_Getfs,
        NFSD_List,
+        NFSD_Export_features,
        NFSD_Fh,
        NFSD_FO_UnlockIP,
        NFSD_FO_UnlockFS,
@@ -173,6 +149,24 @@ static const struct file_operations exports_operations = {
        .owner          = THIS_MODULE,
 };
+static int export_features_show(struct seq_file *m, void *v)
+{
+        seq_printf(m, "0x%x 0x%x\n", NFSEXP_ALLFLAGS, NFSEXP_SECINFO_FLAGS);
+        return 0;
+}
+static int export_features_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, export_features_show, NULL);
+}
+static struct file_operations export_features_operations = {
+        .open           = export_features_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
 extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
@@ -995,6 +989,7 @@ static ssize_t __write_ports_delfd(char *buf)
 static ssize_t __write_ports_addxprt(char *buf)
 {
        char transport[16];
+        struct svc_xprt *xprt;
        int port, err;
        if (sscanf(buf, "%15s %4u", transport, &port) != 2)
@@ -1009,13 +1004,24 @@ static ssize_t __write_ports_addxprt(char *buf)
        err = svc_create_xprt(nfsd_serv, transport,
                                PF_INET, port, SVC_SOCK_ANONYMOUS);
-        if (err < 0) {
+        if (err < 0)
-                /* Give a reasonable perror msg for bad transport string */
+                goto out_err;
-                if (err == -ENOENT)
-                        err = -EPROTONOSUPPORT;
+        err = svc_create_xprt(nfsd_serv, transport,
-                return err;
+                                PF_INET6, port, SVC_SOCK_ANONYMOUS);
-        }
+        if (err < 0 && err != -EAFNOSUPPORT)
+                goto out_close;
        return 0;
+out_close:
+        xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port);
+        if (xprt != NULL) {
+                svc_close_xprt(xprt);
+                svc_xprt_put(xprt);
+        }
+out_err:
+        /* Decrease the count, but don't shut down the service */
+        nfsd_serv->sv_nrthreads--;
+        return err;
 }
 /*
@@ -1330,6 +1336,8 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
                [NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_List] = {"exports", &exports_operations, S_IRUGO},
+                [NFSD_Export_features] = {"export_features",
+                                        &export_features_operations, S_IRUGO},
                [NFSD_FO_UnlockIP] = {"unlock_ip",
                                        &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_FO_UnlockFS] = {"unlock_filesystem",
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
new file mode 100644
index 000000000000..e942a1aaac92
--- /dev/null
+++ b/fs/nfsd/nfsd.h
@@ -0,0 +1,338 @@
+/*
+ * Hodge-podge collection of knfsd-related stuff.
+ * I will sort this out later.
+ *
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ */
+#ifndef LINUX_NFSD_NFSD_H
+#define LINUX_NFSD_NFSD_H
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <linux/nfsd/debug.h>
+#include <linux/nfsd/export.h>
+#include <linux/nfsd/stats.h>
+/*
+ * nfsd version
+ */
+#define NFSD_SUPPORTED_MINOR_VERSION    1
+struct readdir_cd {
+        __be32                  err;    /* 0, nfserr, or nfserr_eof */
+};
+extern struct svc_program       nfsd_program;
+extern struct svc_version       nfsd_version2, nfsd_version3,
+                                nfsd_version4;
+extern u32                      nfsd_supported_minorversion;
+extern struct mutex             nfsd_mutex;
+extern struct svc_serv          *nfsd_serv;
+extern spinlock_t               nfsd_drc_lock;
+extern unsigned int             nfsd_drc_max_mem;
+extern unsigned int             nfsd_drc_mem_used;
+extern const struct seq_operations nfs_exports_op;
+/*
+ * Function prototypes.
+ */
+int             nfsd_svc(unsigned short port, int nrservs);
+int             nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp);
+int             nfsd_nrthreads(void);
+int             nfsd_nrpools(void);
+int             nfsd_get_nrthreads(int n, int *);
+int             nfsd_set_nrthreads(int n, int *);
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+#ifdef CONFIG_NFSD_V2_ACL
+extern struct svc_version nfsd_acl_version2;
+#else
+#define nfsd_acl_version2 NULL
+#endif
+#ifdef CONFIG_NFSD_V3_ACL
+extern struct svc_version nfsd_acl_version3;
+#else
+#define nfsd_acl_version3 NULL
+#endif
+#endif
+enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL };
+int nfsd_vers(int vers, enum vers_op change);
+int nfsd_minorversion(u32 minorversion, enum vers_op change);
+void nfsd_reset_versions(void);
+int nfsd_create_serv(void);
+extern int nfsd_max_blksize;
+static inline int nfsd_v4client(struct svc_rqst *rq)
+{
+        return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4;
+}
+/* 
+ * NFSv4 State
+ */
+#ifdef CONFIG_NFSD_V4
+extern unsigned int max_delegations;
+int nfs4_state_init(void);
+void nfsd4_free_slabs(void);
+int nfs4_state_start(void);
+void nfs4_state_shutdown(void);
+time_t nfs4_lease_time(void);
+void nfs4_reset_lease(time_t leasetime);
+int nfs4_reset_recoverydir(char *recdir);
+#else
+static inline int nfs4_state_init(void) { return 0; }
+static inline void nfsd4_free_slabs(void) { }
+static inline int nfs4_state_start(void) { return 0; }
+static inline void nfs4_state_shutdown(void) { }
+static inline time_t nfs4_lease_time(void) { return 0; }
+static inline void nfs4_reset_lease(time_t leasetime) { }
+static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
+#endif
+/*
+ * lockd binding
+ */
+void            nfsd_lockd_init(void);
+void            nfsd_lockd_shutdown(void);
+/*
+ * These macros provide pre-xdr'ed values for faster operation.
+ */
+#define nfs_ok                  cpu_to_be32(NFS_OK)
+#define nfserr_perm             cpu_to_be32(NFSERR_PERM)
+#define nfserr_noent            cpu_to_be32(NFSERR_NOENT)
+#define nfserr_io               cpu_to_be32(NFSERR_IO)
+#define nfserr_nxio             cpu_to_be32(NFSERR_NXIO)
+#define nfserr_eagain           cpu_to_be32(NFSERR_EAGAIN)
+#define nfserr_acces            cpu_to_be32(NFSERR_ACCES)
+#define nfserr_exist            cpu_to_be32(NFSERR_EXIST)
+#define nfserr_xdev             cpu_to_be32(NFSERR_XDEV)
+#define nfserr_nodev            cpu_to_be32(NFSERR_NODEV)
+#define nfserr_notdir           cpu_to_be32(NFSERR_NOTDIR)
+#define nfserr_isdir            cpu_to_be32(NFSERR_ISDIR)
+#define nfserr_inval            cpu_to_be32(NFSERR_INVAL)
+#define nfserr_fbig             cpu_to_be32(NFSERR_FBIG)
+#define nfserr_nospc            cpu_to_be32(NFSERR_NOSPC)
+#define nfserr_rofs             cpu_to_be32(NFSERR_ROFS)
+#define nfserr_mlink            cpu_to_be32(NFSERR_MLINK)
+#define nfserr_opnotsupp        cpu_to_be32(NFSERR_OPNOTSUPP)
+#define nfserr_nametoolong      cpu_to_be32(NFSERR_NAMETOOLONG)
+#define nfserr_notempty         cpu_to_be32(NFSERR_NOTEMPTY)
+#define nfserr_dquot            cpu_to_be32(NFSERR_DQUOT)
+#define nfserr_stale            cpu_to_be32(NFSERR_STALE)
+#define nfserr_remote           cpu_to_be32(NFSERR_REMOTE)
+#define nfserr_wflush           cpu_to_be32(NFSERR_WFLUSH)
+#define nfserr_badhandle        cpu_to_be32(NFSERR_BADHANDLE)
+#define nfserr_notsync          cpu_to_be32(NFSERR_NOT_SYNC)
+#define nfserr_badcookie        cpu_to_be32(NFSERR_BAD_COOKIE)
+#define nfserr_notsupp          cpu_to_be32(NFSERR_NOTSUPP)
+#define nfserr_toosmall         cpu_to_be32(NFSERR_TOOSMALL)
+#define nfserr_serverfault      cpu_to_be32(NFSERR_SERVERFAULT)
+#define nfserr_badtype          cpu_to_be32(NFSERR_BADTYPE)
+#define nfserr_jukebox          cpu_to_be32(NFSERR_JUKEBOX)
+#define nfserr_denied           cpu_to_be32(NFSERR_DENIED)
+#define nfserr_deadlock         cpu_to_be32(NFSERR_DEADLOCK)
+#define nfserr_expired          cpu_to_be32(NFSERR_EXPIRED)
+#define nfserr_bad_cookie       cpu_to_be32(NFSERR_BAD_COOKIE)
+#define nfserr_same             cpu_to_be32(NFSERR_SAME)
+#define nfserr_clid_inuse       cpu_to_be32(NFSERR_CLID_INUSE)
+#define nfserr_stale_clientid   cpu_to_be32(NFSERR_STALE_CLIENTID)
+#define nfserr_resource         cpu_to_be32(NFSERR_RESOURCE)
+#define nfserr_moved            cpu_to_be32(NFSERR_MOVED)
+#define nfserr_nofilehandle     cpu_to_be32(NFSERR_NOFILEHANDLE)
+#define nfserr_minor_vers_mismatch      cpu_to_be32(NFSERR_MINOR_VERS_MISMATCH)
+#define nfserr_share_denied     cpu_to_be32(NFSERR_SHARE_DENIED)
+#define nfserr_stale_stateid    cpu_to_be32(NFSERR_STALE_STATEID)
+#define nfserr_old_stateid      cpu_to_be32(NFSERR_OLD_STATEID)
+#define nfserr_bad_stateid      cpu_to_be32(NFSERR_BAD_STATEID)
+#define nfserr_bad_seqid        cpu_to_be32(NFSERR_BAD_SEQID)
+#define nfserr_symlink          cpu_to_be32(NFSERR_SYMLINK)
+#define nfserr_not_same         cpu_to_be32(NFSERR_NOT_SAME)
+#define nfserr_restorefh        cpu_to_be32(NFSERR_RESTOREFH)
+#define nfserr_attrnotsupp      cpu_to_be32(NFSERR_ATTRNOTSUPP)
+#define nfserr_bad_xdr          cpu_to_be32(NFSERR_BAD_XDR)
+#define nfserr_openmode         cpu_to_be32(NFSERR_OPENMODE)
+#define nfserr_locks_held       cpu_to_be32(NFSERR_LOCKS_HELD)
+#define nfserr_op_illegal       cpu_to_be32(NFSERR_OP_ILLEGAL)
+#define nfserr_grace            cpu_to_be32(NFSERR_GRACE)
+#define nfserr_no_grace         cpu_to_be32(NFSERR_NO_GRACE)
+#define nfserr_reclaim_bad      cpu_to_be32(NFSERR_RECLAIM_BAD)
+#define nfserr_badname          cpu_to_be32(NFSERR_BADNAME)
+#define nfserr_cb_path_down     cpu_to_be32(NFSERR_CB_PATH_DOWN)
+#define nfserr_locked           cpu_to_be32(NFSERR_LOCKED)
+#define nfserr_wrongsec         cpu_to_be32(NFSERR_WRONGSEC)
+#define nfserr_badiomode                cpu_to_be32(NFS4ERR_BADIOMODE)
+#define nfserr_badlayout                cpu_to_be32(NFS4ERR_BADLAYOUT)
+#define nfserr_bad_session_digest       cpu_to_be32(NFS4ERR_BAD_SESSION_DIGEST)
+#define nfserr_badsession               cpu_to_be32(NFS4ERR_BADSESSION)
+#define nfserr_badslot                  cpu_to_be32(NFS4ERR_BADSLOT)
+#define nfserr_complete_already         cpu_to_be32(NFS4ERR_COMPLETE_ALREADY)
+#define nfserr_conn_not_bound_to_session cpu_to_be32(NFS4ERR_CONN_NOT_BOUND_TO_SESSION)
+#define nfserr_deleg_already_wanted     cpu_to_be32(NFS4ERR_DELEG_ALREADY_WANTED)
+#define nfserr_back_chan_busy           cpu_to_be32(NFS4ERR_BACK_CHAN_BUSY)
+#define nfserr_layouttrylater           cpu_to_be32(NFS4ERR_LAYOUTTRYLATER)
+#define nfserr_layoutunavailable        cpu_to_be32(NFS4ERR_LAYOUTUNAVAILABLE)
+#define nfserr_nomatching_layout        cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT)
+#define nfserr_recallconflict           cpu_to_be32(NFS4ERR_RECALLCONFLICT)
+#define nfserr_unknown_layouttype       cpu_to_be32(NFS4ERR_UNKNOWN_LAYOUTTYPE)
+#define nfserr_seq_misordered           cpu_to_be32(NFS4ERR_SEQ_MISORDERED)
+#define nfserr_sequence_pos             cpu_to_be32(NFS4ERR_SEQUENCE_POS)
+#define nfserr_req_too_big              cpu_to_be32(NFS4ERR_REQ_TOO_BIG)
+#define nfserr_rep_too_big              cpu_to_be32(NFS4ERR_REP_TOO_BIG)
+#define nfserr_rep_too_big_to_cache     cpu_to_be32(NFS4ERR_REP_TOO_BIG_TO_CACHE)
+#define nfserr_retry_uncached_rep       cpu_to_be32(NFS4ERR_RETRY_UNCACHED_REP)
+#define nfserr_unsafe_compound          cpu_to_be32(NFS4ERR_UNSAFE_COMPOUND)
+#define nfserr_too_many_ops             cpu_to_be32(NFS4ERR_TOO_MANY_OPS)
+#define nfserr_op_not_in_session        cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION)
+#define nfserr_hash_alg_unsupp          cpu_to_be32(NFS4ERR_HASH_ALG_UNSUPP)
+#define nfserr_clientid_busy            cpu_to_be32(NFS4ERR_CLIENTID_BUSY)
+#define nfserr_pnfs_io_hole             cpu_to_be32(NFS4ERR_PNFS_IO_HOLE)
+#define nfserr_seq_false_retry          cpu_to_be32(NFS4ERR_SEQ_FALSE_RETRY)
+#define nfserr_bad_high_slot            cpu_to_be32(NFS4ERR_BAD_HIGH_SLOT)
+#define nfserr_deadsession              cpu_to_be32(NFS4ERR_DEADSESSION)
+#define nfserr_encr_alg_unsupp          cpu_to_be32(NFS4ERR_ENCR_ALG_UNSUPP)
+#define nfserr_pnfs_no_layout           cpu_to_be32(NFS4ERR_PNFS_NO_LAYOUT)
+#define nfserr_not_only_op              cpu_to_be32(NFS4ERR_NOT_ONLY_OP)
+#define nfserr_wrong_cred               cpu_to_be32(NFS4ERR_WRONG_CRED)
+#define nfserr_wrong_type               cpu_to_be32(NFS4ERR_WRONG_TYPE)
+#define nfserr_dirdeleg_unavail         cpu_to_be32(NFS4ERR_DIRDELEG_UNAVAIL)
+#define nfserr_reject_deleg             cpu_to_be32(NFS4ERR_REJECT_DELEG)
+#define nfserr_returnconflict           cpu_to_be32(NFS4ERR_RETURNCONFLICT)
+#define nfserr_deleg_revoked            cpu_to_be32(NFS4ERR_DELEG_REVOKED)
+/* error codes for internal use */
+/* if a request fails due to kmalloc failure, it gets dropped.
+ *  Client should resend eventually
+ */
+#define nfserr_dropit           cpu_to_be32(30000)
+/* end-of-file indicator in readdir */
+#define nfserr_eof              cpu_to_be32(30001)
+/* replay detected */
+#define nfserr_replay_me        cpu_to_be32(11001)
+/* nfs41 replay detected */
+#define nfserr_replay_cache     cpu_to_be32(11002)
+/* Check for dir entries '.' and '..' */
+#define isdotent(n, l)  (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.'))
+/*
+ * Time of server startup
+ */
+extern struct timeval   nfssvc_boot;
+#ifdef CONFIG_NFSD_V4
+/* before processing a COMPOUND operation, we have to check that there
+ * is enough space in the buffer for XDR encode to succeed.  otherwise,
+ * we might process an operation with side effects, and be unable to
+ * tell the client that the operation succeeded.
+ *
+ * COMPOUND_SLACK_SPACE - this is the minimum bytes of buffer space
+ * needed to encode an "ordinary" _successful_ operation.  (GETATTR,
+ * READ, READDIR, and READLINK have their own buffer checks.)  if we
+ * fall below this level, we fail the next operation with NFS4ERR_RESOURCE.
+ *
+ * COMPOUND_ERR_SLACK_SPACE - this is the minimum bytes of buffer space
+ * needed to encode an operation which has failed with NFS4ERR_RESOURCE.
+ * care is taken to ensure that we never fall below this level for any
+ * reason.
+ */
+#define COMPOUND_SLACK_SPACE            140    /* OP_GETFH */
+#define COMPOUND_ERR_SLACK_SPACE        12     /* OP_SETATTR */
+#define NFSD_LEASE_TIME                 (nfs4_lease_time())
+#define NFSD_LAUNDROMAT_MINTIMEOUT      10   /* seconds */
+/*
+ * The following attributes are currently not supported by the NFSv4 server:
+ *    ARCHIVE       (deprecated anyway)
+ *    HIDDEN        (unlikely to be supported any time soon)
+ *    MIMETYPE      (unlikely to be supported any time soon)
+ *    QUOTA_*       (will be supported in a forthcoming patch)
+ *    SYSTEM        (unlikely to be supported any time soon)
+ *    TIME_BACKUP   (unlikely to be supported any time soon)
+ *    TIME_CREATE   (unlikely to be supported any time soon)
+ */
+#define NFSD4_SUPPORTED_ATTRS_WORD0                                                         \
+(FATTR4_WORD0_SUPPORTED_ATTRS   | FATTR4_WORD0_TYPE         | FATTR4_WORD0_FH_EXPIRE_TYPE   \
+ | FATTR4_WORD0_CHANGE          | FATTR4_WORD0_SIZE         | FATTR4_WORD0_LINK_SUPPORT     \
+ | FATTR4_WORD0_SYMLINK_SUPPORT | FATTR4_WORD0_NAMED_ATTR   | FATTR4_WORD0_FSID             \
+ | FATTR4_WORD0_UNIQUE_HANDLES  | FATTR4_WORD0_LEASE_TIME   | FATTR4_WORD0_RDATTR_ERROR     \
+ | FATTR4_WORD0_ACLSUPPORT      | FATTR4_WORD0_CANSETTIME   | FATTR4_WORD0_CASE_INSENSITIVE \
+ | FATTR4_WORD0_CASE_PRESERVING | FATTR4_WORD0_CHOWN_RESTRICTED                             \
+ | FATTR4_WORD0_FILEHANDLE      | FATTR4_WORD0_FILEID       | FATTR4_WORD0_FILES_AVAIL      \
+ | FATTR4_WORD0_FILES_FREE      | FATTR4_WORD0_FILES_TOTAL  | FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_HOMOGENEOUS      \
+ | FATTR4_WORD0_MAXFILESIZE     | FATTR4_WORD0_MAXLINK      | FATTR4_WORD0_MAXNAME          \
+ | FATTR4_WORD0_MAXREAD         | FATTR4_WORD0_MAXWRITE     | FATTR4_WORD0_ACL)
+#define NFSD4_SUPPORTED_ATTRS_WORD1                                                         \
+(FATTR4_WORD1_MODE              | FATTR4_WORD1_NO_TRUNC     | FATTR4_WORD1_NUMLINKS         \
+ | FATTR4_WORD1_OWNER           | FATTR4_WORD1_OWNER_GROUP  | FATTR4_WORD1_RAWDEV           \
+ | FATTR4_WORD1_SPACE_AVAIL     | FATTR4_WORD1_SPACE_FREE   | FATTR4_WORD1_SPACE_TOTAL      \
+ | FATTR4_WORD1_SPACE_USED      | FATTR4_WORD1_TIME_ACCESS  | FATTR4_WORD1_TIME_ACCESS_SET  \
+ | FATTR4_WORD1_TIME_DELTA   | FATTR4_WORD1_TIME_METADATA    \
+ | FATTR4_WORD1_TIME_MODIFY     | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID)
+#define NFSD4_SUPPORTED_ATTRS_WORD2 0
+#define NFSD4_1_SUPPORTED_ATTRS_WORD0 \
+        NFSD4_SUPPORTED_ATTRS_WORD0
+#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
+        NFSD4_SUPPORTED_ATTRS_WORD1
+#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
+        (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
+static inline u32 nfsd_suppattrs0(u32 minorversion)
+{
+        return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0
+                            : NFSD4_SUPPORTED_ATTRS_WORD0;
+}
+static inline u32 nfsd_suppattrs1(u32 minorversion)
+{
+        return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD1
+                            : NFSD4_SUPPORTED_ATTRS_WORD1;
+}
+static inline u32 nfsd_suppattrs2(u32 minorversion)
+{
+        return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD2
+                            : NFSD4_SUPPORTED_ATTRS_WORD2;
+}
+/* These will return ERR_INVAL if specified in GETATTR or READDIR. */
+#define NFSD_WRITEONLY_ATTRS_WORD1                                                          \
+(FATTR4_WORD1_TIME_ACCESS_SET   | FATTR4_WORD1_TIME_MODIFY_SET)
+/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */
+#define NFSD_WRITEABLE_ATTRS_WORD0                                                          \
+(FATTR4_WORD0_SIZE              | FATTR4_WORD0_ACL                                         )
+#define NFSD_WRITEABLE_ATTRS_WORD1                                                          \
+(FATTR4_WORD1_MODE              | FATTR4_WORD1_OWNER         | FATTR4_WORD1_OWNER_GROUP     \
+ | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
+#define NFSD_WRITEABLE_ATTRS_WORD2 0
+#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \
+        NFSD_WRITEABLE_ATTRS_WORD0
+/*
+ * we currently store the exclusive create verifier in the v_{a,m}time
+ * attributes so the client can't set these at create time using EXCLUSIVE4_1
+ */
+#define NFSD_SUPPATTR_EXCLCREAT_WORD1 \
+        (NFSD_WRITEABLE_ATTRS_WORD1 & \
+         ~(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET))
+#define NFSD_SUPPATTR_EXCLCREAT_WORD2 \
+        NFSD_WRITEABLE_ATTRS_WORD2
+#endif /* CONFIG_NFSD_V4 */
+#endif /* LINUX_NFSD_NFSD_H */
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 01965b2f3a76..55c8e63af0be 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -1,6 +1,4 @@
 /*
- * linux/fs/nfsd/nfsfh.c
- *
 * NFS server file handle treatment.
 *
 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
@@ -9,19 +7,11 @@
 * ... and again Southern-Winter 2001 to support export_operations
 */
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/unistd.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/dcache.h>
 #include <linux/exportfs.h>
-#include <linux/mount.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svcauth_gss.h>
-#include <linux/nfsd/nfsd.h>
+#include "nfsd.h"
+#include "vfs.h"
 #include "auth.h"
 #define NFSDDBG_FACILITY                NFSDDBG_FH
@@ -96,8 +86,10 @@ nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type)
 static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
                                          struct svc_export *exp)
 {
+        int flags = nfsexp_flags(rqstp, exp);
        /* Check if the request originated from a secure port. */
-        if (!rqstp->rq_secure && EX_SECURE(exp)) {
+        if (!rqstp->rq_secure && !(flags & NFSEXP_INSECURE_PORT)) {
                RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
                dprintk(KERN_WARNING
                       "nfsd: request from insecure port %s!\n",
@@ -109,6 +101,36 @@ static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
        return nfserrno(nfsd_setuser(rqstp, exp));
 }
+static inline __be32 check_pseudo_root(struct svc_rqst *rqstp,
+        struct dentry *dentry, struct svc_export *exp)
+{
+        if (!(exp->ex_flags & NFSEXP_V4ROOT))
+                return nfs_ok;
+        /*
+         * v2/v3 clients have no need for the V4ROOT export--they use
+         * the mount protocl instead; also, further V4ROOT checks may be
+         * in v4-specific code, in which case v2/v3 clients could bypass
+         * them.
+         */
+        if (!nfsd_v4client(rqstp))
+                return nfserr_stale;
+        /*
+         * We're exposing only the directories and symlinks that have to be
+         * traversed on the way to real exports:
+         */
+        if (unlikely(!S_ISDIR(dentry->d_inode->i_mode) &&
+                     !S_ISLNK(dentry->d_inode->i_mode)))
+                return nfserr_stale;
+        /*
+         * A pseudoroot export gives permission to access only one
+         * single directory; the kernel has to make another upcall
+         * before granting access to anything else under it:
+         */
+        if (unlikely(dentry != exp->ex_path.dentry))
+                return nfserr_stale;
+        return nfs_ok;
+}
 /*
 * Use the given filehandle to look up the corresponding export and
 * dentry.  On success, the results are used to set fh_export and
@@ -232,14 +254,6 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
                goto out;
        }
-        if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) {
-                error = nfsd_setuser_and_check_port(rqstp, exp);
-                if (error) {
-                        dput(dentry);
-                        goto out;
-                }
-        }
        if (S_ISDIR(dentry->d_inode->i_mode) &&
                        (dentry->d_flags & DCACHE_DISCONNECTED)) {
                printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %s/%s\n",
@@ -294,28 +308,32 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
                error = nfsd_set_fh_dentry(rqstp, fhp);
                if (error)
                        goto out;
-                dentry = fhp->fh_dentry;
-                exp = fhp->fh_export;
-        } else {
-                /*
-                 * just rechecking permissions
-                 * (e.g. nfsproc_create calls fh_verify, then nfsd_create
-                 * does as well)
-                 */
-                dprintk("nfsd: fh_verify - just checking\n");
-                dentry = fhp->fh_dentry;
-                exp = fhp->fh_export;
-                /*
-                 * Set user creds for this exportpoint; necessary even
-                 * in the "just checking" case because this may be a
-                 * filehandle that was created by fh_compose, and that
-                 * is about to be used in another nfsv4 compound
-                 * operation.
-                 */
-                error = nfsd_setuser_and_check_port(rqstp, exp);
-                if (error)
-                        goto out;
        }
+        dentry = fhp->fh_dentry;
+        exp = fhp->fh_export;
+        /*
+         * We still have to do all these permission checks, even when
+         * fh_dentry is already set:
+         *      - fh_verify may be called multiple times with different
+         *        "access" arguments (e.g. nfsd_proc_create calls
+         *        fh_verify(...,NFSD_MAY_EXEC) first, then later (in
+         *        nfsd_create) calls fh_verify(...,NFSD_MAY_CREATE).
+         *      - in the NFSv4 case, the filehandle may have been filled
+         *        in by fh_compose, and given a dentry, but further
+         *        compound operations performed with that filehandle
+         *        still need permissions checks.  In the worst case, a
+         *        mountpoint crossing may have changed the export
+         *        options, and we may now need to use a different uid
+         *        (for example, if different id-squashing options are in
+         *        effect on the new filesystem).
+         */
+        error = check_pseudo_root(rqstp, dentry, exp);
+        if (error)
+                goto out;
+        error = nfsd_setuser_and_check_port(rqstp, exp);
+        if (error)
+                goto out;
        error = nfsd_mode_check(rqstp, dentry->d_inode->i_mode, type);
        if (error)
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
new file mode 100644
index 000000000000..cdfb8c6a4206
--- /dev/null
+++ b/fs/nfsd/nfsfh.h
@@ -0,0 +1,208 @@
+/* Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */
+#ifndef _LINUX_NFSD_FH_INT_H
+#define _LINUX_NFSD_FH_INT_H
+#include <linux/nfsd/nfsfh.h>
+enum nfsd_fsid {
+        FSID_DEV = 0,
+        FSID_NUM,
+        FSID_MAJOR_MINOR,
+        FSID_ENCODE_DEV,
+        FSID_UUID4_INUM,
+        FSID_UUID8,
+        FSID_UUID16,
+        FSID_UUID16_INUM,
+};
+enum fsid_source {
+        FSIDSOURCE_DEV,
+        FSIDSOURCE_FSID,
+        FSIDSOURCE_UUID,
+};
+extern enum fsid_source fsid_source(struct svc_fh *fhp);
+/* This might look a little large to "inline" but in all calls except
+ * one, 'vers' is constant so moste of the function disappears.
+ */
+static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino,
+                           u32 fsid, unsigned char *uuid)
+{
+        u32 *up;
+        switch(vers) {
+        case FSID_DEV:
+                fsidv[0] = htonl((MAJOR(dev)<<16) |
+                                 MINOR(dev));
+                fsidv[1] = ino_t_to_u32(ino);
+                break;
+        case FSID_NUM:
+                fsidv[0] = fsid;
+                break;
+        case FSID_MAJOR_MINOR:
+                fsidv[0] = htonl(MAJOR(dev));
+                fsidv[1] = htonl(MINOR(dev));
+                fsidv[2] = ino_t_to_u32(ino);
+                break;
+        case FSID_ENCODE_DEV:
+                fsidv[0] = new_encode_dev(dev);
+                fsidv[1] = ino_t_to_u32(ino);
+                break;
+        case FSID_UUID4_INUM:
+                /* 4 byte fsid and inode number */
+                up = (u32*)uuid;
+                fsidv[0] = ino_t_to_u32(ino);
+                fsidv[1] = up[0] ^ up[1] ^ up[2] ^ up[3];
+                break;
+        case FSID_UUID8:
+                /* 8 byte fsid  */
+                up = (u32*)uuid;
+                fsidv[0] = up[0] ^ up[2];
+                fsidv[1] = up[1] ^ up[3];
+                break;
+        case FSID_UUID16:
+                /* 16 byte fsid - NFSv3+ only */
+                memcpy(fsidv, uuid, 16);
+                break;
+        case FSID_UUID16_INUM:
+                /* 8 byte inode and 16 byte fsid */
+                *(u64*)fsidv = (u64)ino;
+                memcpy(fsidv+2, uuid, 16);
+                break;
+        default: BUG();
+        }
+}
+static inline int key_len(int type)
+{
+        switch(type) {
+        case FSID_DEV:          return 8;
+        case FSID_NUM:          return 4;
+        case FSID_MAJOR_MINOR:  return 12;
+        case FSID_ENCODE_DEV:   return 8;
+        case FSID_UUID4_INUM:   return 8;
+        case FSID_UUID8:        return 8;
+        case FSID_UUID16:       return 16;
+        case FSID_UUID16_INUM:  return 24;
+        default: return 0;
+        }
+}
+/*
+ * Shorthand for dprintk()'s
+ */
+extern char * SVCFH_fmt(struct svc_fh *fhp);
+/*
+ * Function prototypes
+ */
+__be32  fh_verify(struct svc_rqst *, struct svc_fh *, int, int);
+__be32  fh_compose(struct svc_fh *, struct svc_export *, struct dentry *, struct svc_fh *);
+__be32  fh_update(struct svc_fh *);
+void    fh_put(struct svc_fh *);
+static __inline__ struct svc_fh *
+fh_copy(struct svc_fh *dst, struct svc_fh *src)
+{
+        WARN_ON(src->fh_dentry || src->fh_locked);
+                        
+        *dst = *src;
+        return dst;
+}
+static inline void
+fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src)
+{
+        dst->fh_size = src->fh_size;
+        memcpy(&dst->fh_base, &src->fh_base, src->fh_size);
+}
+static __inline__ struct svc_fh *
+fh_init(struct svc_fh *fhp, int maxsize)
+{
+        memset(fhp, 0, sizeof(*fhp));
+        fhp->fh_maxsize = maxsize;
+        return fhp;
+}
+#ifdef CONFIG_NFSD_V3
+/*
+ * Fill in the pre_op attr for the wcc data
+ */
+static inline void
+fill_pre_wcc(struct svc_fh *fhp)
+{
+        struct inode    *inode;
+        inode = fhp->fh_dentry->d_inode;
+        if (!fhp->fh_pre_saved) {
+                fhp->fh_pre_mtime = inode->i_mtime;
+                fhp->fh_pre_ctime = inode->i_ctime;
+                fhp->fh_pre_size  = inode->i_size;
+                fhp->fh_pre_change = inode->i_version;
+                fhp->fh_pre_saved = 1;
+        }
+}
+extern void fill_post_wcc(struct svc_fh *);
+#else
+#define fill_pre_wcc(ignored)
+#define fill_post_wcc(notused)
+#endif /* CONFIG_NFSD_V3 */
+/*
+ * Lock a file handle/inode
+ * NOTE: both fh_lock and fh_unlock are done "by hand" in
+ * vfs.c:nfsd_rename as it needs to grab 2 i_mutex's at once
+ * so, any changes here should be reflected there.
+ */
+static inline void
+fh_lock_nested(struct svc_fh *fhp, unsigned int subclass)
+{
+        struct dentry   *dentry = fhp->fh_dentry;
+        struct inode    *inode;
+        BUG_ON(!dentry);
+        if (fhp->fh_locked) {
+                printk(KERN_WARNING "fh_lock: %s/%s already locked!\n",
+                        dentry->d_parent->d_name.name, dentry->d_name.name);
+                return;
+        }
+        inode = dentry->d_inode;
+        mutex_lock_nested(&inode->i_mutex, subclass);
+        fill_pre_wcc(fhp);
+        fhp->fh_locked = 1;
+}
+static inline void
+fh_lock(struct svc_fh *fhp)
+{
+        fh_lock_nested(fhp, I_MUTEX_NORMAL);
+}
+/*
+ * Unlock a file handle/inode
+ */
+static inline void
+fh_unlock(struct svc_fh *fhp)
+{
+        BUG_ON(!fhp->fh_dentry);
+        if (fhp->fh_locked) {
+                fill_post_wcc(fhp);
+                mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex);
+                fhp->fh_locked = 0;
+        }
+}
+#endif /* _LINUX_NFSD_FH_INT_H */
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 0eb9c820b7a6..a047ad6111ef 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -1,29 +1,14 @@
 /*
- * nfsproc2.c   Process version 2 NFS requests.
- * linux/fs/nfsd/nfs2proc.c
- * 
 * Process version 2 NFS requests.
 *
 * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/linkage.h>
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/net.h>
-#include <linux/in.h>
 #include <linux/namei.h>
-#include <linux/unistd.h>
-#include <linux/slab.h>
-#include <linux/sunrpc/clnt.h>
+#include "cache.h"
-#include <linux/sunrpc/svc.h>
+#include "xdr.h"
-#include <linux/nfsd/nfsd.h>
+#include "vfs.h"
-#include <linux/nfsd/cache.h>
-#include <linux/nfsd/xdr.h>
 typedef struct svc_rqst svc_rqst;
 typedef struct svc_buf  svc_buf;
@@ -758,6 +743,7 @@ nfserrno (int errno)
                { nfserr_io, -ETXTBSY },
                { nfserr_notsupp, -EOPNOTSUPP },
                { nfserr_toosmall, -ETOOSMALL },
+                { nfserr_serverfault, -ESERVERFAULT },
        };
        int     i;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 67ea83eedd43..171699eb07c8 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -1,6 +1,4 @@
 /*
- * linux/fs/nfsd/nfssvc.c
- *
 * Central processing for nfsd.
 *
 * Authors:     Olaf Kirch (okir@monad.swb.de)
@@ -8,33 +6,19 @@
 * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/module.h>
 #include <linux/sched.h>
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/nfs.h>
-#include <linux/in.h>
-#include <linux/uio.h>
-#include <linux/unistd.h>
-#include <linux/slab.h>
-#include <linux/smp.h>
 #include <linux/freezer.h>
 #include <linux/fs_struct.h>
-#include <linux/kthread.h>
 #include <linux/swap.h>
-#include <linux/sunrpc/types.h>
 #include <linux/sunrpc/stats.h>
-#include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svcsock.h>
-#include <linux/sunrpc/cache.h>
-#include <linux/nfsd/nfsd.h>
-#include <linux/nfsd/stats.h>
-#include <linux/nfsd/cache.h>
-#include <linux/nfsd/syscall.h>
 #include <linux/lockd/bind.h>
 #include <linux/nfsacl.h>
 #include <linux/seq_file.h>
+#include "nfsd.h"
+#include "cache.h"
+#include "vfs.h"
 #define NFSDDBG_FACILITY        NFSDDBG_SVC
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index afd08e2c90a5..4ce005dbf3e6 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -1,20 +1,10 @@
 /*
- * linux/fs/nfsd/nfsxdr.c
- *
 * XDR support for nfsd
 *
 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/types.h>
+#include "xdr.h"
-#include <linux/time.h>
-#include <linux/nfs.h>
-#include <linux/vfs.h>
-#include <linux/sunrpc/xdr.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/nfsd/nfsd.h>
-#include <linux/nfsd/xdr.h>
-#include <linux/mm.h>
 #include "auth.h"
 #define NFSDDBG_FACILITY                NFSDDBG_XDR
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
new file mode 100644
index 000000000000..fefeae27f25e
--- /dev/null
+++ b/fs/nfsd/state.h
@@ -0,0 +1,408 @@
+/*
+ *  Copyright (c) 2001 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Kendrick Smith <kmsmith@umich.edu>
+ *  Andy Adamson <andros@umich.edu>
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _NFSD4_STATE_H
+#define _NFSD4_STATE_H
+#include <linux/nfsd/nfsfh.h>
+#include "nfsfh.h"
+typedef struct {
+        u32             cl_boot;
+        u32             cl_id;
+} clientid_t;
+typedef struct {
+        u32             so_boot;
+        u32             so_stateownerid;
+        u32             so_fileid;
+} stateid_opaque_t;
+typedef struct {
+        u32                     si_generation;
+        stateid_opaque_t        si_opaque;
+} stateid_t;
+#define si_boot           si_opaque.so_boot
+#define si_stateownerid   si_opaque.so_stateownerid
+#define si_fileid         si_opaque.so_fileid
+#define STATEID_FMT     "(%08x/%08x/%08x/%08x)"
+#define STATEID_VAL(s) \
+        (s)->si_boot, \
+        (s)->si_stateownerid, \
+        (s)->si_fileid, \
+        (s)->si_generation
+struct nfsd4_cb_sequence {
+        /* args/res */
+        u32                     cbs_minorversion;
+        struct nfs4_client      *cbs_clp;
+};
+struct nfs4_delegation {
+        struct list_head        dl_perfile;
+        struct list_head        dl_perclnt;
+        struct list_head        dl_recall_lru;  /* delegation recalled */
+        atomic_t                dl_count;       /* ref count */
+        struct nfs4_client      *dl_client;
+        struct nfs4_file        *dl_file;
+        struct file_lock        *dl_flock;
+        struct file             *dl_vfs_file;
+        u32                     dl_type;
+        time_t                  dl_time;
+/* For recall: */
+        u32                     dl_ident;
+        stateid_t               dl_stateid;
+        struct knfsd_fh         dl_fh;
+        int                     dl_retries;
+};
+/* client delegation callback info */
+struct nfs4_cb_conn {
+        /* SETCLIENTID info */
+        struct sockaddr_storage cb_addr;
+        size_t                  cb_addrlen;
+        u32                     cb_prog;
+        u32                     cb_minorversion;
+        u32                     cb_ident;       /* minorversion 0 only */
+        /* RPC client info */
+        atomic_t                cb_set;     /* successful CB_NULL call */
+        struct rpc_clnt *       cb_client;
+};
+/* Maximum number of slots per session. 160 is useful for long haul TCP */
+#define NFSD_MAX_SLOTS_PER_SESSION     160
+/* Maximum number of operations per session compound */
+#define NFSD_MAX_OPS_PER_COMPOUND       16
+/* Maximum  session per slot cache size */
+#define NFSD_SLOT_CACHE_SIZE            1024
+/* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */
+#define NFSD_CACHE_SIZE_SLOTS_PER_SESSION       32
+#define NFSD_MAX_MEM_PER_SESSION  \
+                (NFSD_CACHE_SIZE_SLOTS_PER_SESSION * NFSD_SLOT_CACHE_SIZE)
+struct nfsd4_slot {
+        bool    sl_inuse;
+        bool    sl_cachethis;
+        u16     sl_opcnt;
+        u32     sl_seqid;
+        __be32  sl_status;
+        u32     sl_datalen;
+        char    sl_data[];
+};
+struct nfsd4_channel_attrs {
+        u32             headerpadsz;
+        u32             maxreq_sz;
+        u32             maxresp_sz;
+        u32             maxresp_cached;
+        u32             maxops;
+        u32             maxreqs;
+        u32             nr_rdma_attrs;
+        u32             rdma_attrs;
+};
+struct nfsd4_create_session {
+        clientid_t                      clientid;
+        struct nfs4_sessionid           sessionid;
+        u32                             seqid;
+        u32                             flags;
+        struct nfsd4_channel_attrs      fore_channel;
+        struct nfsd4_channel_attrs      back_channel;
+        u32                             callback_prog;
+        u32                             uid;
+        u32                             gid;
+};
+/* The single slot clientid cache structure */
+struct nfsd4_clid_slot {
+        u32                             sl_seqid;
+        __be32                          sl_status;
+        struct nfsd4_create_session     sl_cr_ses;
+};
+struct nfsd4_session {
+        struct kref             se_ref;
+        struct list_head        se_hash;        /* hash by sessionid */
+        struct list_head        se_perclnt;
+        u32                     se_flags;
+        struct nfs4_client      *se_client;     /* for expire_client */
+        struct nfs4_sessionid   se_sessionid;
+        struct nfsd4_channel_attrs se_fchannel;
+        struct nfsd4_channel_attrs se_bchannel;
+        struct nfsd4_slot       *se_slots[];    /* forward channel slots */
+};
+static inline void
+nfsd4_put_session(struct nfsd4_session *ses)
+{
+        extern void free_session(struct kref *kref);
+        kref_put(&ses->se_ref, free_session);
+}
+static inline void
+nfsd4_get_session(struct nfsd4_session *ses)
+{
+        kref_get(&ses->se_ref);
+}
+/* formatted contents of nfs4_sessionid */
+struct nfsd4_sessionid {
+        clientid_t      clientid;
+        u32             sequence;
+        u32             reserved;
+};
+#define HEXDIR_LEN     33 /* hex version of 16 byte md5 of cl_name plus '\0' */
+/*
+ * struct nfs4_client - one per client.  Clientids live here.
+ *      o Each nfs4_client is hashed by clientid.
+ *
+ *      o Each nfs4_clients is also hashed by name 
+ *        (the opaque quantity initially sent by the client to identify itself).
+ *        
+ *      o cl_perclient list is used to ensure no dangling stateowner references
+ *        when we expire the nfs4_client
+ */
+struct nfs4_client {
+        struct list_head        cl_idhash;      /* hash by cl_clientid.id */
+        struct list_head        cl_strhash;     /* hash by cl_name */
+        struct list_head        cl_openowners;
+        struct list_head        cl_delegations;
+        struct list_head        cl_lru;         /* tail queue */
+        struct xdr_netobj       cl_name;        /* id generated by client */
+        char                    cl_recdir[HEXDIR_LEN]; /* recovery dir */
+        nfs4_verifier           cl_verifier;    /* generated by client */
+        time_t                  cl_time;        /* time of last lease renewal */
+        struct sockaddr_storage cl_addr;        /* client ipaddress */
+        u32                     cl_flavor;      /* setclientid pseudoflavor */
+        char                    *cl_principal;  /* setclientid principal name */
+        struct svc_cred         cl_cred;        /* setclientid principal */
+        clientid_t              cl_clientid;    /* generated by server */
+        nfs4_verifier           cl_confirm;     /* generated by server */
+        struct nfs4_cb_conn     cl_cb_conn;     /* callback info */
+        atomic_t                cl_count;       /* ref count */
+        u32                     cl_firststate;  /* recovery dir creation */
+        /* for nfs41 */
+        struct list_head        cl_sessions;
+        struct nfsd4_clid_slot  cl_cs_slot;     /* create_session slot */
+        u32                     cl_exchange_flags;
+        struct nfs4_sessionid   cl_sessionid;
+        /* for nfs41 callbacks */
+        /* We currently support a single back channel with a single slot */
+        unsigned long           cl_cb_slot_busy;
+        u32                     cl_cb_seq_nr;
+        struct svc_xprt         *cl_cb_xprt;    /* 4.1 callback transport */
+        struct rpc_wait_queue   cl_cb_waitq;    /* backchannel callers may */
+                                                /* wait here for slots */
+};
+/* struct nfs4_client_reset
+ * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl
+ * upon lease reset, or from upcall to state_daemon (to read in state
+ * from non-volitile storage) upon reboot.
+ */
+struct nfs4_client_reclaim {
+        struct list_head        cr_strhash;     /* hash by cr_name */
+        char                    cr_recdir[HEXDIR_LEN]; /* recover dir */
+};
+static inline void
+update_stateid(stateid_t *stateid)
+{
+        stateid->si_generation++;
+}
+/* A reasonable value for REPLAY_ISIZE was estimated as follows:  
+ * The OPEN response, typically the largest, requires 
+ *   4(status) + 8(stateid) + 20(changeinfo) + 4(rflags) +  8(verifier) + 
+ *   4(deleg. type) + 8(deleg. stateid) + 4(deleg. recall flag) + 
+ *   20(deleg. space limit) + ~32(deleg. ace) = 112 bytes 
+ */
+#define NFSD4_REPLAY_ISIZE       112 
+/*
+ * Replay buffer, where the result of the last seqid-mutating operation 
+ * is cached. 
+ */
+struct nfs4_replay {
+        __be32                  rp_status;
+        unsigned int            rp_buflen;
+        char                    *rp_buf;
+        unsigned                intrp_allocated;
+        struct knfsd_fh         rp_openfh;
+        char                    rp_ibuf[NFSD4_REPLAY_ISIZE];
+};
+/*
+* nfs4_stateowner can either be an open_owner, or a lock_owner
+*
+*    so_idhash:  stateid_hashtbl[] for open owner, lockstateid_hashtbl[]
+*         for lock_owner
+*    so_strhash: ownerstr_hashtbl[] for open_owner, lock_ownerstr_hashtbl[]
+*         for lock_owner
+*    so_perclient: nfs4_client->cl_perclient entry - used when nfs4_client
+*         struct is reaped.
+*    so_perfilestate: heads the list of nfs4_stateid (either open or lock) 
+*         and is used to ensure no dangling nfs4_stateid references when we 
+*         release a stateowner.
+*    so_perlockowner: (open) nfs4_stateid->st_perlockowner entry - used when
+*         close is called to reap associated byte-range locks
+*    so_close_lru: (open) stateowner is placed on this list instead of being
+*         reaped (when so_perfilestate is empty) to hold the last close replay.
+*         reaped by laundramat thread after lease period.
+*/
+struct nfs4_stateowner {
+        struct kref             so_ref;
+        struct list_head        so_idhash;   /* hash by so_id */
+        struct list_head        so_strhash;   /* hash by op_name */
+        struct list_head        so_perclient;
+        struct list_head        so_stateids;
+        struct list_head        so_perstateid; /* for lockowners only */
+        struct list_head        so_close_lru; /* tail queue */
+        time_t                  so_time; /* time of placement on so_close_lru */
+        int                     so_is_open_owner; /* 1=openowner,0=lockowner */
+        u32                     so_id;
+        struct nfs4_client *    so_client;
+        /* after increment in ENCODE_SEQID_OP_TAIL, represents the next
+         * sequence id expected from the client: */
+        u32                     so_seqid;
+        struct xdr_netobj       so_owner;     /* open owner name */
+        int                     so_confirmed; /* successful OPEN_CONFIRM? */
+        struct nfs4_replay      so_replay;
+};
+/*
+*  nfs4_file: a file opened by some number of (open) nfs4_stateowners.
+*    o fi_perfile list is used to search for conflicting 
+*      share_acces, share_deny on the file.
+*/
+struct nfs4_file {
+        atomic_t                fi_ref;
+        struct list_head        fi_hash;    /* hash by "struct inode *" */
+        struct list_head        fi_stateids;
+        struct list_head        fi_delegations;
+        struct inode            *fi_inode;
+        u32                     fi_id;      /* used with stateowner->so_id 
+                                             * for stateid_hashtbl hash */
+        bool                    fi_had_conflict;
+};
+/*
+* nfs4_stateid can either be an open stateid or (eventually) a lock stateid
+*
+* (open)nfs4_stateid: one per (open)nfs4_stateowner, nfs4_file
+*
+*       st_hash: stateid_hashtbl[] entry or lockstateid_hashtbl entry
+*       st_perfile: file_hashtbl[] entry.
+*       st_perfile_state: nfs4_stateowner->so_perfilestate
+*       st_perlockowner: (open stateid) list of lock nfs4_stateowners
+*       st_access_bmap: used only for open stateid
+*       st_deny_bmap: used only for open stateid
+*       st_openstp: open stateid lock stateid was derived from
+*
+* XXX: open stateids and lock stateids have diverged sufficiently that
+* we should consider defining separate structs for the two cases.
+*/
+struct nfs4_stateid {
+        struct list_head              st_hash; 
+        struct list_head              st_perfile;
+        struct list_head              st_perstateowner;
+        struct list_head              st_lockowners;
+        struct nfs4_stateowner      * st_stateowner;
+        struct nfs4_file            * st_file;
+        stateid_t                     st_stateid;
+        struct file                 * st_vfs_file;
+        unsigned long                 st_access_bmap;
+        unsigned long                 st_deny_bmap;
+        struct nfs4_stateid         * st_openstp;
+};
+/* flags for preprocess_seqid_op() */
+#define HAS_SESSION             0x00000001
+#define CONFIRM                 0x00000002
+#define OPEN_STATE              0x00000004
+#define LOCK_STATE              0x00000008
+#define RD_STATE                0x00000010
+#define WR_STATE                0x00000020
+#define CLOSE_STATE             0x00000040
+#define seqid_mutating_err(err)                       \
+        (((err) != nfserr_stale_clientid) &&    \
+        ((err) != nfserr_bad_seqid) &&          \
+        ((err) != nfserr_stale_stateid) &&      \
+        ((err) != nfserr_bad_stateid))
+struct nfsd4_compound_state;
+extern __be32 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
+                stateid_t *stateid, int flags, struct file **filp);
+extern void nfs4_lock_state(void);
+extern void nfs4_unlock_state(void);
+extern int nfs4_in_grace(void);
+extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
+extern void put_nfs4_client(struct nfs4_client *clp);
+extern void nfs4_free_stateowner(struct kref *kref);
+extern int set_callback_cred(void);
+extern void nfsd4_probe_callback(struct nfs4_client *clp);
+extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
+extern void nfs4_put_delegation(struct nfs4_delegation *dp);
+extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
+extern void nfsd4_init_recdir(char *recdir_name);
+extern int nfsd4_recdir_load(void);
+extern void nfsd4_shutdown_recdir(void);
+extern int nfs4_client_to_reclaim(const char *name);
+extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id);
+extern void nfsd4_recdir_purge_old(void);
+extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
+extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
+static inline void
+nfs4_put_stateowner(struct nfs4_stateowner *so)
+{
+        kref_put(&so->so_ref, nfs4_free_stateowner);
+}
+static inline void
+nfs4_get_stateowner(struct nfs4_stateowner *so)
+{
+        kref_get(&so->so_ref);
+}
+#endif   /* NFSD4_STATE_H */
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 71944cddf680..5232d3e8fb2f 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -1,6 +1,4 @@
 /*
- * linux/fs/nfsd/stats.c
- *
 * procfs-based user access to knfsd statistics
 *
 * /proc/net/rpc/nfsd
@@ -23,18 +21,13 @@
 * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/kernel.h>
-#include <linux/time.h>
-#include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <linux/stat.h>
 #include <linux/module.h>
-#include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/stats.h>
-#include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/stats.h>
+#include "nfsd.h"
 struct nfsd_stats       nfsdstats;
 struct svc_stat         nfsd_svcstats = {
        .program        = &nfsd_program,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a293f0273263..6dd5f1970e01 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1,7 +1,5 @@
 #define MSNFS   /* HACK HACK */
 /*
- * linux/fs/nfsd/vfs.c
- *
 * File operations used by nfsd. Some of these have been ripped from
 * other parts of the kernel because they weren't exported, others
 * are partial duplicates with added or changed functionality.
@@ -16,48 +14,33 @@
 * Zerocpy NFS support (C) 2002 Hirokazu Takahashi <taka@valinux.co.jp>
 */
-#include <linux/string.h>
-#include <linux/time.h>
-#include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/file.h>
-#include <linux/mount.h>
-#include <linux/major.h>
 #include <linux/splice.h>
-#include <linux/proc_fs.h>
-#include <linux/stat.h>
 #include <linux/fcntl.h>
-#include <linux/net.h>
-#include <linux/unistd.h>
-#include <linux/slab.h>
-#include <linux/pagemap.h>
-#include <linux/in.h>
-#include <linux/module.h>
 #include <linux/namei.h>
-#include <linux/vfs.h>
 #include <linux/delay.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/nfsd/nfsd.h>
-#ifdef CONFIG_NFSD_V3
-#include <linux/nfs3.h>
-#include <linux/nfsd/xdr3.h>
-#endif /* CONFIG_NFSD_V3 */
-#include <linux/nfsd/nfsfh.h>
-#include <linux/quotaops.h>
 #include <linux/fsnotify.h>
-#include <linux/posix_acl.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/xattr.h>
+#include <linux/jhash.h>
+#include <linux/ima.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/exportfs.h>
+#include <linux/writeback.h>
+#ifdef CONFIG_NFSD_V3
+#include "xdr3.h"
+#endif /* CONFIG_NFSD_V3 */
 #ifdef CONFIG_NFSD_V4
-#include <linux/nfs4.h>
 #include <linux/nfs4_acl.h>
 #include <linux/nfsd_idmap.h>
-#include <linux/security.h>
 #endif /* CONFIG_NFSD_V4 */
-#include <linux/jhash.h>
-#include <linux/ima.h>
-#include <asm/uaccess.h>
+#include "nfsd.h"
+#include "vfs.h"
 #define NFSDDBG_FACILITY                NFSDDBG_FILEOP
@@ -89,12 +72,6 @@ struct raparm_hbucket {
 #define RAPARM_HASH_MASK        (RAPARM_HASH_SIZE-1)
 static struct raparm_hbucket    raparm_hash[RAPARM_HASH_SIZE];
-static inline int
-nfsd_v4client(struct svc_rqst *rq)
-{
-    return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4;
-}
 /* 
 * Called from nfsd_lookup and encode_dirent. Check if we have crossed 
 * a mount point.
@@ -116,8 +93,16 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
        exp2 = rqst_exp_get_by_name(rqstp, &path);
        if (IS_ERR(exp2)) {
-                if (PTR_ERR(exp2) != -ENOENT)
+                err = PTR_ERR(exp2);
-                        err = PTR_ERR(exp2);
+                /*
+                 * We normally allow NFS clients to continue
+                 * "underneath" a mountpoint that is not exported.
+                 * The exception is V4ROOT, where no traversal is ever
+                 * allowed without an explicit export of the new
+                 * directory.
+                 */
+                if (err == -ENOENT && !(exp->ex_flags & NFSEXP_V4ROOT))
+                        err = 0;
                path_put(&path);
                goto out;
        }
@@ -141,6 +126,53 @@ out:
        return err;
 }
+static void follow_to_parent(struct path *path)
+{
+        struct dentry *dp;
+        while (path->dentry == path->mnt->mnt_root && follow_up(path))
+                ;
+        dp = dget_parent(path->dentry);
+        dput(path->dentry);
+        path->dentry = dp;
+}
+static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, struct svc_export **exp, struct dentry **dentryp)
+{
+        struct svc_export *exp2;
+        struct path path = {.mnt = mntget((*exp)->ex_path.mnt),
+                            .dentry = dget(dparent)};
+        follow_to_parent(&path);
+        exp2 = rqst_exp_parent(rqstp, &path);
+        if (PTR_ERR(exp2) == -ENOENT) {
+                *dentryp = dget(dparent);
+        } else if (IS_ERR(exp2)) {
+                path_put(&path);
+                return PTR_ERR(exp2);
+        } else {
+                *dentryp = dget(path.dentry);
+                exp_put(*exp);
+                *exp = exp2;
+        }
+        path_put(&path);
+        return 0;
+}
+/*
+ * For nfsd purposes, we treat V4ROOT exports as though there was an
+ * export at *every* directory.
+ */
+int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
+{
+        if (d_mountpoint(dentry))
+                return 1;
+        if (!(exp->ex_flags & NFSEXP_V4ROOT))
+                return 0;
+        return dentry->d_inode != NULL;
+}
 __be32
 nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
                   const char *name, unsigned int len,
@@ -169,35 +201,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
                        dentry = dget(dparent);
                else if (dparent != exp->ex_path.dentry)
                        dentry = dget_parent(dparent);
-                else if (!EX_NOHIDE(exp))
+                else if (!EX_NOHIDE(exp) && !nfsd_v4client(rqstp))
                        dentry = dget(dparent); /* .. == . just like at / */
                else {
                        /* checking mountpoint crossing is very different when stepping up */
-                        struct svc_export *exp2 = NULL;
+                        host_err = nfsd_lookup_parent(rqstp, dparent, &exp, &dentry);
-                        struct dentry *dp;
+                        if (host_err)
-                        struct path path = {.mnt = mntget(exp->ex_path.mnt),
-                                            .dentry = dget(dparent)};
-                        while (path.dentry == path.mnt->mnt_root &&
-                               follow_up(&path))
-                                ;
-                        dp = dget_parent(path.dentry);
-                        dput(path.dentry);
-                        path.dentry = dp;
-                        exp2 = rqst_exp_parent(rqstp, &path);
-                        if (PTR_ERR(exp2) == -ENOENT) {
-                                dentry = dget(dparent);
-                        } else if (IS_ERR(exp2)) {
-                                host_err = PTR_ERR(exp2);
-                                path_put(&path);
                                goto out_nfserr;
-                        } else {
-                                dentry = dget(path.dentry);
-                                exp_put(exp);
-                                exp = exp2;
-                        }
-                        path_put(&path);
                }
        } else {
                fh_lock(fhp);
@@ -208,7 +218,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
                /*
                 * check if we have crossed a mount point ...
                 */
-                if (d_mountpoint(dentry)) {
+                if (nfsd_mountpoint(dentry, exp)) {
                        if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
                                dput(dentry);
                                goto out_nfserr;
@@ -263,6 +273,32 @@ out:
        return err;
 }
+/*
+ * Commit metadata changes to stable storage.
+ */
+static int
+commit_metadata(struct svc_fh *fhp)
+{
+        struct inode *inode = fhp->fh_dentry->d_inode;
+        const struct export_operations *export_ops = inode->i_sb->s_export_op;
+        int error = 0;
+        if (!EX_ISSYNC(fhp->fh_export))
+                return 0;
+        if (export_ops->commit_metadata) {
+                error = export_ops->commit_metadata(inode);
+        } else {
+                struct writeback_control wbc = {
+                        .sync_mode = WB_SYNC_ALL,
+                        .nr_to_write = 0, /* metadata only */
+                };
+                error = sync_inode(inode, &wbc);
+        }
+        return error;
+}
 /*
 * Set various file attributes.
@@ -353,7 +389,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
                 * If we are changing the size of the file, then
                 * we need to break all leases.
                 */
-                host_err = break_lease(inode, FMODE_WRITE | O_NONBLOCK);
+                host_err = break_lease(inode, O_WRONLY | O_NONBLOCK);
                if (host_err == -EWOULDBLOCK)
                        host_err = -ETIMEDOUT;
                if (host_err) /* ENOMEM or EWOULDBLOCK */
@@ -369,7 +405,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
                        put_write_access(inode);
                        goto out_nfserr;
                }
-                vfs_dq_init(inode);
        }
        /* sanitize the mode change */
@@ -726,7 +761,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
         * Check to see if there are any leases on this file.
         * This may block while leases are broken.
         */
-        host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? FMODE_WRITE : 0));
+        host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
        if (host_err == -EWOULDBLOCK)
                host_err = -ETIMEDOUT;
        if (host_err) /* NOMEM or WOULDBLOCK */
@@ -737,15 +772,13 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
                        flags = O_RDWR|O_LARGEFILE;
                else
                        flags = O_WRONLY|O_LARGEFILE;
-                vfs_dq_init(inode);
        }
        *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt),
                            flags, current_cred());
        if (IS_ERR(*filp))
                host_err = PTR_ERR(*filp);
        else
-                ima_counts_get(*filp);
+                host_err = ima_file_check(*filp, access);
 out_nfserr:
        err = nfserrno(host_err);
 out:
@@ -763,46 +796,6 @@ nfsd_close(struct file *filp)
 }
 /*
- * Sync a file
- * As this calls fsync (not fdatasync) there is no need for a write_inode
- * after it.
- */
-static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
-                              const struct file_operations *fop)
-{
-        struct inode *inode = dp->d_inode;
-        int (*fsync) (struct file *, struct dentry *, int);
-        int err;
-        err = filemap_fdatawrite(inode->i_mapping);
-        if (err == 0 && fop && (fsync = fop->fsync))
-                err = fsync(filp, dp, 0);
-        if (err == 0)
-                err = filemap_fdatawait(inode->i_mapping);
-        return err;
-}
-static int
-nfsd_sync(struct file *filp)
-{
-        int err;
-        struct inode *inode = filp->f_path.dentry->d_inode;
-        dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name);
-        mutex_lock(&inode->i_mutex);
-        err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op);
-        mutex_unlock(&inode->i_mutex);
-        return err;
-}
-int
-nfsd_sync_dir(struct dentry *dp)
-{
-        return nfsd_dosync(NULL, dp, dp->d_inode->i_fop);
-}
-/*
 * Obtain the readahead parameters for the file
 * specified by (dev, ino).
 */
@@ -1005,7 +998,7 @@ static int wait_for_concurrent_writes(struct file *file)
        if (inode->i_state & I_DIRTY) {
                dprintk("nfsd: write sync %d\n", task_pid_nr(current));
-                err = nfsd_sync(file);
+                err = vfs_fsync(file, file->f_path.dentry, 0);
        }
        last_ino = inode->i_ino;
        last_dev = inode->i_sb->s_dev;
@@ -1153,8 +1146,9 @@ out:
 #ifdef CONFIG_NFSD_V3
 /*
 * Commit all pending writes to stable storage.
- * Strictly speaking, we could sync just the indicated file region here,
+ *
- * but there's currently no way we can ask the VFS to do so.
+ * Note: we only guarantee that data that lies within the range specified
+ * by the 'offset' and 'count' parameters will be synced.
 *
 * Unfortunately we cannot lock the file to make sure we return full WCC
 * data to the client, as locking happens lower down in the filesystem.
@@ -1164,23 +1158,32 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
               loff_t offset, unsigned long count)
 {
        struct file     *file;
-        __be32          err;
+        loff_t          end = LLONG_MAX;
+        __be32          err = nfserr_inval;
-        if ((u64)count > ~(u64)offset)
+        if (offset < 0)
-                return nfserr_inval;
+                goto out;
+        if (count != 0) {
+                end = offset + (loff_t)count - 1;
+                if (end < offset)
+                        goto out;
+        }
        err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
        if (err)
-                return err;
+                goto out;
        if (EX_ISSYNC(fhp->fh_export)) {
-                if (file->f_op && file->f_op->fsync) {
+                int err2 = vfs_fsync_range(file, file->f_path.dentry,
-                        err = nfserrno(nfsd_sync(file));
+                                offset, end, 0);
-                } else {
+                if (err2 != -EINVAL)
+                        err = nfserrno(err2);
+                else
                        err = nfserr_notsupp;
-                }
        }
        nfsd_close(file);
+out:
        return err;
 }
 #endif /* CONFIG_NFSD_V3 */
@@ -1333,12 +1336,14 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
                goto out_nfserr;
        }
-        if (EX_ISSYNC(fhp->fh_export)) {
+        err = nfsd_create_setattr(rqstp, resfhp, iap);
-                err = nfserrno(nfsd_sync_dir(dentry));
-                write_inode_now(dchild->d_inode, 1);
-        }
-        err2 = nfsd_create_setattr(rqstp, resfhp, iap);
+        /*
+         * nfsd_setattr already committed the child.  Transactional filesystems
+         * had a chance to commit changes for both parent and child
+         * simultaneously making the following commit_metadata a noop.
+         */
+        err2 = nfserrno(commit_metadata(fhp));
        if (err2)
                err = err2;
        mnt_drop_write(fhp->fh_export->ex_path.mnt);
@@ -1370,7 +1375,6 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
        struct dentry   *dentry, *dchild = NULL;
        struct inode    *dirp;
        __be32          err;
-        __be32          err2;
        int             host_err;
        __u32           v_mtime=0, v_atime=0;
@@ -1465,11 +1469,6 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
        if (created)
                *created = 1;
-        if (EX_ISSYNC(fhp->fh_export)) {
-                err = nfserrno(nfsd_sync_dir(dentry));
-                /* setattr will sync the child (or not) */
-        }
        nfsd_check_ignore_resizing(iap);
        if (createmode == NFS3_CREATE_EXCLUSIVE) {
@@ -1484,9 +1483,13 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
        }
 set_attr:
-        err2 = nfsd_create_setattr(rqstp, resfhp, iap);
+        err = nfsd_create_setattr(rqstp, resfhp, iap);
-        if (err2)
-                err = err2;
+        /*
+         * nfsd_setattr already committed the child (and possibly also the parent).
+         */
+        if (!err)
+                err = nfserrno(commit_metadata(fhp));
        mnt_drop_write(fhp->fh_export->ex_path.mnt);
        /*
@@ -1601,12 +1604,9 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
                }
        } else
                host_err = vfs_symlink(dentry->d_inode, dnew, path);
-        if (!host_err) {
-                if (EX_ISSYNC(fhp->fh_export))
-                        host_err = nfsd_sync_dir(dentry);
-        }
        err = nfserrno(host_err);
+        if (!err)
+                err = nfserrno(commit_metadata(fhp));
        fh_unlock(fhp);
        mnt_drop_write(fhp->fh_export->ex_path.mnt);
@@ -1668,11 +1668,9 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
        }
        host_err = vfs_link(dold, dirp, dnew);
        if (!host_err) {
-                if (EX_ISSYNC(ffhp->fh_export)) {
+                err = nfserrno(commit_metadata(ffhp));
-                        err = nfserrno(nfsd_sync_dir(ddir));
+                if (!err)
-                        write_inode_now(dest, 1);
+                        err = nfserrno(commit_metadata(tfhp));
-                }
-                err = 0;
        } else {
                if (host_err == -EXDEV && rqstp->rq_vers == 2)
                        err = nfserr_acces;
@@ -1768,10 +1766,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
                goto out_dput_new;
        host_err = vfs_rename(fdir, odentry, tdir, ndentry);
-        if (!host_err && EX_ISSYNC(tfhp->fh_export)) {
+        if (!host_err) {
-                host_err = nfsd_sync_dir(tdentry);
+                host_err = commit_metadata(tfhp);
                if (!host_err)
-                        host_err = nfsd_sync_dir(fdentry);
+                        host_err = commit_metadata(ffhp);
        }
        mnt_drop_write(ffhp->fh_export->ex_path.mnt);
@@ -1852,12 +1850,9 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
        dput(rdentry);
-        if (host_err)
+        if (!host_err)
-                goto out_drop;
+                host_err = commit_metadata(fhp);
-        if (EX_ISSYNC(fhp->fh_export))
-                host_err = nfsd_sync_dir(dentry);
-out_drop:
        mnt_drop_write(fhp->fh_export->ex_path.mnt);
 out_nfserr:
        err = nfserrno(host_err);
@@ -2124,8 +2119,6 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
         */
        path.mnt = exp->ex_path.mnt;
        path.dentry = dentry;
-        err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC),
-                             IMA_COUNT_LEAVE);
 nfsd_out:
        return err? nfserrno(err) : 0;
 }
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
new file mode 100644
index 000000000000..4b1de0a9ea75
--- /dev/null
+++ b/fs/nfsd/vfs.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ */
+#ifndef LINUX_NFSD_VFS_H
+#define LINUX_NFSD_VFS_H
+#include "nfsfh.h"
+/*
+ * Flags for nfsd_permission
+ */
+#define NFSD_MAY_NOP            0
+#define NFSD_MAY_EXEC           1 /* == MAY_EXEC */
+#define NFSD_MAY_WRITE          2 /* == MAY_WRITE */
+#define NFSD_MAY_READ           4 /* == MAY_READ */
+#define NFSD_MAY_SATTR          8
+#define NFSD_MAY_TRUNC          16
+#define NFSD_MAY_LOCK           32
+#define NFSD_MAY_OWNER_OVERRIDE 64
+#define NFSD_MAY_LOCAL_ACCESS   128 /* IRIX doing local access check on device special file*/
+#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
+#define NFSD_MAY_CREATE         (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
+#define NFSD_MAY_REMOVE         (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
+/*
+ * Callback function for readdir
+ */
+typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int);
+/* nfsd/vfs.c */
+int             fh_lock_parent(struct svc_fh *, struct dentry *);
+int             nfsd_racache_init(int);
+void            nfsd_racache_shutdown(void);
+int             nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
+                                struct svc_export **expp);
+__be32          nfsd_lookup(struct svc_rqst *, struct svc_fh *,
+                                const char *, unsigned int, struct svc_fh *);
+__be32           nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *,
+                                const char *, unsigned int,
+                                struct svc_export **, struct dentry **);
+__be32          nfsd_setattr(struct svc_rqst *, struct svc_fh *,
+                                struct iattr *, int, time_t);
+int nfsd_mountpoint(struct dentry *, struct svc_export *);
+#ifdef CONFIG_NFSD_V4
+__be32          nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *,
+                    struct nfs4_acl *);
+int             nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **);
+#endif /* CONFIG_NFSD_V4 */
+__be32          nfsd_create(struct svc_rqst *, struct svc_fh *,
+                                char *name, int len, struct iattr *attrs,
+                                int type, dev_t rdev, struct svc_fh *res);
+#ifdef CONFIG_NFSD_V3
+__be32          nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
+__be32          nfsd_create_v3(struct svc_rqst *, struct svc_fh *,
+                                char *name, int len, struct iattr *attrs,
+                                struct svc_fh *res, int createmode,
+                                u32 *verifier, int *truncp, int *created);
+__be32          nfsd_commit(struct svc_rqst *, struct svc_fh *,
+                                loff_t, unsigned long);
+#endif /* CONFIG_NFSD_V3 */
+__be32          nfsd_open(struct svc_rqst *, struct svc_fh *, int,
+                                int, struct file **);
+void            nfsd_close(struct file *);
+__be32          nfsd_read(struct svc_rqst *, struct svc_fh *, struct file *,
+                                loff_t, struct kvec *, int, unsigned long *);
+__be32          nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *,
+                                loff_t, struct kvec *,int, unsigned long *, int *);
+__be32          nfsd_readlink(struct svc_rqst *, struct svc_fh *,
+                                char *, int *);
+__be32          nfsd_symlink(struct svc_rqst *, struct svc_fh *,
+                                char *name, int len, char *path, int plen,
+                                struct svc_fh *res, struct iattr *);
+__be32          nfsd_link(struct svc_rqst *, struct svc_fh *,
+                                char *, int, struct svc_fh *);
+__be32          nfsd_rename(struct svc_rqst *,
+                                struct svc_fh *, char *, int,
+                                struct svc_fh *, char *, int);
+__be32          nfsd_remove(struct svc_rqst *,
+                                struct svc_fh *, char *, int);
+__be32          nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type,
+                                char *name, int len);
+int             nfsd_truncate(struct svc_rqst *, struct svc_fh *,
+                                unsigned long size);
+__be32          nfsd_readdir(struct svc_rqst *, struct svc_fh *,
+                             loff_t *, struct readdir_cd *, filldir_t);
+__be32          nfsd_statfs(struct svc_rqst *, struct svc_fh *,
+                                struct kstatfs *, int access);
+int             nfsd_notify_change(struct inode *, struct iattr *);
+__be32          nfsd_permission(struct svc_rqst *, struct svc_export *,
+                                struct dentry *, int);
+int             nfsd_sync_dir(struct dentry *dp);
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int);
+int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *);
+#endif
+#endif /* LINUX_NFSD_VFS_H */
diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h
new file mode 100644
index 000000000000..53b1863dd8f6
--- /dev/null
+++ b/fs/nfsd/xdr.h
@@ -0,0 +1,173 @@
+/* XDR types for nfsd. This is mainly a typing exercise. */
+#ifndef LINUX_NFSD_H
+#define LINUX_NFSD_H
+#include <linux/vfs.h>
+#include "nfsd.h"
+#include "nfsfh.h"
+struct nfsd_fhandle {
+        struct svc_fh           fh;
+};
+struct nfsd_sattrargs {
+        struct svc_fh           fh;
+        struct iattr            attrs;
+};
+struct nfsd_diropargs {
+        struct svc_fh           fh;
+        char *                  name;
+        unsigned int            len;
+};
+struct nfsd_readargs {
+        struct svc_fh           fh;
+        __u32                   offset;
+        __u32                   count;
+        int                     vlen;
+};
+struct nfsd_writeargs {
+        svc_fh                  fh;
+        __u32                   offset;
+        int                     len;
+        int                     vlen;
+};
+struct nfsd_createargs {
+        struct svc_fh           fh;
+        char *                  name;
+        unsigned int            len;
+        struct iattr            attrs;
+};
+struct nfsd_renameargs {
+        struct svc_fh           ffh;
+        char *                  fname;
+        unsigned int            flen;
+        struct svc_fh           tfh;
+        char *                  tname;
+        unsigned int            tlen;
+};
+struct nfsd_readlinkargs {
+        struct svc_fh           fh;
+        char *                  buffer;
+};
+        
+struct nfsd_linkargs {
+        struct svc_fh           ffh;
+        struct svc_fh           tfh;
+        char *                  tname;
+        unsigned int            tlen;
+};
+struct nfsd_symlinkargs {
+        struct svc_fh           ffh;
+        char *                  fname;
+        unsigned int            flen;
+        char *                  tname;
+        unsigned int            tlen;
+        struct iattr            attrs;
+};
+struct nfsd_readdirargs {
+        struct svc_fh           fh;
+        __u32                   cookie;
+        __u32                   count;
+        __be32 *                buffer;
+};
+struct nfsd_attrstat {
+        struct svc_fh           fh;
+        struct kstat            stat;
+};
+struct nfsd_diropres  {
+        struct svc_fh           fh;
+        struct kstat            stat;
+};
+struct nfsd_readlinkres {
+        int                     len;
+};
+struct nfsd_readres {
+        struct svc_fh           fh;
+        unsigned long           count;
+        struct kstat            stat;
+};
+struct nfsd_readdirres {
+        int                     count;
+        struct readdir_cd       common;
+        __be32 *                buffer;
+        int                     buflen;
+        __be32 *                offset;
+};
+struct nfsd_statfsres {
+        struct kstatfs          stats;
+};
+/*
+ * Storage requirements for XDR arguments and results.
+ */
+union nfsd_xdrstore {
+        struct nfsd_sattrargs   sattr;
+        struct nfsd_diropargs   dirop;
+        struct nfsd_readargs    read;
+        struct nfsd_writeargs   write;
+        struct nfsd_createargs  create;
+        struct nfsd_renameargs  rename;
+        struct nfsd_linkargs    link;
+        struct nfsd_symlinkargs symlink;
+        struct nfsd_readdirargs readdir;
+};
+#define NFS2_SVC_XDRSIZE        sizeof(union nfsd_xdrstore)
+int nfssvc_decode_void(struct svc_rqst *, __be32 *, void *);
+int nfssvc_decode_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *);
+int nfssvc_decode_sattrargs(struct svc_rqst *, __be32 *,
+                                struct nfsd_sattrargs *);
+int nfssvc_decode_diropargs(struct svc_rqst *, __be32 *,
+                                struct nfsd_diropargs *);
+int nfssvc_decode_readargs(struct svc_rqst *, __be32 *,
+                                struct nfsd_readargs *);
+int nfssvc_decode_writeargs(struct svc_rqst *, __be32 *,
+                                struct nfsd_writeargs *);
+int nfssvc_decode_createargs(struct svc_rqst *, __be32 *,
+                                struct nfsd_createargs *);
+int nfssvc_decode_renameargs(struct svc_rqst *, __be32 *,
+                                struct nfsd_renameargs *);
+int nfssvc_decode_readlinkargs(struct svc_rqst *, __be32 *,
+                                struct nfsd_readlinkargs *);
+int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *,
+                                struct nfsd_linkargs *);
+int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *,
+                                struct nfsd_symlinkargs *);
+int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *,
+                                struct nfsd_readdirargs *);
+int nfssvc_encode_void(struct svc_rqst *, __be32 *, void *);
+int nfssvc_encode_attrstat(struct svc_rqst *, __be32 *, struct nfsd_attrstat *);
+int nfssvc_encode_diropres(struct svc_rqst *, __be32 *, struct nfsd_diropres *);
+int nfssvc_encode_readlinkres(struct svc_rqst *, __be32 *, struct nfsd_readlinkres *);
+int nfssvc_encode_readres(struct svc_rqst *, __be32 *, struct nfsd_readres *);
+int nfssvc_encode_statfsres(struct svc_rqst *, __be32 *, struct nfsd_statfsres *);
+int nfssvc_encode_readdirres(struct svc_rqst *, __be32 *, struct nfsd_readdirres *);
+int nfssvc_encode_entry(void *, const char *name,
+                        int namlen, loff_t offset, u64 ino, unsigned int);
+int nfssvc_release_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *);
+/* Helper functions for NFSv2 ACL code */
+__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp);
+__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp);
+#endif /* LINUX_NFSD_H */
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
new file mode 100644
index 000000000000..7df980eb0562
--- /dev/null
+++ b/fs/nfsd/xdr3.h
@@ -0,0 +1,344 @@
+/*
+ * XDR types for NFSv3 in nfsd.
+ *
+ * Copyright (C) 1996-1998, Olaf Kirch <okir@monad.swb.de>
+ */
+#ifndef _LINUX_NFSD_XDR3_H
+#define _LINUX_NFSD_XDR3_H
+#include "xdr.h"
+struct nfsd3_sattrargs {
+        struct svc_fh           fh;
+        struct iattr            attrs;
+        int                     check_guard;
+        time_t                  guardtime;
+};
+struct nfsd3_diropargs {
+        struct svc_fh           fh;
+        char *                  name;
+        unsigned int            len;
+};
+struct nfsd3_accessargs {
+        struct svc_fh           fh;
+        unsigned int            access;
+};
+struct nfsd3_readargs {
+        struct svc_fh           fh;
+        __u64                   offset;
+        __u32                   count;
+        int                     vlen;
+};
+struct nfsd3_writeargs {
+        svc_fh                  fh;
+        __u64                   offset;
+        __u32                   count;
+        int                     stable;
+        __u32                   len;
+        int                     vlen;
+};
+struct nfsd3_createargs {
+        struct svc_fh           fh;
+        char *                  name;
+        unsigned int            len;
+        int                     createmode;
+        struct iattr            attrs;
+        __be32 *                verf;
+};
+struct nfsd3_mknodargs {
+        struct svc_fh           fh;
+        char *                  name;
+        unsigned int            len;
+        __u32                   ftype;
+        __u32                   major, minor;
+        struct iattr            attrs;
+};
+struct nfsd3_renameargs {
+        struct svc_fh           ffh;
+        char *                  fname;
+        unsigned int            flen;
+        struct svc_fh           tfh;
+        char *                  tname;
+        unsigned int            tlen;
+};
+struct nfsd3_readlinkargs {
+        struct svc_fh           fh;
+        char *                  buffer;
+};
+struct nfsd3_linkargs {
+        struct svc_fh           ffh;
+        struct svc_fh           tfh;
+        char *                  tname;
+        unsigned int            tlen;
+};
+struct nfsd3_symlinkargs {
+        struct svc_fh           ffh;
+        char *                  fname;
+        unsigned int            flen;
+        char *                  tname;
+        unsigned int            tlen;
+        struct iattr            attrs;
+};
+struct nfsd3_readdirargs {
+        struct svc_fh           fh;
+        __u64                   cookie;
+        __u32                   dircount;
+        __u32                   count;
+        __be32 *                verf;
+        __be32 *                buffer;
+};
+struct nfsd3_commitargs {
+        struct svc_fh           fh;
+        __u64                   offset;
+        __u32                   count;
+};
+struct nfsd3_getaclargs {
+        struct svc_fh           fh;
+        int                     mask;
+};
+struct posix_acl;
+struct nfsd3_setaclargs {
+        struct svc_fh           fh;
+        int                     mask;
+        struct posix_acl        *acl_access;
+        struct posix_acl        *acl_default;
+};
+struct nfsd3_attrstat {
+        __be32                  status;
+        struct svc_fh           fh;
+        struct kstat            stat;
+};
+/* LOOKUP, CREATE, MKDIR, SYMLINK, MKNOD */
+struct nfsd3_diropres  {
+        __be32                  status;
+        struct svc_fh           dirfh;
+        struct svc_fh           fh;
+};
+struct nfsd3_accessres {
+        __be32                  status;
+        struct svc_fh           fh;
+        __u32                   access;
+};
+struct nfsd3_readlinkres {
+        __be32                  status;
+        struct svc_fh           fh;
+        __u32                   len;
+};
+struct nfsd3_readres {
+        __be32                  status;
+        struct svc_fh           fh;
+        unsigned long           count;
+        int                     eof;
+};
+struct nfsd3_writeres {
+        __be32                  status;
+        struct svc_fh           fh;
+        unsigned long           count;
+        int                     committed;
+};
+struct nfsd3_renameres {
+        __be32                  status;
+        struct svc_fh           ffh;
+        struct svc_fh           tfh;
+};
+struct nfsd3_linkres {
+        __be32                  status;
+        struct svc_fh           tfh;
+        struct svc_fh           fh;
+};
+struct nfsd3_readdirres {
+        __be32                  status;
+        struct svc_fh           fh;
+        int                     count;
+        __be32                  verf[2];
+        struct readdir_cd       common;
+        __be32 *                buffer;
+        int                     buflen;
+        __be32 *                offset;
+        __be32 *                offset1;
+        struct svc_rqst *       rqstp;
+};
+struct nfsd3_fsstatres {
+        __be32                  status;
+        struct kstatfs          stats;
+        __u32                   invarsec;
+};
+struct nfsd3_fsinfores {
+        __be32                  status;
+        __u32                   f_rtmax;
+        __u32                   f_rtpref;
+        __u32                   f_rtmult;
+        __u32                   f_wtmax;
+        __u32                   f_wtpref;
+        __u32                   f_wtmult;
+        __u32                   f_dtpref;
+        __u64                   f_maxfilesize;
+        __u32                   f_properties;
+};
+struct nfsd3_pathconfres {
+        __be32                  status;
+        __u32                   p_link_max;
+        __u32                   p_name_max;
+        __u32                   p_no_trunc;
+        __u32                   p_chown_restricted;
+        __u32                   p_case_insensitive;
+        __u32                   p_case_preserving;
+};
+struct nfsd3_commitres {
+        __be32                  status;
+        struct svc_fh           fh;
+};
+struct nfsd3_getaclres {
+        __be32                  status;
+        struct svc_fh           fh;
+        int                     mask;
+        struct posix_acl        *acl_access;
+        struct posix_acl        *acl_default;
+};
+/* dummy type for release */
+struct nfsd3_fhandle_pair {
+        __u32                   dummy;
+        struct svc_fh           fh1;
+        struct svc_fh           fh2;
+};
+/*
+ * Storage requirements for XDR arguments and results.
+ */
+union nfsd3_xdrstore {
+        struct nfsd3_sattrargs          sattrargs;
+        struct nfsd3_diropargs          diropargs;
+        struct nfsd3_readargs           readargs;
+        struct nfsd3_writeargs          writeargs;
+        struct nfsd3_createargs         createargs;
+        struct nfsd3_renameargs         renameargs;
+        struct nfsd3_linkargs           linkargs;
+        struct nfsd3_symlinkargs        symlinkargs;
+        struct nfsd3_readdirargs        readdirargs;
+        struct nfsd3_diropres           diropres;
+        struct nfsd3_accessres          accessres;
+        struct nfsd3_readlinkres        readlinkres;
+        struct nfsd3_readres            readres;
+        struct nfsd3_writeres           writeres;
+        struct nfsd3_renameres          renameres;
+        struct nfsd3_linkres            linkres;
+        struct nfsd3_readdirres         readdirres;
+        struct nfsd3_fsstatres          fsstatres;
+        struct nfsd3_fsinfores          fsinfores;
+        struct nfsd3_pathconfres        pathconfres;
+        struct nfsd3_commitres          commitres;
+        struct nfsd3_getaclres          getaclres;
+};
+#define NFS3_SVC_XDRSIZE                sizeof(union nfsd3_xdrstore)
+int nfs3svc_decode_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *);
+int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_sattrargs *);
+int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_diropargs *);
+int nfs3svc_decode_accessargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_accessargs *);
+int nfs3svc_decode_readargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_readargs *);
+int nfs3svc_decode_writeargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_writeargs *);
+int nfs3svc_decode_createargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_createargs *);
+int nfs3svc_decode_mkdirargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_createargs *);
+int nfs3svc_decode_mknodargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_mknodargs *);
+int nfs3svc_decode_renameargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_renameargs *);
+int nfs3svc_decode_readlinkargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_readlinkargs *);
+int nfs3svc_decode_linkargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_linkargs *);
+int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_symlinkargs *);
+int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_readdirargs *);
+int nfs3svc_decode_readdirplusargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_readdirargs *);
+int nfs3svc_decode_commitargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_commitargs *);
+int nfs3svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
+int nfs3svc_encode_attrstat(struct svc_rqst *, __be32 *,
+                                struct nfsd3_attrstat *);
+int nfs3svc_encode_wccstat(struct svc_rqst *, __be32 *,
+                                struct nfsd3_attrstat *);
+int nfs3svc_encode_diropres(struct svc_rqst *, __be32 *,
+                                struct nfsd3_diropres *);
+int nfs3svc_encode_accessres(struct svc_rqst *, __be32 *,
+                                struct nfsd3_accessres *);
+int nfs3svc_encode_readlinkres(struct svc_rqst *, __be32 *,
+                                struct nfsd3_readlinkres *);
+int nfs3svc_encode_readres(struct svc_rqst *, __be32 *, struct nfsd3_readres *);
+int nfs3svc_encode_writeres(struct svc_rqst *, __be32 *, struct nfsd3_writeres *);
+int nfs3svc_encode_createres(struct svc_rqst *, __be32 *,
+                                struct nfsd3_diropres *);
+int nfs3svc_encode_renameres(struct svc_rqst *, __be32 *,
+                                struct nfsd3_renameres *);
+int nfs3svc_encode_linkres(struct svc_rqst *, __be32 *,
+                                struct nfsd3_linkres *);
+int nfs3svc_encode_readdirres(struct svc_rqst *, __be32 *,
+                                struct nfsd3_readdirres *);
+int nfs3svc_encode_fsstatres(struct svc_rqst *, __be32 *,
+                                struct nfsd3_fsstatres *);
+int nfs3svc_encode_fsinfores(struct svc_rqst *, __be32 *,
+                                struct nfsd3_fsinfores *);
+int nfs3svc_encode_pathconfres(struct svc_rqst *, __be32 *,
+                                struct nfsd3_pathconfres *);
+int nfs3svc_encode_commitres(struct svc_rqst *, __be32 *,
+                                struct nfsd3_commitres *);
+int nfs3svc_release_fhandle(struct svc_rqst *, __be32 *,
+                                struct nfsd3_attrstat *);
+int nfs3svc_release_fhandle2(struct svc_rqst *, __be32 *,
+                                struct nfsd3_fhandle_pair *);
+int nfs3svc_encode_entry(void *, const char *name,
+                                int namlen, loff_t offset, u64 ino,
+                                unsigned int);
+int nfs3svc_encode_entry_plus(void *, const char *name,
+                                int namlen, loff_t offset, u64 ino,
+                                unsigned int);
+/* Helper functions for NFSv3 ACL code */
+__be32 *nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p,
+                                struct svc_fh *fhp);
+__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp);
+#endif /* _LINUX_NFSD_XDR3_H */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
new file mode 100644
index 000000000000..efa337739534
--- /dev/null
+++ b/fs/nfsd/xdr4.h
@@ -0,0 +1,562 @@
+/*
+ *  Server-side types for NFSv4.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Kendrick Smith <kmsmith@umich.edu>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _LINUX_NFSD_XDR4_H
+#define _LINUX_NFSD_XDR4_H
+#include "state.h"
+#include "nfsd.h"
+#define NFSD4_MAX_TAGLEN        128
+#define XDR_LEN(n)                     (((n) + 3) & ~3)
+struct nfsd4_compound_state {
+        struct svc_fh           current_fh;
+        struct svc_fh           save_fh;
+        struct nfs4_stateowner  *replay_owner;
+        /* For sessions DRC */
+        struct nfsd4_session    *session;
+        struct nfsd4_slot       *slot;
+        __be32                  *datap;
+        size_t                  iovlen;
+        u32                     minorversion;
+        u32                     status;
+};
+static inline bool nfsd4_has_session(struct nfsd4_compound_state *cs)
+{
+        return cs->slot != NULL;
+}
+struct nfsd4_change_info {
+        u32             atomic;
+        bool            change_supported;
+        u32             before_ctime_sec;
+        u32             before_ctime_nsec;
+        u64             before_change;
+        u32             after_ctime_sec;
+        u32             after_ctime_nsec;
+        u64             after_change;
+};
+struct nfsd4_access {
+        u32             ac_req_access;      /* request */
+        u32             ac_supported;       /* response */
+        u32             ac_resp_access;     /* response */
+};
+struct nfsd4_close {
+        u32             cl_seqid;           /* request */
+        stateid_t       cl_stateid;         /* request+response */
+        struct nfs4_stateowner * cl_stateowner; /* response */
+};
+struct nfsd4_commit {
+        u64             co_offset;          /* request */
+        u32             co_count;           /* request */
+        nfs4_verifier   co_verf;            /* response */
+};
+struct nfsd4_create {
+        u32             cr_namelen;         /* request */
+        char *          cr_name;            /* request */
+        u32             cr_type;            /* request */
+        union {                             /* request */
+                struct {
+                        u32 namelen;
+                        char *name;
+                } link;   /* NF4LNK */
+                struct {
+                        u32 specdata1;
+                        u32 specdata2;
+                } dev;    /* NF4BLK, NF4CHR */
+        } u;
+        u32             cr_bmval[3];        /* request */
+        struct iattr    cr_iattr;           /* request */
+        struct nfsd4_change_info  cr_cinfo; /* response */
+        struct nfs4_acl *cr_acl;
+};
+#define cr_linklen      u.link.namelen
+#define cr_linkname     u.link.name
+#define cr_specdata1    u.dev.specdata1
+#define cr_specdata2    u.dev.specdata2
+struct nfsd4_delegreturn {
+        stateid_t       dr_stateid;
+};
+struct nfsd4_getattr {
+        u32             ga_bmval[3];        /* request */
+        struct svc_fh   *ga_fhp;            /* response */
+};
+struct nfsd4_link {
+        u32             li_namelen;         /* request */
+        char *          li_name;            /* request */
+        struct nfsd4_change_info  li_cinfo; /* response */
+};
+struct nfsd4_lock_denied {
+        clientid_t      ld_clientid;
+        struct nfs4_stateowner   *ld_sop;
+        u64             ld_start;
+        u64             ld_length;
+        u32             ld_type;
+};
+struct nfsd4_lock {
+        /* request */
+        u32             lk_type;
+        u32             lk_reclaim;         /* boolean */
+        u64             lk_offset;
+        u64             lk_length;
+        u32             lk_is_new;
+        union {
+                struct {
+                        u32             open_seqid;
+                        stateid_t       open_stateid;
+                        u32             lock_seqid;
+                        clientid_t      clientid;
+                        struct xdr_netobj owner;
+                } new;
+                struct {
+                        stateid_t       lock_stateid;
+                        u32             lock_seqid;
+                } old;
+        } v;
+        /* response */
+        union {
+                struct {
+                        stateid_t               stateid;
+                } ok;
+                struct nfsd4_lock_denied        denied;
+        } u;
+        /* The lk_replay_owner is the open owner in the open_to_lock_owner
+         * case and the lock owner otherwise: */
+        struct nfs4_stateowner *lk_replay_owner;
+};
+#define lk_new_open_seqid       v.new.open_seqid
+#define lk_new_open_stateid     v.new.open_stateid
+#define lk_new_lock_seqid       v.new.lock_seqid
+#define lk_new_clientid         v.new.clientid
+#define lk_new_owner            v.new.owner
+#define lk_old_lock_stateid     v.old.lock_stateid
+#define lk_old_lock_seqid       v.old.lock_seqid
+#define lk_rflags       u.ok.rflags
+#define lk_resp_stateid u.ok.stateid
+#define lk_denied       u.denied
+struct nfsd4_lockt {
+        u32                             lt_type;
+        clientid_t                      lt_clientid;
+        struct xdr_netobj               lt_owner;
+        u64                             lt_offset;
+        u64                             lt_length;
+        struct nfs4_stateowner *        lt_stateowner;
+        struct nfsd4_lock_denied        lt_denied;
+};
+ 
+struct nfsd4_locku {
+        u32             lu_type;
+        u32             lu_seqid;
+        stateid_t       lu_stateid;
+        u64             lu_offset;
+        u64             lu_length;
+        struct nfs4_stateowner  *lu_stateowner;
+};
+struct nfsd4_lookup {
+        u32             lo_len;             /* request */
+        char *          lo_name;            /* request */
+};
+struct nfsd4_putfh {
+        u32             pf_fhlen;           /* request */
+        char            *pf_fhval;          /* request */
+};
+struct nfsd4_open {
+        u32             op_claim_type;      /* request */
+        struct xdr_netobj op_fname;         /* request - everything but CLAIM_PREV */
+        u32             op_delegate_type;   /* request - CLAIM_PREV only */
+        stateid_t       op_delegate_stateid; /* request - response */
+        u32             op_create;          /* request */
+        u32             op_createmode;      /* request */
+        u32             op_bmval[3];        /* request */
+        struct iattr    iattr;              /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */
+        nfs4_verifier   verf;               /* EXCLUSIVE4 */
+        clientid_t      op_clientid;        /* request */
+        struct xdr_netobj op_owner;           /* request */
+        u32             op_seqid;           /* request */
+        u32             op_share_access;    /* request */
+        u32             op_share_deny;      /* request */
+        stateid_t       op_stateid;         /* response */
+        u32             op_recall;          /* recall */
+        struct nfsd4_change_info  op_cinfo; /* response */
+        u32             op_rflags;          /* response */
+        int             op_truncate;        /* used during processing */
+        struct nfs4_stateowner *op_stateowner; /* used during processing */
+        struct nfs4_acl *op_acl;
+};
+#define op_iattr        iattr
+#define op_verf         verf
+struct nfsd4_open_confirm {
+        stateid_t       oc_req_stateid          /* request */;
+        u32             oc_seqid                /* request */;
+        stateid_t       oc_resp_stateid         /* response */;
+        struct nfs4_stateowner * oc_stateowner; /* response */
+};
+struct nfsd4_open_downgrade {
+        stateid_t       od_stateid;
+        u32             od_seqid;
+        u32             od_share_access;
+        u32             od_share_deny;
+        struct nfs4_stateowner *od_stateowner;
+};
+struct nfsd4_read {
+        stateid_t       rd_stateid;         /* request */
+        u64             rd_offset;          /* request */
+        u32             rd_length;          /* request */
+        int             rd_vlen;
+        struct file     *rd_filp;
+        
+        struct svc_rqst *rd_rqstp;          /* response */
+        struct svc_fh * rd_fhp;             /* response */
+};
+struct nfsd4_readdir {
+        u64             rd_cookie;          /* request */
+        nfs4_verifier   rd_verf;            /* request */
+        u32             rd_dircount;        /* request */
+        u32             rd_maxcount;        /* request */
+        u32             rd_bmval[3];        /* request */
+        struct svc_rqst *rd_rqstp;          /* response */
+        struct svc_fh * rd_fhp;             /* response */
+        struct readdir_cd       common;
+        __be32 *                buffer;
+        int                     buflen;
+        __be32 *                offset;
+};
+struct nfsd4_release_lockowner {
+        clientid_t        rl_clientid;
+        struct xdr_netobj rl_owner;
+};
+struct nfsd4_readlink {
+        struct svc_rqst *rl_rqstp;          /* request */
+        struct svc_fh * rl_fhp;             /* request */
+};
+struct nfsd4_remove {
+        u32             rm_namelen;         /* request */
+        char *          rm_name;            /* request */
+        struct nfsd4_change_info  rm_cinfo; /* response */
+};
+struct nfsd4_rename {
+        u32             rn_snamelen;        /* request */
+        char *          rn_sname;           /* request */
+        u32             rn_tnamelen;        /* request */
+        char *          rn_tname;           /* request */
+        struct nfsd4_change_info  rn_sinfo; /* response */
+        struct nfsd4_change_info  rn_tinfo; /* response */
+};
+struct nfsd4_secinfo {
+        u32 si_namelen;                                 /* request */
+        char *si_name;                                  /* request */
+        struct svc_export *si_exp;                      /* response */
+};
+struct nfsd4_setattr {
+        stateid_t       sa_stateid;         /* request */
+        u32             sa_bmval[3];        /* request */
+        struct iattr    sa_iattr;           /* request */
+        struct nfs4_acl *sa_acl;
+};
+struct nfsd4_setclientid {
+        nfs4_verifier   se_verf;            /* request */
+        u32             se_namelen;         /* request */
+        char *          se_name;            /* request */
+        u32             se_callback_prog;   /* request */
+        u32             se_callback_netid_len;  /* request */
+        char *          se_callback_netid_val;  /* request */
+        u32             se_callback_addr_len;   /* request */
+        char *          se_callback_addr_val;   /* request */
+        u32             se_callback_ident;  /* request */
+        clientid_t      se_clientid;        /* response */
+        nfs4_verifier   se_confirm;         /* response */
+};
+struct nfsd4_setclientid_confirm {
+        clientid_t      sc_clientid;
+        nfs4_verifier   sc_confirm;
+};
+/* also used for NVERIFY */
+struct nfsd4_verify {
+        u32             ve_bmval[3];        /* request */
+        u32             ve_attrlen;         /* request */
+        char *          ve_attrval;         /* request */
+};
+struct nfsd4_write {
+        stateid_t       wr_stateid;         /* request */
+        u64             wr_offset;          /* request */
+        u32             wr_stable_how;      /* request */
+        u32             wr_buflen;          /* request */
+        int             wr_vlen;
+        u32             wr_bytes_written;   /* response */
+        u32             wr_how_written;     /* response */
+        nfs4_verifier   wr_verifier;        /* response */
+};
+struct nfsd4_exchange_id {
+        nfs4_verifier   verifier;
+        struct xdr_netobj clname;
+        u32             flags;
+        clientid_t      clientid;
+        u32             seqid;
+        int             spa_how;
+};
+struct nfsd4_sequence {
+        struct nfs4_sessionid   sessionid;              /* request/response */
+        u32                     seqid;                  /* request/response */
+        u32                     slotid;                 /* request/response */
+        u32                     maxslots;               /* request/response */
+        u32                     cachethis;              /* request */
+#if 0
+        u32                     target_maxslots;        /* response */
+        u32                     status_flags;           /* response */
+#endif /* not yet */
+};
+struct nfsd4_destroy_session {
+        struct nfs4_sessionid   sessionid;
+};
+struct nfsd4_op {
+        int                                     opnum;
+        __be32                                  status;
+        union {
+                struct nfsd4_access             access;
+                struct nfsd4_close              close;
+                struct nfsd4_commit             commit;
+                struct nfsd4_create             create;
+                struct nfsd4_delegreturn        delegreturn;
+                struct nfsd4_getattr            getattr;
+                struct svc_fh *                 getfh;
+                struct nfsd4_link               link;
+                struct nfsd4_lock               lock;
+                struct nfsd4_lockt              lockt;
+                struct nfsd4_locku              locku;
+                struct nfsd4_lookup             lookup;
+                struct nfsd4_verify             nverify;
+                struct nfsd4_open               open;
+                struct nfsd4_open_confirm       open_confirm;
+                struct nfsd4_open_downgrade     open_downgrade;
+                struct nfsd4_putfh              putfh;
+                struct nfsd4_read               read;
+                struct nfsd4_readdir            readdir;
+                struct nfsd4_readlink           readlink;
+                struct nfsd4_remove             remove;
+                struct nfsd4_rename             rename;
+                clientid_t                      renew;
+                struct nfsd4_secinfo            secinfo;
+                struct nfsd4_setattr            setattr;
+                struct nfsd4_setclientid        setclientid;
+                struct nfsd4_setclientid_confirm setclientid_confirm;
+                struct nfsd4_verify             verify;
+                struct nfsd4_write              write;
+                struct nfsd4_release_lockowner  release_lockowner;
+                /* NFSv4.1 */
+                struct nfsd4_exchange_id        exchange_id;
+                struct nfsd4_create_session     create_session;
+                struct nfsd4_destroy_session    destroy_session;
+                struct nfsd4_sequence           sequence;
+        } u;
+        struct nfs4_replay *                    replay;
+};
+struct nfsd4_compoundargs {
+        /* scratch variables for XDR decode */
+        __be32 *                        p;
+        __be32 *                        end;
+        struct page **                  pagelist;
+        int                             pagelen;
+        __be32                          tmp[8];
+        __be32 *                        tmpp;
+        struct tmpbuf {
+                struct tmpbuf *next;
+                void (*release)(const void *);
+                void *buf;
+        }                               *to_free;
+        struct svc_rqst                 *rqstp;
+        u32                             taglen;
+        char *                          tag;
+        u32                             minorversion;
+        u32                             opcnt;
+        struct nfsd4_op                 *ops;
+        struct nfsd4_op                 iops[8];
+};
+struct nfsd4_compoundres {
+        /* scratch variables for XDR encode */
+        __be32 *                        p;
+        __be32 *                        end;
+        struct xdr_buf *                xbuf;
+        struct svc_rqst *               rqstp;
+        u32                             taglen;
+        char *                          tag;
+        u32                             opcnt;
+        __be32 *                        tagp; /* tag, opcount encode location */
+        struct nfsd4_compound_state     cstate;
+};
+static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp)
+{
+        struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
+        return resp->opcnt == 1 && args->ops[0].opnum == OP_SEQUENCE;
+}
+static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
+{
+        return !resp->cstate.slot->sl_cachethis || nfsd4_is_solo_sequence(resp);
+}
+#define NFS4_SVC_XDRSIZE                sizeof(struct nfsd4_compoundargs)
+static inline void
+set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
+{
+        BUG_ON(!fhp->fh_pre_saved || !fhp->fh_post_saved);
+        cinfo->atomic = 1;
+        cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode);
+        if (cinfo->change_supported) {
+                cinfo->before_change = fhp->fh_pre_change;
+                cinfo->after_change = fhp->fh_post_change;
+        } else {
+                cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
+                cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
+                cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
+                cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
+        }
+}
+int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
+int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *,
+                struct nfsd4_compoundargs *);
+int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *,
+                struct nfsd4_compoundres *);
+void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
+void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op);
+__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
+                       struct dentry *dentry, __be32 *buffer, int *countp,
+                       u32 *bmval, struct svc_rqst *, int ignore_crossmnt);
+extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *,
+                struct nfsd4_setclientid *setclid);
+extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *,
+                struct nfsd4_setclientid_confirm *setclientid_confirm);
+extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp);
+extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
+                struct nfsd4_sequence *seq);
+extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *,
+struct nfsd4_exchange_id *);
+                extern __be32 nfsd4_create_session(struct svc_rqst *,
+                struct nfsd4_compound_state *,
+                struct nfsd4_create_session *);
+extern __be32 nfsd4_sequence(struct svc_rqst *,
+                struct nfsd4_compound_state *,
+                struct nfsd4_sequence *);
+extern __be32 nfsd4_destroy_session(struct svc_rqst *,
+                struct nfsd4_compound_state *,
+                struct nfsd4_destroy_session *);
+extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
+                struct nfsd4_open *open);
+extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
+                struct svc_fh *current_fh, struct nfsd4_open *open);
+extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc);
+extern __be32 nfsd4_close(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *,
+                struct nfsd4_close *close);
+extern __be32 nfsd4_open_downgrade(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *,
+                struct nfsd4_open_downgrade *od);
+extern __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *,
+                struct nfsd4_lock *lock);
+extern __be32 nfsd4_lockt(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *,
+                struct nfsd4_lockt *lockt);
+extern __be32 nfsd4_locku(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *,
+                struct nfsd4_locku *locku);
+extern __be32
+nfsd4_release_lockowner(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *,
+                struct nfsd4_release_lockowner *rlockowner);
+extern void nfsd4_release_compoundargs(struct nfsd4_compoundargs *);
+extern __be32 nfsd4_delegreturn(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *, struct nfsd4_delegreturn *dr);
+extern __be32 nfsd4_renew(struct svc_rqst *rqstp,
+                          struct nfsd4_compound_state *, clientid_t *clid);
+#endif
+/*
+ * Local variables:
+ *  c-basic-offset: 8
+ * End:
+ */
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index d69e6ae59251..8d6356a804f3 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -26,6 +26,7 @@
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
 #include <linux/bitops.h>
+#include <linux/slab.h>
 #include "mdt.h"
 #include "alloc.h"
@@ -142,29 +143,75 @@ static void nilfs_palloc_desc_block_init(struct inode *inode,
        }
 }
+static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
+                                  int create,
+                                  void (*init_block)(struct inode *,
+                                                     struct buffer_head *,
+                                                     void *),
+                                  struct buffer_head **bhp,
+                                  struct nilfs_bh_assoc *prev,
+                                  spinlock_t *lock)
+{
+        int ret;
+        spin_lock(lock);
+        if (prev->bh && blkoff == prev->blkoff) {
+                get_bh(prev->bh);
+                *bhp = prev->bh;
+                spin_unlock(lock);
+                return 0;
+        }
+        spin_unlock(lock);
+        ret = nilfs_mdt_get_block(inode, blkoff, create, init_block, bhp);
+        if (!ret) {
+                spin_lock(lock);
+                /*
+                 * The following code must be safe for change of the
+                 * cache contents during the get block call.
+                 */
+                brelse(prev->bh);
+                get_bh(*bhp);
+                prev->bh = *bhp;
+                prev->blkoff = blkoff;
+                spin_unlock(lock);
+        }
+        return ret;
+}
 static int nilfs_palloc_get_desc_block(struct inode *inode,
                                       unsigned long group,
                                       int create, struct buffer_head **bhp)
 {
-        return nilfs_mdt_get_block(inode,
+        struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
-                                   nilfs_palloc_desc_blkoff(inode, group),
-                                   create, nilfs_palloc_desc_block_init, bhp);
+        return nilfs_palloc_get_block(inode,
+                                      nilfs_palloc_desc_blkoff(inode, group),
+                                      create, nilfs_palloc_desc_block_init,
+                                      bhp, &cache->prev_desc, &cache->lock);
 }
 static int nilfs_palloc_get_bitmap_block(struct inode *inode,
                                         unsigned long group,
                                         int create, struct buffer_head **bhp)
 {
-        return nilfs_mdt_get_block(inode,
+        struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
-                                   nilfs_palloc_bitmap_blkoff(inode, group),
-                                   create, NULL, bhp);
+        return nilfs_palloc_get_block(inode,
+                                      nilfs_palloc_bitmap_blkoff(inode, group),
+                                      create, NULL, bhp,
+                                      &cache->prev_bitmap, &cache->lock);
 }
 int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
                                 int create, struct buffer_head **bhp)
 {
-        return nilfs_mdt_get_block(inode, nilfs_palloc_entry_blkoff(inode, nr),
+        struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
-                                   create, NULL, bhp);
+        return nilfs_palloc_get_block(inode,
+                                      nilfs_palloc_entry_blkoff(inode, nr),
+                                      create, NULL, bhp,
+                                      &cache->prev_entry, &cache->lock);
 }
 static struct nilfs_palloc_group_desc *
@@ -176,13 +223,6 @@ nilfs_palloc_block_get_group_desc(const struct inode *inode,
                group % nilfs_palloc_groups_per_desc_block(inode);
 }
-static unsigned char *
-nilfs_palloc_block_get_bitmap(const struct inode *inode,
-                              const struct buffer_head *bh, void *kaddr)
-{
-        return (unsigned char *)(kaddr + bh_offset(bh));
-}
 void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
                                   const struct buffer_head *bh, void *kaddr)
 {
@@ -289,8 +329,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
                                if (ret < 0)
                                        goto out_desc;
                                bitmap_kaddr = kmap(bitmap_bh->b_page);
-                                bitmap = nilfs_palloc_block_get_bitmap(
+                                bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
-                                        inode, bitmap_bh, bitmap_kaddr);
                                pos = nilfs_palloc_find_available_slot(
                                        inode, group, group_offset, bitmap,
                                        entries_per_group);
@@ -351,8 +390,7 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
        desc = nilfs_palloc_block_get_group_desc(inode, group,
                                                 req->pr_desc_bh, desc_kaddr);
        bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
-        bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
+        bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
-                                               bitmap_kaddr);
        if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
                                    group_offset, bitmap))
@@ -385,8 +423,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
        desc = nilfs_palloc_block_get_group_desc(inode, group,
                                                 req->pr_desc_bh, desc_kaddr);
        bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
-        bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
+        bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
-                                               bitmap_kaddr);
        if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
                                    group_offset, bitmap))
                printk(KERN_WARNING "%s: entry numer %llu already freed\n",
@@ -472,8 +509,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
                desc = nilfs_palloc_block_get_group_desc(
                        inode, group, desc_bh, desc_kaddr);
                bitmap_kaddr = kmap(bitmap_bh->b_page);
-                bitmap = nilfs_palloc_block_get_bitmap(
+                bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
-                        inode, bitmap_bh, bitmap_kaddr);
                for (j = i, n = 0;
                     (j < nitems) && nilfs_palloc_group_is_in(inode, group,
                                                              entry_nrs[j]);
@@ -502,3 +538,30 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
        }
        return 0;
 }
+void nilfs_palloc_setup_cache(struct inode *inode,
+                              struct nilfs_palloc_cache *cache)
+{
+        NILFS_MDT(inode)->mi_palloc_cache = cache;
+        spin_lock_init(&cache->lock);
+}
+void nilfs_palloc_clear_cache(struct inode *inode)
+{
+        struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+        spin_lock(&cache->lock);
+        brelse(cache->prev_desc.bh);
+        brelse(cache->prev_bitmap.bh);
+        brelse(cache->prev_entry.bh);
+        cache->prev_desc.bh = NULL;
+        cache->prev_bitmap.bh = NULL;
+        cache->prev_entry.bh = NULL;
+        spin_unlock(&cache->lock);
+}
+void nilfs_palloc_destroy_cache(struct inode *inode)
+{
+        nilfs_palloc_clear_cache(inode);
+        NILFS_MDT(inode)->mi_palloc_cache = NULL;
+}
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 4ace5475c2c7..5cccf874d692 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -42,7 +42,7 @@ void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
                                   const struct buffer_head *, void *);
 /**
- * nilfs_palloc_req - persistent alloctor request and reply
+ * nilfs_palloc_req - persistent allocator request and reply
 * @pr_entry_nr: entry number (vblocknr or inode number)
 * @pr_desc_bh: buffer head of the buffer containing block group descriptors
 * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap
@@ -69,4 +69,25 @@ int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
 #define nilfs_clear_bit_atomic          ext2_clear_bit_atomic
 #define nilfs_find_next_zero_bit        ext2_find_next_zero_bit
+/*
+ * persistent object allocator cache
+ */
+struct nilfs_bh_assoc {
+        unsigned long blkoff;
+        struct buffer_head *bh;
+};
+struct nilfs_palloc_cache {
+        spinlock_t lock;
+        struct nilfs_bh_assoc prev_desc;
+        struct nilfs_bh_assoc prev_bitmap;
+        struct nilfs_bh_assoc prev_entry;
+};
+void nilfs_palloc_setup_cache(struct inode *inode,
+                              struct nilfs_palloc_cache *cache);
+void nilfs_palloc_clear_cache(struct inode *inode);
+void nilfs_palloc_destroy_cache(struct inode *inode);
 #endif  /* _NILFS_ALLOC_H */
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 08834df6ec68..effdbdbe6c11 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -402,19 +402,11 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap)
 void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n)
 {
        inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
-        if (NILFS_MDT(bmap->b_inode))
-                nilfs_mdt_mark_dirty(bmap->b_inode);
-        else
-                mark_inode_dirty(bmap->b_inode);
 }
 void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n)
 {
        inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
-        if (NILFS_MDT(bmap->b_inode))
-                nilfs_mdt_mark_dirty(bmap->b_inode);
-        else
-                mark_inode_dirty(bmap->b_inode);
 }
 __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
@@ -425,8 +417,8 @@ __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
        key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT -
                                         bmap->b_inode->i_blkbits);
-        for (pbh = page_buffers(bh->b_page); pbh != bh;
+        for (pbh = page_buffers(bh->b_page); pbh != bh; pbh = pbh->b_this_page)
-             pbh = pbh->b_this_page, key++);
+                key++;
        return key;
 }
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 84c25382f8e3..447ce47a3306 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -27,6 +27,7 @@
 #include <linux/buffer_head.h>
 #include <linux/mm.h>
 #include <linux/backing-dev.h>
+#include <linux/gfp.h>
 #include "nilfs.h"
 #include "mdt.h"
 #include "dat.h"
@@ -68,9 +69,34 @@ void nilfs_btnode_cache_clear(struct address_space *btnc)
        truncate_inode_pages(btnc, 0);
 }
+struct buffer_head *
+nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
+{
+        struct inode *inode = NILFS_BTNC_I(btnc);
+        struct buffer_head *bh;
+        bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
+        if (unlikely(!bh))
+                return NULL;
+        if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
+                     buffer_dirty(bh))) {
+                brelse(bh);
+                BUG();
+        }
+        memset(bh->b_data, 0, 1 << inode->i_blkbits);
+        bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+        bh->b_blocknr = blocknr;
+        set_buffer_mapped(bh);
+        set_buffer_uptodate(bh);
+        unlock_page(bh->b_page);
+        page_cache_release(bh->b_page);
+        return bh;
+}
 int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
-                              sector_t pblocknr, struct buffer_head **pbh,
+                              sector_t pblocknr, struct buffer_head **pbh)
-                              int newblk)
 {
        struct buffer_head *bh;
        struct inode *inode = NILFS_BTNC_I(btnc);
@@ -81,19 +107,6 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
                return -ENOMEM;
        err = -EEXIST; /* internal code */
-        if (newblk) {
-                if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
-                             buffer_dirty(bh))) {
-                        brelse(bh);
-                        BUG();
-                }
-                memset(bh->b_data, 0, 1 << inode->i_blkbits);
-                bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
-                bh->b_blocknr = blocknr;
-                set_buffer_mapped(bh);
-                set_buffer_uptodate(bh);
-                goto found;
-        }
        if (buffer_uptodate(bh) || buffer_dirty(bh))
                goto found;
@@ -135,27 +148,6 @@ out_locked:
        return err;
 }
-int nilfs_btnode_get(struct address_space *btnc, __u64 blocknr,
-                     sector_t pblocknr, struct buffer_head **pbh, int newblk)
-{
-        struct buffer_head *bh;
-        int err;
-        err = nilfs_btnode_submit_block(btnc, blocknr, pblocknr, pbh, newblk);
-        if (err == -EEXIST) /* internal code (cache hit) */
-                return 0;
-        if (unlikely(err))
-                return err;
-        bh = *pbh;
-        wait_on_buffer(bh);
-        if (!buffer_uptodate(bh)) {
-                brelse(bh);
-                return -EIO;
-        }
-        return 0;
-}
 /**
 * nilfs_btnode_delete - delete B-tree node buffer
 * @bh: buffer to be deleted
@@ -244,12 +236,13 @@ retry:
                unlock_page(obh->b_page);
        }
-        err = nilfs_btnode_get(btnc, newkey, 0, &nbh, 1);
+        nbh = nilfs_btnode_create_block(btnc, newkey);
-        if (likely(!err)) {
+        if (!nbh)
-                BUG_ON(nbh == obh);
+                return -ENOMEM;
-                ctxt->newbh = nbh;
-        }
+        BUG_ON(nbh == obh);
-        return err;
+        ctxt->newbh = nbh;
+        return 0;
 failed_unlock:
        unlock_page(obh->b_page);
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 3e2275172ed6..07da83f07712 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -40,10 +40,10 @@ struct nilfs_btnode_chkey_ctxt {
 void nilfs_btnode_cache_init_once(struct address_space *);
 void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
 void nilfs_btnode_cache_clear(struct address_space *);
+struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
+                                              __u64 blocknr);
 int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t,
-                              struct buffer_head **, int);
+                              struct buffer_head **);
-int nilfs_btnode_get(struct address_space *, __u64, sector_t,
-                     struct buffer_head **, int);
 void nilfs_btnode_delete(struct buffer_head *);
 int nilfs_btnode_prepare_change_key(struct address_space *,
                                    struct nilfs_btnode_chkey_ctxt *);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index e25b507a474f..7cdd98b8d514 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -114,7 +114,18 @@ static int nilfs_btree_get_block(const struct nilfs_btree *btree, __u64 ptr,
 {
        struct address_space *btnc =
                &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
-        return nilfs_btnode_get(btnc, ptr, 0, bhp, 0);
+        int err;
+        err = nilfs_btnode_submit_block(btnc, ptr, 0, bhp);
+        if (err)
+                return err == -EEXIST ? 0 : err;
+        wait_on_buffer(*bhp);
+        if (!buffer_uptodate(*bhp)) {
+                brelse(*bhp);
+                return -EIO;
+        }
+        return 0;
 }
 static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
@@ -122,12 +133,15 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
 {
        struct address_space *btnc =
                &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
-        int ret;
+        struct buffer_head *bh;
-        ret = nilfs_btnode_get(btnc, ptr, 0, bhp, 1);
+        bh = nilfs_btnode_create_block(btnc, ptr);
-        if (!ret)
+        if (!bh)
-                set_buffer_nilfs_volatile(*bhp);
+                return -ENOMEM;
-        return ret;
+        set_buffer_nilfs_volatile(bh);
+        *bhp = bh;
+        return 0;
 }
 static inline int
@@ -444,6 +458,18 @@ nilfs_btree_get_node(const struct nilfs_btree *btree,
                nilfs_btree_get_nonroot_node(path, level);
 }
+static inline int
+nilfs_btree_bad_node(struct nilfs_btree_node *node, int level)
+{
+        if (unlikely(nilfs_btree_node_get_level(node) != level)) {
+                dump_stack();
+                printk(KERN_CRIT "NILFS: btree level mismatch: %d != %d\n",
+                       nilfs_btree_node_get_level(node), level);
+                return 1;
+        }
+        return 0;
+}
 static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
                                 struct nilfs_btree_path *path,
                                 __u64 key, __u64 *ptrp, int minlevel)
@@ -467,7 +493,8 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
                if (ret < 0)
                        return ret;
                node = nilfs_btree_get_nonroot_node(path, level);
-                BUG_ON(level != nilfs_btree_node_get_level(node));
+                if (nilfs_btree_bad_node(node, level))
+                        return -EINVAL;
                if (!found)
                        found = nilfs_btree_node_lookup(node, key, &index);
                else
@@ -512,7 +539,8 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
                if (ret < 0)
                        return ret;
                node = nilfs_btree_get_nonroot_node(path, level);
-                BUG_ON(level != nilfs_btree_node_get_level(node));
+                if (nilfs_btree_bad_node(node, level))
+                        return -EINVAL;
                index = nilfs_btree_node_get_nchildren(node) - 1;
                ptr = nilfs_btree_node_get_ptr(btree, node, index);
                path[level].bp_index = index;
@@ -638,13 +666,11 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
 {
        if (level < nilfs_btree_height(btree) - 1) {
                do {
-                        lock_buffer(path[level].bp_bh);
                        nilfs_btree_node_set_key(
                                nilfs_btree_get_nonroot_node(path, level),
                                path[level].bp_index, key);
                        if (!buffer_dirty(path[level].bp_bh))
                                nilfs_btnode_mark_dirty(path[level].bp_bh);
-                        unlock_buffer(path[level].bp_bh);
                } while ((path[level].bp_index == 0) &&
                         (++level < nilfs_btree_height(btree) - 1));
        }
@@ -663,13 +689,11 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
        struct nilfs_btree_node *node;
        if (level < nilfs_btree_height(btree) - 1) {
-                lock_buffer(path[level].bp_bh);
                node = nilfs_btree_get_nonroot_node(path, level);
                nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
                                        path[level].bp_index);
                if (!buffer_dirty(path[level].bp_bh))
                        nilfs_btnode_mark_dirty(path[level].bp_bh);
-                unlock_buffer(path[level].bp_bh);
                if (path[level].bp_index == 0)
                        nilfs_btree_promote_key(btree, path, level + 1,
@@ -689,9 +713,6 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
        struct nilfs_btree_node *node, *left;
        int nchildren, lnchildren, n, move;
-        lock_buffer(path[level].bp_bh);
-        lock_buffer(path[level].bp_sib_bh);
        node = nilfs_btree_get_nonroot_node(path, level);
        left = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
@@ -712,9 +733,6 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
        if (!buffer_dirty(path[level].bp_sib_bh))
                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
-        unlock_buffer(path[level].bp_bh);
-        unlock_buffer(path[level].bp_sib_bh);
        nilfs_btree_promote_key(btree, path, level + 1,
                                nilfs_btree_node_get_key(node, 0));
@@ -740,9 +758,6 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
        struct nilfs_btree_node *node, *right;
        int nchildren, rnchildren, n, move;
-        lock_buffer(path[level].bp_bh);
-        lock_buffer(path[level].bp_sib_bh);
        node = nilfs_btree_get_nonroot_node(path, level);
        right = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
@@ -763,9 +778,6 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
        if (!buffer_dirty(path[level].bp_sib_bh))
                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
-        unlock_buffer(path[level].bp_bh);
-        unlock_buffer(path[level].bp_sib_bh);
        path[level + 1].bp_index++;
        nilfs_btree_promote_key(btree, path, level + 1,
                                nilfs_btree_node_get_key(right, 0));
@@ -794,9 +806,6 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
        __u64 newptr;
        int nchildren, n, move;
-        lock_buffer(path[level].bp_bh);
-        lock_buffer(path[level].bp_sib_bh);
        node = nilfs_btree_get_nonroot_node(path, level);
        right = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
@@ -815,9 +824,6 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
        if (!buffer_dirty(path[level].bp_sib_bh))
                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
-        unlock_buffer(path[level].bp_bh);
-        unlock_buffer(path[level].bp_sib_bh);
        newkey = nilfs_btree_node_get_key(right, 0);
        newptr = path[level].bp_newreq.bpr_ptr;
@@ -852,8 +858,6 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
        struct nilfs_btree_node *root, *child;
        int n;
-        lock_buffer(path[level].bp_sib_bh);
        root = nilfs_btree_get_root(btree);
        child = nilfs_btree_get_sib_node(path, level);
@@ -865,8 +869,6 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
        if (!buffer_dirty(path[level].bp_sib_bh))
                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
-        unlock_buffer(path[level].bp_sib_bh);
        path[level].bp_bh = path[level].bp_sib_bh;
        path[level].bp_sib_bh = NULL;
@@ -1023,11 +1025,9 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
                stats->bs_nblocks++;
-                lock_buffer(bh);
                nilfs_btree_node_init(btree,
                                      (struct nilfs_btree_node *)bh->b_data,
                                      0, level, 0, NULL, NULL);
-                unlock_buffer(bh);
                path[level].bp_sib_bh = bh;
                path[level].bp_op = nilfs_btree_split;
        }
@@ -1052,10 +1052,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        if (ret < 0)
                goto err_out_curr_node;
-        lock_buffer(bh);
        nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data,
                              0, level, 0, NULL, NULL);
-        unlock_buffer(bh);
        path[level].bp_sib_bh = bh;
        path[level].bp_op = nilfs_btree_grow;
@@ -1154,13 +1152,11 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
        struct nilfs_btree_node *node;
        if (level < nilfs_btree_height(btree) - 1) {
-                lock_buffer(path[level].bp_bh);
                node = nilfs_btree_get_nonroot_node(path, level);
                nilfs_btree_node_delete(btree, node, keyp, ptrp,
                                        path[level].bp_index);
                if (!buffer_dirty(path[level].bp_bh))
                        nilfs_btnode_mark_dirty(path[level].bp_bh);
-                unlock_buffer(path[level].bp_bh);
                if (path[level].bp_index == 0)
                        nilfs_btree_promote_key(btree, path, level + 1,
                                nilfs_btree_node_get_key(node, 0));
@@ -1180,9 +1176,6 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
-        lock_buffer(path[level].bp_bh);
-        lock_buffer(path[level].bp_sib_bh);
        node = nilfs_btree_get_nonroot_node(path, level);
        left = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
@@ -1197,9 +1190,6 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
        if (!buffer_dirty(path[level].bp_sib_bh))
                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
-        unlock_buffer(path[level].bp_bh);
-        unlock_buffer(path[level].bp_sib_bh);
        nilfs_btree_promote_key(btree, path, level + 1,
                                nilfs_btree_node_get_key(node, 0));
@@ -1217,9 +1207,6 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
-        lock_buffer(path[level].bp_bh);
-        lock_buffer(path[level].bp_sib_bh);
        node = nilfs_btree_get_nonroot_node(path, level);
        right = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
@@ -1234,9 +1221,6 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
        if (!buffer_dirty(path[level].bp_sib_bh))
                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
-        unlock_buffer(path[level].bp_bh);
-        unlock_buffer(path[level].bp_sib_bh);
        path[level + 1].bp_index++;
        nilfs_btree_promote_key(btree, path, level + 1,
                                nilfs_btree_node_get_key(right, 0));
@@ -1255,9 +1239,6 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
-        lock_buffer(path[level].bp_bh);
-        lock_buffer(path[level].bp_sib_bh);
        node = nilfs_btree_get_nonroot_node(path, level);
        left = nilfs_btree_get_sib_node(path, level);
@@ -1268,9 +1249,6 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
        if (!buffer_dirty(path[level].bp_sib_bh))
                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
-        unlock_buffer(path[level].bp_bh);
-        unlock_buffer(path[level].bp_sib_bh);
        nilfs_btnode_delete(path[level].bp_bh);
        path[level].bp_bh = path[level].bp_sib_bh;
        path[level].bp_sib_bh = NULL;
@@ -1286,9 +1264,6 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
-        lock_buffer(path[level].bp_bh);
-        lock_buffer(path[level].bp_sib_bh);
        node = nilfs_btree_get_nonroot_node(path, level);
        right = nilfs_btree_get_sib_node(path, level);
@@ -1299,9 +1274,6 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
        if (!buffer_dirty(path[level].bp_bh))
                nilfs_btnode_mark_dirty(path[level].bp_bh);
-        unlock_buffer(path[level].bp_bh);
-        unlock_buffer(path[level].bp_sib_bh);
        nilfs_btnode_delete(path[level].bp_sib_bh);
        path[level].bp_sib_bh = NULL;
        path[level + 1].bp_index++;
@@ -1316,7 +1288,6 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
-        lock_buffer(path[level].bp_bh);
        root = nilfs_btree_get_root(btree);
        child = nilfs_btree_get_nonroot_node(path, level);
@@ -1324,7 +1295,6 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
        nilfs_btree_node_set_level(root, level);
        n = nilfs_btree_node_get_nchildren(child);
        nilfs_btree_node_move_left(btree, root, child, n);
-        unlock_buffer(path[level].bp_bh);
        nilfs_btnode_delete(path[level].bp_bh);
        path[level].bp_bh = NULL;
@@ -1699,7 +1669,6 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
                nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat);
                /* create child node at level 1 */
-                lock_buffer(bh);
                node = (struct nilfs_btree_node *)bh->b_data;
                nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs);
                nilfs_btree_node_insert(btree, node,
@@ -1709,7 +1678,6 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
                if (!nilfs_bmap_dirty(bmap))
                        nilfs_bmap_set_dirty(bmap);
-                unlock_buffer(bh);
                brelse(bh);
                /* create root node at level 2 */
@@ -2050,7 +2018,7 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap,
        for (level = NILFS_BTREE_LEVEL_NODE_MIN;
             level < NILFS_BTREE_LEVEL_MAX;
             level++)
-                list_splice(&lists[level], listp->prev);
+                list_splice_tail(&lists[level], listp);
 }
 static int nilfs_btree_assign_p(struct nilfs_btree *btree,
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 0e72bbbc6b64..4b82d84ade75 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -34,28 +34,6 @@ struct nilfs_btree;
 struct nilfs_btree_path;
 /**
- * struct nilfs_btree_node - B-tree node
- * @bn_flags: flags
- * @bn_level: level
- * @bn_nchildren: number of children
- * @bn_pad: padding
- */
-struct nilfs_btree_node {
-        __u8 bn_flags;
-        __u8 bn_level;
-        __le16 bn_nchildren;
-        __le32 bn_pad;
-};
-/* flags */
-#define NILFS_BTREE_NODE_ROOT   0x01
-/* level */
-#define NILFS_BTREE_LEVEL_DATA          0
-#define NILFS_BTREE_LEVEL_NODE_MIN      (NILFS_BTREE_LEVEL_DATA + 1)
-#define NILFS_BTREE_LEVEL_MAX           14
-/**
 * struct nilfs_btree - B-tree structure
 * @bt_bmap: bmap base structure
 */
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 3f5d5d06f53c..18737818db63 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -328,19 +328,24 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
                        tnicps += nicps;
                        nilfs_mdt_mark_buffer_dirty(cp_bh);
                        nilfs_mdt_mark_dirty(cpfile);
-                        if (!nilfs_cpfile_is_in_first(cpfile, cno) &&
+                        if (!nilfs_cpfile_is_in_first(cpfile, cno)) {
-                            (count = nilfs_cpfile_block_sub_valid_checkpoints(
+                                count =
-                                    cpfile, cp_bh, kaddr, nicps)) == 0) {
+                                  nilfs_cpfile_block_sub_valid_checkpoints(
-                                /* make hole */
+                                                cpfile, cp_bh, kaddr, nicps);
-                                kunmap_atomic(kaddr, KM_USER0);
+                                if (count == 0) {
-                                brelse(cp_bh);
+                                        /* make hole */
-                                ret = nilfs_cpfile_delete_checkpoint_block(
+                                        kunmap_atomic(kaddr, KM_USER0);
-                                        cpfile, cno);
+                                        brelse(cp_bh);
-                                if (ret == 0)
+                                        ret =
-                                        continue;
+                                          nilfs_cpfile_delete_checkpoint_block(
-                                printk(KERN_ERR "%s: cannot delete block\n",
+                                                                   cpfile, cno);
-                                       __func__);
+                                        if (ret == 0)
-                                break;
+                                                continue;
+                                        printk(KERN_ERR
+                                               "%s: cannot delete block\n",
+                                               __func__);
+                                        break;
+                                }
                        }
                }
@@ -926,3 +931,29 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
        up_read(&NILFS_MDT(cpfile)->mi_sem);
        return ret;
 }
+/**
+ * nilfs_cpfile_read - read cpfile inode
+ * @cpfile: cpfile inode
+ * @raw_inode: on-disk cpfile inode
+ */
+int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode)
+{
+        return nilfs_read_inode_common(cpfile, raw_inode);
+}
+/**
+ * nilfs_cpfile_new - create cpfile
+ * @nilfs: nilfs object
+ * @cpsize: size of a checkpoint entry
+ */
+struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize)
+{
+        struct inode *cpfile;
+        cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO, 0);
+        if (cpfile)
+                nilfs_mdt_set_entry_size(cpfile, cpsize,
+                                         sizeof(struct nilfs_cpfile_header));
+        return cpfile;
+}
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index debea896e701..bc0809e0ab43 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -40,4 +40,7 @@ int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
 ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
                                size_t);
+int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode);
+struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize);
 #endif  /* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 1ff8e15bd36b..013146755683 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -33,6 +33,16 @@
 #define NILFS_CNO_MIN   ((__u64)1)
 #define NILFS_CNO_MAX   (~(__u64)0)
+struct nilfs_dat_info {
+        struct nilfs_mdt_info mi;
+        struct nilfs_palloc_cache palloc_cache;
+};
+static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat)
+{
+        return (struct nilfs_dat_info *)NILFS_MDT(dat);
+}
 static int nilfs_dat_prepare_entry(struct inode *dat,
                                   struct nilfs_palloc_req *req, int create)
 {
@@ -278,7 +288,7 @@ int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr)
 * @vblocknrs and @nitems.
 *
 * Return Value: On success, 0 is returned. On error, one of the following
- * nagative error codes is returned.
+ * negative error codes is returned.
 *
 * %-EIO - I/O error.
 *
@@ -378,8 +388,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
                ret = -ENOENT;
                goto out;
        }
-        if (blocknrp != NULL)
+        *blocknrp = blocknr;
-                *blocknrp = blocknr;
 out:
        kunmap_atomic(kaddr, KM_USER0);
@@ -425,3 +434,40 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
        return nvi;
 }
+/**
+ * nilfs_dat_read - read dat inode
+ * @dat: dat inode
+ * @raw_inode: on-disk dat inode
+ */
+int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode)
+{
+        return nilfs_read_inode_common(dat, raw_inode);
+}
+/**
+ * nilfs_dat_new - create dat file
+ * @nilfs: nilfs object
+ * @entry_size: size of a dat entry
+ */
+struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size)
+{
+        static struct lock_class_key dat_lock_key;
+        struct inode *dat;
+        struct nilfs_dat_info *di;
+        int err;
+        dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO, sizeof(*di));
+        if (dat) {
+                err = nilfs_palloc_init_blockgroup(dat, entry_size);
+                if (unlikely(err)) {
+                        nilfs_mdt_destroy(dat);
+                        return NULL;
+                }
+                di = NILFS_DAT_I(dat);
+                lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
+                nilfs_palloc_setup_cache(dat, &di->palloc_cache);
+        }
+        return dat;
+}
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index 406070d3ff49..d31c3aab0efe 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -53,4 +53,7 @@ int nilfs_dat_freev(struct inode *, __u64 *, size_t);
 int nilfs_dat_move(struct inode *, __u64, sector_t);
 ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
+int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode);
+struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size);
 #endif  /* _NILFS_DAT_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index e097099bfc8f..85c89dfc71f0 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -99,9 +99,9 @@ static int nilfs_prepare_chunk(struct page *page,
                                 NULL, nilfs_get_block);
 }
-static int nilfs_commit_chunk(struct page *page,
+static void nilfs_commit_chunk(struct page *page,
-                              struct address_space *mapping,
+                               struct address_space *mapping,
-                              unsigned from, unsigned to)
+                               unsigned from, unsigned to)
 {
        struct inode *dir = mapping->host;
        struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb);
@@ -112,15 +112,13 @@ static int nilfs_commit_chunk(struct page *page,
        nr_dirty = nilfs_page_count_clean_buffers(page, from, to);
        copied = block_write_end(NULL, mapping, pos, len, len, page, NULL);
-        if (pos + copied > dir->i_size) {
+        if (pos + copied > dir->i_size)
                i_size_write(dir, pos + copied);
-                mark_inode_dirty(dir);
-        }
        if (IS_DIRSYNC(dir))
                nilfs_set_transaction_flag(NILFS_TI_SYNC);
        err = nilfs_set_file_dirty(sbi, dir, nr_dirty);
+        WARN_ON(err); /* do not happen */
        unlock_page(page);
-        return err;
 }
 static void nilfs_check_page(struct page *page)
@@ -226,7 +224,7 @@ fail:
 * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller.
 */
 static int
-nilfs_match(int len, const char * const name, struct nilfs_dir_entry *de)
+nilfs_match(int len, const unsigned char *name, struct nilfs_dir_entry *de)
 {
        if (len != de->name_len)
                return 0;
@@ -351,11 +349,11 @@ done:
 * Entry is guaranteed to be valid.
 */
 struct nilfs_dir_entry *
-nilfs_find_entry(struct inode *dir, struct dentry *dentry,
+nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
                 struct page **res_page)
 {
-        const char *name = dentry->d_name.name;
+        const unsigned char *name = qstr->name;
-        int namelen = dentry->d_name.len;
+        int namelen = qstr->len;
        unsigned reclen = NILFS_DIR_REC_LEN(namelen);
        unsigned long start, n;
        unsigned long npages = dir_pages(dir);
@@ -398,7 +396,7 @@ nilfs_find_entry(struct inode *dir, struct dentry *dentry,
                /* next page is past the blocks we've got */
                if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
                        nilfs_error(dir->i_sb, __func__,
-                               "dir %lu size %lld exceeds block cout %llu",
+                               "dir %lu size %lld exceeds block count %llu",
                               dir->i_ino, dir->i_size,
                               (unsigned long long)dir->i_blocks);
                        goto out;
@@ -426,13 +424,13 @@ struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p)
        return de;
 }
-ino_t nilfs_inode_by_name(struct inode *dir, struct dentry *dentry)
+ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
 {
        ino_t res = 0;
        struct nilfs_dir_entry *de;
        struct page *page;
-        de = nilfs_find_entry(dir, dentry, &page);
+        de = nilfs_find_entry(dir, qstr, &page);
        if (de) {
                res = le64_to_cpu(de->inode);
                kunmap(page);
@@ -455,11 +453,10 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
        BUG_ON(err);
        de->inode = cpu_to_le64(inode->i_ino);
        nilfs_set_de_type(de, inode);
-        err = nilfs_commit_chunk(page, mapping, from, to);
+        nilfs_commit_chunk(page, mapping, from, to);
        nilfs_put_page(page);
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 /*      NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
-        mark_inode_dirty(dir);
 }
 /*
@@ -468,7 +465,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
 int nilfs_add_link(struct dentry *dentry, struct inode *inode)
 {
        struct inode *dir = dentry->d_parent->d_inode;
-        const char *name = dentry->d_name.name;
+        const unsigned char *name = dentry->d_name.name;
        int namelen = dentry->d_name.len;
        unsigned chunk_size = nilfs_chunk_size(dir);
        unsigned reclen = NILFS_DIR_REC_LEN(namelen);
@@ -548,10 +545,10 @@ got_it:
        memcpy(de->name, name, namelen);
        de->inode = cpu_to_le64(inode->i_ino);
        nilfs_set_de_type(de, inode);
-        err = nilfs_commit_chunk(page, page->mapping, from, to);
+        nilfs_commit_chunk(page, page->mapping, from, to);
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 /*      NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
-        mark_inode_dirty(dir);
+        nilfs_mark_inode_dirty(dir);
        /* OFFSET_CACHE */
 out_put:
        nilfs_put_page(page);
@@ -595,10 +592,9 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
        if (pde)
                pde->rec_len = cpu_to_le16(to - from);
        dir->inode = 0;
-        err = nilfs_commit_chunk(page, mapping, from, to);
+        nilfs_commit_chunk(page, mapping, from, to);
        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 /*      NILFS_I(inode)->i_flags &= ~NILFS_BTREE_FL; */
-        mark_inode_dirty(inode);
 out:
        nilfs_put_page(page);
        return err;
@@ -640,7 +636,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
        memcpy(de->name, "..\0", 4);
        nilfs_set_de_type(de, inode);
        kunmap_atomic(kaddr, KM_USER0);
-        err = nilfs_commit_chunk(page, mapping, 0, chunk_size);
+        nilfs_commit_chunk(page, mapping, 0, chunk_size);
 fail:
        page_cache_release(page);
        return err;
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index d369ac718277..236753df5cdf 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -51,11 +51,11 @@ static int nilfs_direct_lookup(const struct nilfs_bmap *bmap,
        struct nilfs_direct *direct;
        __u64 ptr;
-        direct = (struct nilfs_direct *)bmap;
+        direct = (struct nilfs_direct *)bmap;  /* XXX: use macro for level 1 */
-        if ((key > NILFS_DIRECT_KEY_MAX) ||
+        if (key > NILFS_DIRECT_KEY_MAX || level != 1)
-            (level != 1) ||     /* XXX: use macro for level 1 */
+                return -ENOENT;
-            ((ptr = nilfs_direct_get_ptr(direct, key)) ==
+        ptr = nilfs_direct_get_ptr(direct, key);
-             NILFS_BMAP_INVALID_PTR))
+        if (ptr == NILFS_BMAP_INVALID_PTR)
                return -ENOENT;
        if (ptrp != NULL)
@@ -73,9 +73,10 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap,
        sector_t blocknr;
        int ret, cnt;
-        if (key > NILFS_DIRECT_KEY_MAX ||
+        if (key > NILFS_DIRECT_KEY_MAX)
-            (ptr = nilfs_direct_get_ptr(direct, key)) ==
+                return -ENOENT;
-            NILFS_BMAP_INVALID_PTR)
+        ptr = nilfs_direct_get_ptr(direct, key);
+        if (ptr == NILFS_BMAP_INVALID_PTR)
                return -ENOENT;
        if (NILFS_BMAP_USE_VBN(bmap)) {
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
index 93383c5cee90..dd5f7e0a95f6 100644
--- a/fs/nilfs2/gcdat.c
+++ b/fs/nilfs2/gcdat.c
@@ -61,6 +61,8 @@ void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs)
        nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap);
+        nilfs_palloc_clear_cache(dat);
+        nilfs_palloc_clear_cache(gcdat);
        nilfs_clear_dirty_pages(mapping);
        nilfs_copy_back_pages(mapping, gmapping);
        /* note: mdt dirty flags should be cleared by segctor. */
@@ -79,6 +81,7 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
        gcdat->i_state = I_CLEAR;
        gii->i_flags = 0;
+        nilfs_palloc_clear_cache(gcdat);
        truncate_inode_pages(gcdat->i_mapping, 0);
        truncate_inode_pages(&gii->i_btnode_cache, 0);
 }
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index e6de0a27ab5d..145f03cd7d3e 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -28,10 +28,10 @@
 * gcinodes), and this file provides lookup function of the dummy
 * inodes and their buffer read function.
 *
- * Since NILFS2 keeps up multiple checkpoints/snapshots accross GC, it
+ * Since NILFS2 keeps up multiple checkpoints/snapshots across GC, it
 * has to treat blocks that belong to a same file but have different
 * checkpoint numbers.  To avoid interference among generations, dummy
- * inodes are managed separatly from actual inodes, and their lookup
+ * inodes are managed separately from actual inodes, and their lookup
 * function (nilfs_gc_iget) is designed to be specified with a
 * checkpoint number argument as well as an inode number.
 *
@@ -45,6 +45,7 @@
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
 #include <linux/hash.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 #include "nilfs.h"
 #include "page.h"
@@ -149,7 +150,7 @@ int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
                                   __u64 vbn, struct buffer_head **out_bh)
 {
        int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
-                                            vbn ? : pbn, pbn, out_bh, 0);
+                                            vbn ? : pbn, pbn, out_bh);
        if (ret == -EEXIST) /* internal code (cache hit) */
                ret = 0;
        return ret;
@@ -212,9 +213,10 @@ void nilfs_destroy_gccache(struct the_nilfs *nilfs)
 static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino,
                                   __u64 cno)
 {
-        struct inode *inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS);
+        struct inode *inode;
        struct nilfs_inode_info *ii;
+        inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS, 0);
        if (!inode)
                return NULL;
@@ -265,7 +267,6 @@ struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno)
 */
 void nilfs_clear_gcinode(struct inode *inode)
 {
-        nilfs_mdt_clear(inode);
        nilfs_mdt_destroy(inode);
 }
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index de86401f209f..922d9dd42c8f 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -29,6 +29,17 @@
 #include "alloc.h"
 #include "ifile.h"
+struct nilfs_ifile_info {
+        struct nilfs_mdt_info mi;
+        struct nilfs_palloc_cache palloc_cache;
+};
+static inline struct nilfs_ifile_info *NILFS_IFILE_I(struct inode *ifile)
+{
+        return (struct nilfs_ifile_info *)NILFS_MDT(ifile);
+}
 /**
 * nilfs_ifile_create_inode - create a new disk inode
 * @ifile: ifile inode
@@ -148,3 +159,27 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
        }
        return err;
 }
+/**
+ * nilfs_ifile_new - create inode file
+ * @sbi: nilfs_sb_info struct
+ * @inode_size: size of an inode
+ */
+struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size)
+{
+        struct inode *ifile;
+        int err;
+        ifile = nilfs_mdt_new(sbi->s_nilfs, sbi->s_super, NILFS_IFILE_INO,
+                              sizeof(struct nilfs_ifile_info));
+        if (ifile) {
+                err = nilfs_palloc_init_blockgroup(ifile, inode_size);
+                if (unlikely(err)) {
+                        nilfs_mdt_destroy(ifile);
+                        return NULL;
+                }
+                nilfs_palloc_setup_cache(ifile,
+                                         &NILFS_IFILE_I(ifile)->palloc_cache);
+        }
+        return ifile;
+}
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index ecc3ba76db47..cbca32e498f2 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -49,4 +49,6 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
 int nilfs_ifile_delete_inode(struct inode *, ino_t);
 int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
+struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size);
 #endif  /* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 2a0a5a3ac134..0957b58f909d 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -22,6 +22,7 @@
 */
 #include <linux/buffer_head.h>
+#include <linux/gfp.h>
 #include <linux/mpage.h>
 #include <linux/writeback.h>
 #include <linux/uio.h>
@@ -97,6 +98,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
                        nilfs_transaction_abort(inode->i_sb);
                        goto out;
                }
+                nilfs_mark_inode_dirty(inode);
                nilfs_transaction_commit(inode->i_sb); /* never fails */
                /* Error handling should be detailed */
                set_buffer_new(bh_result);
@@ -322,7 +324,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
                                    nilfs_init_acl(), proper cancellation of
                                    above jobs should be considered */
-        mark_inode_dirty(inode);
        return inode;
 failed_acl:
@@ -525,7 +526,6 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
        raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh);
-        /* The buffer is guarded with lock_buffer() by the caller */
        if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
                memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size);
        set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
@@ -599,6 +599,7 @@ void nilfs_truncate(struct inode *inode)
        if (IS_SYNC(inode))
                nilfs_set_transaction_flag(NILFS_TI_SYNC);
+        nilfs_mark_inode_dirty(inode);
        nilfs_set_file_dirty(NILFS_SB(sb), inode, 0);
        nilfs_transaction_commit(sb);
        /* May construct a logical segment and may fail in sync mode.
@@ -623,6 +624,7 @@ void nilfs_delete_inode(struct inode *inode)
                truncate_inode_pages(&inode->i_data, 0);
        nilfs_truncate_bmap(ii, 0);
+        nilfs_mark_inode_dirty(inode);
        nilfs_free_inode(inode);
        /* nilfs_free_inode() marks inode buffer dirty */
        if (IS_SYNC(inode))
@@ -745,9 +747,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
                              "failed to reget inode block.\n");
                return err;
        }
-        lock_buffer(ibh);
        nilfs_update_inode(inode, ibh);
-        unlock_buffer(ibh);
        nilfs_mdt_mark_buffer_dirty(ibh);
        nilfs_mdt_mark_dirty(sbi->s_ifile);
        brelse(ibh);
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index f6af76042d80..c2ff1b306012 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -23,9 +23,11 @@
 #include <linux/fs.h>
 #include <linux/wait.h>
 #include <linux/smp_lock.h>     /* lock_kernel(), unlock_kernel() */
+#include <linux/slab.h>
 #include <linux/capability.h>   /* capable() */
 #include <linux/uaccess.h>      /* copy_from_user(), copy_to_user() */
 #include <linux/vmalloc.h>
+#include <linux/mount.h>        /* mnt_want_write(), mnt_drop_write() */
 #include <linux/nilfs2_fs.h>
 #include "nilfs.h"
 #include "segment.h"
@@ -107,20 +109,28 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
+        ret = mnt_want_write(filp->f_path.mnt);
+        if (ret)
+                return ret;
+        ret = -EFAULT;
        if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
-                return -EFAULT;
+                goto out;
        mutex_lock(&nilfs->ns_mount_mutex);
        nilfs_transaction_begin(inode->i_sb, &ti, 0);
        ret = nilfs_cpfile_change_cpmode(
                cpfile, cpmode.cm_cno, cpmode.cm_mode);
-        if (unlikely(ret < 0)) {
+        if (unlikely(ret < 0))
                nilfs_transaction_abort(inode->i_sb);
-                mutex_unlock(&nilfs->ns_mount_mutex);
+        else
-                return ret;
+                nilfs_transaction_commit(inode->i_sb); /* never fails */
-        }
-        nilfs_transaction_commit(inode->i_sb); /* never fails */
        mutex_unlock(&nilfs->ns_mount_mutex);
+out:
+        mnt_drop_write(filp->f_path.mnt);
        return ret;
 }
@@ -135,16 +145,23 @@ nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
+        ret = mnt_want_write(filp->f_path.mnt);
+        if (ret)
+                return ret;
+        ret = -EFAULT;
        if (copy_from_user(&cno, argp, sizeof(cno)))
-                return -EFAULT;
+                goto out;
        nilfs_transaction_begin(inode->i_sb, &ti, 0);
        ret = nilfs_cpfile_delete_checkpoint(cpfile, cno);
-        if (unlikely(ret < 0)) {
+        if (unlikely(ret < 0))
                nilfs_transaction_abort(inode->i_sb);
-                return ret;
+        else
-        }
+                nilfs_transaction_commit(inode->i_sb); /* never fails */
-        nilfs_transaction_commit(inode->i_sb); /* never fails */
+out:
+        mnt_drop_write(filp->f_path.mnt);
        return ret;
 }
@@ -480,7 +497,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
                                      unsigned int cmd, void __user *argp)
 {
        struct nilfs_argv argv[5];
-        const static size_t argsz[5] = {
+        static const size_t argsz[5] = {
                sizeof(struct nilfs_vdesc),
                sizeof(struct nilfs_period),
                sizeof(__u64),
@@ -496,12 +513,19 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
+        ret = mnt_want_write(filp->f_path.mnt);
+        if (ret)
+                return ret;
+        ret = -EFAULT;
        if (copy_from_user(argv, argp, sizeof(argv)))
-                return -EFAULT;
+                goto out;
+        ret = -EINVAL;
        nsegs = argv[4].v_nmembs;
        if (argv[4].v_size != argsz[4])
-                return -EINVAL;
+                goto out;
        /*
         * argv[4] points to segment numbers this ioctl cleans.  We
         * use kmalloc() for its buffer because memory used for the
@@ -509,9 +533,10 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
         */
        kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base,
                               nsegs * sizeof(__u64));
-        if (IS_ERR(kbufs[4]))
+        if (IS_ERR(kbufs[4])) {
-                return PTR_ERR(kbufs[4]);
+                ret = PTR_ERR(kbufs[4]);
+                goto out;
+        }
        nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
        for (n = 0; n < 4; n++) {
@@ -563,10 +588,12 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
                nilfs_remove_all_gcinode(nilfs);
        clear_nilfs_gc_running(nilfs);
- out_free:
+out_free:
        while (--n >= 0)
                vfree(kbufs[n]);
        kfree(kbufs[4]);
+out:
+        mnt_drop_write(filp->f_path.mnt);
        return ret;
 }
@@ -575,13 +602,17 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
 {
        __u64 cno;
        int ret;
+        struct the_nilfs *nilfs;
        ret = nilfs_construct_segment(inode->i_sb);
        if (ret < 0)
                return ret;
        if (argp != NULL) {
-                cno = NILFS_SB(inode->i_sb)->s_nilfs->ns_cno - 1;
+                nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+                down_read(&nilfs->ns_segctor_sem);
+                cno = nilfs->ns_cno - 1;
+                up_read(&nilfs->ns_segctor_sem);
                if (copy_to_user(argp, &cno, sizeof(cno)))
                        return -EFAULT;
        }
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index f6326112d647..024be8c35bb6 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -26,6 +26,7 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/swap.h>
+#include <linux/slab.h>
 #include "nilfs.h"
 #include "segment.h"
 #include "page.h"
@@ -186,7 +187,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
 }
 static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
-                                struct buffer_head **out_bh)
+                                int readahead, struct buffer_head **out_bh)
 {
        struct buffer_head *first_bh, *bh;
        unsigned long blkoff;
@@ -200,16 +201,18 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
        if (unlikely(err))
                goto failed;
-        blkoff = block + 1;
+        if (readahead) {
-        for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
+                blkoff = block + 1;
-                err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh);
+                for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
-                if (likely(!err || err == -EEXIST))
+                        err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh);
-                        brelse(bh);
+                        if (likely(!err || err == -EEXIST))
-                else if (err != -EBUSY)
+                                brelse(bh);
-                        break; /* abort readahead if bmap lookup failed */
+                        else if (err != -EBUSY)
+                                break;
-                if (!buffer_locked(first_bh))
+                                /* abort readahead if bmap lookup failed */
-                        goto out_no_wait;
+                        if (!buffer_locked(first_bh))
+                                goto out_no_wait;
+                }
        }
        wait_on_buffer(first_bh);
@@ -263,7 +266,7 @@ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
        /* Should be rewritten with merging nilfs_mdt_read_block() */
 retry:
-        ret = nilfs_mdt_read_block(inode, blkoff, out_bh);
+        ret = nilfs_mdt_read_block(inode, blkoff, !create, out_bh);
        if (!create || ret != -ENOENT)
                return ret;
@@ -371,7 +374,7 @@ int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
        struct buffer_head *bh;
        int err;
-        err = nilfs_mdt_read_block(inode, block, &bh);
+        err = nilfs_mdt_read_block(inode, block, 0, &bh);
        if (unlikely(err))
                return err;
        nilfs_mark_buffer_dirty(bh);
@@ -445,9 +448,17 @@ static const struct file_operations def_mdt_fops;
 * longer than those of the super block structs; they may continue for
 * several consecutive mounts/umounts.  This would need discussions.
 */
+/**
+ * nilfs_mdt_new_common - allocate a pseudo inode for metadata file
+ * @nilfs: nilfs object
+ * @sb: super block instance the metadata file belongs to
+ * @ino: inode number
+ * @gfp_mask: gfp mask for data pages
+ * @objsz: size of the private object attached to inode->i_private
+ */
 struct inode *
 nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
-                     ino_t ino, gfp_t gfp_mask)
+                     ino_t ino, gfp_t gfp_mask, size_t objsz)
 {
        struct inode *inode = nilfs_alloc_inode_common(nilfs);
@@ -455,8 +466,9 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
                return NULL;
        else {
                struct address_space * const mapping = &inode->i_data;
-                struct nilfs_mdt_info *mi = kzalloc(sizeof(*mi), GFP_NOFS);
+                struct nilfs_mdt_info *mi;
+                mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
                if (!mi) {
                        nilfs_destroy_inode(inode);
                        return NULL;
@@ -513,11 +525,11 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
 }
 struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
-                            ino_t ino)
+                            ino_t ino, size_t objsz)
 {
-        struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino,
+        struct inode *inode;
-                                                   NILFS_MDT_GFP);
+        inode = nilfs_mdt_new_common(nilfs, sb, ino, NILFS_MDT_GFP, objsz);
        if (!inode)
                return NULL;
@@ -544,14 +556,15 @@ void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow)
                &NILFS_I(orig)->i_btnode_cache;
 }
-void nilfs_mdt_clear(struct inode *inode)
+static void nilfs_mdt_clear(struct inode *inode)
 {
        struct nilfs_inode_info *ii = NILFS_I(inode);
        invalidate_mapping_pages(inode->i_mapping, 0, -1);
        truncate_inode_pages(inode->i_mapping, 0);
-        nilfs_bmap_clear(ii->i_bmap);
+        if (test_bit(NILFS_I_BMAP, &ii->i_state))
+                nilfs_bmap_clear(ii->i_bmap);
        nilfs_btnode_cache_clear(&ii->i_btnode_cache);
 }
@@ -559,6 +572,10 @@ void nilfs_mdt_destroy(struct inode *inode)
 {
        struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+        if (mdi->mi_palloc_cache)
+                nilfs_palloc_destroy_cache(inode);
+        nilfs_mdt_clear(inode);
        kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
        kfree(mdi);
        nilfs_destroy_inode(inode);
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index 431599733c9b..6c4bbb0470fc 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -36,6 +36,7 @@
 * @mi_entry_size: size of an entry
 * @mi_first_entry_offset: offset to the first entry
 * @mi_entries_per_block: number of entries in a block
+ * @mi_palloc_cache: persistent object allocator cache
 * @mi_blocks_per_group: number of blocks in a group
 * @mi_blocks_per_desc_block: number of blocks per descriptor block
 */
@@ -46,6 +47,7 @@ struct nilfs_mdt_info {
        unsigned                mi_entry_size;
        unsigned                mi_first_entry_offset;
        unsigned long           mi_entries_per_block;
+        struct nilfs_palloc_cache *mi_palloc_cache;
        unsigned long           mi_blocks_per_group;
        unsigned long           mi_blocks_per_desc_block;
 };
@@ -74,11 +76,11 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long);
 int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
 int nilfs_mdt_fetch_dirty(struct inode *);
-struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t);
+struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
+                            size_t);
 struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
-                                   ino_t, gfp_t);
+                                   ino_t, gfp_t, size_t);
 void nilfs_mdt_destroy(struct inode *);
-void nilfs_mdt_clear(struct inode *);
 void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
 void nilfs_mdt_set_shadow(struct inode *, struct inode *);
@@ -104,21 +106,4 @@ static inline __u64 nilfs_mdt_cno(struct inode *inode)
 #define nilfs_mdt_bgl_lock(inode, bg) \
        (&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock)
-static inline int
-nilfs_mdt_read_inode_direct(struct inode *inode, struct buffer_head *bh,
-                            unsigned n)
-{
-        return nilfs_read_inode_common(
-                inode, (struct nilfs_inode *)(bh->b_data + n));
-}
-static inline void
-nilfs_mdt_write_inode_direct(struct inode *inode, struct buffer_head *bh,
-                             unsigned n)
-{
-        nilfs_write_inode_common(
-                inode, (struct nilfs_inode *)(bh->b_data + n), 1);
-}
 #endif /* _NILFS_MDT_H */
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index ed02e886fa79..ad6ed2cf19b4 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -67,7 +67,7 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        if (dentry->d_name.len > NILFS_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
-        ino = nilfs_inode_by_name(dir, dentry);
+        ino = nilfs_inode_by_name(dir, &dentry->d_name);
        inode = NULL;
        if (ino) {
                inode = nilfs_iget(dir->i_sb, ino);
@@ -81,10 +81,7 @@ struct dentry *nilfs_get_parent(struct dentry *child)
 {
        unsigned long ino;
        struct inode *inode;
-        struct dentry dotdot;
+        struct qstr dotdot = {.name = "..", .len = 2};
-        dotdot.d_name.name = "..";
-        dotdot.d_name.len = 2;
        ino = nilfs_inode_by_name(child->d_inode, &dotdot);
        if (!ino)
@@ -120,7 +117,7 @@ static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode,
                inode->i_op = &nilfs_file_inode_operations;
                inode->i_fop = &nilfs_file_operations;
                inode->i_mapping->a_ops = &nilfs_aops;
-                mark_inode_dirty(inode);
+                nilfs_mark_inode_dirty(inode);
                err = nilfs_add_nondir(dentry, inode);
        }
        if (!err)
@@ -148,7 +145,7 @@ nilfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                init_special_inode(inode, inode->i_mode, rdev);
-                mark_inode_dirty(inode);
+                nilfs_mark_inode_dirty(inode);
                err = nilfs_add_nondir(dentry, inode);
        }
        if (!err)
@@ -188,7 +185,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
                goto out_fail;
        /* mark_inode_dirty(inode); */
-        /* nilfs_new_inode() and page_symlink() do this */
+        /* page_symlink() do this */
        err = nilfs_add_nondir(dentry, inode);
 out:
@@ -200,7 +197,8 @@ out:
        return err;
 out_fail:
-        inode_dec_link_count(inode);
+        drop_nlink(inode);
+        nilfs_mark_inode_dirty(inode);
        iput(inode);
        goto out;
 }
@@ -245,7 +243,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        if (err)
                return err;
-        inode_inc_link_count(dir);
+        inc_nlink(dir);
        inode = nilfs_new_inode(dir, S_IFDIR | mode);
        err = PTR_ERR(inode);
@@ -256,7 +254,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        inode->i_fop = &nilfs_dir_operations;
        inode->i_mapping->a_ops = &nilfs_aops;
-        inode_inc_link_count(inode);
+        inc_nlink(inode);
        err = nilfs_make_empty(inode, dir);
        if (err)
@@ -266,6 +264,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        if (err)
                goto out_fail;
+        nilfs_mark_inode_dirty(inode);
        d_instantiate(dentry, inode);
 out:
        if (!err)
@@ -276,28 +275,25 @@ out:
        return err;
 out_fail:
-        inode_dec_link_count(inode);
+        drop_nlink(inode);
-        inode_dec_link_count(inode);
+        drop_nlink(inode);
+        nilfs_mark_inode_dirty(inode);
        iput(inode);
 out_dir:
-        inode_dec_link_count(dir);
+        drop_nlink(dir);
+        nilfs_mark_inode_dirty(dir);
        goto out;
 }
-static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
+static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode;
        struct nilfs_dir_entry *de;
        struct page *page;
-        struct nilfs_transaction_info ti;
        int err;
-        err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
-        if (err)
-                return err;
        err = -ENOENT;
-        de = nilfs_find_entry(dir, dentry, &page);
+        de = nilfs_find_entry(dir, &dentry->d_name, &page);
        if (!de)
                goto out;
@@ -317,12 +313,28 @@ static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
                goto out;
        inode->i_ctime = dir->i_ctime;
-        inode_dec_link_count(inode);
+        drop_nlink(inode);
        err = 0;
 out:
-        if (!err)
+        return err;
+}
+static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct nilfs_transaction_info ti;
+        int err;
+        err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
+        if (err)
+                return err;
+        err = nilfs_do_unlink(dir, dentry);
+        if (!err) {
+                nilfs_mark_inode_dirty(dir);
+                nilfs_mark_inode_dirty(dentry->d_inode);
                err = nilfs_transaction_commit(dir->i_sb);
-        else
+        } else
                nilfs_transaction_abort(dir->i_sb);
        return err;
@@ -340,11 +352,13 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
        err = -ENOTEMPTY;
        if (nilfs_empty_dir(inode)) {
-                err = nilfs_unlink(dir, dentry);
+                err = nilfs_do_unlink(dir, dentry);
                if (!err) {
                        inode->i_size = 0;
-                        inode_dec_link_count(inode);
+                        drop_nlink(inode);
-                        inode_dec_link_count(dir);
+                        nilfs_mark_inode_dirty(inode);
+                        drop_nlink(dir);
+                        nilfs_mark_inode_dirty(dir);
                }
        }
        if (!err)
@@ -372,7 +386,7 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                return err;
        err = -ENOENT;
-        old_de = nilfs_find_entry(old_dir, old_dentry, &old_page);
+        old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_page);
        if (!old_de)
                goto out;
@@ -392,45 +406,51 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        goto out_dir;
                err = -ENOENT;
-                new_de = nilfs_find_entry(new_dir, new_dentry, &new_page);
+                new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page);
                if (!new_de)
                        goto out_dir;
-                inode_inc_link_count(old_inode);
+                inc_nlink(old_inode);
                nilfs_set_link(new_dir, new_de, new_page, old_inode);
+                nilfs_mark_inode_dirty(new_dir);
                new_inode->i_ctime = CURRENT_TIME;
                if (dir_de)
                        drop_nlink(new_inode);
-                inode_dec_link_count(new_inode);
+                drop_nlink(new_inode);
+                nilfs_mark_inode_dirty(new_inode);
        } else {
                if (dir_de) {
                        err = -EMLINK;
                        if (new_dir->i_nlink >= NILFS_LINK_MAX)
                                goto out_dir;
                }
-                inode_inc_link_count(old_inode);
+                inc_nlink(old_inode);
                err = nilfs_add_link(new_dentry, old_inode);
                if (err) {
-                        inode_dec_link_count(old_inode);
+                        drop_nlink(old_inode);
+                        nilfs_mark_inode_dirty(old_inode);
                        goto out_dir;
                }
-                if (dir_de)
+                if (dir_de) {
-                        inode_inc_link_count(new_dir);
+                        inc_nlink(new_dir);
+                        nilfs_mark_inode_dirty(new_dir);
+                }
        }
        /*
         * Like most other Unix systems, set the ctime for inodes on a
         * rename.
-         * inode_dec_link_count() will mark the inode dirty.
         */
        old_inode->i_ctime = CURRENT_TIME;
        nilfs_delete_entry(old_de, old_page);
-        inode_dec_link_count(old_inode);
+        drop_nlink(old_inode);
        if (dir_de) {
                nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
-                inode_dec_link_count(old_dir);
+                drop_nlink(old_dir);
        }
+        nilfs_mark_inode_dirty(old_dir);
+        nilfs_mark_inode_dirty(old_inode);
        err = nilfs_transaction_commit(old_dir->i_sb);
        return err;
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 4da6f67e9a91..8723e5bfd071 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -217,10 +217,10 @@ static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
 /* dir.c */
 extern int nilfs_add_link(struct dentry *, struct inode *);
-extern ino_t nilfs_inode_by_name(struct inode *, struct dentry *);
+extern ino_t nilfs_inode_by_name(struct inode *, const struct qstr *);
 extern int nilfs_make_empty(struct inode *, struct inode *);
 extern struct nilfs_dir_entry *
-nilfs_find_entry(struct inode *, struct dentry *, struct page **);
+nilfs_find_entry(struct inode *, const struct qstr *, struct page **);
 extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *);
 extern int nilfs_empty_dir(struct inode *);
 extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index a2692bbc7b50..8de3e1e48130 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -29,6 +29,7 @@
 #include <linux/list.h>
 #include <linux/highmem.h>
 #include <linux/pagevec.h>
+#include <linux/gfp.h>
 #include "nilfs.h"
 #include "page.h"
 #include "mdt.h"
@@ -292,7 +293,7 @@ void nilfs_free_private_page(struct page *page)
 * @src: source page
 * @copy_dirty: flag whether to copy dirty states on the page's buffer heads.
 *
- * This fuction is for both data pages and btnode pages.  The dirty flag
+ * This function is for both data pages and btnode pages.  The dirty flag
 * should be treated by caller.  The page must not be under i/o.
 * Both src and dst page must be locked
 */
@@ -388,7 +389,7 @@ repeat:
 }
 /**
- * nilfs_copy_back_pages -- copy back pages to orignal cache from shadow cache
+ * nilfs_copy_back_pages -- copy back pages to original cache from shadow cache
 * @dmap: destination page cache
 * @smap: source page cache
 *
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 6dc83591d118..ba43146f3c30 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -23,6 +23,7 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/swap.h>
+#include <linux/slab.h>
 #include <linux/crc32.h>
 #include "nilfs.h"
 #include "segment.h"
@@ -39,7 +40,6 @@ enum {
        NILFS_SEG_FAIL_IO,
        NILFS_SEG_FAIL_MAGIC,
        NILFS_SEG_FAIL_SEQ,
-        NILFS_SEG_FAIL_CHECKSUM_SEGSUM,
        NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT,
        NILFS_SEG_FAIL_CHECKSUM_FULL,
        NILFS_SEG_FAIL_CONSISTENCY,
@@ -71,10 +71,6 @@ static int nilfs_warn_segment_error(int err)
                printk(KERN_WARNING
                       "NILFS warning: Sequence number mismatch\n");
                break;
-        case NILFS_SEG_FAIL_CHECKSUM_SEGSUM:
-                printk(KERN_WARNING
-                       "NILFS warning: Checksum error in segment summary\n");
-                break;
        case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT:
                printk(KERN_WARNING
                       "NILFS warning: Checksum error in super root\n");
@@ -206,19 +202,15 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
 * @pseg_start: start disk block number of partial segment
 * @seg_seq: sequence number requested
 * @ssi: pointer to nilfs_segsum_info struct to store information
- * @full_check: full check flag
- *              (0: only checks segment summary CRC, 1: data CRC)
 */
 static int
 load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
-                     u64 seg_seq, struct nilfs_segsum_info *ssi,
+                     u64 seg_seq, struct nilfs_segsum_info *ssi)
-                     int full_check)
 {
        struct buffer_head *bh_sum;
        struct nilfs_segment_summary *sum;
-        unsigned long offset, nblock;
+        unsigned long nblock;
-        u64 check_bytes;
+        u32 crc;
-        u32 crc, crc_sum;
        int ret = NILFS_SEG_FAIL_IO;
        bh_sum = sb_bread(sbi->s_super, pseg_start);
@@ -237,34 +229,24 @@ load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
                ret = NILFS_SEG_FAIL_SEQ;
                goto failed;
        }
-        if (full_check) {
-                offset = sizeof(sum->ss_datasum);
-                check_bytes =
-                        ((u64)ssi->nblocks << sbi->s_super->s_blocksize_bits);
-                nblock = ssi->nblocks;
-                crc_sum = le32_to_cpu(sum->ss_datasum);
-                ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
-        } else { /* only checks segment summary */
-                offset = sizeof(sum->ss_datasum) + sizeof(sum->ss_sumsum);
-                check_bytes = ssi->sumbytes;
-                nblock = ssi->nsumblk;
-                crc_sum = le32_to_cpu(sum->ss_sumsum);
-                ret = NILFS_SEG_FAIL_CHECKSUM_SEGSUM;
-        }
+        nblock = ssi->nblocks;
        if (unlikely(nblock == 0 ||
                     nblock > sbi->s_nilfs->ns_blocks_per_segment)) {
                /* This limits the number of blocks read in the CRC check */
                ret = NILFS_SEG_FAIL_CONSISTENCY;
                goto failed;
        }
-        if (calc_crc_cont(sbi, bh_sum, &crc, offset, check_bytes,
+        if (calc_crc_cont(sbi, bh_sum, &crc, sizeof(sum->ss_datasum),
+                          ((u64)nblock << sbi->s_super->s_blocksize_bits),
                          pseg_start, nblock)) {
                ret = NILFS_SEG_FAIL_IO;
                goto failed;
        }
-        if (crc == crc_sum)
+        if (crc == le32_to_cpu(sum->ss_datasum))
                ret = 0;
+        else
+                ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
 failed:
        brelse(bh_sum);
 out:
@@ -598,7 +580,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
        while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
-                ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
+                ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi);
                if (ret) {
                        if (ret == NILFS_SEG_FAIL_IO) {
                                err = -EIO;
@@ -770,14 +752,8 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
                nilfs_finish_roll_forward(nilfs, sbi, ri);
        }
-        nilfs_detach_checkpoint(sbi);
-        return 0;
 failed:
        nilfs_detach_checkpoint(sbi);
-        nilfs_mdt_clear(nilfs->ns_cpfile);
-        nilfs_mdt_clear(nilfs->ns_sufile);
-        nilfs_mdt_clear(nilfs->ns_dat);
        return err;
 }
@@ -804,6 +780,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
        struct nilfs_segsum_info ssi;
        sector_t pseg_start, pseg_end, sr_pseg_start = 0;
        sector_t seg_start, seg_end; /* range of full segment (block number) */
+        sector_t b, end;
        u64 seg_seq;
        __u64 segnum, nextnum = 0;
        __u64 cno;
@@ -819,9 +796,14 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
        /* Calculate range of segment */
        nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+        /* Read ahead segment */
+        b = seg_start;
+        while (b <= seg_end)
+                sb_breadahead(sbi->s_super, b++);
        for (;;) {
                /* Load segment summary */
-                ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
+                ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi);
                if (ret) {
                        if (ret == NILFS_SEG_FAIL_IO)
                                goto failed;
@@ -841,14 +823,20 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
                ri->ri_nextnum = nextnum;
                empty_seg = 0;
+                if (!NILFS_SEG_HAS_SR(&ssi) && !scan_newer) {
+                        /* This will never happen because a superblock
+                           (last_segment) always points to a pseg
+                           having a super root. */
+                        ret = NILFS_SEG_FAIL_CONSISTENCY;
+                        goto failed;
+                }
+                if (pseg_start == seg_start) {
+                        nilfs_get_segment_range(nilfs, nextnum, &b, &end);
+                        while (b <= end)
+                                sb_breadahead(sbi->s_super, b++);
+                }
                if (!NILFS_SEG_HAS_SR(&ssi)) {
-                        if (!scan_newer) {
-                                /* This will never happen because a superblock
-                                   (last_segment) always points to a pseg
-                                   having a super root. */
-                                ret = NILFS_SEG_FAIL_CONSISTENCY;
-                                goto failed;
-                        }
                        if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) {
                                ri->ri_lsegs_start = pseg_start;
                                ri->ri_lsegs_start_seq = seg_seq;
@@ -919,7 +907,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
 super_root_found:
        /* Updating pointers relating to the latest checkpoint */
-        list_splice(&segments, ri->ri_used_segments.prev);
+        list_splice_tail(&segments, &ri->ri_used_segments);
        nilfs->ns_last_pseg = sr_pseg_start;
        nilfs->ns_last_seq = nilfs->ns_seg_seq;
        nilfs->ns_last_cno = ri->ri_cno;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index e6d9e37fa241..17851f77f739 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -24,10 +24,28 @@
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/crc32.h>
+#include <linux/backing-dev.h>
+#include <linux/slab.h>
 #include "page.h"
 #include "segbuf.h"
+struct nilfs_write_info {
+        struct the_nilfs       *nilfs;
+        struct bio             *bio;
+        int                     start, end; /* The region to be submitted */
+        int                     rest_blocks;
+        int                     max_pages;
+        int                     nr_vecs;
+        sector_t                blocknr;
+};
+static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
+                              struct the_nilfs *nilfs);
+static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
 static struct kmem_cache *nilfs_segbuf_cachep;
 static void nilfs_segbuf_init_once(void *obj)
@@ -63,6 +81,11 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
        INIT_LIST_HEAD(&segbuf->sb_list);
        INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
        INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
+        init_completion(&segbuf->sb_bio_event);
+        atomic_set(&segbuf->sb_err, 0);
+        segbuf->sb_nbio = 0;
        return segbuf;
 }
@@ -83,6 +106,22 @@ void nilfs_segbuf_map(struct nilfs_segment_buffer *segbuf, __u64 segnum,
                segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1;
 }
+/**
+ * nilfs_segbuf_map_cont - map a new log behind a given log
+ * @segbuf: new segment buffer
+ * @prev: segment buffer containing a log to be continued
+ */
+void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
+                           struct nilfs_segment_buffer *prev)
+{
+        segbuf->sb_segnum = prev->sb_segnum;
+        segbuf->sb_fseg_start = prev->sb_fseg_start;
+        segbuf->sb_fseg_end = prev->sb_fseg_end;
+        segbuf->sb_pseg_start = prev->sb_pseg_start + prev->sb_sum.nblocks;
+        segbuf->sb_rest_blocks =
+                segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1;
+}
 void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf,
                                  __u64 nextnum, struct the_nilfs *nilfs)
 {
@@ -132,13 +171,11 @@ int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
        segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
        segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
        segbuf->sb_sum.ctime = ctime;
-        segbuf->sb_io_error = 0;
        return 0;
 }
 /*
- * Setup segument summary
+ * Setup segment summary
 */
 void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
 {
@@ -219,7 +256,7 @@ void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
        raw_sum->ss_datasum = cpu_to_le32(crc);
 }
-void nilfs_release_buffers(struct list_head *list)
+static void nilfs_release_buffers(struct list_head *list)
 {
        struct buffer_head *bh, *n;
@@ -241,13 +278,69 @@ void nilfs_release_buffers(struct list_head *list)
        }
 }
+static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
+{
+        nilfs_release_buffers(&segbuf->sb_segsum_buffers);
+        nilfs_release_buffers(&segbuf->sb_payload_buffers);
+}
+/*
+ * Iterators for segment buffers
+ */
+void nilfs_clear_logs(struct list_head *logs)
+{
+        struct nilfs_segment_buffer *segbuf;
+        list_for_each_entry(segbuf, logs, sb_list)
+                nilfs_segbuf_clear(segbuf);
+}
+void nilfs_truncate_logs(struct list_head *logs,
+                         struct nilfs_segment_buffer *last)
+{
+        struct nilfs_segment_buffer *n, *segbuf;
+        segbuf = list_prepare_entry(last, logs, sb_list);
+        list_for_each_entry_safe_continue(segbuf, n, logs, sb_list) {
+                list_del_init(&segbuf->sb_list);
+                nilfs_segbuf_clear(segbuf);
+                nilfs_segbuf_free(segbuf);
+        }
+}
+int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs)
+{
+        struct nilfs_segment_buffer *segbuf;
+        int ret = 0;
+        list_for_each_entry(segbuf, logs, sb_list) {
+                ret = nilfs_segbuf_write(segbuf, nilfs);
+                if (ret)
+                        break;
+        }
+        return ret;
+}
+int nilfs_wait_on_logs(struct list_head *logs)
+{
+        struct nilfs_segment_buffer *segbuf;
+        int err, ret = 0;
+        list_for_each_entry(segbuf, logs, sb_list) {
+                err = nilfs_segbuf_wait(segbuf);
+                if (err && !ret)
+                        ret = err;
+        }
+        return ret;
+}
 /*
 * BIO operations
 */
 static void nilfs_end_bio_write(struct bio *bio, int err)
 {
        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-        struct nilfs_write_info *wi = bio->bi_private;
+        struct nilfs_segment_buffer *segbuf = bio->bi_private;
        if (err == -EOPNOTSUPP) {
                set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
@@ -256,21 +349,22 @@ static void nilfs_end_bio_write(struct bio *bio, int err)
        }
        if (!uptodate)
-                atomic_inc(&wi->err);
+                atomic_inc(&segbuf->sb_err);
        bio_put(bio);
-        complete(&wi->bio_event);
+        complete(&segbuf->sb_bio_event);
 }
-static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
+static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
+                                   struct nilfs_write_info *wi, int mode)
 {
        struct bio *bio = wi->bio;
        int err;
-        if (wi->nbio > 0 && bdi_write_congested(wi->bdi)) {
+        if (segbuf->sb_nbio > 0 && bdi_write_congested(wi->nilfs->ns_bdi)) {
-                wait_for_completion(&wi->bio_event);
+                wait_for_completion(&segbuf->sb_bio_event);
-                wi->nbio--;
+                segbuf->sb_nbio--;
-                if (unlikely(atomic_read(&wi->err))) {
+                if (unlikely(atomic_read(&segbuf->sb_err))) {
                        bio_put(bio);
                        err = -EIO;
                        goto failed;
@@ -278,7 +372,7 @@ static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
        }
        bio->bi_end_io = nilfs_end_bio_write;
-        bio->bi_private = wi;
+        bio->bi_private = segbuf;
        bio_get(bio);
        submit_bio(mode, bio);
        if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
@@ -286,7 +380,7 @@ static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
                err = -EOPNOTSUPP;
                goto failed;
        }
-        wi->nbio++;
+        segbuf->sb_nbio++;
        bio_put(bio);
        wi->bio = NULL;
@@ -301,17 +395,15 @@ static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
 }
 /**
- * nilfs_alloc_seg_bio - allocate a bio for writing segment.
+ * nilfs_alloc_seg_bio - allocate a new bio for writing log
- * @sb: super block
+ * @nilfs: nilfs object
- * @start: beginning disk block number of this BIO.
+ * @start: start block number of the bio
 * @nr_vecs: request size of page vector.
 *
- * alloc_seg_bio() allocates a new BIO structure and initialize it.
- *
 * Return Value: On success, pointer to the struct bio is returned.
 * On error, NULL is returned.
 */
-static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
+static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start,
                                       int nr_vecs)
 {
        struct bio *bio;
@@ -322,36 +414,33 @@ static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
                        bio = bio_alloc(GFP_NOIO, nr_vecs);
        }
        if (likely(bio)) {
-                bio->bi_bdev = sb->s_bdev;
+                bio->bi_bdev = nilfs->ns_bdev;
-                bio->bi_sector = (sector_t)start << (sb->s_blocksize_bits - 9);
+                bio->bi_sector = start << (nilfs->ns_blocksize_bits - 9);
        }
        return bio;
 }
-void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
+static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
-                                struct nilfs_write_info *wi)
+                                       struct nilfs_write_info *wi)
 {
        wi->bio = NULL;
        wi->rest_blocks = segbuf->sb_sum.nblocks;
-        wi->max_pages = bio_get_nr_vecs(wi->sb->s_bdev);
+        wi->max_pages = bio_get_nr_vecs(wi->nilfs->ns_bdev);
        wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
        wi->start = wi->end = 0;
-        wi->nbio = 0;
        wi->blocknr = segbuf->sb_pseg_start;
-        atomic_set(&wi->err, 0);
-        init_completion(&wi->bio_event);
 }
-static int nilfs_submit_bh(struct nilfs_write_info *wi, struct buffer_head *bh,
+static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
-                           int mode)
+                                  struct nilfs_write_info *wi,
+                                  struct buffer_head *bh, int mode)
 {
        int len, err;
        BUG_ON(wi->nr_vecs <= 0);
 repeat:
        if (!wi->bio) {
-                wi->bio = nilfs_alloc_seg_bio(wi->sb, wi->blocknr + wi->end,
+                wi->bio = nilfs_alloc_seg_bio(wi->nilfs, wi->blocknr + wi->end,
                                              wi->nr_vecs);
                if (unlikely(!wi->bio))
                        return -ENOMEM;
@@ -363,76 +452,83 @@ static int nilfs_submit_bh(struct nilfs_write_info *wi, struct buffer_head *bh,
                return 0;
        }
        /* bio is FULL */
-        err = nilfs_submit_seg_bio(wi, mode);
+        err = nilfs_segbuf_submit_bio(segbuf, wi, mode);
        /* never submit current bh */
        if (likely(!err))
                goto repeat;
        return err;
 }
-int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
+/**
-                       struct nilfs_write_info *wi)
+ * nilfs_segbuf_write - submit write requests of a log
+ * @segbuf: buffer storing a log to be written
+ * @nilfs: nilfs object
+ *
+ * Return Value: On Success, 0 is returned. On Error, one of the following
+ * negative error code is returned.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
+                              struct the_nilfs *nilfs)
 {
+        struct nilfs_write_info wi;
        struct buffer_head *bh;
-        int res, rw = WRITE;
+        int res = 0, rw = WRITE;
+        wi.nilfs = nilfs;
+        nilfs_segbuf_prepare_write(segbuf, &wi);
        list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) {
-                res = nilfs_submit_bh(wi, bh, rw);
+                res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw);
                if (unlikely(res))
                        goto failed_bio;
        }
        list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
-                res = nilfs_submit_bh(wi, bh, rw);
+                res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw);
                if (unlikely(res))
                        goto failed_bio;
        }
-        if (wi->bio) {
+        if (wi.bio) {
                /*
                 * Last BIO is always sent through the following
                 * submission.
                 */
                rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
-                res = nilfs_submit_seg_bio(wi, rw);
+                res = nilfs_segbuf_submit_bio(segbuf, &wi, rw);
-                if (unlikely(res))
-                        goto failed_bio;
        }
-        res = 0;
- out:
-        return res;
 failed_bio:
-        atomic_inc(&wi->err);
+        return res;
-        goto out;
 }
 /**
 * nilfs_segbuf_wait - wait for completion of requested BIOs
- * @wi: nilfs_write_info
+ * @segbuf: segment buffer
 *
 * Return Value: On Success, 0 is returned. On Error, one of the following
 * negative error code is returned.
 *
 * %-EIO - I/O error
 */
-int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf,
+static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
-                      struct nilfs_write_info *wi)
 {
        int err = 0;
-        if (!wi->nbio)
+        if (!segbuf->sb_nbio)
                return 0;
        do {
-                wait_for_completion(&wi->bio_event);
+                wait_for_completion(&segbuf->sb_bio_event);
-        } while (--wi->nbio > 0);
+        } while (--segbuf->sb_nbio > 0);
-        if (unlikely(atomic_read(&wi->err) > 0)) {
+        if (unlikely(atomic_read(&segbuf->sb_err) > 0)) {
                printk(KERN_ERR "NILFS: IO error writing segment\n");
                err = -EIO;
-                segbuf->sb_io_error = 1;
        }
        return err;
 }
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 0c3076f4e592..94dfd3517bc0 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -27,7 +27,6 @@
 #include <linux/buffer_head.h>
 #include <linux/bio.h>
 #include <linux/completion.h>
-#include <linux/backing-dev.h>
 /**
 * struct nilfs_segsum_info - On-memory segment summary
@@ -77,7 +76,9 @@ struct nilfs_segsum_info {
 * @sb_rest_blocks: Number of residual blocks in the current segment
 * @sb_segsum_buffers: List of buffers for segment summaries
 * @sb_payload_buffers: List of buffers for segment payload
- * @sb_io_error: I/O error status
+ * @sb_nbio: Number of flying bio requests
+ * @sb_err: I/O error status
+ * @sb_bio_event: Completion event of log writing
 */
 struct nilfs_segment_buffer {
        struct super_block     *sb_super;
@@ -96,7 +97,9 @@ struct nilfs_segment_buffer {
        struct list_head        sb_payload_buffers; /* including super root */
        /* io status */
-        int                     sb_io_error;
+        int                     sb_nbio;
+        atomic_t                sb_err;
+        struct completion       sb_bio_event;
 };
 #define NILFS_LIST_SEGBUF(head)  \
@@ -125,6 +128,8 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *);
 void nilfs_segbuf_free(struct nilfs_segment_buffer *);
 void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long,
                      struct the_nilfs *);
+void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
+                           struct nilfs_segment_buffer *prev);
 void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
                                  struct the_nilfs *);
 int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t);
@@ -161,41 +166,15 @@ nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf,
        segbuf->sb_sum.nfileblk++;
 }
-void nilfs_release_buffers(struct list_head *);
+void nilfs_clear_logs(struct list_head *logs);
+void nilfs_truncate_logs(struct list_head *logs,
+                         struct nilfs_segment_buffer *last);
+int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs);
+int nilfs_wait_on_logs(struct list_head *logs);
-static inline void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
+static inline void nilfs_destroy_logs(struct list_head *logs)
 {
-        nilfs_release_buffers(&segbuf->sb_segsum_buffers);
+        nilfs_truncate_logs(logs, NULL);
-        nilfs_release_buffers(&segbuf->sb_payload_buffers);
 }
-struct nilfs_write_info {
-        struct bio             *bio;
-        int                     start, end; /* The region to be submitted */
-        int                     rest_blocks;
-        int                     max_pages;
-        int                     nr_vecs;
-        sector_t                blocknr;
-        int                     nbio;
-        atomic_t                err;
-        struct completion       bio_event;
-                                /* completion event of segment write */
-        /*
-         * The following fields must be set explicitly
-         */
-        struct super_block     *sb;
-        struct backing_dev_info *bdi; /* backing dev info */
-        struct buffer_head     *bh_sr;
-};
-void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *,
-                                struct nilfs_write_info *);
-int nilfs_segbuf_write(struct nilfs_segment_buffer *,
-                       struct nilfs_write_info *);
-int nilfs_segbuf_wait(struct nilfs_segment_buffer *,
-                      struct nilfs_write_info *);
 #endif /* _NILFS_SEGBUF_H */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 6eff66a070d5..6a7dbd8451db 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -32,6 +32,7 @@
 #include <linux/kthread.h>
 #include <linux/crc32.h>
 #include <linux/pagevec.h>
+#include <linux/slab.h>
 #include "nilfs.h"
 #include "btnode.h"
 #include "page.h"
@@ -141,7 +142,7 @@ int nilfs_init_transaction_cache(void)
 }
 /**
- * nilfs_detroy_transaction_cache - destroy the cache for transaction info
+ * nilfs_destroy_transaction_cache - destroy the cache for transaction info
 *
 * nilfs_destroy_transaction_cache() frees the slab cache for the struct
 * nilfs_transaction_info.
@@ -201,7 +202,7 @@ static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
 * This function allocates a nilfs_transaction_info struct to keep context
 * information on it.  It is initialized and hooked onto the current task in
 * the outermost call.  If a pre-allocated struct is given to @ti, it is used
- * instead; othewise a new struct is assigned from a slab.
+ * instead; otherwise a new struct is assigned from a slab.
 *
 * When @vacancy_check flag is set, this function will check the amount of
 * free space, and will wait for the GC to reclaim disk space if low capacity.
@@ -974,12 +975,12 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
                              nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
        raw_sr->sr_flags = 0;
-        nilfs_mdt_write_inode_direct(
+        nilfs_write_inode_common(nilfs_dat_inode(nilfs), (void *)raw_sr +
-                nilfs_dat_inode(nilfs), bh_sr, NILFS_SR_DAT_OFFSET(isz));
+                                 NILFS_SR_DAT_OFFSET(isz), 1);
-        nilfs_mdt_write_inode_direct(
+        nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr +
-                nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(isz));
+                                 NILFS_SR_CPFILE_OFFSET(isz), 1);
-        nilfs_mdt_write_inode_direct(
+        nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr +
-                nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(isz));
+                                 NILFS_SR_SUFILE_OFFSET(isz), 1);
 }
 static void nilfs_redirty_inodes(struct list_head *head)
@@ -1273,73 +1274,75 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
        return err;
 }
-static int nilfs_touch_segusage(struct inode *sufile, __u64 segnum)
+/**
-{
+ * nilfs_segctor_begin_construction - setup segment buffer to make a new log
-        struct buffer_head *bh_su;
+ * @sci: nilfs_sc_info
-        struct nilfs_segment_usage *raw_su;
+ * @nilfs: nilfs object
-        int err;
+ */
-        err = nilfs_sufile_get_segment_usage(sufile, segnum, &raw_su, &bh_su);
-        if (unlikely(err))
-                return err;
-        nilfs_mdt_mark_buffer_dirty(bh_su);
-        nilfs_mdt_mark_dirty(sufile);
-        nilfs_sufile_put_segment_usage(sufile, segnum, bh_su);
-        return 0;
-}
 static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci,
                                            struct the_nilfs *nilfs)
 {
-        struct nilfs_segment_buffer *segbuf, *n;
+        struct nilfs_segment_buffer *segbuf, *prev;
        __u64 nextnum;
-        int err;
+        int err, alloc = 0;
-        if (list_empty(&sci->sc_segbufs)) {
+        segbuf = nilfs_segbuf_new(sci->sc_super);
-                segbuf = nilfs_segbuf_new(sci->sc_super);
+        if (unlikely(!segbuf))
-                if (unlikely(!segbuf))
+                return -ENOMEM;
-                        return -ENOMEM;
-                list_add(&segbuf->sb_list, &sci->sc_segbufs);
-        } else
-                segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
-        nilfs_segbuf_map(segbuf, nilfs->ns_segnum, nilfs->ns_pseg_offset,
+        if (list_empty(&sci->sc_write_logs)) {
-                         nilfs);
+                nilfs_segbuf_map(segbuf, nilfs->ns_segnum,
+                                 nilfs->ns_pseg_offset, nilfs);
+                if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
+                        nilfs_shift_to_next_segment(nilfs);
+                        nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs);
+                }
+                segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq;
+                nextnum = nilfs->ns_nextnum;
-        if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
+                if (nilfs->ns_segnum == nilfs->ns_nextnum)
-                nilfs_shift_to_next_segment(nilfs);
+                        /* Start from the head of a new full segment */
-                nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs);
+                        alloc++;
+        } else {
+                /* Continue logs */
+                prev = NILFS_LAST_SEGBUF(&sci->sc_write_logs);
+                nilfs_segbuf_map_cont(segbuf, prev);
+                segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq;
+                nextnum = prev->sb_nextnum;
+                if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
+                        nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs);
+                        segbuf->sb_sum.seg_seq++;
+                        alloc++;
+                }
        }
-        sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks;
-        err = nilfs_touch_segusage(nilfs->ns_sufile, segbuf->sb_segnum);
+        err = nilfs_sufile_mark_dirty(nilfs->ns_sufile, segbuf->sb_segnum);
-        if (unlikely(err))
+        if (err)
-                return err;
+                goto failed;
-        if (nilfs->ns_segnum == nilfs->ns_nextnum) {
+        if (alloc) {
-                /* Start from the head of a new full segment */
                err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum);
-                if (unlikely(err))
+                if (err)
-                        return err;
+                        goto failed;
-        } else
+        }
-                nextnum = nilfs->ns_nextnum;
-        segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq;
        nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs);
-        /* truncating segment buffers */
+        BUG_ON(!list_empty(&sci->sc_segbufs));
-        list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs,
+        list_add_tail(&segbuf->sb_list, &sci->sc_segbufs);
-                                          sb_list) {
+        sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks;
-                list_del_init(&segbuf->sb_list);
-                nilfs_segbuf_free(segbuf);
-        }
        return 0;
+ failed:
+        nilfs_segbuf_free(segbuf);
+        return err;
 }
 static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
                                         struct the_nilfs *nilfs, int nadd)
 {
-        struct nilfs_segment_buffer *segbuf, *prev, *n;
+        struct nilfs_segment_buffer *segbuf, *prev;
        struct inode *sufile = nilfs->ns_sufile;
        __u64 nextnextnum;
        LIST_HEAD(list);
@@ -1352,7 +1355,7 @@ static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
         * not be dirty.  The following call ensures that the buffer is dirty
         * and will pin the buffer on memory until the sufile is written.
         */
-        err = nilfs_touch_segusage(sufile, prev->sb_nextnum);
+        err = nilfs_sufile_mark_dirty(sufile, prev->sb_nextnum);
        if (unlikely(err))
                return err;
@@ -1378,33 +1381,33 @@ static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
                list_add_tail(&segbuf->sb_list, &list);
                prev = segbuf;
        }
-        list_splice(&list, sci->sc_segbufs.prev);
+        list_splice_tail(&list, &sci->sc_segbufs);
        return 0;
 failed_segbuf:
        nilfs_segbuf_free(segbuf);
 failed:
-        list_for_each_entry_safe(segbuf, n, &list, sb_list) {
+        list_for_each_entry(segbuf, &list, sb_list) {
                ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
                WARN_ON(ret); /* never fails */
-                list_del_init(&segbuf->sb_list);
-                nilfs_segbuf_free(segbuf);
        }
+        nilfs_destroy_logs(&list);
        return err;
 }
-static void nilfs_segctor_free_incomplete_segments(struct nilfs_sc_info *sci,
+static void nilfs_free_incomplete_logs(struct list_head *logs,
-                                                   struct the_nilfs *nilfs)
+                                       struct the_nilfs *nilfs)
 {
-        struct nilfs_segment_buffer *segbuf;
+        struct nilfs_segment_buffer *segbuf, *prev;
-        int ret, done = 0;
+        struct inode *sufile = nilfs->ns_sufile;
+        int ret;
-        segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+        segbuf = NILFS_FIRST_SEGBUF(logs);
        if (nilfs->ns_nextnum != segbuf->sb_nextnum) {
-                ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum);
+                ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
                WARN_ON(ret); /* never fails */
        }
-        if (segbuf->sb_io_error) {
+        if (atomic_read(&segbuf->sb_err)) {
                /* Case 1: The first segment failed */
                if (segbuf->sb_pseg_start != segbuf->sb_fseg_start)
                        /* Case 1a:  Partial segment appended into an existing
@@ -1413,106 +1416,54 @@ static void nilfs_segctor_free_incomplete_segments(struct nilfs_sc_info *sci,
                                                segbuf->sb_fseg_end);
                else /* Case 1b:  New full segment */
                        set_nilfs_discontinued(nilfs);
-                done++;
        }
-        list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
+        prev = segbuf;
-                ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum);
+        list_for_each_entry_continue(segbuf, logs, sb_list) {
-                WARN_ON(ret); /* never fails */
+                if (prev->sb_nextnum != segbuf->sb_nextnum) {
-                if (!done && segbuf->sb_io_error) {
+                        ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
-                        if (segbuf->sb_segnum != nilfs->ns_nextnum)
+                        WARN_ON(ret); /* never fails */
-                                /* Case 2: extended segment (!= next) failed */
-                                nilfs_sufile_set_error(nilfs->ns_sufile,
-                                                       segbuf->sb_segnum);
-                        done++;
-                }
-        }
-}
-static void nilfs_segctor_clear_segment_buffers(struct nilfs_sc_info *sci)
-{
-        struct nilfs_segment_buffer *segbuf;
-        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list)
-                nilfs_segbuf_clear(segbuf);
-        sci->sc_super_root = NULL;
-}
-static void nilfs_segctor_destroy_segment_buffers(struct nilfs_sc_info *sci)
-{
-        struct nilfs_segment_buffer *segbuf;
-        while (!list_empty(&sci->sc_segbufs)) {
-                segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
-                list_del_init(&segbuf->sb_list);
-                nilfs_segbuf_free(segbuf);
-        }
-        /* sci->sc_curseg = NULL; */
-}
-static void nilfs_segctor_end_construction(struct nilfs_sc_info *sci,
-                                           struct the_nilfs *nilfs, int err)
-{
-        if (unlikely(err)) {
-                nilfs_segctor_free_incomplete_segments(sci, nilfs);
-                if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
-                        int ret;
-                        ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
-                                                        sci->sc_freesegs,
-                                                        sci->sc_nfreesegs,
-                                                        NULL);
-                        WARN_ON(ret); /* do not happen */
                }
+                if (atomic_read(&segbuf->sb_err) &&
+                    segbuf->sb_segnum != nilfs->ns_nextnum)
+                        /* Case 2: extended segment (!= next) failed */
+                        nilfs_sufile_set_error(sufile, segbuf->sb_segnum);
+                prev = segbuf;
        }
-        nilfs_segctor_clear_segment_buffers(sci);
 }
 static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci,
                                          struct inode *sufile)
 {
        struct nilfs_segment_buffer *segbuf;
-        struct buffer_head *bh_su;
-        struct nilfs_segment_usage *raw_su;
        unsigned long live_blocks;
        int ret;
        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
-                ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
-                                                     &raw_su, &bh_su);
-                WARN_ON(ret); /* always succeed because bh_su is dirty */
                live_blocks = segbuf->sb_sum.nblocks +
                        (segbuf->sb_pseg_start - segbuf->sb_fseg_start);
-                raw_su->su_lastmod = cpu_to_le64(sci->sc_seg_ctime);
+                ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
-                raw_su->su_nblocks = cpu_to_le32(live_blocks);
+                                                     live_blocks,
-                nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
+                                                     sci->sc_seg_ctime);
-                                               bh_su);
+                WARN_ON(ret); /* always succeed because the segusage is dirty */
        }
 }
-static void nilfs_segctor_cancel_segusage(struct nilfs_sc_info *sci,
+static void nilfs_cancel_segusage(struct list_head *logs, struct inode *sufile)
-                                          struct inode *sufile)
 {
        struct nilfs_segment_buffer *segbuf;
-        struct buffer_head *bh_su;
-        struct nilfs_segment_usage *raw_su;
        int ret;
-        segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+        segbuf = NILFS_FIRST_SEGBUF(logs);
-        ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
+        ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
-                                             &raw_su, &bh_su);
+                                             segbuf->sb_pseg_start -
-        WARN_ON(ret); /* always succeed because bh_su is dirty */
+                                             segbuf->sb_fseg_start, 0);
-        raw_su->su_nblocks = cpu_to_le32(segbuf->sb_pseg_start -
+        WARN_ON(ret); /* always succeed because the segusage is dirty */
-                                         segbuf->sb_fseg_start);
-        nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, bh_su);
-        list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
+        list_for_each_entry_continue(segbuf, logs, sb_list) {
-                ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
+                ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
-                                                     &raw_su, &bh_su);
+                                                     0, 0);
                WARN_ON(ret); /* always succeed */
-                raw_su->su_nblocks = 0;
-                nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
-                                               bh_su);
        }
 }
@@ -1520,17 +1471,15 @@ static void nilfs_segctor_truncate_segments(struct nilfs_sc_info *sci,
                                            struct nilfs_segment_buffer *last,
                                            struct inode *sufile)
 {
-        struct nilfs_segment_buffer *segbuf = last, *n;
+        struct nilfs_segment_buffer *segbuf = last;
        int ret;
-        list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs,
+        list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
-                                          sb_list) {
-                list_del_init(&segbuf->sb_list);
                sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks;
                ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
                WARN_ON(ret);
-                nilfs_segbuf_free(segbuf);
        }
+        nilfs_truncate_logs(&sci->sc_segbufs, last);
 }
@@ -1562,6 +1511,12 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
                if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
                        break;
+                nilfs_clear_logs(&sci->sc_segbufs);
+                err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
+                if (unlikely(err))
+                        return err;
                if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
                        err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
                                                        sci->sc_freesegs,
@@ -1569,12 +1524,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
                                                        NULL);
                        WARN_ON(err); /* do not happen */
                }
-                nilfs_segctor_clear_segment_buffers(sci);
-                err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
-                if (unlikely(err))
-                        return err;
                nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
                sci->sc_stage = prev_stage;
        }
@@ -1814,26 +1763,13 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
 }
 static int nilfs_segctor_write(struct nilfs_sc_info *sci,
-                               struct backing_dev_info *bdi)
+                               struct the_nilfs *nilfs)
 {
-        struct nilfs_segment_buffer *segbuf;
+        int ret;
-        struct nilfs_write_info wi;
-        int err, res;
-        wi.sb = sci->sc_super;
-        wi.bh_sr = sci->sc_super_root;
-        wi.bdi = bdi;
-        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
-                nilfs_segbuf_prepare_write(segbuf, &wi);
-                err = nilfs_segbuf_write(segbuf, &wi);
-                res = nilfs_segbuf_wait(segbuf, &wi);
+        ret = nilfs_write_logs(&sci->sc_segbufs, nilfs);
-                err = err ? : res;
+        list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs);
-                if (err)
+        return ret;
-                        return err;
-        }
-        return 0;
 }
 static void __nilfs_end_page_io(struct page *page, int err)
@@ -1911,15 +1847,17 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err)
        }
 }
-static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
+static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
-                                      struct page *failed_page, int err)
+                             struct buffer_head *bh_sr, int err)
 {
        struct nilfs_segment_buffer *segbuf;
        struct page *bd_page = NULL, *fs_page = NULL;
+        struct buffer_head *bh;
-        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+        if (list_empty(logs))
-                struct buffer_head *bh;
+                return;
+        list_for_each_entry(segbuf, logs, sb_list) {
                list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
                                    b_assoc_buffers) {
                        if (bh->b_page != bd_page) {
@@ -1931,7 +1869,7 @@ static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
                list_for_each_entry(bh, &segbuf->sb_payload_buffers,
                                    b_assoc_buffers) {
-                        if (bh == sci->sc_super_root) {
+                        if (bh == bh_sr) {
                                if (bh->b_page != bd_page) {
                                        end_page_writeback(bd_page);
                                        bd_page = bh->b_page;
@@ -1941,7 +1879,7 @@ static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
                        if (bh->b_page != fs_page) {
                                nilfs_end_page_io(fs_page, err);
                                if (fs_page && fs_page == failed_page)
-                                        goto done;
+                                        return;
                                fs_page = bh->b_page;
                        }
                }
@@ -1950,8 +1888,33 @@ static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
                end_page_writeback(bd_page);
        nilfs_end_page_io(fs_page, err);
- done:
+}
+static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
+                                             struct the_nilfs *nilfs, int err)
+{
+        LIST_HEAD(logs);
+        int ret;
+        list_splice_tail_init(&sci->sc_write_logs, &logs);
+        ret = nilfs_wait_on_logs(&logs);
+        nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret ? : err);
+        list_splice_tail_init(&sci->sc_segbufs, &logs);
+        nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
+        nilfs_free_incomplete_logs(&logs, nilfs);
        nilfs_clear_copied_buffers(&sci->sc_copied_buffers, err);
+        if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
+                ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
+                                                sci->sc_freesegs,
+                                                sci->sc_nfreesegs,
+                                                NULL);
+                WARN_ON(ret); /* do not happen */
+        }
+        nilfs_destroy_logs(&logs);
+        sci->sc_super_root = NULL;
 }
 static void nilfs_set_next_segment(struct the_nilfs *nilfs,
@@ -1969,11 +1932,10 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 {
        struct nilfs_segment_buffer *segbuf;
        struct page *bd_page = NULL, *fs_page = NULL;
-        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        int update_sr = (sci->sc_super_root != NULL);
-        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+        list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
                struct buffer_head *bh;
                list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
@@ -2046,21 +2008,34 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
        sci->sc_nblk_inc += sci->sc_nblk_this_inc;
-        segbuf = NILFS_LAST_SEGBUF(&sci->sc_segbufs);
+        segbuf = NILFS_LAST_SEGBUF(&sci->sc_write_logs);
        nilfs_set_next_segment(nilfs, segbuf);
        if (update_sr) {
                nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
                                       segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
-                sbi->s_super->s_dirt = 1;
+                set_nilfs_sb_dirty(nilfs);
                clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
                clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
                set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
+                nilfs_segctor_clear_metadata_dirty(sci);
        } else
                clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
 }
+static int nilfs_segctor_wait(struct nilfs_sc_info *sci)
+{
+        int ret;
+        ret = nilfs_wait_on_logs(&sci->sc_write_logs);
+        if (!ret) {
+                nilfs_segctor_complete_write(sci);
+                nilfs_destroy_logs(&sci->sc_write_logs);
+        }
+        return ret;
+}
 static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
                                        struct nilfs_sb_info *sbi)
 {
@@ -2173,7 +2148,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                /* Avoid empty segment */
                if (sci->sc_stage.scnt == NILFS_ST_DONE &&
                    NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) {
-                        nilfs_segctor_end_construction(sci, nilfs, 1);
+                        nilfs_segctor_abort_construction(sci, nilfs, 1);
                        goto out;
                }
@@ -2187,7 +2162,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                if (has_sr) {
                        err = nilfs_segctor_fill_in_checkpoint(sci);
                        if (unlikely(err))
-                                goto failed_to_make_up;
+                                goto failed_to_write;
                        nilfs_segctor_fill_in_super_root(sci, nilfs);
                }
@@ -2195,47 +2170,51 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                /* Write partial segments */
                err = nilfs_segctor_prepare_write(sci, &failed_page);
-                if (unlikely(err))
+                if (err) {
+                        nilfs_abort_logs(&sci->sc_segbufs, failed_page,
+                                         sci->sc_super_root, err);
                        goto failed_to_write;
+                }
                nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed);
-                err = nilfs_segctor_write(sci, nilfs->ns_bdi);
+                err = nilfs_segctor_write(sci, nilfs);
                if (unlikely(err))
                        goto failed_to_write;
-                nilfs_segctor_complete_write(sci);
+                if (sci->sc_stage.scnt == NILFS_ST_DONE ||
+                    nilfs->ns_blocksize_bits != PAGE_CACHE_SHIFT) {
-                /* Commit segments */
+                        /*
-                if (has_sr)
+                         * At this point, we avoid double buffering
-                        nilfs_segctor_clear_metadata_dirty(sci);
+                         * for blocksize < pagesize because page dirty
+                         * flag is turned off during write and dirty
-                nilfs_segctor_end_construction(sci, nilfs, 0);
+                         * buffers are not properly collected for
+                         * pages crossing over segments.
+                         */
+                        err = nilfs_segctor_wait(sci);
+                        if (err)
+                                goto failed_to_write;
+                }
        } while (sci->sc_stage.scnt != NILFS_ST_DONE);
+        sci->sc_super_root = NULL;
 out:
-        nilfs_segctor_destroy_segment_buffers(sci);
        nilfs_segctor_check_out_files(sci, sbi);
        return err;
 failed_to_write:
-        nilfs_segctor_abort_write(sci, failed_page, err);
-        nilfs_segctor_cancel_segusage(sci, nilfs->ns_sufile);
- failed_to_make_up:
        if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
                nilfs_redirty_inodes(&sci->sc_dirty_files);
 failed:
        if (nilfs_doing_gc())
                nilfs_redirty_inodes(&sci->sc_gc_inodes);
-        nilfs_segctor_end_construction(sci, nilfs, err);
+        nilfs_segctor_abort_construction(sci, nilfs, err);
        goto out;
 }
 /**
- * nilfs_secgtor_start_timer - set timer of background write
+ * nilfs_segctor_start_timer - set timer of background write
 * @sci: nilfs_sc_info
 *
 * If the timer has already been set, it ignores the new request.
@@ -2440,43 +2419,43 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
        return err;
 }
-struct nilfs_segctor_req {
-        int mode;
-        __u32 seq_accepted;
-        int sc_err;  /* construction failure */
-        int sb_err;  /* super block writeback failure */
-};
 #define FLUSH_FILE_BIT  (0x1) /* data file only */
 #define FLUSH_DAT_BIT   (1 << NILFS_DAT_INO) /* DAT only */
-static void nilfs_segctor_accept(struct nilfs_sc_info *sci,
+/**
-                                 struct nilfs_segctor_req *req)
+ * nilfs_segctor_accept - record accepted sequence count of log-write requests
+ * @sci: segment constructor object
+ */
+static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
 {
-        req->sc_err = req->sb_err = 0;
        spin_lock(&sci->sc_state_lock);
-        req->seq_accepted = sci->sc_seq_request;
+        sci->sc_seq_accepted = sci->sc_seq_request;
        spin_unlock(&sci->sc_state_lock);
        if (sci->sc_timer)
                del_timer_sync(sci->sc_timer);
 }
-static void nilfs_segctor_notify(struct nilfs_sc_info *sci,
+/**
-                                 struct nilfs_segctor_req *req)
+ * nilfs_segctor_notify - notify the result of request to caller threads
+ * @sci: segment constructor object
+ * @mode: mode of log forming
+ * @err: error code to be notified
+ */
+static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
 {
        /* Clear requests (even when the construction failed) */
        spin_lock(&sci->sc_state_lock);
-        if (req->mode == SC_LSEG_SR) {
+        if (mode == SC_LSEG_SR) {
                sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
-                sci->sc_seq_done = req->seq_accepted;
+                sci->sc_seq_done = sci->sc_seq_accepted;
-                nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err);
+                nilfs_segctor_wakeup(sci, err);
                sci->sc_flush_request = 0;
        } else {
-                if (req->mode == SC_FLUSH_FILE)
+                if (mode == SC_FLUSH_FILE)
                        sci->sc_flush_request &= ~FLUSH_FILE_BIT;
-                else if (req->mode == SC_FLUSH_DAT)
+                else if (mode == SC_FLUSH_DAT)
                        sci->sc_flush_request &= ~FLUSH_DAT_BIT;
                /* re-enable timer if checkpoint creation was not done */
@@ -2487,30 +2466,37 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci,
        spin_unlock(&sci->sc_state_lock);
 }
-static int nilfs_segctor_construct(struct nilfs_sc_info *sci,
+/**
-                                   struct nilfs_segctor_req *req)
+ * nilfs_segctor_construct - form logs and write them to disk
+ * @sci: segment constructor object
+ * @mode: mode of log forming
+ */
+static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
 {
        struct nilfs_sb_info *sbi = sci->sc_sbi;
        struct the_nilfs *nilfs = sbi->s_nilfs;
        int err = 0;
+        nilfs_segctor_accept(sci);
        if (nilfs_discontinued(nilfs))
-                req->mode = SC_LSEG_SR;
+                mode = SC_LSEG_SR;
-        if (!nilfs_segctor_confirm(sci)) {
+        if (!nilfs_segctor_confirm(sci))
-                err = nilfs_segctor_do_construct(sci, req->mode);
+                err = nilfs_segctor_do_construct(sci, mode);
-                req->sc_err = err;
-        }
        if (likely(!err)) {
-                if (req->mode != SC_FLUSH_DAT)
+                if (mode != SC_FLUSH_DAT)
                        atomic_set(&nilfs->ns_ndirtyblks, 0);
                if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
                    nilfs_discontinued(nilfs)) {
                        down_write(&nilfs->ns_sem);
-                        req->sb_err = nilfs_commit_super(sbi,
+                        err = nilfs_commit_super(
-                                        nilfs_altsb_need_update(nilfs));
+                                sbi, nilfs_altsb_need_update(nilfs));
                        up_write(&nilfs->ns_sem);
                }
        }
+        nilfs_segctor_notify(sci, mode, err);
        return err;
 }
@@ -2541,7 +2527,6 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
        struct nilfs_sc_info *sci = NILFS_SC(sbi);
        struct the_nilfs *nilfs = sbi->s_nilfs;
        struct nilfs_transaction_info ti;
-        struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
        int err;
        if (unlikely(!sci))
@@ -2559,13 +2544,11 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
        sci->sc_freesegs = kbufs[4];
        sci->sc_nfreesegs = argv[4].v_nmembs;
-        list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev);
+        list_splice_tail_init(&nilfs->ns_gc_inodes, &sci->sc_gc_inodes);
        for (;;) {
-                nilfs_segctor_accept(sci, &req);
+                err = nilfs_segctor_construct(sci, SC_LSEG_SR);
-                err = nilfs_segctor_construct(sci, &req);
                nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes);
-                nilfs_segctor_notify(sci, &req);
                if (likely(!err))
                        break;
@@ -2575,6 +2558,16 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
                set_current_state(TASK_INTERRUPTIBLE);
                schedule_timeout(sci->sc_interval);
        }
+        if (nilfs_test_opt(sbi, DISCARD)) {
+                int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs,
+                                                 sci->sc_nfreesegs);
+                if (ret) {
+                        printk(KERN_WARNING
+                               "NILFS warning: error %d on discard request, "
+                               "turning discards off for the device\n", ret);
+                        nilfs_clear_opt(sbi, DISCARD);
+                }
+        }
 out_unlock:
        sci->sc_freesegs = NULL;
@@ -2588,13 +2581,9 @@ static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
 {
        struct nilfs_sb_info *sbi = sci->sc_sbi;
        struct nilfs_transaction_info ti;
-        struct nilfs_segctor_req req = { .mode = mode };
        nilfs_transaction_lock(sbi, &ti, 0);
+        nilfs_segctor_construct(sci, mode);
-        nilfs_segctor_accept(sci, &req);
-        nilfs_segctor_construct(sci, &req);
-        nilfs_segctor_notify(sci, &req);
        /*
         * Unclosed segment should be retried.  We do this using sc_timer.
@@ -2650,6 +2639,7 @@ static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
 static int nilfs_segctor_thread(void *arg)
 {
        struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
+        struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
        struct timer_list timer;
        int timeout = 0;
@@ -2695,7 +2685,6 @@ static int nilfs_segctor_thread(void *arg)
        } else {
                DEFINE_WAIT(wait);
                int should_sleep = 1;
-                struct the_nilfs *nilfs;
                prepare_to_wait(&sci->sc_wait_daemon, &wait,
                                TASK_INTERRUPTIBLE);
@@ -2716,8 +2705,8 @@ static int nilfs_segctor_thread(void *arg)
                finish_wait(&sci->sc_wait_daemon, &wait);
                timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
                           time_after_eq(jiffies, sci->sc_timer->expires));
-                nilfs = sci->sc_sbi->s_nilfs;
-                if (sci->sc_super->s_dirt && nilfs_sb_need_update(nilfs))
+                if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs))
                        set_nilfs_discontinued(nilfs);
        }
        goto loop;
@@ -2788,6 +2777,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
        spin_lock_init(&sci->sc_state_lock);
        INIT_LIST_HEAD(&sci->sc_dirty_files);
        INIT_LIST_HEAD(&sci->sc_segbufs);
+        INIT_LIST_HEAD(&sci->sc_write_logs);
        INIT_LIST_HEAD(&sci->sc_gc_inodes);
        INIT_LIST_HEAD(&sci->sc_copied_buffers);
@@ -2811,12 +2801,9 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
        do {
                struct nilfs_sb_info *sbi = sci->sc_sbi;
                struct nilfs_transaction_info ti;
-                struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
                nilfs_transaction_lock(sbi, &ti, 0);
-                nilfs_segctor_accept(sci, &req);
+                ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
-                ret = nilfs_segctor_construct(sci, &req);
-                nilfs_segctor_notify(sci, &req);
                nilfs_transaction_unlock(sbi);
        } while (ret && retrycount-- > 0);
@@ -2843,7 +2830,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
                || sci->sc_seq_request != sci->sc_seq_done);
        spin_unlock(&sci->sc_state_lock);
-        if (flag || nilfs_segctor_confirm(sci))
+        if (flag || !nilfs_segctor_confirm(sci))
                nilfs_segctor_write_out(sci);
        WARN_ON(!list_empty(&sci->sc_copied_buffers));
@@ -2855,6 +2842,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
        }
        WARN_ON(!list_empty(&sci->sc_segbufs));
+        WARN_ON(!list_empty(&sci->sc_write_logs));
        down_write(&sbi->s_nilfs->ns_segctor_sem);
@@ -2866,7 +2854,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 * @sbi: nilfs_sb_info
 *
 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
- * initilizes it, and starts the segment constructor.
+ * initializes it, and starts the segment constructor.
 *
 * Return Value: On success, 0 is returned. On error, one of the following
 * negative error code is returned.
@@ -2878,8 +2866,15 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
        struct the_nilfs *nilfs = sbi->s_nilfs;
        int err;
-        /* Each field of nilfs_segctor is cleared through the initialization
+        if (NILFS_SC(sbi)) {
-           of super-block info */
+                /*
+                 * This happens if the filesystem was remounted
+                 * read/write after nilfs_error degenerated it into a
+                 * read-only mount.
+                 */
+                nilfs_detach_segment_constructor(sbi);
+        }
        sbi->s_sc_info = nilfs_segctor_new(sbi);
        if (!sbi->s_sc_info)
                return -ENOMEM;
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 0d2a475a741b..82dfd6a686b9 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -30,7 +30,7 @@
 #include "sb.h"
 /**
- * struct nilfs_recovery_info - Recovery infomation
+ * struct nilfs_recovery_info - Recovery information
 * @ri_need_recovery: Recovery status
 * @ri_super_root: Block number of the last super root
 * @ri_ri_cno: Number of the last checkpoint
@@ -71,7 +71,7 @@ struct nilfs_recovery_info {
 */
 struct nilfs_cstage {
        int                     scnt;
-        unsigned                flags;
+        unsigned                flags;
        struct nilfs_inode_info *dirty_file_ptr;
        struct nilfs_inode_info *gc_inode_ptr;
 };
@@ -97,6 +97,7 @@ struct nilfs_segsum_pointer {
 * @sc_dsync_start: start byte offset of data pages
 * @sc_dsync_end: end byte offset of data pages (inclusive)
 * @sc_segbufs: List of segment buffers
+ * @sc_write_logs: List of segment buffers to hold logs under writing
 * @sc_segbuf_nblocks: Number of available blocks in segment buffers.
 * @sc_curseg: Current segment buffer
 * @sc_super_root: Pointer to the super root buffer
@@ -115,6 +116,7 @@ struct nilfs_segsum_pointer {
 * @sc_wait_daemon: Daemon wait queue
 * @sc_wait_task: Start/end wait queue to control segctord task
 * @sc_seq_request: Request counter
+ * @sc_seq_accept: Accepted request count
 * @sc_seq_done: Completion counter
 * @sc_sync: Request of explicit sync operation
 * @sc_interval: Timeout value of background construction
@@ -143,6 +145,7 @@ struct nilfs_sc_info {
        /* Segment buffers */
        struct list_head        sc_segbufs;
+        struct list_head        sc_write_logs;
        unsigned long           sc_segbuf_nblocks;
        struct nilfs_segment_buffer *sc_curseg;
        struct buffer_head     *sc_super_root;
@@ -167,6 +170,7 @@ struct nilfs_sc_info {
        wait_queue_head_t       sc_wait_task;
        __u32                   sc_seq_request;
+        __u32                   sc_seq_accepted;
        __u32                   sc_seq_done;
        int                     sc_sync;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 37994d4a59cc..3c6cc6005c2e 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -18,7 +18,7 @@
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *
 * Written by Koji Sato <koji@osrg.net>.
- * Rivised by Ryusuke Konishi <ryusuke@osrg.net>.
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
 */
 #include <linux/kernel.h>
@@ -31,6 +31,16 @@
 #include "sufile.h"
+struct nilfs_sufile_info {
+        struct nilfs_mdt_info mi;
+        unsigned long ncleansegs;
+};
+static inline struct nilfs_sufile_info *NILFS_SUI(struct inode *sufile)
+{
+        return (struct nilfs_sufile_info *)NILFS_MDT(sufile);
+}
 static inline unsigned long
 nilfs_sufile_segment_usages_per_block(const struct inode *sufile)
 {
@@ -62,14 +72,6 @@ nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr,
                     max - curr + 1);
 }
-static inline struct nilfs_sufile_header *
-nilfs_sufile_block_get_header(const struct inode *sufile,
-                              struct buffer_head *bh,
-                              void *kaddr)
-{
-        return kaddr + bh_offset(bh);
-}
 static struct nilfs_segment_usage *
 nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum,
                                     struct buffer_head *bh, void *kaddr)
@@ -110,6 +112,15 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
 }
 /**
+ * nilfs_sufile_get_ncleansegs - return the number of clean segments
+ * @sufile: inode of segment usage file
+ */
+unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile)
+{
+        return NILFS_SUI(sufile)->ncleansegs;
+}
+/**
 * nilfs_sufile_updatev - modify multiple segment usages at a time
 * @sufile: inode of segment usage file
 * @segnumv: array of segment numbers
@@ -270,7 +281,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
        if (ret < 0)
                goto out_sem;
        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
-        header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+        header = kaddr + bh_offset(header_bh);
        ncleansegs = le64_to_cpu(header->sh_ncleansegs);
        last_alloc = le64_to_cpu(header->sh_last_alloc);
        kunmap_atomic(kaddr, KM_USER0);
@@ -302,13 +313,13 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
                        kunmap_atomic(kaddr, KM_USER0);
                        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
-                        header = nilfs_sufile_block_get_header(
+                        header = kaddr + bh_offset(header_bh);
-                                sufile, header_bh, kaddr);
                        le64_add_cpu(&header->sh_ncleansegs, -1);
                        le64_add_cpu(&header->sh_ndirtysegs, 1);
                        header->sh_last_alloc = cpu_to_le64(segnum);
                        kunmap_atomic(kaddr, KM_USER0);
+                        NILFS_SUI(sufile)->ncleansegs--;
                        nilfs_mdt_mark_buffer_dirty(header_bh);
                        nilfs_mdt_mark_buffer_dirty(su_bh);
                        nilfs_mdt_mark_dirty(sufile);
@@ -351,6 +362,8 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
        kunmap_atomic(kaddr, KM_USER0);
        nilfs_sufile_mod_counter(header_bh, -1, 1);
+        NILFS_SUI(sufile)->ncleansegs--;
        nilfs_mdt_mark_buffer_dirty(su_bh);
        nilfs_mdt_mark_dirty(sufile);
 }
@@ -380,6 +393,8 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
        kunmap_atomic(kaddr, KM_USER0);
        nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
+        NILFS_SUI(sufile)->ncleansegs -= clean;
        nilfs_mdt_mark_buffer_dirty(su_bh);
        nilfs_mdt_mark_dirty(sufile);
 }
@@ -409,79 +424,65 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
        nilfs_mdt_mark_buffer_dirty(su_bh);
        nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
+        NILFS_SUI(sufile)->ncleansegs++;
        nilfs_mdt_mark_dirty(sufile);
 }
 /**
- * nilfs_sufile_get_segment_usage - get a segment usage
+ * nilfs_sufile_mark_dirty - mark the buffer having a segment usage dirty
 * @sufile: inode of segment usage file
 * @segnum: segment number
- * @sup: pointer to segment usage
- * @bhp: pointer to buffer head
- *
- * Description: nilfs_sufile_get_segment_usage() acquires the segment usage
- * specified by @segnum.
- *
- * Return Value: On success, 0 is returned, and the segment usage and the
- * buffer head of the buffer on which the segment usage is located are stored
- * in the place pointed by @sup and @bhp, respectively. On error, one of the
- * following negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-EINVAL - Invalid segment usage number.
 */
-int nilfs_sufile_get_segment_usage(struct inode *sufile, __u64 segnum,
+int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
-                                   struct nilfs_segment_usage **sup,
-                                   struct buffer_head **bhp)
 {
        struct buffer_head *bh;
-        struct nilfs_segment_usage *su;
-        void *kaddr;
        int ret;
-        /* segnum is 0 origin */
+        ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
-        if (segnum >= nilfs_sufile_get_nsegments(sufile))
+        if (!ret) {
-                return -EINVAL;
+                nilfs_mdt_mark_buffer_dirty(bh);
-        down_write(&NILFS_MDT(sufile)->mi_sem);
+                nilfs_mdt_mark_dirty(sufile);
-        ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &bh);
-        if (ret < 0)
-                goto out_sem;
-        kaddr = kmap(bh->b_page);
-        su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
-        if (nilfs_segment_usage_error(su)) {
-                kunmap(bh->b_page);
                brelse(bh);
-                ret = -EINVAL;
-                goto out_sem;
        }
-        if (sup != NULL)
-                *sup = su;
-        *bhp = bh;
- out_sem:
-        up_write(&NILFS_MDT(sufile)->mi_sem);
        return ret;
 }
 /**
- * nilfs_sufile_put_segment_usage - put a segment usage
+ * nilfs_sufile_set_segment_usage - set usage of a segment
 * @sufile: inode of segment usage file
 * @segnum: segment number
- * @bh: buffer head
+ * @nblocks: number of live blocks in the segment
- *
+ * @modtime: modification time (option)
- * Description: nilfs_sufile_put_segment_usage() releases the segment usage
- * specified by @segnum. @bh must be the buffer head which have been returned
- * by a previous call to nilfs_sufile_get_segment_usage() with @segnum.
 */
-void nilfs_sufile_put_segment_usage(struct inode *sufile, __u64 segnum,
+int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
-                                    struct buffer_head *bh)
+                                   unsigned long nblocks, time_t modtime)
 {
-        kunmap(bh->b_page);
+        struct buffer_head *bh;
+        struct nilfs_segment_usage *su;
+        void *kaddr;
+        int ret;
+        down_write(&NILFS_MDT(sufile)->mi_sem);
+        ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
+        if (ret < 0)
+                goto out_sem;
+        kaddr = kmap_atomic(bh->b_page, KM_USER0);
+        su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
+        WARN_ON(nilfs_segment_usage_error(su));
+        if (modtime)
+                su->su_lastmod = cpu_to_le64(modtime);
+        su->su_nblocks = cpu_to_le32(nblocks);
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_mdt_mark_buffer_dirty(bh);
+        nilfs_mdt_mark_dirty(sufile);
        brelse(bh);
+ out_sem:
+        up_write(&NILFS_MDT(sufile)->mi_sem);
+        return ret;
 }
 /**
@@ -515,7 +516,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
                goto out_sem;
        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
-        header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+        header = kaddr + bh_offset(header_bh);
        sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);
        sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs);
        sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs);
@@ -532,33 +533,6 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
        return ret;
 }
-/**
- * nilfs_sufile_get_ncleansegs - get the number of clean segments
- * @sufile: inode of segment usage file
- * @nsegsp: pointer to the number of clean segments
- *
- * Description: nilfs_sufile_get_ncleansegs() acquires the number of clean
- * segments.
- *
- * Return Value: On success, 0 is returned and the number of clean segments is
- * stored in the place pointed by @nsegsp. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- */
-int nilfs_sufile_get_ncleansegs(struct inode *sufile, unsigned long *nsegsp)
-{
-        struct nilfs_sustat sustat;
-        int ret;
-        ret = nilfs_sufile_get_stat(sufile, &sustat);
-        if (ret == 0)
-                *nsegsp = sustat.ss_ncleansegs;
-        return ret;
-}
 void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
                               struct buffer_head *header_bh,
                               struct buffer_head *su_bh)
@@ -577,8 +551,10 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
        nilfs_segment_usage_set_error(su);
        kunmap_atomic(kaddr, KM_USER0);
-        if (suclean)
+        if (suclean) {
                nilfs_sufile_mod_counter(header_bh, -1, 0);
+                NILFS_SUI(sufile)->ncleansegs--;
+        }
        nilfs_mdt_mark_buffer_dirty(su_bh);
        nilfs_mdt_mark_dirty(sufile);
 }
@@ -657,3 +633,48 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
        up_read(&NILFS_MDT(sufile)->mi_sem);
        return ret;
 }
+/**
+ * nilfs_sufile_read - read sufile inode
+ * @sufile: sufile inode
+ * @raw_inode: on-disk sufile inode
+ */
+int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode)
+{
+        struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
+        struct buffer_head *header_bh;
+        struct nilfs_sufile_header *header;
+        void *kaddr;
+        int ret;
+        ret = nilfs_read_inode_common(sufile, raw_inode);
+        if (ret < 0)
+                return ret;
+        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+        if (!ret) {
+                kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+                header = kaddr + bh_offset(header_bh);
+                sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
+                kunmap_atomic(kaddr, KM_USER0);
+                brelse(header_bh);
+        }
+        return ret;
+}
+/**
+ * nilfs_sufile_new - create sufile
+ * @nilfs: nilfs object
+ * @susize: size of a segment usage entry
+ */
+struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize)
+{
+        struct inode *sufile;
+        sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO,
+                               sizeof(struct nilfs_sufile_info));
+        if (sufile)
+                nilfs_mdt_set_entry_size(sufile, susize,
+                                         sizeof(struct nilfs_sufile_header));
+        return sufile;
+}
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index 0e99e5c0bd0f..15163b8aff7d 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -34,14 +34,13 @@ static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
        return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments;
 }
+unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile);
 int nilfs_sufile_alloc(struct inode *, __u64 *);
-int nilfs_sufile_get_segment_usage(struct inode *, __u64,
+int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum);
-                                   struct nilfs_segment_usage **,
+int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
-                                   struct buffer_head **);
+                                   unsigned long nblocks, time_t modtime);
-void nilfs_sufile_put_segment_usage(struct inode *, __u64,
-                                    struct buffer_head *);
 int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
-int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *);
 ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned,
                                size_t);
@@ -62,6 +61,9 @@ void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
 void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
                               struct buffer_head *);
+int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode);
+struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize);
 /**
 * nilfs_sufile_scrap - make a segment garbage
 * @sufile: inode of segment usage file
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 644e66727dd0..0cdbc5e7655a 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -96,9 +96,6 @@ void nilfs_error(struct super_block *sb, const char *function,
        if (!(sb->s_flags & MS_RDONLY)) {
                struct the_nilfs *nilfs = sbi->s_nilfs;
-                if (!nilfs_test_opt(sbi, ERRORS_CONT))
-                        nilfs_detach_segment_constructor(sbi);
                down_write(&nilfs->ns_sem);
                if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
                        nilfs->ns_mount_state |= NILFS_ERROR_FS;
@@ -301,7 +298,7 @@ int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
                memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
                nilfs->ns_sbwtime[1] = t;
        }
-        sbi->s_super->s_dirt = 0;
+        clear_nilfs_sb_dirty(nilfs);
        return nilfs_sync_super(sbi, dupsb);
 }
@@ -345,7 +342,7 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
                err = nilfs_construct_segment(sb);
        down_write(&nilfs->ns_sem);
-        if (sb->s_dirt)
+        if (nilfs_sb_dirty(nilfs))
                nilfs_commit_super(sbi, 1);
        up_write(&nilfs->ns_sem);
@@ -363,14 +360,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
        list_add(&sbi->s_list, &nilfs->ns_supers);
        up_write(&nilfs->ns_super_sem);
-        sbi->s_ifile = nilfs_mdt_new(nilfs, sbi->s_super, NILFS_IFILE_INO);
+        sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size);
        if (!sbi->s_ifile)
                return -ENOMEM;
-        err = nilfs_palloc_init_blockgroup(sbi->s_ifile, nilfs->ns_inode_size);
-        if (unlikely(err))
-                goto failed;
        down_read(&nilfs->ns_segctor_sem);
        err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
                                          &bh_cp);
@@ -411,7 +404,6 @@ void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
 {
        struct the_nilfs *nilfs = sbi->s_nilfs;
-        nilfs_mdt_clear(sbi->s_ifile);
        nilfs_mdt_destroy(sbi->s_ifile);
        sbi->s_ifile = NULL;
        down_write(&nilfs->ns_super_sem);
@@ -419,22 +411,6 @@ void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
        up_write(&nilfs->ns_super_sem);
 }
-static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi)
-{
-        struct the_nilfs *nilfs = sbi->s_nilfs;
-        int err = 0;
-        down_write(&nilfs->ns_sem);
-        if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
-                nilfs->ns_mount_state |= NILFS_VALID_FS;
-                err = nilfs_commit_super(sbi, 1);
-                if (likely(!err))
-                        printk(KERN_INFO "NILFS: recovery complete.\n");
-        }
-        up_write(&nilfs->ns_sem);
-        return err;
-}
 static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
@@ -460,7 +436,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        /*
         * Compute the overhead
         *
-         * When distributing meta data blocks outside semgent structure,
+         * When distributing meta data blocks outside segment structure,
         * We must count them as the overhead.
         */
        overhead = 0;
@@ -490,7 +466,7 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        if (!nilfs_test_opt(sbi, BARRIER))
-                seq_printf(seq, ",barrier=off");
+                seq_printf(seq, ",nobarrier");
        if (nilfs_test_opt(sbi, SNAPSHOT))
                seq_printf(seq, ",cp=%llu",
                           (unsigned long long int)sbi->s_snapshot_cno);
@@ -500,6 +476,10 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_printf(seq, ",errors=panic");
        if (nilfs_test_opt(sbi, STRICT_ORDER))
                seq_printf(seq, ",order=strict");
+        if (nilfs_test_opt(sbi, NORECOVERY))
+                seq_printf(seq, ",norecovery");
+        if (nilfs_test_opt(sbi, DISCARD))
+                seq_printf(seq, ",discard");
        return 0;
 }
@@ -568,33 +548,22 @@ static const struct export_operations nilfs_export_ops = {
 enum {
        Opt_err_cont, Opt_err_panic, Opt_err_ro,
-        Opt_barrier, Opt_snapshot, Opt_order,
+        Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
-        Opt_err,
+        Opt_discard, Opt_err,
 };
 static match_table_t tokens = {
        {Opt_err_cont, "errors=continue"},
        {Opt_err_panic, "errors=panic"},
        {Opt_err_ro, "errors=remount-ro"},
-        {Opt_barrier, "barrier=%s"},
+        {Opt_nobarrier, "nobarrier"},
        {Opt_snapshot, "cp=%u"},
        {Opt_order, "order=%s"},
+        {Opt_norecovery, "norecovery"},
+        {Opt_discard, "discard"},
        {Opt_err, NULL}
 };
-static int match_bool(substring_t *s, int *result)
-{
-        int len = s->to - s->from;
-        if (strncmp(s->from, "on", len) == 0)
-                *result = 1;
-        else if (strncmp(s->from, "off", len) == 0)
-                *result = 0;
-        else
-                return 1;
-        return 0;
-}
 static int parse_options(char *options, struct super_block *sb)
 {
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
@@ -612,13 +581,8 @@ static int parse_options(char *options, struct super_block *sb)
                token = match_token(p, tokens, args);
                switch (token) {
-                case Opt_barrier:
+                case Opt_nobarrier:
-                        if (match_bool(&args[0], &option))
+                        nilfs_clear_opt(sbi, BARRIER);
-                                return 0;
-                        if (option)
-                                nilfs_set_opt(sbi, BARRIER);
-                        else
-                                nilfs_clear_opt(sbi, BARRIER);
                        break;
                case Opt_order:
                        if (strcmp(args[0].from, "relaxed") == 0)
@@ -647,6 +611,12 @@ static int parse_options(char *options, struct super_block *sb)
                        sbi->s_snapshot_cno = option;
                        nilfs_set_opt(sbi, SNAPSHOT);
                        break;
+                case Opt_norecovery:
+                        nilfs_set_opt(sbi, NORECOVERY);
+                        break;
+                case Opt_discard:
+                        nilfs_set_opt(sbi, DISCARD);
+                        break;
                default:
                        printk(KERN_ERR
                               "NILFS: Unrecognized mount option \"%s\"\n", p);
@@ -672,9 +642,7 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi)
        int mnt_count = le16_to_cpu(sbp->s_mnt_count);
        /* nilfs->sem must be locked by the caller. */
-        if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
+        if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
-                printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
-        } else if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
                printk(KERN_WARNING
                       "NILFS warning: mounting fs with errors\n");
 #if 0
@@ -782,11 +750,10 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
        sb->s_root = NULL;
        sb->s_time_gran = 1;
-        if (!nilfs_loaded(nilfs)) {
+        err = load_nilfs(nilfs, sbi);
-                err = load_nilfs(nilfs, sbi);
+        if (err)
-                if (err)
+                goto failed_sbi;
-                        goto failed_sbi;
-        }
        cno = nilfs_last_cno(nilfs);
        if (sb->s_flags & MS_RDONLY) {
@@ -854,12 +821,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
                up_write(&nilfs->ns_sem);
        }
-        err = nilfs_mark_recovery_complete(sbi);
-        if (unlikely(err)) {
-                printk(KERN_ERR "NILFS: recovery failed.\n");
-                goto failed_root;
-        }
        down_write(&nilfs->ns_super_sem);
        if (!nilfs_test_opt(sbi, SNAPSHOT))
                nilfs->ns_current = sbi;
@@ -867,10 +828,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
        return 0;
- failed_root:
-        dput(sb->s_root);
-        sb->s_root = NULL;
 failed_segctor:
        nilfs_detach_segment_constructor(sbi);
@@ -909,12 +866,20 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        if ((*flags & MS_RDONLY) &&
            sbi->s_snapshot_cno != old_opts.snapshot_cno) {
                printk(KERN_WARNING "NILFS (device %s): couldn't "
-                       "remount to a different snapshot. \n",
+                       "remount to a different snapshot.\n",
                       sb->s_id);
                err = -EINVAL;
                goto restore_opts;
        }
+        if (!nilfs_valid_fs(nilfs)) {
+                printk(KERN_WARNING "NILFS (device %s): couldn't "
+                       "remount because the filesystem is in an "
+                       "incomplete recovery state.\n", sb->s_id);
+                err = -EINVAL;
+                goto restore_opts;
+        }
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
                goto out;
        if (*flags & MS_RDONLY) {
@@ -1156,8 +1121,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
        /* Abandoning the newly allocated superblock */
        mutex_unlock(&nilfs->ns_mount_mutex);
        put_nilfs(nilfs);
-        up_write(&s->s_umount);
+        deactivate_locked_super(s);
-        deactivate_super(s);
        /*
         * deactivate_super() invokes close_bdev_exclusive().
         * We must finish all post-cleaning before this call;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index ad391a8c3e7e..33871f7e4f01 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -146,13 +146,9 @@ void put_nilfs(struct the_nilfs *nilfs)
        might_sleep();
        if (nilfs_loaded(nilfs)) {
-                nilfs_mdt_clear(nilfs->ns_sufile);
                nilfs_mdt_destroy(nilfs->ns_sufile);
-                nilfs_mdt_clear(nilfs->ns_cpfile);
                nilfs_mdt_destroy(nilfs->ns_cpfile);
-                nilfs_mdt_clear(nilfs->ns_dat);
                nilfs_mdt_destroy(nilfs->ns_dat);
-                /* XXX: how and when to clear nilfs->ns_gc_dat? */
                nilfs_mdt_destroy(nilfs->ns_gc_dat);
        }
        if (nilfs_init(nilfs)) {
@@ -166,7 +162,6 @@ void put_nilfs(struct the_nilfs *nilfs)
 static int nilfs_load_super_root(struct the_nilfs *nilfs,
                                 struct nilfs_sb_info *sbi, sector_t sr_block)
 {
-        static struct lock_class_key dat_lock_key;
        struct buffer_head *bh_sr;
        struct nilfs_super_root *raw_sr;
        struct nilfs_super_block **sbp = nilfs->ns_sbp;
@@ -187,51 +182,36 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
        inode_size = nilfs->ns_inode_size;
        err = -ENOMEM;
-        nilfs->ns_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO);
+        nilfs->ns_dat = nilfs_dat_new(nilfs, dat_entry_size);
        if (unlikely(!nilfs->ns_dat))
                goto failed;
-        nilfs->ns_gc_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO);
+        nilfs->ns_gc_dat = nilfs_dat_new(nilfs, dat_entry_size);
        if (unlikely(!nilfs->ns_gc_dat))
                goto failed_dat;
-        nilfs->ns_cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO);
+        nilfs->ns_cpfile = nilfs_cpfile_new(nilfs, checkpoint_size);
        if (unlikely(!nilfs->ns_cpfile))
                goto failed_gc_dat;
-        nilfs->ns_sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO);
+        nilfs->ns_sufile = nilfs_sufile_new(nilfs, segment_usage_size);
        if (unlikely(!nilfs->ns_sufile))
                goto failed_cpfile;
-        err = nilfs_palloc_init_blockgroup(nilfs->ns_dat, dat_entry_size);
-        if (unlikely(err))
-                goto failed_sufile;
-        err = nilfs_palloc_init_blockgroup(nilfs->ns_gc_dat, dat_entry_size);
-        if (unlikely(err))
-                goto failed_sufile;
-        lockdep_set_class(&NILFS_MDT(nilfs->ns_dat)->mi_sem, &dat_lock_key);
-        lockdep_set_class(&NILFS_MDT(nilfs->ns_gc_dat)->mi_sem, &dat_lock_key);
        nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat);
-        nilfs_mdt_set_entry_size(nilfs->ns_cpfile, checkpoint_size,
-                                 sizeof(struct nilfs_cpfile_header));
-        nilfs_mdt_set_entry_size(nilfs->ns_sufile, segment_usage_size,
-                                 sizeof(struct nilfs_sufile_header));
-        err = nilfs_mdt_read_inode_direct(
+        err = nilfs_dat_read(nilfs->ns_dat, (void *)bh_sr->b_data +
-                nilfs->ns_dat, bh_sr, NILFS_SR_DAT_OFFSET(inode_size));
+                             NILFS_SR_DAT_OFFSET(inode_size));
        if (unlikely(err))
                goto failed_sufile;
-        err = nilfs_mdt_read_inode_direct(
+        err = nilfs_cpfile_read(nilfs->ns_cpfile, (void *)bh_sr->b_data +
-                nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(inode_size));
+                                NILFS_SR_CPFILE_OFFSET(inode_size));
        if (unlikely(err))
                goto failed_sufile;
-        err = nilfs_mdt_read_inode_direct(
+        err = nilfs_sufile_read(nilfs->ns_sufile, (void *)bh_sr->b_data +
-                nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(inode_size));
+                                NILFS_SR_SUFILE_OFFSET(inode_size));
        if (unlikely(err))
                goto failed_sufile;
@@ -281,29 +261,30 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
        struct nilfs_recovery_info ri;
        unsigned int s_flags = sbi->s_super->s_flags;
        int really_read_only = bdev_read_only(nilfs->ns_bdev);
-        unsigned valid_fs;
+        int valid_fs = nilfs_valid_fs(nilfs);
-        int err = 0;
+        int err;
-        nilfs_init_recovery_info(&ri);
-        down_write(&nilfs->ns_sem);
+        if (nilfs_loaded(nilfs)) {
-        valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
+                if (valid_fs ||
-        up_write(&nilfs->ns_sem);
+                    ((s_flags & MS_RDONLY) && nilfs_test_opt(sbi, NORECOVERY)))
+                        return 0;
+                printk(KERN_ERR "NILFS: the filesystem is in an incomplete "
+                       "recovery state.\n");
+                return -EINVAL;
+        }
-        if (!valid_fs && (s_flags & MS_RDONLY)) {
+        if (!valid_fs) {
-                printk(KERN_INFO "NILFS: INFO: recovery "
+                printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
-                       "required for readonly filesystem.\n");
+                if (s_flags & MS_RDONLY) {
-                if (really_read_only) {
+                        printk(KERN_INFO "NILFS: INFO: recovery "
-                        printk(KERN_ERR "NILFS: write access "
+                               "required for readonly filesystem.\n");
-                               "unavailable, cannot proceed.\n");
+                        printk(KERN_INFO "NILFS: write access will "
-                        err = -EROFS;
+                               "be enabled during recovery.\n");
-                        goto failed;
                }
-                printk(KERN_INFO "NILFS: write access will "
-                       "be enabled during recovery.\n");
-                sbi->s_super->s_flags &= ~MS_RDONLY;
        }
+        nilfs_init_recovery_info(&ri);
        err = nilfs_search_super_root(nilfs, sbi, &ri);
        if (unlikely(err)) {
                printk(KERN_ERR "NILFS: error searching super root.\n");
@@ -316,19 +297,56 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
                goto failed;
        }
-        if (!valid_fs) {
+        if (valid_fs)
-                err = nilfs_recover_logical_segments(nilfs, sbi, &ri);
+                goto skip_recovery;
-                if (unlikely(err)) {
-                        nilfs_mdt_destroy(nilfs->ns_cpfile);
+        if (s_flags & MS_RDONLY) {
-                        nilfs_mdt_destroy(nilfs->ns_sufile);
+                if (nilfs_test_opt(sbi, NORECOVERY)) {
-                        nilfs_mdt_destroy(nilfs->ns_dat);
+                        printk(KERN_INFO "NILFS: norecovery option specified. "
-                        goto failed;
+                               "skipping roll-forward recovery\n");
+                        goto skip_recovery;
                }
-                if (ri.ri_need_recovery == NILFS_RECOVERY_SR_UPDATED)
+                if (really_read_only) {
-                        sbi->s_super->s_dirt = 1;
+                        printk(KERN_ERR "NILFS: write access "
+                               "unavailable, cannot proceed.\n");
+                        err = -EROFS;
+                        goto failed_unload;
+                }
+                sbi->s_super->s_flags &= ~MS_RDONLY;
+        } else if (nilfs_test_opt(sbi, NORECOVERY)) {
+                printk(KERN_ERR "NILFS: recovery cancelled because norecovery "
+                       "option was specified for a read/write mount\n");
+                err = -EINVAL;
+                goto failed_unload;
+        }
+        err = nilfs_recover_logical_segments(nilfs, sbi, &ri);
+        if (err)
+                goto failed_unload;
+        down_write(&nilfs->ns_sem);
+        nilfs->ns_mount_state |= NILFS_VALID_FS;
+        nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
+        err = nilfs_commit_super(sbi, 1);
+        up_write(&nilfs->ns_sem);
+        if (err) {
+                printk(KERN_ERR "NILFS: failed to update super block. "
+                       "recovery unfinished.\n");
+                goto failed_unload;
        }
+        printk(KERN_INFO "NILFS: recovery complete.\n");
+ skip_recovery:
        set_nilfs_loaded(nilfs);
+        nilfs_clear_recovery_info(&ri);
+        sbi->s_super->s_flags = s_flags;
+        return 0;
+ failed_unload:
+        nilfs_mdt_destroy(nilfs->ns_cpfile);
+        nilfs_mdt_destroy(nilfs->ns_sufile);
+        nilfs_mdt_destroy(nilfs->ns_dat);
 failed:
        nilfs_clear_recovery_info(&ri);
@@ -368,7 +386,7 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
        nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
        if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
-                printk(KERN_ERR "NILFS: too short segment. \n");
+                printk(KERN_ERR "NILFS: too short segment.\n");
                return -EINVAL;
        }
@@ -628,34 +646,65 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
        goto out;
 }
+int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
+                            size_t nsegs)
+{
+        sector_t seg_start, seg_end;
+        sector_t start = 0, nblocks = 0;
+        unsigned int sects_per_block;
+        __u64 *sn;
+        int ret = 0;
+        sects_per_block = (1 << nilfs->ns_blocksize_bits) /
+                bdev_logical_block_size(nilfs->ns_bdev);
+        for (sn = segnump; sn < segnump + nsegs; sn++) {
+                nilfs_get_segment_range(nilfs, *sn, &seg_start, &seg_end);
+                if (!nblocks) {
+                        start = seg_start;
+                        nblocks = seg_end - seg_start + 1;
+                } else if (start + nblocks == seg_start) {
+                        nblocks += seg_end - seg_start + 1;
+                } else {
+                        ret = blkdev_issue_discard(nilfs->ns_bdev,
+                                                   start * sects_per_block,
+                                                   nblocks * sects_per_block,
+                                                   GFP_NOFS,
+                                                   DISCARD_FL_BARRIER);
+                        if (ret < 0)
+                                return ret;
+                        nblocks = 0;
+                }
+        }
+        if (nblocks)
+                ret = blkdev_issue_discard(nilfs->ns_bdev,
+                                           start * sects_per_block,
+                                           nblocks * sects_per_block,
+                                           GFP_NOFS, DISCARD_FL_BARRIER);
+        return ret;
+}
 int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
 {
        struct inode *dat = nilfs_dat_inode(nilfs);
        unsigned long ncleansegs;
-        int err;
        down_read(&NILFS_MDT(dat)->mi_sem);     /* XXX */
-        err = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile, &ncleansegs);
+        ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
-        if (likely(!err))
+        *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
-                *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
+        return 0;
-        return err;
 }
 int nilfs_near_disk_full(struct the_nilfs *nilfs)
 {
-        struct inode *sufile = nilfs->ns_sufile;
        unsigned long ncleansegs, nincsegs;
-        int ret;
-        ret = nilfs_sufile_get_ncleansegs(sufile, &ncleansegs);
+        ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
-        if (likely(!ret)) {
+        nincsegs = atomic_read(&nilfs->ns_ndirtyblks) /
-                nincsegs = atomic_read(&nilfs->ns_ndirtyblks) /
+                nilfs->ns_blocks_per_segment + 1;
-                        nilfs->ns_blocks_per_segment + 1;
-                if (ncleansegs <= nilfs->ns_nrsvsegs + nincsegs)
+        return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs;
-                        ret++;
-        }
-        return ret;
 }
 /**
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 20abd55881e0..1ab974533697 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -29,6 +29,7 @@
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
+#include <linux/slab.h>
 #include "sb.h"
 /* the_nilfs struct */
@@ -38,6 +39,7 @@ enum {
                                   the latest checkpoint was loaded */
        THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
        THE_NILFS_GC_RUNNING,   /* gc process is running */
+        THE_NILFS_SB_DIRTY,     /* super block is dirty */
 };
 /**
@@ -197,6 +199,7 @@ THE_NILFS_FNS(INIT, init)
 THE_NILFS_FNS(LOADED, loaded)
 THE_NILFS_FNS(DISCONTINUED, discontinued)
 THE_NILFS_FNS(GC_RUNNING, gc_running)
+THE_NILFS_FNS(SB_DIRTY, sb_dirty)
 /* Minimum interval of periodical update of superblocks (in seconds) */
 #define NILFS_SB_FREQ           10
@@ -221,6 +224,7 @@ struct the_nilfs *find_or_create_nilfs(struct block_device *);
 void put_nilfs(struct the_nilfs *);
 int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
 int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
+int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
 int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
 struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
 int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
@@ -258,6 +262,16 @@ static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
                kfree(sbi);
 }
+static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
+{
+        unsigned valid_fs;
+        down_read(&nilfs->ns_sem);
+        valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
+        up_read(&nilfs->ns_sem);
+        return valid_fs;
+}
 static inline void
 nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
                        sector_t *seg_start, sector_t *seg_end)
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 037e878e03fc..fcc2f064af83 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -18,6 +18,7 @@
 #include <linux/dcache.h>
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/srcu.h>
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 3165d85aada2..0399bcbe09c8 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -87,7 +87,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/writeback.h> /* for inode_lock */
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index c9ee67b442e1..1afb0a10229f 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -121,7 +121,7 @@ static int idr_callback(int id, void *p, void *data)
        if (warned)
                return 0;
-        warned = false;
+        warned = true;
        entry = p;
        ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index dcd2040d330c..472cdf29ef82 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -29,14 +29,12 @@
 #include <linux/init.h> /* module_init */
 #include <linux/inotify.h>
 #include <linux/kernel.h> /* roundup() */
-#include <linux/magic.h> /* superblock magic number */
-#include <linux/mount.h> /* mntget */
 #include <linux/namei.h> /* LOOKUP_FOLLOW */
-#include <linux/path.h> /* struct path */
 #include <linux/sched.h> /* struct user */
 #include <linux/slab.h> /* struct kmem_cache */
 #include <linux/syscalls.h>
 #include <linux/types.h>
+#include <linux/anon_inodes.h>
 #include <linux/uaccess.h>
 #include <linux/poll.h>
 #include <linux/wait.h>
@@ -45,8 +43,6 @@
 #include <asm/ioctls.h>
-static struct vfsmount *inotify_mnt __read_mostly;
 /* these are configurable via /proc/sys/fs/inotify/ */
 static int inotify_max_user_instances __read_mostly;
 static int inotify_max_queued_events __read_mostly;
@@ -69,36 +65,30 @@ static int zero;
 ctl_table inotify_table[] = {
        {
-                .ctl_name       = INOTIFY_MAX_USER_INSTANCES,
                .procname       = "max_user_instances",
                .data           = &inotify_max_user_instances,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
        },
        {
-                .ctl_name       = INOTIFY_MAX_USER_WATCHES,
                .procname       = "max_user_watches",
                .data           = &inotify_max_user_watches,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
        },
        {
-                .ctl_name       = INOTIFY_MAX_QUEUED_EVENTS,
                .procname       = "max_queued_events",
                .data           = &inotify_max_queued_events,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero
        },
-        { .ctl_name = 0 }
+        { }
 };
 #endif /* CONFIG_SYSCTL */
@@ -558,7 +548,7 @@ retry:
        spin_lock(&group->inotify_data.idr_lock);
        ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
-                                group->inotify_data.last_wd,
+                                group->inotify_data.last_wd+1,
                                &tmp_ientry->wd);
        spin_unlock(&group->inotify_data.idr_lock);
        if (ret) {
@@ -638,7 +628,7 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign
        spin_lock_init(&group->inotify_data.idr_lock);
        idr_init(&group->inotify_data.idr);
-        group->inotify_data.last_wd = 1;
+        group->inotify_data.last_wd = 0;
        group->inotify_data.user = user;
        group->inotify_data.fa = NULL;
@@ -651,8 +641,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
 {
        struct fsnotify_group *group;
        struct user_struct *user;
-        struct file *filp;
+        int ret;
-        int fd, ret;
        /* Check the IN_* constants for consistency.  */
        BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC);
@@ -661,16 +650,6 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
        if (flags & ~(IN_CLOEXEC | IN_NONBLOCK))
                return -EINVAL;
-        fd = get_unused_fd_flags(flags & O_CLOEXEC);
-        if (fd < 0)
-                return fd;
-        filp = get_empty_filp();
-        if (!filp) {
-                ret = -ENFILE;
-                goto out_put_fd;
-        }
        user = get_current_user();
        if (unlikely(atomic_read(&user->inotify_devs) >=
                        inotify_max_user_instances)) {
@@ -685,25 +664,16 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
                goto out_free_uid;
        }
-        filp->f_op = &inotify_fops;
-        filp->f_path.mnt = mntget(inotify_mnt);
-        filp->f_path.dentry = dget(inotify_mnt->mnt_root);
-        filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
-        filp->f_mode = FMODE_READ;
-        filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
-        filp->private_data = group;
        atomic_inc(&user->inotify_devs);
-        fd_install(fd, filp);
+        ret = anon_inode_getfd("inotify", &inotify_fops, group,
+                                  O_RDONLY | flags);
-        return fd;
+        if (ret >= 0)
+                return ret;
+        atomic_dec(&user->inotify_devs);
 out_free_uid:
        free_uid(user);
-        put_filp(filp);
-out_put_fd:
-        put_unused_fd(fd);
        return ret;
 }
@@ -747,10 +717,6 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
        /* create/update an inode mark */
        ret = inotify_update_watch(group, inode, mask);
-        if (unlikely(ret))
-                goto path_put_and_out;
-path_put_and_out:
        path_put(&path);
 fput_and_out:
        fput_light(filp, fput_needed);
@@ -794,20 +760,6 @@ out:
        return ret;
 }
-static int
-inotify_get_sb(struct file_system_type *fs_type, int flags,
-               const char *dev_name, void *data, struct vfsmount *mnt)
-{
-        return get_sb_pseudo(fs_type, "inotify", NULL,
-                        INOTIFYFS_SUPER_MAGIC, mnt);
-}
-static struct file_system_type inotify_fs_type = {
-    .name       = "inotifyfs",
-    .get_sb     = inotify_get_sb,
-    .kill_sb    = kill_anon_super,
-};
 /*
 * inotify_user_setup - Our initialization function.  Note that we cannnot return
 * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
@@ -815,16 +767,6 @@ static struct file_system_type inotify_fs_type = {
 */
 static int __init inotify_user_setup(void)
 {
-        int ret;
-        ret = register_filesystem(&inotify_fs_type);
-        if (unlikely(ret))
-                panic("inotify: register_filesystem returned %d!\n", ret);
-        inotify_mnt = kern_mount(&inotify_fs_type);
-        if (IS_ERR(inotify_mnt))
-                panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
        inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
        event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
deleted file mode 100644
index 37c11e194372..000000000000
--- a/fs/ntfs/ChangeLog
+++ /dev/null
@@ -1,1702 +0,0 @@
-ToDo/Notes:
-        - Find and fix bugs.
-        - The only places in the kernel where a file is resized are
-          ntfs_file_write*() and ntfs_truncate() for both of which i_mutex is
-          held.  Just have to be careful in read-/writepage and other helpers
-          not running under i_mutex that we play nice.  Also need to be careful
-          with initialized_size extension in ntfs_file_write*() and writepage.
-          UPDATE: The only things that need to be checked are the compressed
-          write and the other attribute resize/write cases like index
-          attributes, etc.  For now none of these are implemented so are safe.
-        - Implement filling in of holes in aops.c::ntfs_writepage() and its
-          helpers.
-        - Implement mft.c::sync_mft_mirror_umount().  We currently will just
-          leave the volume dirty on umount if the final iput(vol->mft_ino)
-          causes a write of any mirrored mft records due to the mft mirror
-          inode having been discarded already.  Whether this can actually ever
-          happen is unclear however so it is worth waiting until someone hits
-          the problem.
-2.1.29 - Fix a deadlock at mount time.
-        - During mount the VFS holds s_umount lock on the superblock.  So when
-          we try to empty the journal $LogFile contents by calling
-          ntfs_attr_set() when the machine does not have much memory and the
-          journal is large ntfs_attr_set() results in the VM trying to balance
-          dirty pages which in turn tries to that the s_umount lock and thus we
-          get a deadlock.  The solution is to not use ntfs_attr_set() and
-          instead do the zeroing by hand at the block level rather than page
-          cache level.
-        - Fix sparse warnings.
-2.1.28 - Fix a deadlock.
-        - Fix deadlock in fs/ntfs/inode.c::ntfs_put_inode().  Thanks to Sergey
-          Vlasov for the report and detailed analysis of the deadlock.  The fix
-          involved getting rid of ntfs_put_inode() altogether and hence NTFS no
-          longer has a ->put_inode super operation.
-2.1.27 - Various bug fixes and cleanups.
-        - Fix two compiler warnings on Alpha.  Thanks to Andrew Morton for
-          reporting them.
-        - Fix an (innocent) off-by-one error in the runlist code.
-        - Fix a buggette in an "should be impossible" case handling where we
-          continued the attribute lookup loop instead of aborting it.
-        - Use buffer_migrate_page() for the ->migratepage function of all ntfs
-          address space operations.
-        - Fix comparison of $MFT and $MFTMirr to not bail out when there are
-          unused, invalid mft records which are the same in both $MFT and
-          $MFTMirr.
-        - Add support for sparse files which have a compression unit of 0.
-        - Remove all the make_bad_inode() calls.  This should only be called
-          from read inode and new inode code paths.
-        - Limit name length in fs/ntfs/unistr.c::ntfs_nlstoucs() to maximum
-          allowed by NTFS, i.e. 255 Unicode characters, not including the
-          terminating NULL (which is not stored on disk).
-        - Improve comments on file attribute flags in fs/ntfs/layout.h.
-        - Fix a bug in fs/ntfs/inode.c::ntfs_read_locked_index_inode() where we
-          forgot to update a temporary variable so loading index inodes which
-          have an index allocation attribute failed.
-        - Add a missing call to flush_dcache_mft_record_page() in
-          fs/ntfs/inode.c::ntfs_write_inode().
-        - Handle the recently introduced -ENAMETOOLONG return value from
-          fs/ntfs/unistr.c::ntfs_nlstoucs() in fs/ntfs/namei.c::ntfs_lookup().
-        - Semaphore to mutex conversion.  (Ingo Molnar)
-2.1.26 - Minor bug fixes and updates.
-        - Fix a potential overflow in file.c where a cast to s64 was missing in
-          a left shift of a page index.
-        - The struct inode has had its i_sem semaphore changed to a mutex named
-          i_mutex.
-        - We have struct kmem_cache now so use it instead of the typedef
-          kmem_cache_t.  (Pekka Enberg)
-        - Implement support for sector sizes above 512 bytes (up to the maximum
-          supported by NTFS which is 4096 bytes).
-        - Do more detailed reporting of why we cannot mount read-write by
-          special casing the VOLUME_MODIFIED_BY_CHKDSK flag.
-        - Miscellaneous updates to layout.h.
-        - Cope with attribute list attribute having invalid flags.  Windows
-          copes with this and even chkdsk does not detect or fix this so we
-          have to cope with it, too.  Thanks to Pawel Kot for reporting the
-          problem.
-2.1.25 - (Almost) fully implement write(2) and truncate(2).
-        - Change ntfs_map_runlist_nolock(), ntfs_attr_find_vcn_nolock() and
-          {__,}ntfs_cluster_free() to also take an optional attribute search
-          context as argument.  This allows calling these functions with the
-          mft record mapped.  Update all callers.
-        - Fix potential deadlock in ntfs_mft_data_extend_allocation_nolock()
-          error handling by passing in the active search context when calling
-          ntfs_cluster_free().
-        - Change ntfs_cluster_alloc() to take an extra boolean parameter
-          specifying whether the cluster are being allocated to extend an
-          attribute or to fill a hole.
-        - Change ntfs_attr_make_non_resident() to call ntfs_cluster_alloc()
-          with @is_extension set to TRUE and remove the runlist terminator
-          fixup code as this is now done by ntfs_cluster_alloc().
-        - Change ntfs_attr_make_non_resident to take the attribute value size
-          as an extra parameter.  This is needed since we need to know the size
-          before we can map the mft record and our callers always know it.  The
-          reason we cannot simply read the size from the vfs inode i_size is
-          that this is not necessarily uptodate.  This happens when
-          ntfs_attr_make_non_resident() is called in the ->truncate call path.
-        - Fix ntfs_attr_make_non_resident() to update the vfs inode i_blocks
-          which is zero for a resident attribute but should no longer be zero
-          once the attribute is non-resident as it then has real clusters
-          allocated.
-        - Add fs/ntfs/attrib.[hc]::ntfs_attr_extend_allocation(), a function to
-          extend the allocation of an attributes.  Optionally, the data size,
-          but not the initialized size can be extended, too.
-        - Implement fs/ntfs/inode.[hc]::ntfs_truncate().  It only supports
-          uncompressed and unencrypted files and it never creates sparse files
-          at least for the moment (making a file sparse requires us to modify
-          its directory entries and we do not support directory operations at
-          the moment).  Also, support for highly fragmented files, i.e. ones
-          whose data attribute is split across multiple extents, is severly
-          limited.  When such a case is encountered, EOPNOTSUPP is returned.
-        - Enable ATTR_SIZE attribute changes in ntfs_setattr().  This completes
-          the initial implementation of file truncation.  Now both open(2)ing
-          a file with the O_TRUNC flag and the {,f}truncate(2) system calls
-          will resize a file appropriately.  The limitations are that only
-          uncompressed and unencrypted files are supported.  Also, there is
-          only very limited support for highly fragmented files (the ones whose
-          $DATA attribute is split into multiple attribute extents).
-        - In attrib.c::ntfs_attr_set() call balance_dirty_pages_ratelimited()
-          and cond_resched() in the main loop as we could be dirtying a lot of
-          pages and this ensures we play nice with the VM and the system as a
-          whole.
-        - Implement file operations ->write, ->aio_write, ->writev for regular
-          files.  This replaces the old use of generic_file_write(), et al and
-          the address space operations ->prepare_write and ->commit_write.
-          This means that both sparse and non-sparse (unencrypted and
-          uncompressed) files can now be extended using the normal write(2)
-          code path.  There are two limitations at present and these are that
-          we never create sparse files and that we only have limited support
-          for highly fragmented files, i.e. ones whose data attribute is split
-          across multiple extents.   When such a case is encountered,
-          EOPNOTSUPP is returned.
-        - $EA attributes can be both resident and non-resident.
-        - Use %z for size_t to fix compilation warnings.  (Andrew Morton)
-        - Fix compilation warnings with gcc-4.0.2 on SUSE 10.0.
-        - Document extended attribute ($EA) NEED_EA flag.  (Based on libntfs
-          patch by Yura Pakhuchiy.)
-2.1.24 - Lots of bug fixes and support more clean journal states.
-        - Support journals ($LogFile) which have been modified by chkdsk.  This
-          means users can boot into Windows after we marked the volume dirty.
-          The Windows boot will run chkdsk and then reboot.  The user can then
-          immediately boot into Linux rather than having to do a full Windows
-          boot first before rebooting into Linux and we will recognize such a
-          journal and empty it as it is clean by definition.  Note, this only
-          works if chkdsk left the journal in an obviously clean state.
-        - Support journals ($LogFile) with only one restart page as well as
-          journals with two different restart pages.  We sanity check both and
-          either use the only sane one or the more recent one of the two in the
-          case that both are valid.
-        - Add fs/ntfs/malloc.h::ntfs_malloc_nofs_nofail() which is analogous to
-          ntfs_malloc_nofs() but it performs allocations with __GFP_NOFAIL and
-          hence cannot fail.
-        - Use ntfs_malloc_nofs_nofail() in the two critical regions in
-          fs/ntfs/runlist.c::ntfs_runlists_merge().  This means we no longer
-          need to panic() if the allocation fails as it now cannot fail.
-        - Fix two nasty runlist merging bugs that had gone unnoticed so far.
-          Thanks to Stefano Picerno for the bug report.
-        - Remove two bogus BUG_ON()s from fs/ntfs/mft.c.
-        - Fix handling of valid but empty mapping pairs array in
-          fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress().
-        - Report unrepresentable inodes during ntfs_readdir() as KERN_WARNING
-          messages and include the inode number.  Thanks to Yura Pakhuchiy for
-          pointing this out.
-        - Change ntfs_rl_truncate_nolock() to throw away the runlist if the new
-          length is zero.
-        - Add runlist.[hc]::ntfs_rl_punch_nolock() which punches a caller
-          specified hole into a runlist.
-        - Fix a bug in fs/ntfs/index.c::ntfs_index_lookup().  When the returned
-          index entry is in the index root, we forgot to set the @ir pointer in
-          the index context.  Thanks to Yura Pakhuchiy for finding this bug.
-        - Remove bogus setting of PageError in ntfs_read_compressed_block().
-        - Add fs/ntfs/attrib.[hc]::ntfs_resident_attr_value_resize().
-        - Fix a bug in ntfs_map_runlist_nolock() where we forgot to protect
-          access to the allocated size in the ntfs inode with the size lock.
-        - Fix ntfs_attr_vcn_to_lcn_nolock() and ntfs_attr_find_vcn_nolock() to
-          return LCN_ENOENT when there is no runlist and the allocated size is
-          zero.
-        - Fix load_attribute_list() to handle the case of a NULL runlist.
-        - Fix handling of sparse attributes in ntfs_attr_make_non_resident().
-        - Add BUG() checks to ntfs_attr_make_non_resident() and ntfs_attr_set()
-          to ensure that these functions are never called for compressed or
-          encrypted attributes.
-        - Fix cluster (de)allocators to work when the runlist is NULL and more
-          importantly to take a locked runlist rather than them locking it
-          which leads to lock reversal.
-        - Truncate {a,c,m}time to the ntfs supported time granularity when
-          updating the times in the inode in ntfs_setattr().
-        - Fixup handling of sparse, compressed, and encrypted attributes in
-          fs/ntfs/inode.c::ntfs_read_locked_{,attr_,index_}inode(),
-          fs/ntfs/aops.c::ntfs_{read,write}page().
-        - Make ntfs_write_block() not instantiate sparse blocks if they contain
-          only zeroes.
-        - Optimize fs/ntfs/aops.c::ntfs_write_block() by extending the page
-          lock protection over the buffer submission for i/o which allows the
-          removal of the get_bh()/put_bh() pairs for each buffer.
-        - Fix fs/ntfs/aops.c::ntfs_{read,write}_block() to handle the case
-          where a concurrent truncate has truncated the runlist under our feet.
-        - Fix page_has_buffers()/page_buffers() handling in fs/ntfs/aops.c.
-        - In fs/ntfs/aops.c::ntfs_end_buffer_async_read(), use a bit spin lock
-          in the first buffer head instead of a driver global spin lock to
-          improve scalability.
-        - Minor fix to error handling and error message display in
-          fs/ntfs/aops.c::ntfs_prepare_nonresident_write().
-        - Change the mount options {u,f,d}mask to always parse the number as
-          an octal number to conform to how chmod(1) works, too.  Thanks to
-          Giuseppe Bilotta and Horst von Brand for pointing out the errors of
-          my ways.
-        - Fix various bugs in the runlist merging code.  (Based on libntfs
-          changes by Richard Russon.)
-        - Fix sparse warnings that have crept in over time.
-        - Change ntfs_cluster_free() to require a write locked runlist on entry
-          since we otherwise get into a lock reversal deadlock if a read locked
-          runlist is passed in. In the process also change it to take an ntfs
-          inode instead of a vfs inode as parameter.
-        - Fix the definition of the CHKD ntfs record magic.  It had an off by
-          two error causing it to be CHKB instead of CHKD.
-        - Fix a stupid bug in __ntfs_bitmap_set_bits_in_run() which caused the
-          count to become negative and hence we had a wild memset() scribbling
-          all over the system's ram.
-2.1.23 - Implement extension of resident files and make writing safe as well as
-         many bug fixes, cleanups, and enhancements...
-        - Add printk rate limiting for ntfs_warning() and ntfs_error() when
-          compiled without debug.  This avoids a possible denial of service
-          attack.  Thanks to Carl-Daniel Hailfinger from SuSE for pointing this
-          out.
-        - Fix compilation warnings on ia64.  (Randy Dunlap)
-        - Use i_size_{read,write}() instead of reading i_size by hand and cache
-          the value where apropriate.
-        - Add size_lock to the ntfs_inode structure.  This is an rw spinlock
-          and it locks against access to the inode sizes.  Note, ->size_lock
-          is also accessed from irq context so you must use the _irqsave and
-          _irqrestore lock and unlock functions, respectively.  Protect all
-          accesses to allocated_size, initialized_size, and compressed_size.
-        - Minor optimization to fs/ntfs/super.c::ntfs_statfs() and its helpers.
-        - Implement extension of resident files in the regular file write code
-          paths (fs/ntfs/aops.c::ntfs_{prepare,commit}_write()).  At present
-          this only works until the data attribute becomes too big for the mft
-          record after which we abort the write returning -EOPNOTSUPP from
-          ntfs_prepare_write().
-        - Add disable_sparse mount option together with a per volume sparse
-          enable bit which is set appropriately and a per inode sparse disable
-          bit which is preset on some system file inodes as appropriate.
-        - Enforce that sparse support is disabled on NTFS volumes pre 3.0.
-        - Fix a bug in fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress() in
-          the creation of the unmapped runlist element for the base attribute
-          extent.
-        - Split ntfs_map_runlist() into ntfs_map_runlist() and a non-locking
-          helper ntfs_map_runlist_nolock() which is used by ntfs_map_runlist().
-          This allows us to map runlist fragments with the runlist lock already
-          held without having to drop and reacquire it around the call.  Adapt
-          all callers.
-        - Change ntfs_find_vcn() to ntfs_find_vcn_nolock() which takes a locked
-          runlist.  This allows us to find runlist elements with the runlist
-          lock already held without having to drop and reacquire it around the
-          call.  Adapt all callers.
-        - Change time to u64 in time.h::ntfs2utc() as it otherwise generates a
-          warning in the do_div() call on sparc32.  Thanks to Meelis Roos for
-          the report and analysis of the warning.
-        - Fix a nasty runlist merge bug when merging two holes.
-        - Set the ntfs_inode->allocated_size to the real allocated size in the
-          mft record for resident attributes (fs/ntfs/inode.c).
-        - Small readability cleanup to use "a" instead of "ctx->attr"
-          everywhere (fs/ntfs/inode.c).
-        - Make fs/ntfs/namei.c::ntfs_get_{parent,dentry} static and move the
-          definition of ntfs_export_ops from fs/ntfs/super.c to namei.c.  Also,
-          declare ntfs_export_ops in fs/ntfs/ntfs.h.
-        - Correct sparse file handling.  The compressed values need to be
-          checked and set in the ntfs inode as done for compressed files and
-          the compressed size needs to be used for vfs inode->i_blocks instead
-          of the allocated size, again, as done for compressed files.
-        - Add AT_EA in addition to AT_DATA to whitelist for being allowed to be
-          non-resident in fs/ntfs/attrib.c::ntfs_attr_can_be_non_resident().
-        - Add fs/ntfs/attrib.c::ntfs_attr_vcn_to_lcn_nolock() used by the new
-          write code.
-        - Fix bug in fs/ntfs/attrib.c::ntfs_find_vcn_nolock() where after
-          dropping the read lock and taking the write lock we were not checking
-          whether someone else did not already do the work we wanted to do.
-        - Rename fs/ntfs/attrib.c::ntfs_find_vcn_nolock() to
-          ntfs_attr_find_vcn_nolock() and update all callers.
-        - Add fs/ntfs/attrib.[hc]::ntfs_attr_make_non_resident().
-        - Fix sign of various error return values to be negative in
-          fs/ntfs/lcnalloc.c.
-        - Modify ->readpage and ->writepage (fs/ntfs/aops.c) so they detect and
-          handle the case where an attribute is converted from resident to
-          non-resident by a concurrent file write.
-        - Remove checks for NULL before calling kfree() since kfree() does the
-          checking itself.  (Jesper Juhl)
-        - Some utilities modify the boot sector but do not update the checksum.
-          Thus, relax the checking in fs/ntfs/super.c::is_boot_sector_ntfs() to
-          only emit a warning when the checksum is incorrect rather than
-          refusing the mount.  Thanks to Bernd Casimir for pointing this
-          problem out.
-        - Update attribute definition handling.
-        - Add NTFS_MAX_CLUSTER_SIZE and NTFS_MAX_PAGES_PER_CLUSTER constants.
-        - Use NTFS_MAX_CLUSTER_SIZE in super.c instead of hard coding 0x10000.
-        - Use MAX_BUF_PER_PAGE instead of variable sized array allocation for
-          better code generation and one less sparse warning in fs/ntfs/aops.c.
-        - Remove spurious void pointer casts from fs/ntfs/.  (Pekka Enberg)
-        - Use C99 style structure initialization after memory allocation where
-          possible (fs/ntfs/{attrib.c,index.c,super.c}).  Thanks to Al Viro and
-          Pekka Enberg.
-        - Stamp the transaction log ($UsnJrnl), aka user space journal, if it
-          is active on the volume and we are mounting read-write or remounting
-          from read-only to read-write.
-        - Fix a bug in address space operations error recovery code paths where
-          if the runlist was not mapped at all and a mapping error occured we
-          would leave the runlist locked on exit to the function so that the
-          next access to the same file would try to take the lock and deadlock.
-        - Detect the case when Windows has been suspended to disk on the volume
-          to be mounted and if this is the case do not allow (re)mounting
-          read-write.  This is done by parsing hiberfil.sys if present.
-        - Fix several occurences of a bug where we would perform 'var & ~const'
-          with a 64-bit variable and a int, i.e. 32-bit, constant.  This causes
-          the higher order 32-bits of the 64-bit variable to be zeroed.  To fix
-          this cast the 'const' to the same 64-bit type as 'var'.
-        - Change the runlist terminator of the newly allocated cluster(s) to
-          LCN_ENOENT in ntfs_attr_make_non_resident().  Otherwise the runlist
-          code gets confused.
-        - Add an extra parameter @last_vcn to ntfs_get_size_for_mapping_pairs()
-          and ntfs_mapping_pairs_build() to allow the runlist encoding to be
-          partial which is desirable when filling holes in sparse attributes.
-          Update all callers.
-        - Change ntfs_map_runlist_nolock() to only decompress the mapping pairs
-          if the requested vcn is inside it.  Otherwise we get into problems
-          when we try to map an out of bounds vcn because we then try to map
-          the already mapped runlist fragment which causes
-          ntfs_mapping_pairs_decompress() to fail and return error.  Update
-          ntfs_attr_find_vcn_nolock() accordingly.
-        - Fix a nasty deadlock that appeared in recent kernels.
-          The situation: VFS inode X on a mounted ntfs volume is dirty.  For
-          same inode X, the ntfs_inode is dirty and thus corresponding on-disk
-          inode, i.e. mft record, which is in a dirty PAGE_CACHE_PAGE belonging
-          to the table of inodes, i.e. $MFT, inode 0.
-          What happens:
-          Process 1: sys_sync()/umount()/whatever...  calls
-          __sync_single_inode() for $MFT -> do_writepages() -> write_page for
-          the dirty page containing the on-disk inode X, the page is now locked
-          -> ntfs_write_mst_block() which clears PageUptodate() on the page to
-          prevent anyone else getting hold of it whilst it does the write out.
-          This is necessary as the on-disk inode needs "fixups" applied before
-          the write to disk which are removed again after the write and
-          PageUptodate is then set again.  It then analyses the page looking
-          for dirty on-disk inodes and when it finds one it calls
-          ntfs_may_write_mft_record() to see if it is safe to write this
-          on-disk inode.  This then calls ilookup5() to check if the
-          corresponding VFS inode is in icache().  This in turn calls ifind()
-          which waits on the inode lock via wait_on_inode whilst holding the
-          global inode_lock.
-          Process 2: pdflush results in a call to __sync_single_inode for the
-          same VFS inode X on the ntfs volume.  This locks the inode (I_LOCK)
-          then calls write-inode -> ntfs_write_inode -> map_mft_record() ->
-          read_cache_page() for the page (in page cache of table of inodes
-          $MFT, inode 0) containing the on-disk inode.  This page has
-          PageUptodate() clear because of Process 1 (see above) so
-          read_cache_page() blocks when it tries to take the page lock for the
-          page so it can call ntfs_read_page().
-          Thus Process 1 is holding the page lock on the page containing the
-          on-disk inode X and it is waiting on the inode X to be unlocked in
-          ifind() so it can write the page out and then unlock the page.
-          And Process 2 is holding the inode lock on inode X and is waiting for
-          the page to be unlocked so it can call ntfs_readpage() or discover
-          that Process 1 set PageUptodate() again and use the page.
-          Thus we have a deadlock due to ifind() waiting on the inode lock.
-          The solution: The fix is to use the newly introduced
-          ilookup5_nowait() which does not wait on the inode's lock and hence
-          avoids the deadlock.  This is safe as we do not care about the VFS
-          inode and only use the fact that it is in the VFS inode cache and the
-          fact that the vfs and ntfs inodes are one struct in memory to find
-          the ntfs inode in memory if present.  Also, the ntfs inode has its
-          own locking so it does not matter if the vfs inode is locked.
-        - Fix bug in mft record writing where we forgot to set the device in
-          the buffers when mapping them after the VM had discarded them.
-          Thanks to Martin MOKREJÅ for the bug report.
-2.1.22 - Many bug and race fixes and error handling improvements.
-        - Improve error handling in fs/ntfs/inode.c::ntfs_truncate().
-        - Change fs/ntfs/inode.c::ntfs_truncate() to return an error code
-          instead of void and provide a helper ntfs_truncate_vfs() for the
-          vfs ->truncate method.
-        - Add a new ntfs inode flag NInoTruncateFailed() and modify
-          fs/ntfs/inode.c::ntfs_truncate() to set and clear it appropriately.
-        - Fix min_size and max_size definitions in ATTR_DEF structure in
-          fs/ntfs/layout.h to be signed.
-        - Add attribute definition handling helpers to fs/ntfs/attrib.[hc]:
-          ntfs_attr_size_bounds_check(), ntfs_attr_can_be_non_resident(), and
-          ntfs_attr_can_be_resident(), which in turn use the new private helper
-          ntfs_attr_find_in_attrdef().
-        - In fs/ntfs/aops.c::mark_ntfs_record_dirty(), take the
-          mapping->private_lock around the dirtying of the buffer heads
-          analagous to the way it is done in __set_page_dirty_buffers().
-        - Ensure the mft record size does not exceed the PAGE_CACHE_SIZE at
-          mount time as this cannot work with the current implementation.
-        - Check for location of attribute name and improve error handling in
-          general in fs/ntfs/inode.c::ntfs_read_locked_inode() and friends.
-        - In fs/ntfs/aops.c::ntfs_writepage(), if the page is fully outside
-          i_size, i.e. race with truncate, invalidate the buffers on the page
-          so that they become freeable and hence the page does not leak.
-        - Remove unused function fs/ntfs/runlist.c::ntfs_rl_merge().  (Adrian
-          Bunk)
-        - Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_find() that resulted in
-          a NULL pointer dereference in the error code path when a corrupt
-          attribute was found.  (Thanks to Domen Puncer for the bug report.)
-        - Add MODULE_VERSION() to fs/ntfs/super.c.
-        - Make several functions and variables static.  (Adrian Bunk)
-        - Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() so it allocates
-          buffers for the page if they are not present and then marks the
-          buffers belonging to the ntfs record dirty.  This causes the buffers
-          to become busy and hence they are safe from removal until the page
-          has been written out.
-        - Fix stupid bug in fs/ntfs/attrib.c::ntfs_external_attr_find() in the
-          error handling code path that resulted in a BUG() due to trying to
-          unmap an extent mft record when the mapping of it had failed and it
-          thus was not mapped.  (Thanks to Ken MacFerrin for the bug report.)
-        - Drop the runlist lock after the vcn has been read in
-          fs/ntfs/lcnalloc.c::__ntfs_cluster_free().
-        - Rewrite handling of multi sector transfer errors.  We now do not set
-          PageError() when such errors are detected in the async i/o handler
-          fs/ntfs/aops.c::ntfs_end_buffer_async_read().  All users of mst
-          protected attributes now check the magic of each ntfs record as they
-          use it and act appropriately.  This has the effect of making errors
-          granular per ntfs record rather than per page which solves the case
-          where we cannot access any of the ntfs records in a page when a
-          single one of them had an mst error.  (Thanks to Ken MacFerrin for
-          the bug report.)
-        - Fix error handling in fs/ntfs/quota.c::ntfs_mark_quotas_out_of_date()
-          where we failed to release i_mutex on the $Quota/$Q attribute inode.
-        - Fix bug in handling of bad inodes in fs/ntfs/namei.c::ntfs_lookup().
-        - Add mapping of unmapped buffers to all remaining code paths, i.e.
-          fs/ntfs/aops.c::ntfs_write_mst_block(), mft.c::ntfs_sync_mft_mirror(),
-          and write_mft_record_nolock().  From now on we require that the
-          complete runlist for the mft mirror is always mapped into memory.
-        - Add creation of buffers to fs/ntfs/mft.c::ntfs_sync_mft_mirror().
-        - Improve error handling in fs/ntfs/aops.c::ntfs_{read,write}_block().
-        - Cleanup fs/ntfs/aops.c::ntfs_{read,write}page() since we know that a
-          resident attribute will be smaller than a page which makes the code
-          simpler.  Also make the code more tolerant to concurrent ->truncate.
-2.1.21 - Fix some races and bugs, rewrite mft write code, add mft allocator.
-        - Implement extent mft record deallocation
-          fs/ntfs/mft.c::ntfs_extent_mft_record_free().
-        - Splitt runlist related functions off from attrib.[hc] to runlist.[hc].
-        - Add vol->mft_data_pos and initialize it at mount time.
-        - Rename init_runlist() to ntfs_init_runlist(), ntfs_vcn_to_lcn() to
-          ntfs_rl_vcn_to_lcn(), decompress_mapping_pairs() to
-          ntfs_mapping_pairs_decompress(), ntfs_merge_runlists() to
-          ntfs_runlists_merge() and adapt all callers.
-        - Add fs/ntfs/runlist.[hc]::ntfs_get_nr_significant_bytes(),
-          ntfs_get_size_for_mapping_pairs(), ntfs_write_significant_bytes(),
-          and ntfs_mapping_pairs_build(), adapted from libntfs.
-        - Make fs/ntfs/lcnalloc.c::ntfs_cluster_free_from_rl_nolock() not
-          static and add a declaration for it to lcnalloc.h.
-        - Add fs/ntfs/lcnalloc.h::ntfs_cluster_free_from_rl() which is a static
-          inline wrapper for ntfs_cluster_free_from_rl_nolock() which takes the
-          cluster bitmap lock for the duration of the call.
-        - Add fs/ntfs/attrib.[hc]::ntfs_attr_record_resize().
-        - Implement the equivalent of memset() for an ntfs attribute in
-          fs/ntfs/attrib.[hc]::ntfs_attr_set() and switch
-          fs/ntfs/logfile.c::ntfs_empty_logfile() to using it.
-        - Remove unnecessary casts from LCN_* constants.
-        - Implement fs/ntfs/runlist.c::ntfs_rl_truncate_nolock().
-        - Add MFT_RECORD_OLD as a copy of MFT_RECORD in fs/ntfs/layout.h and
-          change MFT_RECORD to contain the NTFS 3.1+ specific fields.
-        - Add a helper function fs/ntfs/aops.c::mark_ntfs_record_dirty() which
-          marks all buffers belonging to an ntfs record dirty, followed by
-          marking the page the ntfs record is in dirty and also marking the vfs
-          inode containing the ntfs record dirty (I_DIRTY_PAGES).
-        - Switch fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to using the
-          new helper fs/ntfs/aops.c::mark_ntfs_record_dirty() and remove the no
-          longer needed fs/ntfs/index.[hc]::__ntfs_index_entry_mark_dirty().
-        - Move ntfs_{un,}map_page() from ntfs.h to aops.h and fix resulting
-          include errors.
-        - Move the typedefs for runlist_element and runlist from types.h to
-          runlist.h and fix resulting include errors.
-        - Remove unused {__,}format_mft_record() from fs/ntfs/mft.c.
-        - Modify fs/ntfs/mft.c::__mark_mft_record_dirty() to use the helper
-          mark_ntfs_record_dirty() which also changes the behaviour in that we
-          now set the buffers belonging to the mft record dirty as well as the
-          page itself.
-        - Update fs/ntfs/mft.c::write_mft_record_nolock() and sync_mft_mirror()
-          to cope with the fact that there now are dirty buffers in mft pages.
-        - Update fs/ntfs/inode.c::ntfs_write_inode() to also use the helper
-          mark_ntfs_record_dirty() and thus to set the buffers belonging to the
-          mft record dirty as well as the page itself.
-        - Fix compiler warnings on x86-64 in fs/ntfs/dir.c.  (Randy Dunlap,
-          slightly modified by me)
-        - Add fs/ntfs/mft.c::try_map_mft_record() which fails with -EALREADY if
-          the mft record is already locked and otherwise behaves the same way
-          as fs/ntfs/mft.c::map_mft_record().
-        - Modify fs/ntfs/mft.c::write_mft_record_nolock() so that it only
-          writes the mft record if the buffers belonging to it are dirty.
-          Otherwise we assume that it was written out by other means already.
-        - Attempting to write outside initialized size is _not_ a bug so remove
-          the bug check from fs/ntfs/aops.c::ntfs_write_mst_block().  It is in
-          fact required to write outside initialized size when preparing to
-          extend the initialized size.
-        - Map the page instead of using page_address() before writing to it in
-          fs/ntfs/aops.c::ntfs_mft_writepage().
-        - Provide exclusion between opening an inode / mapping an mft record
-          and accessing the mft record in fs/ntfs/mft.c::ntfs_mft_writepage()
-          by setting the page not uptodate throughout ntfs_mft_writepage().
-        - Clear the page uptodate flag in fs/ntfs/aops.c::ntfs_write_mst_block()
-          to ensure noone can see the page whilst the mst fixups are applied.
-        - Add the helper fs/ntfs/mft.c::ntfs_may_write_mft_record() which
-          checks if an mft record may be written out safely obtaining any
-          necessary locks in the process.  This is used by
-          fs/ntfs/aops.c::ntfs_write_mst_block().
-        - Modify fs/ntfs/aops.c::ntfs_write_mst_block() to also work for
-          writing mft records and improve its error handling in the process.
-          Now if any of the records in the page fail to be written out, all
-          other records will be written out instead of aborting completely.
-        - Remove ntfs_mft_aops and update all users to use ntfs_mst_aops.
-        - Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to set the
-          ntfs_mst_aops for all inodes which are NInoMstProtected() and
-          ntfs_aops for all other inodes.
-        - Rename fs/ntfs/mft.c::sync_mft_mirror{,_umount}() to
-          ntfs_sync_mft_mirror{,_umount}() and change their parameters so they
-          no longer require an ntfs inode to be present.  Update all callers.
-        - Cleanup the error handling in fs/ntfs/mft.c::ntfs_sync_mft_mirror().
-        - Clear the page uptodate flag in fs/ntfs/mft.c::ntfs_sync_mft_mirror()
-          to ensure noone can see the page whilst the mst fixups are applied.
-        - Remove the no longer needed fs/ntfs/mft.c::ntfs_mft_writepage() and
-          fs/ntfs/mft.c::try_map_mft_record().
-        - Fix callers of fs/ntfs/aops.c::mark_ntfs_record_dirty() to call it
-          with the ntfs inode which contains the page rather than the ntfs
-          inode the mft record of which is in the page.
-        - Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by moving the
-          index inode bitmap inode release code from there to
-          fs/ntfs/inode.c::ntfs_clear_big_inode().  (Thanks to Christoph
-          Hellwig for spotting this.)
-        - Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by taking the
-          inode semaphore around the code that sets ni->itype.index.bmp_ino to
-          NULL and reorganize the code to optimize it a bit.  (Thanks to
-          Christoph Hellwig for spotting this.)
-        - Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() to no longer take the
-          ntfs inode as a parameter as this is confusing and misleading and the
-          needed ntfs inode is available via NTFS_I(page->mapping->host).
-          Adapt all callers to this change.
-        - Modify fs/ntfs/mft.c::write_mft_record_nolock() and
-          fs/ntfs/aops.c::ntfs_write_mst_block() to only check the dirty state
-          of the first buffer in a record and to take this as the ntfs record
-          dirty state.  We cannot look at the dirty state for subsequent
-          buffers because we might be racing with
-          fs/ntfs/aops.c::mark_ntfs_record_dirty().
-        - Move the static inline ntfs_init_big_inode() from fs/ntfs/inode.c to
-          inode.h and make fs/ntfs/inode.c::__ntfs_init_inode() non-static and
-          add a declaration for it to inode.h.  Fix some compilation issues
-          that resulted due to #includes and header file interdependencies.
-        - Simplify setup of i_mode in fs/ntfs/inode.c::ntfs_read_locked_inode().
-        - Add helpers fs/ntfs/layout.h::MK_MREF() and MK_LE_MREF().
-        - Modify fs/ntfs/mft.c::map_extent_mft_record() to only verify the mft
-          record sequence number if it is specified (i.e. not zero).
-        - Add fs/ntfs/mft.[hc]::ntfs_mft_record_alloc() and various helper
-          functions used by it.
-        - Update Documentation/filesystems/ntfs.txt with instructions on how to
-          use the Device-Mapper driver with NTFS ftdisk/LDM raid.  This removes
-          the linear raid problem with the Software RAID / MD driver when one
-          or more of the devices has an odd number of sectors.
-2.1.20 - Fix two stupid bugs introduced in 2.1.18 release.
-        - Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_reinit_search_ctx()
-          where we did not clear ctx->al_entry but it was still set due to
-          changes in ntfs_attr_lookup() and ntfs_external_attr_find() in
-          particular.
-        - Fix another stupid bug in fs/ntfs/attrib.c::ntfs_external_attr_find()
-          where we forgot to unmap the extent mft record when we had finished
-          enumerating an attribute which caused a bug check to trigger when the
-          VFS calls ->clear_inode.
-2.1.19 - Many cleanups, improvements, and a minor bug fix.
-        - Update ->setattr (fs/ntfs/inode.c::ntfs_setattr()) to refuse to
-          change the uid, gid, and mode of an inode as we do not support NTFS
-          ACLs yet.
-        - Remove BKL use from ntfs_setattr() syncing up with the rest of the
-          kernel.
-        - Get rid of the ugly transparent union in fs/ntfs/dir.c::ntfs_readdir()
-          and ntfs_filldir() as per suggestion from Al Viro.
-        - Change '\0' and L'\0' to simply 0 as per advice from Linus Torvalds.
-        - Update ->truncate (fs/ntfs/inode.c::ntfs_truncate()) to check if the
-          inode size has changed and to only output an error if so.
-        - Rename fs/ntfs/attrib.h::attribute_value_length() to ntfs_attr_size().
-        - Add le{16,32,64} as well as sle{16,32,64} data types to
-          fs/ntfs/types.h.
-        - Change ntfschar to be le16 instead of u16 in fs/ntfs/types.h.
-        - Add le versions of VCN, LCN, and LSN called leVCN, leLCN, and leLSN,
-          respectively, to fs/ntfs/types.h.
-        - Update endianness conversion macros in fs/ntfs/endian.h to use the
-          new types as appropriate.
-        - Do proper type casting when using sle64_to_cpup() in fs/ntfs/dir.c
-          and index.c.
-        - Add leMFT_REF data type to fs/ntfs/layout.h.
-        - Update all NTFS header files with the new little endian data types.
-          Affected files are fs/ntfs/layout.h, logfile.h, and time.h.
-        - Do proper type casting when using ntfs_is_*_recordp() in
-          fs/ntfs/logfile.c, mft.c, and super.c. 
-        - Fix all the sparse bitwise warnings.  Had to change all the typedef
-          enums storing little endian values to simple enums plus a typedef for
-          the datatype to make sparse happy.
-        - Fix a bug found by the new sparse bitwise warnings where the default
-          upcase table was defined as a pointer to wchar_t rather than ntfschar
-          in fs/ntfs/ntfs.h and super.c.
-        - Change {const_,}cpu_to_le{16,32}(0) to just 0 as suggested by Al Viro.
-2.1.18 - Fix scheduling latencies at mount time as well as an endianness bug.
-        - Remove vol->nr_mft_records as it was pretty meaningless and optimize
-          the calculation of total/free inodes as used by statfs().
-        - Fix scheduling latencies in ntfs_fill_super() by dropping the BKL
-          because the code itself is using the ntfs_lock semaphore which
-          provides safe locking.  (Ingo Molnar)
-        - Fix a potential bug in fs/ntfs/mft.c::map_extent_mft_record() that
-          could occur in the future for when we start closing/freeing extent
-          inodes if we don't set base_ni->ext.extent_ntfs_inos to NULL after
-          we free it.
-        - Rename {find,lookup}_attr() to ntfs_attr_{find,lookup}() as well as
-          find_external_attr() to ntfs_external_attr_find() to cleanup the
-          namespace a bit and to be more consistent with libntfs.
-        - Rename {{re,}init,get,put}_attr_search_ctx() to
-          ntfs_attr_{{re,}init,get,put}_search_ctx() as well as the type
-          attr_search_context to ntfs_attr_search_ctx.
-        - Force use of ntfs_attr_find() in ntfs_attr_lookup() when searching
-          for the attribute list attribute itself.
-        - Fix endianness bug in ntfs_external_attr_find().
-        - Change ntfs_{external_,}attr_find() to return 0 on success, -ENOENT
-          if the attribute is not found, and -EIO on real error.  In the case
-          of -ENOENT, the search context is updated to describe the attribute
-          before which the attribute being searched for would need to be
-          inserted if such an action were to be desired and in the case of
-          ntfs_external_attr_find() the search context is also updated to
-          indicate the attribute list entry before which the attribute list
-          entry of the attribute being searched for would need to be inserted
-          if such an action were to be desired.  Also make ntfs_find_attr()
-          static and remove its prototype from attrib.h as it is not used
-          anywhere other than attrib.c.  Update ntfs_attr_lookup() and all
-          callers of ntfs_{external,}attr_{find,lookup}() for the new return
-          values.
-        - Minor cleanup of fs/ntfs/inode.c::ntfs_init_locked_inode().
-2.1.17 - Fix bugs in mount time error code paths and other updates.
-        - Implement bitmap modification code (fs/ntfs/bitmap.[hc]).  This
-          includes functions to set/clear a single bit or a run of bits.
-        - Add fs/ntfs/attrib.[hc]::ntfs_find_vcn() which returns the locked
-          runlist element containing a particular vcn.  It also takes care of
-          mapping any needed runlist fragments.
-        - Implement cluster (de-)allocation code (fs/ntfs/lcnalloc.[hc]).
-        - Load attribute definition table from $AttrDef at mount time.
-        - Fix bugs in mount time error code paths involving (de)allocation of
-          the default and volume upcase tables.
-        - Remove ntfs_nr_mounts as it is no longer used.
-2.1.16 - Implement access time updates, file sync, async io, and read/writev.
-        - Add support for readv/writev and aio_read/aio_write (fs/ntfs/file.c).
-          This is done by setting the appropriate file operations pointers to
-          the generic helper functions provided by mm/filemap.c.
-        - Implement fsync, fdatasync, and msync both for files (fs/ntfs/file.c)
-          and directories (fs/ntfs/dir.c).
-        - Add support for {a,m,c}time updates to inode.c::ntfs_write_inode().
-          Note, except for the root directory and any other system files opened
-          by the user, the system files will not have their access times
-          updated as they are only accessed at the inode level an hence the
-          file level functions which cause the times to be updated are never
-          invoked.
-2.1.15 - Invalidate quotas when (re)mounting read-write.
-        - Add new element itype.index.collation_rule to the ntfs inode
-          structure and set it appropriately in ntfs_read_locked_inode().
-        - Implement a new inode type "index" to allow efficient access to the
-          indices found in various system files and adapt inode handling
-          accordingly (fs/ntfs/inode.[hc]).  An index inode is essentially an
-          attribute inode (NInoAttr() is true) with an attribute type of
-          AT_INDEX_ALLOCATION.  As such, it is no longer allowed to call
-          ntfs_attr_iget() with an attribute type of AT_INDEX_ALLOCATION as
-          there would be no way to distinguish between normal attribute inodes
-          and index inodes.  The function to obtain an index inode is
-          ntfs_index_iget() and it uses the helper function
-          ntfs_read_locked_index_inode().  Note, we do not overload
-          ntfs_attr_iget() as indices consist of multiple attributes so using
-          ntfs_attr_iget() to obtain an index inode would be confusing.
-        - Ensure that there is no overflow when doing page->index <<
-          PAGE_CACHE_SHIFT by casting page->index to s64 in fs/ntfs/aops.c.
-        - Use atomic kmap instead of kmap() in fs/ntfs/aops.c::ntfs_read_page()
-          and ntfs_read_block().
-        - Use case sensitive attribute lookups instead of case insensitive ones.
-        - Lock all page cache pages belonging to mst protected attributes while
-          accessing them to ensure we never see corrupt data while the page is
-          under writeout.
-        - Add framework for generic ntfs collation (fs/ntfs/collation.[hc]).
-          We have ntfs_is_collation_rule_supported() to check if the collation
-          rule you want to use is supported and ntfs_collation() which actually
-          collates two data items.  We currently only support COLLATION_BINARY
-          and COLLATION_NTOFS_ULONG but support for other collation rules will
-          be added as the need arises.
-        - Add a new type, ntfs_index_context, to allow retrieval of an index
-          entry using the corresponding index key.  To get an index context,
-          use ntfs_index_ctx_get() and to release it, use ntfs_index_ctx_put().
-          This also adds a new slab cache for the index contexts.  To lookup a
-          key in an index inode, use ntfs_index_lookup().  After modifying an
-          index entry, call ntfs_index_entry_flush_dcache_page() followed by
-          ntfs_index_entry_mark_dirty() to ensure the changes are written out
-          to disk.  For details see fs/ntfs/index.[hc].  Note, at present, if
-          an index entry is in the index allocation attribute rather than the
-          index root attribute it will not be written out (you will get a
-          warning message about discarded changes instead).
-        - Load the quota file ($Quota) and check if quota tracking is enabled
-          and if so, mark the quotas out of date.  This causes windows to
-          rescan the volume on boot and update all quota entries.
-        - Add a set_page_dirty address space operation for ntfs_m[fs]t_aops.
-          It is simply set to __set_page_dirty_nobuffers() to make sure that
-          running set_page_dirty() on a page containing mft/ntfs records will
-          not affect the dirty state of the page buffers.
-        - Add fs/ntfs/index.c::__ntfs_index_entry_mark_dirty() which sets all
-          buffers that are inside the ntfs record in the page dirty after which
-          it sets the page dirty.  This allows ->writepage to only write the
-          dirty index records rather than having to write all the records in
-          the page.  Modify fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to
-          use this rather than __set_page_dirty_nobuffers().
-        - Implement fs/ntfs/aops.c::ntfs_write_mst_block() which enables the
-          writing of page cache pages belonging to mst protected attributes
-          like the index allocation attribute in directory indices and other
-          indices like $Quota/$Q, etc.  This means that the quota is now marked
-          out of date on all volumes rather than only on ones where the quota
-          defaults entry is in the index root attribute of the $Quota/$Q index.
-2.1.14 - Fix an NFSd caused deadlock reported by several users.
-        - Modify fs/ntfs/ntfs_readdir() to copy the index root attribute value
-          to a buffer so that we can put the search context and unmap the mft
-          record before calling the filldir() callback.  We need to do this
-          because of NFSd which calls ->lookup() from its filldir callback()
-          and this causes NTFS to deadlock as ntfs_lookup() maps the mft record
-          of the directory and since ntfs_readdir() has got it mapped already
-          ntfs_lookup() deadlocks.
-2.1.13 - Enable overwriting of resident files and housekeeping of system files.
-        - Implement writing of mft records (fs/ntfs/mft.[hc]), which includes
-          keeping the mft mirror in sync with the mft when mirrored mft records
-          are written.  The functions are write_mft_record{,_nolock}().  The
-          implementation is quite rudimentary for now with lots of things not
-          implemented yet but I am not sure any of them can actually occur so
-          I will wait for people to hit each one and only then implement it.
-        - Commit open system inodes at umount time.  This should make it
-          virtually impossible for sync_mft_mirror_umount() to ever be needed.
-        - Implement ->write_inode (fs/ntfs/inode.c::ntfs_write_inode()) for the
-          ntfs super operations.  This gives us inode writing via the VFS inode
-          dirty code paths.  Note:  Access time updates are not implemented yet.
-        - Implement fs/ntfs/mft.[hc]::{,__}mark_mft_record_dirty() and make
-          fs/ntfs/aops.c::ntfs_writepage() and ntfs_commit_write() use it, thus
-          finally enabling resident file overwrite!  (-8  This also includes a
-          placeholder for ->writepage (ntfs_mft_writepage()), which for now
-          just redirties the page and returns.  Also, at umount time, we for
-          now throw away all mft data page cache pages after the last call to
-          ntfs_commit_inode() in the hope that all inodes will have been
-          written out by then and hence no dirty (meta)data will be lost.  We
-          also check for this case and emit an error message telling the user
-          to run chkdsk.
-        - Use set_page_writeback() and end_page_writeback() in the resident
-          attribute code path of fs/ntfs/aops.c::ntfs_writepage() otherwise
-          the radix-tree tag PAGECACHE_TAG_DIRTY remains set even though the
-          page is clean.
-        - Implement ntfs_mft_writepage() so it now checks if any of the mft
-          records in the page are dirty and if so redirties the page and
-          returns.  Otherwise it just returns (after doing set_page_writeback(),
-          unlock_page(), end_page_writeback() or the radix-tree tag
-          PAGECACHE_TAG_DIRTY remains set even though the page is clean), thus
-          alowing the VM to do with the page as it pleases.  Also, at umount
-          time, now only throw away dirty mft (meta)data pages if dirty inodes
-          are present and ask the user to email us if they see this happening.
-        - Add functions ntfs_{clear,set}_volume_flags(), to modify the volume
-          information flags (fs/ntfs/super.c).
-        - Mark the volume dirty when (re)mounting read-write and mark it clean
-          when unmounting or remounting read-only.  If any volume errors are
-          found, the volume is left marked dirty to force chkdsk to run.
-        - Add code to set the NT4 compatibility flag when (re)mounting
-          read-write for newer NTFS versions but leave it commented out for now
-          since we do not make any modifications that are NTFS 1.2 specific yet
-          and since setting this flag breaks Captive-NTFS which is not nice.
-          This code must be enabled once we start writing NTFS 1.2 specific
-          changes otherwise Windows NTFS driver might crash / cause corruption.
-2.1.12 - Fix the second fix to the decompression engine and some cleanups.
-        - Add a new address space operations struct, ntfs_mst_aops, for mst
-          protected attributes.  This is because the default ntfs_aops do not
-          make sense with mst protected data and were they to write anything to
-          such an attribute they would cause data corruption so we provide
-          ntfs_mst_aops which does not have any write related operations set.
-        - Cleanup dirty ntfs inode handling (fs/ntfs/inode.[hc]) which also
-          includes an adapted ntfs_commit_inode() and an implementation of
-          ntfs_write_inode() which for now just cleans dirty inodes without
-          writing them (it does emit a warning that this is happening).
-        - Undo the second decompression engine fix (see 2.1.9 release ChangeLog
-          entry) as it was only fixing a theoretical bug but at the same time
-          it badly broke the handling of sparse and uncompressed compression
-          blocks.
-2.1.11 - Driver internal cleanups.
-        - Only build logfile.o if building the driver with read-write support.
-        - Really final white space cleanups.
-        - Use generic_ffs() instead of ffs() in logfile.c which allows the
-          log_page_size variable to be optimized by gcc into a constant.
-        - Rename uchar_t to ntfschar everywhere as uchar_t is unsigned 1-byte
-          char as defined by POSIX and as found on some systems.
-2.1.10 - Force read-only (re)mounting of volumes with unsupported volume flags.
-        - Finish off the white space cleanups (remove trailing spaces, etc).
-        - Clean up ntfs_fill_super() and ntfs_read_inode_mount() by removing
-          the kludges around the first iget().  Instead of (re)setting ->s_op
-          we have the $MFT inode set up by explicit new_inode() / set ->i_ino /
-          insert_inode_hash() / call ntfs_read_inode_mount() directly.  This
-          kills the need for second super_operations and allows to return error
-          from ntfs_read_inode_mount() without resorting to ugly "poisoning"
-          tricks.  (Al Viro)
-        - Force read-only (re)mounting if any of the following bits are set in
-          the volume information flags:
-                VOLUME_IS_DIRTY, VOLUME_RESIZE_LOG_FILE,
-                VOLUME_UPGRADE_ON_MOUNT, VOLUME_DELETE_USN_UNDERWAY,
-                VOLUME_REPAIR_OBJECT_ID, VOLUME_MODIFIED_BY_CHKDSK
-          To make this easier we define VOLUME_MUST_MOUNT_RO_MASK with all the
-          above bits set so the test is made easy.
-2.1.9 - Fix two bugs in decompression engine.
-        - Fix a bug where we would not always detect that we have reached the
-          end of a compression block because we were ending at minus one byte
-          which is effectively the same as being at the end.  The fix is to
-          check whether the uncompressed buffer has been fully filled and if so
-          we assume we have reached the end of the compression block.  A big
-          thank you to Marcin Gibuła for the bug report, the assistance in
-          tracking down the bug and testing the fix.
-        - Fix a possible bug where when a compressed read is truncated to the
-          end of the file, the offset inside the last page was not truncated.
-2.1.8 - Handle $MFT mirror and $LogFile, improve time handling, and cleanups.
-        - Use get_bh() instead of manual atomic_inc() in fs/ntfs/compress.c.
-        - Modify fs/ntfs/time.c::ntfs2utc(), get_current_ntfs_time(), and
-          utc2ntfs() to work with struct timespec instead of time_t on the
-          Linux UTC time side thus preserving the full precision of the NTFS
-          time and only loosing up to 99 nano-seconds in the Linux UTC time.
-        - Move fs/ntfs/time.c to fs/ntfs/time.h and make the time functions
-          static inline.
-        - Remove unused ntfs_dirty_inode().
-        - Cleanup super operations declaration in fs/ntfs/super.c.
-        - Wrap flush_dcache_mft_record_page() in #ifdef NTFS_RW.
-        - Add NInoTestSetFoo() and NInoTestClearFoo() macro magic to
-          fs/ntfs/inode.h and use it to declare NInoTest{Set,Clear}Dirty.
-        - Move typedefs for ntfs_attr and test_t from fs/ntfs/inode.c to
-          fs/ntfs/inode.h so they can be used elsewhere.
-        - Determine the mft mirror size as the number of mirrored mft records
-          and store it in ntfs_volume->mftmirr_size (fs/ntfs/super.c).
-        - Load the mft mirror at mount time and compare the mft records stored
-          in it to the ones in the mft.  Force a read-only mount if the two do
-          not match (fs/ntfs/super.c).
-        - Fix type casting related warnings on 64-bit architectures.  Thanks
-          to Meelis Roos for reporting them.
-        - Move %L to %ll as %L is floating point and %ll is integer which is
-          what we want.
-        - Read the journal ($LogFile) and determine if the volume has been
-          shutdown cleanly and force a read-only mount if not (fs/ntfs/super.c
-          and fs/ntfs/logfile.c).  This is a little bit of a crude check in
-          that we only look at the restart areas and not at the actual log
-          records so that there will be a very small number of cases where we
-          think that a volume is dirty when in fact it is clean.  This should
-          only affect volumes that have not been shutdown cleanly and did not
-          have any pending, non-check-pointed i/o.
-        - If the $LogFile indicates a clean shutdown and a read-write (re)mount
-          is requested, empty $LogFile by overwriting it with 0xff bytes to
-          ensure that Windows cannot cause data corruption by replaying a stale
-          journal after Linux has written to the volume.
-2.1.7 - Enable NFS exporting of mounted NTFS volumes.
-        - Set i_generation in the VFS inode from the seq_no of the NTFS inode.
-        - Make ntfs_lookup() NFS export safe, i.e. use d_splice_alias(), etc.
-        - Implement ->get_dentry() in fs/ntfs/namei.c::ntfs_get_dentry() as the
-          default doesn't allow inode number 0 which is a valid inode on NTFS
-          and even if it did allow that it uses iget() instead of ntfs_iget()
-          which makes it useless for us.
-        - Implement ->get_parent() in fs/ntfs/namei.c::ntfs_get_parent() as the
-          default just returns -EACCES which is not very useful.
-        - Define export operations (->s_export_op) for NTFS (ntfs_export_ops)
-          and set them up in the super block at mount time (super.c) this
-          allows mounted NTFS volumes to be exported via NFS.
-        - Add missing return -EOPNOTSUPP; in
-          fs/ntfs/aops.c::ntfs_commit_nonresident_write().
-        - Enforce no atime and no dir atime updates at mount/remount time as
-          they are not implemented yet anyway.
-        - Move a few assignments in fs/ntfs/attrib.c::load_attribute_list() to
-          after a NULL check.  Thanks to Dave Jones for pointing this out.
-2.1.6 - Fix minor bug in handling of compressed directories.
-        - Fix bug in handling of compressed directories.  A compressed
-          directory is not really compressed so when we set the ->i_blocks
-          field of a compressed directory inode we were setting it from the
-          non-existing field ni->itype.compressed.size which gave random
-          results...  For directories we now always use ni->allocated_size.
-2.1.5 - Fix minor bug in attribute list attribute handling.
-        - Fix bug in attribute list handling.  Actually it is not as much a bug
-          as too much protection in that we were not allowing attribute lists
-          which waste space on disk while Windows XP clearly allows it and in
-          fact creates such attribute lists so our driver was failing.
-        - Update NTFS documentation ready for 2.6 kernel release.
-2.1.4 - Reduce compiler requirements.
-        - Remove all uses of unnamed structs and unions in the driver to make
-          old and newer gcc versions happy. Makes it a bit uglier IMO but at
-          least people will stop hassling me about it.
-2.1.3 - Important bug fixes in corner cases.
-        - super.c::parse_ntfs_boot_sector(): Correct the check for 64-bit
-          clusters. (Philipp Thomas)
-        - attrib.c::load_attribute_list(): Fix bug when initialized_size is a
-          multiple of the block_size but not the cluster size. (Szabolcs
-          Szakacsits)
-2.1.2 - Important bug fixes aleviating the hangs in statfs.
-        - Fix buggy free cluster and free inode determination logic.
-2.1.1 - Minor updates.
-        - Add handling for initialized_size != data_size in compressed files.
-        - Reduce function local stack usage from 0x3d4 bytes to just noise in
-          fs/ntfs/upcase.c. (Randy Dunlap)
-        - Remove compiler warnings for newer gcc.
-        - Pages are no longer kmapped by mm/filemap.c::generic_file_write()
-          around calls to ->{prepare,commit}_write.  Adapt NTFS appropriately
-          in fs/ntfs/aops.c::ntfs_prepare_nonresident_write() by using
-          kmap_atomic(KM_USER0).
-2.1.0 - First steps towards write support: implement file overwrite.
-        - Add configuration option for developmental write support with an
-          appropriately scary configuration help text.
-        - Initial implementation of fs/ntfs/aops.c::ntfs_writepage() and its
-          helper fs/ntfs/aops.c::ntfs_write_block(). This enables mmap(2) based
-          overwriting of existing files on ntfs. Note: Resident files are
-          only written into memory, and not written out to disk at present, so
-          avoid writing to files smaller than about 1kiB.
-        - Initial implementation of fs/ntfs/aops.c::ntfs_prepare_write(), its
-          helper fs/ntfs/aops.c::ntfs_prepare_nonresident_write() and their
-          counterparts, fs/ntfs/aops.c::ntfs_commit_write(), and
-          fs/ntfs/aops.c::ntfs_commit_nonresident_write(), respectively. Also,
-          add generic_file_write() to the ntfs file operations (fs/ntfs/file.c).
-          This enables write(2) based overwriting of existing files on ntfs.
-          Note: As with mmap(2) based overwriting, resident files are only
-          written into memory, and not written out to disk at present, so avoid
-          writing to files smaller than about 1kiB.
-        - Implement ->truncate (fs/ntfs/inode.c::ntfs_truncate()) and
-          ->setattr() (fs/ntfs/inode.c::ntfs_setattr()) inode operations for
-          files with the purpose of intercepting and aborting all i_size
-          changes which we do not support yet. ntfs_truncate() actually only
-          emits a warning message but AFAICS our interception of i_size changes
-          elsewhere means ntfs_truncate() never gets called for i_size changes.
-          It is only called from generic_file_write() when we fail in
-          ntfs_prepare_{,nonresident_}write() in order to discard any
-          instantiated buffers beyond i_size. Thus i_size is not actually
-          changed so our warning message is enough. Unfortunately it is not
-          possible to easily determine if i_size is being changed or not hence
-          we just emit an appropriately worded error message.
-2.0.25 - Small bug fixes and cleanups.
-        - Unlock the page in an out of memory error code path in
-          fs/ntfs/aops.c::ntfs_read_block().
-        - If fs/ntfs/aops.c::ntfs_read_page() is called on an uptodate page,
-          just unlock the page and return. (This can happen due to ->writepage
-          clearing PageUptodate() during write out of MstProtected()
-          attributes.
-        - Remove leaked write code again.
-2.0.24 - Cleanups.
-        - Treat BUG_ON() as ASSERT() not VERIFY(), i.e. do not use side effects
-          inside BUG_ON(). (Adam J. Richter)
-        - Split logical OR expressions inside BUG_ON() into individual BUG_ON()
-          calls for improved debugging. (Adam J. Richter)
-        - Add errors flag to the ntfs volume state, accessed via
-          NVol{,Set,Clear}Errors(vol).
-        - Do not allow read-write remounts of read-only volumes with errors.
-        - Clarify comment for ntfs file operation sendfile which was added by
-          Christoph Hellwig a while ago (just using generic_file_sendfile())
-          to say that ntfs ->sendfile is only used for the case where the
-          source data is on the ntfs partition and the destination is
-          somewhere else, i.e. nothing we need to concern ourselves with.
-        - Add generic_file_write() as our ntfs file write operation.
-2.0.23 - Major bug fixes (races, deadlocks, non-i386 architectures).
-        - Massive internal locking changes to mft record locking. Fixes lock
-          recursion and replaces the mrec_lock read/write semaphore with a
-          mutex. Also removes the now superfluous mft_count. This fixes several
-          race conditions and deadlocks, especially in the future write code.
-        - Fix ntfs over loopback for compressed files by adding an
-          optimization barrier. (gcc was screwing up otherwise ?)
-        - Miscellaneous cleanups all over the code and a fix or two in error
-          handling code paths.
-        Thanks go to Christoph Hellwig for pointing out the following two:
-        - Remove now unused function fs/ntfs/malloc.h::vmalloc_nofs().
-        - Fix ntfs_free() for ia64 and parisc by checking for VMALLOC_END, too.
-2.0.22 - Cleanups, mainly to ntfs_readdir(), and use C99 initializers.
-        - Change fs/ntfs/dir.c::ntfs_reddir() to only read/write ->f_pos once
-          at entry/exit respectively.
-        - Use C99 initializers for structures.
-        - Remove unused variable blocks from fs/ntfs/aops.c::ntfs_read_block().
-2.0.21 - Check for, and refuse to work with too large files/directories/volumes.
-        - Limit volume size at mount time to 2TiB on architectures where
-          unsigned long is 32-bits (fs/ntfs/super.c::parse_ntfs_boot_sector()).
-          This is the most we can do without overflowing the 32-bit limit of
-          the block device size imposed on us by sb_bread() and sb_getblk()
-          for the time being.
-        - Limit file/directory size at open() time to 16TiB on architectures
-          where unsigned long is 32-bits (fs/ntfs/file.c::ntfs_file_open() and
-          fs/ntfs/dir.c::ntfs_dir_open()). This is the most we can do without
-          overflowing the page cache page index.
-2.0.20 - Support non-resident directory index bitmaps, fix page leak in readdir.
-        - Move the directory index bitmap to use an attribute inode instead of
-          having special fields for it inside the ntfs inode structure. This
-          means that the index bitmaps now use the page cache for i/o, too,
-          and also as a side effect we get support for non-resident index
-          bitmaps for free.
-        - Simplify/cleanup error handling in fs/ntfs/dir.c::ntfs_readdir() and
-          fix a page leak that manifested itself in some cases.
-        - Add fs/ntfs/inode.c::ntfs_put_inode(), which we need to release the
-          index bitmap inode on the final iput().
-2.0.19 - Fix race condition, improvements, and optimizations in i/o interface.
-        - Apply block optimization added to fs/ntfs/aops.c::ntfs_read_block()
-          to fs/ntfs/compress.c::ntfs_file_read_compressed_block() as well.
-        - Drop the "file" from ntfs_file_read_compressed_block().
-        - Rename fs/ntfs/aops.c::ntfs_enb_buffer_read_async() to
-          ntfs_end_buffer_async_read() (more like the fs/buffer.c counterpart).
-        - Update ntfs_end_buffer_async_read() with the improved logic from
-          its updated counterpart fs/buffer.c::end_buffer_async_read(). Apply
-          further logic improvements to better determine when we set PageError.
-        - Update submission of buffers in fs/ntfs/aops.c::ntfs_read_block() to
-          check for the buffers being uptodate first in line with the updated
-          fs/buffer.c::block_read_full_page(). This plugs a small race
-          condition.
-2.0.18 - Fix race condition in reading of compressed files.
-        - There was a narrow window between checking a buffer head for being
-          uptodate and locking it in ntfs_file_read_compressed_block(). We now
-          lock the buffer and then check whether it is uptodate or not.
-2.0.17 - Cleanups and optimizations - shrinking the ToDo list.
-        - Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to return an error
-          code and update callers, i.e. ntfs_iget(), to pass that error code
-          up instead of just using -EIO.
-        - Modifications to super.c to ensure that both mount and remount
-          cannot set any write related options when the driver is compiled
-          read-only.
-        - Optimize block resolution in fs/ntfs/aops.c::ntfs_read_block() to
-          cache the current runlist element. This should improve performance
-          when reading very large and/or very fragmented data.
-2.0.16 - Convert access to $MFT/$BITMAP to attribute inode API.
-        - Fix a stupid bug introduced in 2.0.15 where we were unmapping the
-          wrong inode in fs/ntfs/inode.c::ntfs_attr_iget().
-        - Fix debugging check in fs/ntfs/aops.c::ntfs_read_block().
-        - Convert $MFT/$BITMAP access to attribute inode API and remove all
-          remnants of the ugly mftbmp address space and operations hack. This
-          means we finally have only one readpage function as well as only one
-          async io completion handler. Yey! The mft bitmap is now just an
-          attribute inode and is accessed from vol->mftbmp_ino just as if it
-          were a normal file. Fake inodes rule. (-:
-2.0.15 - Fake inodes based attribute i/o via the pagecache, fixes and cleanups.
-        - Fix silly bug in fs/ntfs/super.c::parse_options() which was causing
-          remounts to fail when the partition had an entry in /etc/fstab and
-          the entry specified the nls= option.
-        - Apply same macro magic used in fs/ntfs/inode.h to fs/ntfs/volume.h to
-          expand all the helper functions NVolFoo(), NVolSetFoo(), and
-          NVolClearFoo().
-        - Move copyright statement from driver initialisation message to
-          module description (fs/super.c). This makes the initialisation
-          message fit on one line and fits in better with rest of kernel.
-        - Update fs/ntfs/attrib.c::map_run_list() to work on both real and
-          attribute inodes, and both for files and directories.
-        - Implement fake attribute inodes allowing all attribute i/o to go via
-          the page cache and to use all the normal vfs/mm functionality:
-          - Add ntfs_attr_iget() and its helper ntfs_read_locked_attr_inode()
-            to fs/ntfs/inode.c.
-          - Add needed cleanup code to ntfs_clear_big_inode().
-        - Merge address space operations for files and directories (aops.c),
-          now just have ntfs_aops:
-          - Rename:
-                end_buffer_read_attr_async() -> ntfs_end_buffer_read_async(),
-                ntfs_attr_read_block()       -> ntfs_read_block(),
-                ntfs_file_read_page()        -> ntfs_readpage().
-          - Rewrite fs/ntfs/aops.c::ntfs_readpage() to work on both real and
-            attribute inodes, and both for files and directories.
-          - Remove obsolete fs/ntfs/aops.c::ntfs_mst_readpage().
-2.0.14 - Run list merging code cleanup, minor locking changes, typo fixes.
-        - Change fs/ntfs/super.c::ntfs_statfs() to not rely on BKL by moving
-          the locking out of super.c::get_nr_free_mft_records() and taking and
-          dropping the mftbmp_lock rw_semaphore in ntfs_statfs() itself.
-        - Bring attribute runlist merging code (fs/ntfs/attrib.c) in sync with
-          current userspace ntfs library code. This means that if a merge
-          fails the original runlists are always left unmodified instead of
-          being silently corrupted.
-        - Misc typo fixes.
-2.0.13 - Use iget5_locked() in preparation for fake inodes and small cleanups.
-        - Remove nr_mft_bits and the now superfluous union with nr_mft_records
-          from ntfs_volume structure.
-        - Remove nr_lcn_bits and the now superfluous union with nr_clusters
-          from ntfs_volume structure.
-        - Use iget5_locked() and friends instead of conventional iget(). Wrap
-          the call in fs/ntfs/inode.c::ntfs_iget() and update callers of iget()
-          to use ntfs_iget(). Leave only one iget() call at mount time so we
-          don't need an ntfs_iget_mount().
-        - Change fs/ntfs/inode.c::ntfs_new_extent_inode() to take mft_no as an
-          additional argument.
-2.0.12 - Initial cleanup of address space operations following 2.0.11 changes.
-        - Merge fs/ntfs/aops.c::end_buffer_read_mst_async() and
-          fs/ntfs/aops.c::end_buffer_read_file_async() into one function
-          fs/ntfs/aops.c::end_buffer_read_attr_async() using NInoMstProtected()
-          to determine whether to apply mst fixups or not.
-        - Above change allows merging fs/ntfs/aops.c::ntfs_file_read_block()
-          and fs/ntfs/aops.c::ntfs_mst_readpage() into one function
-          fs/ntfs/aops.c::ntfs_attr_read_block(). Also, create a tiny wrapper
-          fs/ntfs/aops.c::ntfs_mst_readpage() to transform the parameters from
-          the VFS readpage function prototype to the ntfs_attr_read_block()
-          function prototype.
-2.0.11 - Initial preparations for fake inode based attribute i/o.
-        - Move definition of ntfs_inode_state_bits to fs/ntfs/inode.h and
-          do some macro magic (adapted from include/linux/buffer_head.h) to
-          expand all the helper functions NInoFoo(), NInoSetFoo(), and
-          NInoClearFoo().
-        - Add new flag to ntfs_inode_state_bits: NI_Sparse.
-        - Add new fields to ntfs_inode structure to allow use of fake inodes
-          for attribute i/o: type, name, name_len. Also add new state bits:
-          NI_Attr, which, if set, indicates the inode is a fake inode, and
-          NI_MstProtected, which, if set, indicates the attribute uses multi
-          sector transfer protection, i.e. fixups need to be applied after
-          reads and before/after writes.
-        - Rename fs/ntfs/inode.c::ntfs_{new,clear,destroy}_inode() to
-          ntfs_{new,clear,destroy}_extent_inode() and update callers.
-        - Use ntfs_clear_extent_inode() in fs/ntfs/inode.c::__ntfs_clear_inode()
-          instead of ntfs_destroy_extent_inode().
-        - Cleanup memory deallocations in {__,}ntfs_clear_{,big_}inode().
-        - Make all operations on ntfs inode state bits use the NIno* functions.
-        - Set up the new ntfs inode fields and state bits in
-          fs/ntfs/inode.c::ntfs_read_inode() and add appropriate cleanup of
-          allocated memory to __ntfs_clear_inode().
-        - Cleanup ntfs_inode structure a bit for better ordering of elements
-          w.r.t. their size to allow better packing of the structure in memory.
-2.0.10 - There can only be 2^32 - 1 inodes on an NTFS volume.
-        - Add check at mount time to verify that the number of inodes on the
-          volume does not exceed 2^32 - 1, which is the maximum allowed for
-          NTFS according to Microsoft.
-        - Change mft_no member of ntfs_inode structure to be unsigned long.
-          Update all users. This makes ntfs_inode->mft_no just a copy of struct
-          inode->i_ino. But we can't just always use struct inode->i_ino and
-          remove mft_no because extent inodes do not have an attached struct
-          inode.
-2.0.9 - Decompression engine now uses a single buffer and other cleanups.
-        - Change decompression engine to use a single buffer protected by a
-          spin lock instead of per-CPU buffers. (Rusty Russell)
-        - Do not update cb_pos when handling a partial final page during
-          decompression of a sparse compression block, as the value is later
-          reset without being read/used. (Rusty Russell)
-        - Switch to using the new KM_BIO_SRC_IRQ for atomic kmap()s. (Andrew
-          Morton)
-        - Change buffer size in ntfs_readdir()/ntfs_filldir() to use
-          NLS_MAX_CHARSET_SIZE which makes the buffers almost 1kiB each but
-          it also makes everything safer so it is a good thing.
-        - Miscellaneous minor cleanups to comments.
-2.0.8 - Major updates for handling of case sensitivity and dcache aliasing.
-        Big thanks go to Al Viro and other inhabitants of #kernel for investing
-        their time to discuss the case sensitivity and dcache aliasing issues.
-        - Remove unused source file fs/ntfs/attraops.c.
-        - Remove show_inodes mount option(s), thus dropping support for
-          displaying of short file names.
-        - Remove deprecated mount option posix.
-        - Restore show_sys_files mount option.
-        - Add new mount option case_sensitive, to determine if the driver
-          treats file names as case sensitive or not. If case sensitive, create
-          file names in the POSIX namespace. Otherwise create file names in the
-          LONG/WIN32 namespace. Note, files remain accessible via their short
-          file name, if it exists.
-        - Remove really dumb logic bug in boot sector recovery code.
-        - Fix dcache aliasing issues wrt short/long file names via changes
-          to fs/ntfs/dir.c::ntfs_lookup_inode_by_name() and
-          fs/ntfs/namei.c::ntfs_lookup():
-          - Add additional argument to ntfs_lookup_inode_by_name() in which we
-            return information about the matching file name if the case is not
-            matching or the match is a short file name. See comments above the
-            function definition for details.
-          - Change ntfs_lookup() to only create dcache entries for the correctly
-            cased file name and only for the WIN32 namespace counterpart of DOS
-            namespace file names. This ensures we have only one dentry per
-            directory and also removes all dcache aliasing issues between short
-            and long file names once we add write support. See comments above
-            function for details.
-        - Fix potential 1 byte overflow in fs/ntfs/unistr.c::ntfs_ucstonls().
-2.0.7 - Minor cleanups and updates for changes in core kernel code.
-        - Remove much of the NULL struct element initializers.
-        - Various updates to make compatible with recent kernels.
-        - Remove defines of MAX_BUF_PER_PAGE and include linux/buffer_head.h
-          in fs/ntfs/ntfs.h instead.
-        - Remove no longer needed KERNEL_VERSION checks. We are now in the
-          kernel proper so they are no longer needed.
-2.0.6 - Major bugfix to make compatible with other kernel changes.
-        - Initialize the mftbmp address space properly now that there are more
-          fields in the struct address_space. This was leading to hangs and
-          oopses on umount since 2.5.12 because of changes to other parts of
-          the kernel. We probably want a kernel generic init_address_space()
-          function...
-        - Drop BKL from ntfs_readdir() after consultation with Al Viro. The
-          only caller of ->readdir() is vfs_readdir() which holds i_mutex
-          during the call, and i_mutex is sufficient protection against changes
-          in the directory inode (including ->i_size).
-        - Use generic_file_llseek() for directories (as opposed to
-          default_llseek()) as this downs i_mutex instead of the BKL which is
-          what we now need for exclusion against ->f_pos changes considering we
-          no longer take the BKL in ntfs_readdir().
-2.0.5 - Major bugfix. Buffer overflow in extent inode handling.
-        - No need to set old blocksize in super.c::ntfs_fill_super() as the
-          VFS does so via invocation of deactivate_super() calling
-          fs->fill_super() calling block_kill_super() which does it.
-        - BKL moved from VFS into dir.c::ntfs_readdir(). (Linus Torvalds)
-          -> Do we really need it? I don't think so as we have exclusion on
-          the directory ntfs_inode rw_semaphore mrec_lock. We mmight have to
-          move the ->f_pos accesses under the mrec_lock though. Check this...
-        - Fix really, really, really stupid buffer overflow in extent inode
-          handling in mft.c::map_extent_mft_record().
-2.0.4 - Cleanups and updates for kernel 2.5.11.
-        - Add documentation on how to use the MD driver to be able to use NTFS
-          stripe and volume sets in Linux and generally cleanup documentation
-          a bit.
-        Remove all uses of kdev_t in favour of struct block_device *:
-        - Change compress.c::ntfs_file_read_compressed_block() to use
-          sb_getblk() instead of getblk().
-        - Change super.c::ntfs_fill_super() to use bdev_hardsect_size() instead
-          of get_hardsect_size().
-        - No need to get old blocksize in super.c::ntfs_fill_super() as
-          fs/super.c::get_sb_bdev() already does this.
-        - Set bh->b_bdev instead of bh->b_dev throughout aops.c.
-2.0.3 - Small bug fixes, cleanups, and performance improvements.
-        - Remove some dead code from mft.c.
-        - Optimize readpage and read_block functions throughout aops.c so that
-          only initialized blocks are read. Non-initialized ones have their
-          buffer head mapped, zeroed, and set up to date, without scheduling
-          any i/o. Thanks to Al Viro for advice on how to avoid the device i/o.
-        Thanks go to Andrew Morton for spotting the below:
-        - Fix buglet in allocate_compression_buffers() error code path.
-        - Call flush_dcache_page() after modifying page cache page contents in
-          ntfs_file_readpage().
-        - Check for existence of page buffers throughout aops.c before calling
-          create_empty_buffers(). This happens when an I/O error occurs and the
-          read is retried. (It also happens once writing is implemented so that
-          needed doing anyway but I had left it for later...)
-        - Don't BUG_ON() uptodate and/or mapped buffers throughout aops.c in
-          readpage and read_block functions. Reasoning same as above (i.e. I/O
-          error retries and future write code paths.)
-2.0.2 - Minor updates and cleanups.
-        - Cleanup: rename mst.c::__post_read_mst_fixup to post_write_mst_fixup
-          and cleanup the code a bit, removing the unused size parameter.
-        - Change default fmask to 0177 and update documentation.
-        - Change attrib.c::get_attr_search_ctx() to return the search context
-          directly instead of taking the address of a pointer. A return value
-          of NULL means the allocation failed. Updated all callers
-          appropriately.
-        - Update to 2.5.9 kernel (preserving backwards compatibility) by
-          replacing all occurences of page->buffers with page_buffers(page).
-        - Fix minor bugs in runlist merging, also minor cleanup.
-        - Updates to bootsector layout and mft mirror contents descriptions.
-        - Small bug fix in error detection in unistr.c and some cleanups.
-        - Grow name buffer allocations in unistr.c in aligned mutlipled of 64
-          bytes.
-2.0.1 - Minor updates.
-        - Make default umask correspond to documentation.
-        - Improve documentation.
-        - Set default mode to include execute bit. The {u,f,d}mask can be used
-          to take it away if desired. This allows binaries to be executed from
-          a mounted ntfs partition.
-2.0.0 - New version number. Remove TNG from the name. Now in the kernel.
-        - Add kill_super, just keeping up with the vfs changes in the kernel.
-        - Repeat some changes from tng-0.0.8 that somehow got lost on the way
-          from the CVS import into BitKeeper.
-        - Begin to implement proper handling of allocated_size vs
-          initialized_size vs data_size (i.e. i_size). Done are
-          mft.c::ntfs_mft_readpage(), aops.c::end_buffer_read_index_async(),
-          and attrib.c::load_attribute_list().
-        - Lock the runlist in attrib.c::load_attribute_list() while using it.
-        - Fix memory leak in ntfs_file_read_compressed_block() and generally
-          clean up compress.c a little, removing some uncommented/unused debug
-          code.
-        - Tidy up dir.c a little bit.
-        - Don't bother getting the runlist in inode.c::ntfs_read_inode().
-        - Merge mft.c::ntfs_mft_readpage() and aops.c::ntfs_index_readpage()
-          creating aops.c::ntfs_mst_readpage(), improving the handling of
-          holes and overflow in the process and implementing the correct
-          equivalent of ntfs_file_get_block() in ntfs_mst_readpage() itself.
-          I am aiming for correctness at the moment. Modularisation can come
-          later.
-        - Rename aops.c::end_buffer_read_index_async() to
-          end_buffer_read_mst_async() and optimize the overflow checking and
-          handling.
-        - Use the host of the mftbmp address space mapping to hold the ntfs
-          volume. This is needed so the async i/o completion handler can
-          retrieve a pointer to the volume. Hopefully this will not cause
-          problems elsewhere in the kernel... Otherwise will need to use a
-          fake inode.
-        - Complete implementation of proper handling of allocated_size vs
-          initialized_size vs data_size (i.e. i_size) in whole driver.
-          Basically aops.c is now completely rewritten.
-        - Change NTFS driver name to just NTFS and set version number to 2.0.0
-          to make a clear distinction from the old driver which is still on
-          version 1.1.22.
-tng-0.0.8 - 08/03/2002 - Now using BitKeeper, http://linux-ntfs.bkbits.net/
-        - Replace bdevname(sb->s_dev) with sb->s_id.
-        - Remove now superfluous new-line characters in all callers of
-          ntfs_debug().
-        - Apply kludge in ntfs_read_inode(), setting i_nlink to 1 for
-          directories. Without this the "find" utility gets very upset which is
-          fair enough as Linux/Unix do not support directory hard links.
-        - Further runlist merging work. (Richard Russon)
-        - Backwards compatibility for gcc-2.95. (Richard Russon)
-        - Update to kernel 2.5.5-pre1 and rediff the now tiny patch.
-        - Convert to new filesystem declaration using ->ntfs_get_sb() and
-          replacing ntfs_read_super() with ntfs_fill_super().
-        - Set s_maxbytes to MAX_LFS_FILESIZE to avoid page cache page index
-          overflow on 32-bit architectures.
-        - Cleanup upcase loading code to use ntfs_(un)map_page().
-        - Disable/reenable preemtion in critical sections of compession engine.
-        - Replace device size determination in ntfs_fill_super() with
-          sb->s_bdev->bd_inode->i_size (in bytes) and remove now superfluous
-          function super.c::get_nr_blocks().
-        - Implement a mount time option (show_inodes) allowing choice of which
-          types of inode names readdir() returns and modify ntfs_filldir()
-          accordingly. There are several parameters to show_inodes:
-                system: system files
-                win32:  long file names (including POSIX file names) [DEFAULT]
-                long:   same as win32
-                dos:    short file names only (excluding POSIX file names)
-                short:  same as dos
-                posix:  same as both win32 and dos
-                all:    all file names
-          Note that the options are additive, i.e. specifying:
-                -o show_inodes=system,show_inodes=win32,show_inodes=dos
-          is the same as specifying:
-                -o show_inodes=all
-          Note that the "posix" and "all" options will show all directory
-          names, BUT the link count on each directory inode entry is set to 1,
-          due to Linux not supporting directory hard links. This may well
-          confuse some userspace applications, since the directory names will
-          have the same inode numbers. Thus it is NOT advisable to use the
-          "posix" or "all" options. We provide them only for completeness sake.
-        - Add copies of allocated_size, initialized_size, and compressed_size to
-          the ntfs inode structure and set them up in
-          inode.c::ntfs_read_inode(). These reflect the unnamed data attribute
-          for files and the index allocation attribute for directories.
-        - Add copies of allocated_size and initialized_size to ntfs inode for
-          $BITMAP attribute of large directories and set them up in
-          inode.c::ntfs_read_inode().
-        - Add copies of allocated_size and initialized_size to ntfs volume for
-          $BITMAP attribute of $MFT and set them up in
-          super.c::load_system_files().
-        - Parse deprecated ntfs driver options (iocharset, show_sys_files,
-          posix, and utf8) and tell user what the new options to use are. Note
-          we still do support them but they will be removed with kernel 2.7.x.
-        - Change all occurences of integer long long printf formatting to hex
-          as printk() will not support long long integer format if/when the
-          div64 patch goes into the kernel.
-        - Make slab caches have stable names and change the names to what they
-          were intended to be. These changes are required/made possible by the
-          new slab cache name handling which removes the length limitation by
-          requiring the caller of kmem_cache_create() to supply a stable name
-          which is then referenced but not copied.
-        - Rename run_list structure to run_list_element and create a new
-          run_list structure containing a pointer to a run_list_element
-          structure and a read/write semaphore. Adapt all users of runlists
-          to new scheme and take and release the lock as needed. This fixes a
-          nasty race as the run_list changes even when inodes are locked for
-          reading and even when the inode isn't locked at all, so we really
-          needed the serialization. We use a semaphore rather than a spinlock
-          as memory allocations can sleep and doing everything GFP_ATOMIC
-          would be silly.
-        - Cleanup read_inode() removing all code checking for lowest_vcn != 0.
-          This can never happen due to the nature of lookup_attr() and how we
-          support attribute lists. If it did happen it would imply the inode
-          being corrupt.
-        - Check for lowest_vcn != 0 in ntfs_read_inode() and mark the inode as
-          bad if found.
-        - Update to 2.5.6-pre2 changes in struct address_space.
-        - Use parent_ino() when accessing d_parent inode number in dir.c.
-        - Import Sourceforge CVS repository into BitKeeper repository:
-                http://linux-ntfs.bkbits.net/ntfs-tng-2.5
-        - Update fs/Makefile, fs/Config.help, fs/Config.in, and
-          Documentation/filesystems/ntfs.txt for NTFS TNG.
-        - Create kernel configuration option controlling whether debugging
-          is enabled or not.
-        - Add the required export of end_buffer_io_sync() from the patches
-          directory to the kernel code.
-        - Update inode.c::ntfs_show_options() with show_inodes mount option.
-        - Update errors mount option.
-tng-0.0.7 - 13/02/2002 - The driver is now feature complete for read-only!
-        - Cleanup mft.c and it's debug/error output in particular. Fix a minor
-          bug in mapping of extent inodes. Update all the comments to fit all
-          the recent code changes.
-        - Modify vcn_to_lcn() to cope with entirely unmapped runlists.
-        - Cleanups in compress.c, mostly comments and folding help.
-        - Implement attrib.c::map_run_list() as a generic helper.
-        - Make compress.c::ntfs_file_read_compressed_block() use map_run_list()
-          thus making code shorter and enabling attribute list support.
-        - Cleanup incorrect use of [su]64 with %L printf format specifier in
-          all source files. Type casts to [unsigned] long long added to correct
-          the mismatches (important for architectures which have long long not
-          being 64 bits).
-        - Merge async io completion handlers for directory indexes and $MFT
-          data into one by setting the index_block_size{_bits} of the ntfs
-          inode for $MFT to the mft_record_size{_bits} of the ntfs_volume.
-        - Cleanup aops.c, update comments.
-        - Make ntfs_file_get_block() use map_run_list() so all files now
-          support attribute lists.
-        - Make ntfs_dir_readpage() almost verbatim copy of
-          block_read_full_page() by using ntfs_file_get_block() with only real
-          difference being the use of our own async io completion handler
-          rather than the default one, thus reducing the amount of code and
-          automatically enabling attribute list support for directory indices.
-        - Fix bug in load_attribute_list() - forgot to call brelse in error
-          code path.
-        - Change parameters to find_attr() and lookup_attr(). We no longer
-          pass in the upcase table and its length. These can be gotten from
-          ctx->ntfs_ino->vol->upcase{_len}. Update all callers.
-        - Cleanups in attrib.c.
-        - Implement merging of runlists, attrib.c::merge_run_lists() and its
-          helpers. (Richard Russon)
-        - Attribute lists part 2, attribute extents and multi part runlists:
-          enable proper support for LCN_RL_NOT_MAPPED and automatic mapping of
-          further runlist parts via attrib.c::map_run_list().
-        - Tiny endianness bug fix in decompress_mapping_pairs().
-tng-0.0.6 - Encrypted directories, bug fixes, cleanups, debugging enhancements.
-        - Enable encrypted directories. (Their index root is marked encrypted
-          to indicate that new files in that directory should be created
-          encrypted.)
-        - Fix bug in NInoBmpNonResident() macro. (Cut and paste error.)
-        - Enable $Extend system directory. Most (if not all) extended system
-          files do not have unnamed data attributes so ntfs_read_inode() had to
-          special case them but that is ok, as the special casing recovery
-          happens inside an error code path so there is zero slow down in the
-          normal fast path. The special casing is done by introducing a new
-          function inode.c::ntfs_is_extended_system_file() which checks if any
-          of the hard links in the inode point to $Extend as being their parent
-          directory and if they do we assume this is an extended system file.
-        - Create a sysctl/proc interface to allow {dis,en}abling of debug output
-          when compiled with -DDEBUG. Default is debug messages to be disabled.
-          To enable them, one writes a non-zero value to /proc/sys/fs/ntfs-debug
-          (if /proc is enabled) or uses sysctl(2) to effect the same (if sysctl
-          interface is enabled). Inspired by old ntfs driver.
-        - Add debug_msgs insmod/kernel boot parameter to set whether debug
-          messages are {dis,en}abled. This is useful to enable debug messages
-          during ntfs initialization and is the only way to activate debugging
-          when the sysctl interface is not enabled.
-        - Cleanup debug output in various places.
-        - Remove all dollar signs ($) from the source (except comments) to
-          enable compilation on architectures whose gcc compiler does not
-          support dollar signs in the names of variables/constants. Attribute
-          types now start with AT_ instead of $ and $I30 is now just I30.
-        - Cleanup ntfs_lookup() and add consistency check of sequence numbers.
-        - Load complete runlist for $MFT/$BITMAP during mount and cleanup
-          access functions. This means we now cope with $MFT/$BITMAP being
-          spread accross several mft records.
-        - Disable modification of mft_zone_multiplier on remount. We can always
-          reenable this later on if we really want to, but we will need to make
-          sure we readjust the mft_zone size / layout accordingly.
-tng-0.0.5 - Modernize for 2.5.x and further in line-ing with Al Viro's comments.
-        - Use sb_set_blocksize() instead of set_blocksize() and verify the
-          return value.
-        - Use sb_bread() instead of bread() throughout.
-        - Add index_vcn_size{_bits} to ntfs_inode structure to store the size
-          of a directory index block vcn. Apply resulting simplifications in
-          dir.c everywhere.
-        - Fix a small bug somewhere (but forgot what it was).
-        - Change ntfs_{debug,error,warning} to enable gcc to do type checking
-          on the printf-format parameter list and fix bugs reported by gcc
-          as a result. (Richard Russon)
-        - Move inode allocation strategy to Al's new stuff but maintain the
-          divorce of ntfs_inode from struct inode. To achieve this we have two
-          separate slab caches, one for big ntfs inodes containing a struct
-          inode and pure ntfs inodes and at the same time fix some faulty
-          error code paths in ntfs_read_inode().
-        - Show mount options in proc (inode.c::ntfs_show_options()).
-tng-0.0.4 - Big changes, getting in line with Al Viro's comments.
-        - Modified (un)map_mft_record functions to be common for read and write
-          case. To specify which is which, added extra parameter at front of
-          parameter list. Pass either READ or WRITE to this, each has the
-          obvious meaning.
-        - General cleanups to allow for easier folding in vi.
-        - attrib.c::decompress_mapping_pairs() now accepts the old runlist
-          argument, and invokes attrib.c::merge_run_lists() to merge the old
-          and the new runlists.
-        - Removed attrib.c::find_first_attr().
-        - Implemented loading of attribute list and complete runlist for $MFT.
-          This means we now cope with $MFT being spread across several mft
-          records.
-        - Adapt to 2.5.2-pre9 and the changed create_empty_buffers() syntax.
-        - Adapt major/minor/kdev_t/[bk]devname stuff to new 2.5.x kernels.
-        - Make ntfs_volume be allocated via kmalloc() instead of using a slab
-          cache. There are too little ntfs_volume structures at any one time
-          to justify a private slab cache.
-        - Fix bogus kmap() use in async io completion. Now use kmap_atomic().
-          Use KM_BIO_IRQ on advice from IRC/kernel...
-        - Use ntfs_map_page() in map_mft_record() and create ->readpage method
-          for reading $MFT (ntfs_mft_readpage). In the process create dedicated
-          address space operations (ntfs_mft_aops) for $MFT inode mapping. Also
-          removed the now superfluous exports from the kernel core patch.
-        - Fix a bug where kfree() was used instead of ntfs_free().
-        - Change map_mft_record() to take ntfs_inode as argument instead of
-          vfs inode. Dito for unmap_mft_record(). Adapt all callers.
-        - Add pointer to ntfs_volume to ntfs_inode.
-        - Add mft record number and sequence number to ntfs_inode. Stop using
-          i_ino and i_generation for in-driver purposes.
-        - Implement attrib.c::merge_run_lists(). (Richard Russon)
-        - Remove use of proper inodes by extent inodes. Move i_ino and
-          i_generation to ntfs_inode to do this. Apply simplifications that
-          result and remove iget_no_wait(), etc.
-        - Pass ntfs_inode everywhere in the driver (used to be struct inode).
-        - Add reference counting in ntfs_inode for the ntfs inode itself and
-          for the mapped mft record.
-        - Extend mft record mapping so we can (un)map extent mft records (new
-          functions (un)map_extent_mft_record), and so mappings are reference
-          counted and don't have to happen twice if already mapped - just ref
-          count increases.
-        - Add -o iocharset as alias to -o nls for backwards compatibility.
-        - The latest core patch is now tiny. In fact just a single additional
-          export is necessary over the base kernel.
-tng-0.0.3 - Cleanups, enhancements, bug fixes.
-        - Work on attrib.c::decompress_mapping_pairs() to detect base extents
-          and setup the runlist appropriately using knowledge provided by the
-          sizes in the base attribute record.
-        - Balance the get_/put_attr_search_ctx() calls so we don't leak memory
-          any more.
-        - Introduce ntfs_malloc_nofs() and ntfs_free() to allocate/free a single
-          page or use vmalloc depending on the amount of memory requested.
-        - Cleanup error output. The __FUNCTION__ "(): " is now added
-          automatically. Introduced a new header file debug.h to support this
-          and also moved ntfs_debug() function into it.
-        - Make reading of compressed files more intelligent and especially get
-          rid of the vmalloc_nofs() from readpage(). This now uses per CPU
-          buffers (allocated at first mount with cluster size <= 4kiB and
-          deallocated on last umount with cluster size <= 4kiB), and
-          asynchronous io for the compressed data using a list of buffer heads.
-          Er, we use synchronous io as async io only works on whole pages
-          covered by buffers and not on individual buffer heads...
-        - Bug fix for reading compressed files with sparse compression blocks.
-tng-0.0.2 - Now handles larger/fragmented/compressed volumes/files/dirs.
-        - Fixed handling of directories when cluster size exceeds index block
-          size.
-        - Hide DOS only name space directory entries from readdir() but allow
-          them in lookup(). This should fix the problem that Linux doesn't
-          support directory hard links, while still allowing access to entries
-          via their short file name. This also has the benefit of mimicking
-          what Windows users are used to, so it is the ideal solution.
-        - Implemented sync_page everywhere so no more hangs in D state when
-          waiting for a page.
-        - Stop using bforget() in favour of brelse().
-        - Stop locking buffers unnecessarily.
-        - Implemented compressed files (inode->mapping contains uncompressed
-          data, raw compressed data is currently bread() into a vmalloc()ed
-          memory buffer).
-        - Enable compressed directories. (Their index root is marked compressed
-          to indicate that new files in that directory should be created
-          compressed.)
-        - Use vsnprintf rather than vsprintf in the ntfs_error and ntfs_warning
-          functions. (Thanks to Will Dyson for pointing this out.)
-        - Moved the ntfs_inode and ntfs_volume (the former ntfs_inode_info and
-          ntfs_sb_info) out of the common inode and super_block structures and
-          started using the generic_ip and generic_sbp pointers instead. This
-          makes ntfs entirely private with respect to the kernel tree.
-        - Detect compiler version and abort with error message if gcc less than
-          2.96 is used.
-        - Fix bug in name comparison function in unistr.c.
-        - Implement attribute lists part 1, the infrastructure: search contexts
-          and operations, find_external_attr(), lookup_attr()) and make the
-          code use the infrastructure.
-        - Fix stupid buffer overflow bug that became apparent on larger run
-          list containing attributes.
-        - Fix bugs in readdir() that became apparent on larger directories.
-        The driver is now really useful and survives the test
-                find . -type f -exec md5sum "{}" \;
-        without any error messages on a over 1GiB sized partition with >16k
-        files on it, including compressed files and directories and many files
-        and directories with attribute lists.
-tng-0.0.1 - The first useful version.
-        - Added ntfs_lookup().
-        - Added default upcase generation and handling.
-        - Added compile options to be shown on module init.
-        - Many bug fixes that were "hidden" before.
-        - Update to latest kernel.
-        - Added ntfs_readdir().
-        - Added file operations for mmap(), read(), open() and llseek(). We just
-          use the generic ones. The whole point of going through implementing
-          readpage() methods and where possible get_block() call backs is that
-          this allows us to make use of the generic high level methods provided
-          by the kernel.
-        The driver is now actually useful! Yey. (-: It undoubtedly has got bugs
-        though and it doesn't implement accesssing compressed files yet. Also,
-        accessing files with attribute list attributes is not implemented yet
-        either. But for small or simple filesystems it should work and allow
-        you to list directories, use stat on directory entries and the file
-        system, open, read, mmap and llseek around in files. A big mile stone
-        has been reached!
-tng-0.0.0 - Initial version tag.
-        Initial driver implementation. The driver can mount and umount simple
-        NTFS filesystems (i.e. ones without attribute lists in the system
-        files). If the mount fails there might be problems in the error handling
-        code paths, so be warned. Otherwise it seems to be loading the system
-        files nicely and the mft record read mapping/unmapping seems to be
-        working nicely, too. Proof of inode metadata in the page cache and non-
-        resident file unnamed stream data in the page cache concepts is thus
-        complete.
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index cfce53cb65d7..c3c2c7ac9020 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -23,6 +23,7 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 50d3b0c258e3..f5094ee224c1 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -22,6 +22,7 @@
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 9669541d0119..6551c7cbad92 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -25,6 +25,7 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/vmalloc.h>
+#include <linux/slab.h>
 #include "attrib.h"
 #include "inode.h"
@@ -927,7 +928,7 @@ lock_retry_remap:
                return 0;
        ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ?
-                        "EOVERFLOW" : (!err ? "EIO" : "unkown error"));
+                        "EOVERFLOW" : (!err ? "EIO" : "unknown error"));
        return err < 0 ? err : -EIO;
 read_err:
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 5a9e34475e37..fe44d3feee4a 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -21,6 +21,7 @@
 */
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include "dir.h"
 #include "aops.h"
@@ -1545,7 +1546,7 @@ static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry,
                write_inode_now(bmp_vi, !datasync);
                iput(bmp_vi);
        }
-        ret = ntfs_write_inode(vi, 1);
+        ret = __ntfs_write_inode(vi, 1);
        write_inode_now(vi, !datasync);
        err = sync_blockdev(vi->i_sb->s_bdev);
        if (unlikely(err && !ret))
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 663c0e341f8b..8804f093ba75 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -20,6 +20,7 @@
 */
 #include <linux/buffer_head.h>
+#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/pagevec.h>
 #include <linux/sched.h>
@@ -399,7 +400,7 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
 * @cached_page: allocated but as yet unused page
 * @lru_pvec:   lru-buffering pagevec of caller
 *
- * Obtain @nr_pages locked page cache pages from the mapping @maping and
+ * Obtain @nr_pages locked page cache pages from the mapping @mapping and
 * starting at index @index.
 *
 * If a page is newly created, increment its refcount and add it to the
@@ -1281,7 +1282,7 @@ rl_not_mapped_enoent:
 /*
 * Copy as much as we can into the pages and return the number of bytes which
- * were sucessfully copied.  If a fault is encountered then clear the pages
+ * were successfully copied.  If a fault is encountered then clear the pages
 * out to (ofs + bytes) and return the number of bytes which were copied.
 */
 static inline size_t ntfs_copy_from_user(struct page **pages,
@@ -2182,7 +2183,7 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
        ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
        BUG_ON(S_ISDIR(vi->i_mode));
        if (!datasync || !NInoNonResident(NTFS_I(vi)))
-                ret = ntfs_write_inode(vi, 1);
+                ret = __ntfs_write_inode(vi, 1);
        write_inode_now(vi, !datasync);
        /*
         * NOTE: If we were to use mapping->private_list (see ext2 and
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index 2194eff49743..096c135691ae 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -19,6 +19,8 @@
 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
+#include <linux/slab.h>
 #include "aops.h"
 #include "collate.h"
 #include "debug.h"
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 9938034762cc..4b57fb1eac2a 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -530,7 +530,7 @@ err_corrupt_attr:
 * the ntfs inode.
 *
 * Q: What locks are held when the function is called?
- * A: i_state has I_LOCK set, hence the inode is locked, also
+ * A: i_state has I_NEW set, hence the inode is locked, also
 *    i_count is set to 1, so it is not going to go away
 *    i_flags is set to 0 and we have no business touching it.  Only an ioctl()
 *    is allowed to write to them. We should of course be honouring them but
@@ -1207,7 +1207,7 @@ err_out:
 * necessary fields in @vi as well as initializing the ntfs inode.
 *
 * Q: What locks are held when the function is called?
- * A: i_state has I_LOCK set, hence the inode is locked, also
+ * A: i_state has I_NEW set, hence the inode is locked, also
 *    i_count is set to 1, so it is not going to go away
 *
 * Return 0 on success and -errno on error.  In the error case, the inode will
@@ -1474,7 +1474,7 @@ err_out:
 * normal directory inodes.
 *
 * Q: What locks are held when the function is called?
- * A: i_state has I_LOCK set, hence the inode is locked, also
+ * A: i_state has I_NEW set, hence the inode is locked, also
 *    i_count is set to 1, so it is not going to go away
 *
 * Return 0 on success and -errno on error.  In the error case, the inode will
@@ -2957,7 +2957,7 @@ out:
 *
 * Return 0 on success and -errno on error.
 */
-int ntfs_write_inode(struct inode *vi, int sync)
+int __ntfs_write_inode(struct inode *vi, int sync)
 {
        sle64 nt;
        ntfs_inode *ni = NTFS_I(vi);
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index 117eaf8032a3..9a113544605d 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -307,12 +307,12 @@ extern void ntfs_truncate_vfs(struct inode *vi);
 extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr);
-extern int ntfs_write_inode(struct inode *vi, int sync);
+extern int __ntfs_write_inode(struct inode *vi, int sync);
 static inline void ntfs_commit_inode(struct inode *vi)
 {
        if (!is_bad_inode(vi))
-                ntfs_write_inode(vi, 1);
+                __ntfs_write_inode(vi, 1);
        return;
 }
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index 89b02985c054..4dadcdf3d451 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -338,7 +338,7 @@ err_out:
 * copy of the complete multi sector transfer deprotected page.  On failure,
 * *@wrp is undefined.
 *
- * Simillarly, if @lsn is not NULL, on succes *@lsn will be set to the current
+ * Simillarly, if @lsn is not NULL, on success *@lsn will be set to the current
 * logfile lsn according to this restart page.  On failure, *@lsn is undefined.
 *
 * The following error codes are defined:
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 1caa0ef0b2bb..b572b6727181 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -21,6 +21,7 @@
 */
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 #include "attrib.h"
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 2ca00153b6ec..358273e59ade 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -23,6 +23,7 @@
 #include <linux/dcache.h>
 #include <linux/exportfs.h>
 #include <linux/security.h>
+#include <linux/slab.h>
 #include "attrib.h"
 #include "debug.h"
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 80b04770e8e9..0de1db6cddbf 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -31,6 +31,7 @@
 #include <linux/vfs.h>
 #include <linux/moduleparam.h>
 #include <linux/smp_lock.h>
+#include <linux/bitmap.h>
 #include "sysctl.h"
 #include "logfile.h"
@@ -39,6 +40,7 @@
 #include "dir.h"
 #include "debug.h"
 #include "index.h"
+#include "inode.h"
 #include "aops.h"
 #include "layout.h"
 #include "malloc.h"
@@ -2457,7 +2459,6 @@ static void ntfs_put_super(struct super_block *sb)
 static s64 get_nr_free_clusters(ntfs_volume *vol)
 {
        s64 nr_free = vol->nr_clusters;
-        u32 *kaddr;
        struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
        struct page *page;
        pgoff_t index, max_index;
@@ -2476,7 +2477,8 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
        ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.",
                        max_index, PAGE_CACHE_SIZE / 4);
        for (index = 0; index < max_index; index++) {
-                unsigned int i;
+                unsigned long *kaddr;
                /*
                 * Read the page from page cache, getting it from backing store
                 * if necessary, and increment the use count.
@@ -2489,16 +2491,16 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
                        nr_free -= PAGE_CACHE_SIZE * 8;
                        continue;
                }
-                kaddr = (u32*)kmap_atomic(page, KM_USER0);
+                kaddr = kmap_atomic(page, KM_USER0);
                /*
-                 * For each 4 bytes, subtract the number of set bits. If this
+                 * Subtract the number of set bits. If this
                 * is the last page and it is partial we don't really care as
                 * it just means we do a little extra work but it won't affect
                 * the result as all out of range bytes are set to zero by
                 * ntfs_readpage().
                 */
-                for (i = 0; i < PAGE_CACHE_SIZE / 4; i++)
+                nr_free -= bitmap_weight(kaddr,
-                        nr_free -= (s64)hweight32(kaddr[i]);
+                                        PAGE_CACHE_SIZE * BITS_PER_BYTE);
                kunmap_atomic(kaddr, KM_USER0);
                page_cache_release(page);
        }
@@ -2537,7 +2539,6 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
 static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
                s64 nr_free, const pgoff_t max_index)
 {
-        u32 *kaddr;
        struct address_space *mapping = vol->mftbmp_ino->i_mapping;
        struct page *page;
        pgoff_t index;
@@ -2547,7 +2548,8 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
        ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = "
                        "0x%lx.", max_index, PAGE_CACHE_SIZE / 4);
        for (index = 0; index < max_index; index++) {
-                unsigned int i;
+                unsigned long *kaddr;
                /*
                 * Read the page from page cache, getting it from backing store
                 * if necessary, and increment the use count.
@@ -2560,16 +2562,16 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
                        nr_free -= PAGE_CACHE_SIZE * 8;
                        continue;
                }
-                kaddr = (u32*)kmap_atomic(page, KM_USER0);
+                kaddr = kmap_atomic(page, KM_USER0);
                /*
-                 * For each 4 bytes, subtract the number of set bits. If this
+                 * Subtract the number of set bits. If this
                 * is the last page and it is partial we don't really care as
                 * it just means we do a little extra work but it won't affect
                 * the result as all out of range bytes are set to zero by
                 * ntfs_readpage().
                 */
-                for (i = 0; i < PAGE_CACHE_SIZE / 4; i++)
+                nr_free -= bitmap_weight(kaddr,
-                        nr_free -= (s64)hweight32(kaddr[i]);
+                                        PAGE_CACHE_SIZE * BITS_PER_BYTE);
                kunmap_atomic(kaddr, KM_USER0);
                page_cache_release(page);
        }
@@ -2662,6 +2664,13 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
        return 0;
 }
+#ifdef NTFS_RW
+static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc)
+{
+        return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL);
+}
+#endif
 /**
 * The complete super operations.
 */
diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c
index 9ef85e628fe1..79a89184cb5e 100644
--- a/fs/ntfs/sysctl.c
+++ b/fs/ntfs/sysctl.c
@@ -36,12 +36,11 @@
 /* Definition of the ntfs sysctl. */
 static ctl_table ntfs_sysctls[] = {
        {
-                .ctl_name       = CTL_UNNUMBERED,       /* Binary and text IDs. */
                .procname       = "ntfs-debug",
                .data           = &debug_msgs,          /* Data pointer and size. */
                .maxlen         = sizeof(debug_msgs),
                .mode           = 0644,                 /* Mode, proc handler. */
-                .proc_handler   = &proc_dointvec
+                .proc_handler   = proc_dointvec
        },
        {}
 };
@@ -49,7 +48,6 @@ static ctl_table ntfs_sysctls[] = {
 /* Define the parent directory /proc/sys/fs. */
 static ctl_table sysctls_root[] = {
        {
-                .ctl_name       = CTL_FS,
                .procname       = "fs",
                .mode           = 0555,
                .child          = ntfs_sysctls
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 701b7a3a872e..0d840669698e 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -6,6 +6,7 @@ config OCFS2_FS
        select CRC32
        select QUOTA
        select QUOTA_TREE
+        select FS_POSIX_ACL
        help
          OCFS2 is a general purpose extent based shared disk cluster file
          system with many similarities to ext3. It supports 64 bit inode
@@ -74,12 +75,3 @@ config OCFS2_DEBUG_FS
          This option will enable expensive consistency checks. Enable
          this option for debugging only as it is likely to decrease
          performance of the filesystem.
-config OCFS2_FS_POSIX_ACL
-        bool "OCFS2 POSIX Access Control Lists"
-        depends on OCFS2_FS
-        select FS_POSIX_ACL
-        default n
-        help
-          Posix Access Control Lists (ACLs) support permissions for users and
-          groups beyond the owner/group/world scheme.
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 31f25ce32c97..791c0886c060 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -39,16 +39,14 @@ ocfs2-objs := \
        ver.o                   \
        quota_local.o           \
        quota_global.o          \
-        xattr.o
+        xattr.o                 \
+        acl.o
-ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y)
-ocfs2-objs += acl.o
-endif
 ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
 ocfs2_stack_user-objs := stack_user.o
+obj-$(CONFIG_OCFS2_FS) += dlmfs/
 # cluster/ is always needed when OCFS2_FS for masklog support
 obj-$(CONFIG_OCFS2_FS) += cluster/
 obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index fbeaec762103..e13fc9e8fcdc 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -21,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #define MLOG_MASK_PREFIX ML_INODE
@@ -30,6 +31,8 @@
 #include "alloc.h"
 #include "dlmglue.h"
 #include "file.h"
+#include "inode.h"
+#include "journal.h"
 #include "ocfs2_fs.h"
 #include "xattr.h"
@@ -98,15 +101,11 @@ static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
                                              int type,
                                              struct buffer_head *di_bh)
 {
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        int name_index;
        char *value = NULL;
        struct posix_acl *acl;
        int retval;
-        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
-                return NULL;
        switch (type) {
        case ACL_TYPE_ACCESS:
                name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -170,6 +169,60 @@ static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
 }
 /*
+ * Helper function to set i_mode in memory and disk. Some call paths
+ * will not have di_bh or a journal handle to pass, in which case it
+ * will create it's own.
+ */
+static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
+                              handle_t *handle, umode_t new_mode)
+{
+        int ret, commit_handle = 0;
+        struct ocfs2_dinode *di;
+        if (di_bh == NULL) {
+                ret = ocfs2_read_inode_block(inode, &di_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        } else
+                get_bh(di_bh);
+        if (handle == NULL) {
+                handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
+                                           OCFS2_INODE_UPDATE_CREDITS);
+                if (IS_ERR(handle)) {
+                        ret = PTR_ERR(handle);
+                        mlog_errno(ret);
+                        goto out_brelse;
+                }
+                commit_handle = 1;
+        }
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        inode->i_mode = new_mode;
+        di->i_mode = cpu_to_le16(inode->i_mode);
+        ocfs2_journal_dirty(handle, di_bh);
+out_commit:
+        if (commit_handle)
+                ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out_brelse:
+        brelse(di_bh);
+out:
+        return ret;
+}
+/*
 * Set the access or default ACL of an inode.
 */
 static int ocfs2_set_acl(handle_t *handle,
@@ -197,9 +250,14 @@ static int ocfs2_set_acl(handle_t *handle,
                        if (ret < 0)
                                return ret;
                        else {
-                                inode->i_mode = mode;
                                if (ret == 0)
                                        acl = NULL;
+                                ret = ocfs2_acl_set_mode(inode, di_bh,
+                                                         handle, mode);
+                                if (ret)
+                                        return ret;
                        }
                }
                break;
@@ -287,6 +345,7 @@ int ocfs2_init_acl(handle_t *handle,
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct posix_acl *acl = NULL;
        int ret = 0;
+        mode_t mode;
        if (!S_ISLNK(inode->i_mode)) {
                if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
@@ -295,12 +354,17 @@ int ocfs2_init_acl(handle_t *handle,
                        if (IS_ERR(acl))
                                return PTR_ERR(acl);
                }
-                if (!acl)
+                if (!acl) {
-                        inode->i_mode &= ~current_umask();
+                        mode = inode->i_mode & ~current_umask();
+                        ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto cleanup;
+                        }
+                }
        }
        if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
                struct posix_acl *clone;
-                mode_t mode;
                if (S_ISDIR(inode->i_mode)) {
                        ret = ocfs2_set_acl(handle, inode, di_bh,
@@ -317,7 +381,7 @@ int ocfs2_init_acl(handle_t *handle,
                mode = inode->i_mode;
                ret = posix_acl_create_masq(clone, &mode);
                if (ret >= 0) {
-                        inode->i_mode = mode;
+                        ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
                        if (ret > 0) {
                                ret = ocfs2_set_acl(handle, inode,
                                                    di_bh, ACL_TYPE_ACCESS,
@@ -331,13 +395,14 @@ cleanup:
        return ret;
 }
-static size_t ocfs2_xattr_list_acl_access(struct inode *inode,
+static size_t ocfs2_xattr_list_acl_access(struct dentry *dentry,
                                          char *list,
                                          size_t list_len,
                                          const char *name,
-                                          size_t name_len)
+                                          size_t name_len,
+                                          int type)
 {
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
        const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
@@ -348,13 +413,14 @@ static size_t ocfs2_xattr_list_acl_access(struct inode *inode,
        return size;
 }
-static size_t ocfs2_xattr_list_acl_default(struct inode *inode,
+static size_t ocfs2_xattr_list_acl_default(struct dentry *dentry,
                                           char *list,
                                           size_t list_len,
                                           const char *name,
-                                           size_t name_len)
+                                           size_t name_len,
+                                           int type)
 {
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
        const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
@@ -365,19 +431,19 @@ static size_t ocfs2_xattr_list_acl_default(struct inode *inode,
        return size;
 }
-static int ocfs2_xattr_get_acl(struct inode *inode,
+static int ocfs2_xattr_get_acl(struct dentry *dentry, const char *name,
-                               int type,
+                void *buffer, size_t size, int type)
-                               void *buffer,
-                               size_t size)
 {
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
        struct posix_acl *acl;
        int ret;
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
                return -EOPNOTSUPP;
-        acl = ocfs2_get_acl(inode, type);
+        acl = ocfs2_get_acl(dentry->d_inode, type);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl == NULL)
@@ -388,35 +454,16 @@ static int ocfs2_xattr_get_acl(struct inode *inode,
        return ret;
 }
-static int ocfs2_xattr_get_acl_access(struct inode *inode,
+static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name,
-                                      const char *name,
+                const void *value, size_t size, int flags, int type)
-                                      void *buffer,
-                                      size_t size)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ocfs2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
-}
-static int ocfs2_xattr_get_acl_default(struct inode *inode,
-                                       const char *name,
-                                       void *buffer,
-                                       size_t size)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ocfs2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
-}
-static int ocfs2_xattr_set_acl(struct inode *inode,
-                               int type,
-                               const void *value,
-                               size_t size)
 {
+        struct inode *inode = dentry->d_inode;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct posix_acl *acl;
        int ret = 0;
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
                return -EOPNOTSUPP;
@@ -442,38 +489,18 @@ cleanup:
        return ret;
 }
-static int ocfs2_xattr_set_acl_access(struct inode *inode,
-                                      const char *name,
-                                      const void *value,
-                                      size_t size,
-                                      int flags)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ocfs2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
-}
-static int ocfs2_xattr_set_acl_default(struct inode *inode,
-                                       const char *name,
-                                       const void *value,
-                                       size_t size,
-                                       int flags)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ocfs2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
-}
 struct xattr_handler ocfs2_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .flags  = ACL_TYPE_ACCESS,
        .list   = ocfs2_xattr_list_acl_access,
-        .get    = ocfs2_xattr_get_acl_access,
+        .get    = ocfs2_xattr_get_acl,
-        .set    = ocfs2_xattr_set_acl_access,
+        .set    = ocfs2_xattr_set_acl,
 };
 struct xattr_handler ocfs2_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .flags  = ACL_TYPE_DEFAULT,
        .list   = ocfs2_xattr_list_acl_default,
-        .get    = ocfs2_xattr_get_acl_default,
+        .get    = ocfs2_xattr_get_acl,
-        .set    = ocfs2_xattr_set_acl_default,
+        .set    = ocfs2_xattr_set_acl,
 };
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 8f6389ed4da5..5c5d31f05853 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -26,8 +26,6 @@ struct ocfs2_acl_entry {
        __le32 e_id;
 };
-#ifdef CONFIG_OCFS2_FS_POSIX_ACL
 extern int ocfs2_check_acl(struct inode *, int);
 extern int ocfs2_acl_chmod(struct inode *);
 extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
@@ -35,24 +33,4 @@ extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
                          struct ocfs2_alloc_context *,
                          struct ocfs2_alloc_context *);
-#else /* CONFIG_OCFS2_FS_POSIX_ACL*/
-#define ocfs2_check_acl NULL
-static inline int ocfs2_acl_chmod(struct inode *inode)
-{
-        return 0;
-}
-static inline int ocfs2_init_acl(handle_t *handle,
-                                 struct inode *inode,
-                                 struct inode *dir,
-                                 struct buffer_head *di_bh,
-                                 struct buffer_head *dir_bh,
-                                 struct ocfs2_alloc_context *meta_ac,
-                                 struct ocfs2_alloc_context *data_ac)
-{
-        return 0;
-}
-#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/
 #endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 38a42f5d59ff..9f8bd913c51e 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1050,7 +1050,8 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
                        strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
                        eb->h_blkno = cpu_to_le64(first_blkno);
                        eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
-                        eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
+                        eb->h_suballoc_slot =
+                                cpu_to_le16(meta_ac->ac_alloc_slot);
                        eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
                        eb->h_list.l_count =
                                cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -1765,9 +1766,9 @@ set_and_inc:
 *
 * The array index of the subtree root is passed back.
 */
-static int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
+int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
-                                   struct ocfs2_path *left,
+                            struct ocfs2_path *left,
-                                   struct ocfs2_path *right)
+                            struct ocfs2_path *right)
 {
        int i = 0;
@@ -2398,7 +2399,7 @@ static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
 *
 * The array is assumed to be large enough to hold an entire path (tree depth).
 *
- * Upon succesful return from this function:
+ * Upon successful return from this function:
 *
 * - The 'right_path' array will contain a path to the leaf block
 *   whose range contains e_cpos.
@@ -2872,8 +2873,8 @@ out:
 * This looks similar, but is subtly different to
 * ocfs2_find_cpos_for_left_leaf().
 */
-static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
-                                          struct ocfs2_path *path, u32 *cpos)
+                                   struct ocfs2_path *path, u32 *cpos)
 {
        int i, j, ret = 0;
        u64 blkno;
@@ -5712,7 +5713,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
                goto out;
        }
-        vfs_dq_free_space_nodirty(inode,
+        dquot_free_space_nodirty(inode,
                                  ocfs2_clusters_to_bytes(inode->i_sb, len));
        ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
@@ -6037,7 +6038,7 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
        if (status < 0)
                mlog_errno(status);
        else
-                ocfs2_init_inode_steal_slot(osb);
+                ocfs2_init_steal_slots(osb);
        mlog_exit(status);
 }
@@ -6935,7 +6936,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
                goto bail;
        }
-        vfs_dq_free_space_nodirty(inode,
+        dquot_free_space_nodirty(inode,
                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
        spin_lock(&OCFS2_I(inode)->ip_lock);
        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
@@ -7190,8 +7191,8 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
         * wait on them - the truncate_inode_pages() call later will
         * do that for us.
         */
-        ret = do_sync_mapping_range(inode->i_mapping, range_start,
+        ret = filemap_fdatawrite_range(inode->i_mapping, range_start,
-                                    range_end - 1, SYNC_FILE_RANGE_WRITE);
+                                       range_end - 1);
        if (ret)
                mlog_errno(ret);
@@ -7300,11 +7301,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                unsigned int page_end;
                u64 phys;
-                if (vfs_dq_alloc_space_nodirty(inode,
+                ret = dquot_alloc_space_nodirty(inode,
-                                       ocfs2_clusters_to_bytes(osb->sb, 1))) {
+                                       ocfs2_clusters_to_bytes(osb->sb, 1));
-                        ret = -EDQUOT;
+                if (ret)
                        goto out_commit;
-                }
                did_quota = 1;
                ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
@@ -7380,7 +7380,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 out_commit:
        if (ret < 0 && did_quota)
-                vfs_dq_free_space_nodirty(inode,
+                dquot_free_space_nodirty(inode,
                                          ocfs2_clusters_to_bytes(osb->sb, 1));
        ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 9c122d574464..1db4359ccb90 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -317,4 +317,9 @@ int ocfs2_path_bh_journal_access(handle_t *handle,
 int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
                              handle_t *handle,
                              struct ocfs2_path *path);
+int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+                                   struct ocfs2_path *path, u32 *cpos);
+int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
+                            struct ocfs2_path *left,
+                            struct ocfs2_path *right);
 #endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index deb2b132ae5e..21441ddb5506 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -547,6 +547,9 @@ bail:
 *
 * called like this: dio->get_blocks(dio->inode, fs_startblk,
 *                                      fs_count, map_bh, dio->rw == WRITE);
+ *
+ * Note that we never bother to allocate blocks here, and thus ignore the
+ * create argument.
 */
 static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                                     struct buffer_head *bh_result, int create)
@@ -563,14 +566,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
        inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-        /*
-         * Any write past EOF is not allowed because we'd be extending.
-         */
-        if (create && (iblock + max_blocks) > inode_blocks) {
-                ret = -EIO;
-                goto bail;
-        }
        /* This figures out the size of the next contiguous block, and
         * our logical offset */
        ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
@@ -582,17 +577,9 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                goto bail;
        }
-        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
+        /* We should already CoW the refcounted extent in case of create. */
-                ocfs2_error(inode->i_sb,
+        BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
-                            "Inode %llu has a hole at block %llu\n",
-                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                            (unsigned long long)iblock);
-                ret = -EROFS;
-                goto bail;
-        }
-        /* We should already CoW the refcounted extent. */
-        BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
        /*
         * get_more_blocks() expects us to describe a hole by clearing
         * the mapped bit on bh_result().
@@ -601,20 +588,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
         */
        if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
                map_bh(bh_result, inode->i_sb, p_blkno);
-        else {
+        else
-                /*
-                 * ocfs2_prepare_inode_for_write() should have caught
-                 * the case where we'd be filling a hole and triggered
-                 * a buffered write instead.
-                 */
-                if (create) {
-                        ret = -EIO;
-                        mlog_errno(ret);
-                        goto bail;
-                }
                clear_buffer_mapped(bh_result);
-        }
        /* make sure we don't map more than max_blocks blocks here as
           that's all the kernel will handle at this point. */
@@ -625,7 +600,7 @@ bail:
        return ret;
 }
-/* 
+/*
 * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
 * particularly interested in the aio/dio case.  Like the core uses
 * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
@@ -696,7 +671,7 @@ static ssize_t ocfs2_direct_IO(int rw,
        ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
                                            inode->i_sb->s_bdev, iov, offset,
-                                            nr_segs, 
+                                            nr_segs,
                                            ocfs2_direct_IO_get_blocks,
                                            ocfs2_dio_end_io);
@@ -1789,10 +1764,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
        wc->w_handle = handle;
-        if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode,
+        if (clusters_to_alloc) {
-                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) {
+                ret = dquot_alloc_space_nodirty(inode,
-                ret = -EDQUOT;
+                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
-                goto out_commit;
+                if (ret)
+                        goto out_commit;
        }
        /*
         * We don't want this to fail in ocfs2_write_end(), so do it
@@ -1835,7 +1811,7 @@ success:
        return 0;
 out_quota:
        if (clusters_to_alloc)
-                vfs_dq_free_space(inode,
+                dquot_free_space(inode,
                          ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
 out_commit:
        ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index a1163b8b417c..b7428c5d0d3b 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -47,7 +47,7 @@
 * Calculate the bit offset in the hamming code buffer based on the bit's
 * offset in the data buffer.  Since the hamming code reserves all
 * power-of-two bits for parity, the data bit number and the code bit
- * number are offest by all the parity bits beforehand.
+ * number are offset by all the parity bits beforehand.
 *
 * Recall that bit numbers in hamming code are 1-based.  This function
 * takes the 0-based data bit from the caller.
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index d43d34a1dd31..ecebb2276790 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -25,7 +25,6 @@
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <cluster/masklog.h>
@@ -368,7 +367,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
        }
        ocfs2_metadata_cache_io_unlock(ci);
-        mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", 
+        mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n",
             (unsigned long long)block, nr,
             ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes",
             flags);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index c452d116b892..41d5f1f92d56 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -34,6 +34,7 @@
 #include <linux/crc32.h>
 #include <linux/time.h>
 #include <linux/debugfs.h>
+#include <linux/slab.h>
 #include "heartbeat.h"
 #include "tcp.h"
@@ -78,7 +79,7 @@ static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
 unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
-/* Only sets a new threshold if there are no active regions. 
+/* Only sets a new threshold if there are no active regions.
 *
 * No locking or otherwise interesting code is required for reading
 * o2hb_dead_threshold as it can't change once regions are active and
@@ -170,13 +171,14 @@ static void o2hb_write_timeout(struct work_struct *work)
        mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
             "milliseconds\n", reg->hr_dev_name,
-             jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); 
+             jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
        o2quo_disk_timeout();
 }
 static void o2hb_arm_write_timeout(struct o2hb_region *reg)
 {
-        mlog(0, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS);
+        mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
+             O2HB_MAX_WRITE_TIMEOUT_MS);
        cancel_delayed_work(&reg->hr_write_timeout_work);
        reg->hr_last_timeout_start = jiffies;
@@ -623,7 +625,7 @@ static int o2hb_check_slot(struct o2hb_region *reg,
             "seq %llu last %llu changed %u equal %u\n",
             slot->ds_node_num, (long long)slot->ds_last_generation,
             le32_to_cpu(hb_block->hb_cksum),
-             (unsigned long long)le64_to_cpu(hb_block->hb_seq), 
+             (unsigned long long)le64_to_cpu(hb_block->hb_seq),
             (unsigned long long)slot->ds_last_time, slot->ds_changed_samples,
             slot->ds_equal_samples);
@@ -874,7 +876,8 @@ static int o2hb_thread(void *data)
                do_gettimeofday(&after_hb);
                elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
-                mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
+                mlog(ML_HEARTBEAT,
+                     "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
                     before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
                     after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
                     elapsed_msec);
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 1cd2934de615..3bb928a2bf7d 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -112,6 +112,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
        define_mask(XATTR),
        define_mask(QUOTA),
        define_mask(REFCOUNT),
+        define_mask(BASTS),
        define_mask(ERROR),
        define_mask(NOTICE),
        define_mask(KTHREAD),
@@ -135,7 +136,7 @@ static ssize_t mlog_store(struct kobject *obj, struct attribute *attr,
        return mlog_mask_store(mlog_attr->mask, buf, count);
 }
-static struct sysfs_ops mlog_attr_ops = {
+static const struct sysfs_ops mlog_attr_ops = {
        .show  = mlog_show,
        .store = mlog_store,
 };
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 9b4d11726cf2..3dfddbec32f2 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -114,6 +114,7 @@
 #define ML_XATTR        0x0000000020000000ULL /* ocfs2 extended attributes */
 #define ML_QUOTA        0x0000000040000000ULL /* ocfs2 quota operations */
 #define ML_REFCOUNT     0x0000000080000000ULL /* refcount tree operations */
+#define ML_BASTS        0x0000001000000000ULL /* dlmglue asts and basts */
 /* bits that are infrequently given and frequently matched in the high word */
 #define ML_ERROR        0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE       0x0000000200000000ULL /* setn to KERN_NOTICE */
@@ -194,9 +195,9 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 * previous token if args expands to nothing.
 */
 #define __mlog_printk(level, fmt, args...)                              \
-        printk(level "(%u,%lu):%s:%d " fmt, task_pid_nr(current),       \
+        printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm,           \
-               __mlog_cpu_guess, __PRETTY_FUNCTION__, __LINE__ ,        \
+               task_pid_nr(current), __mlog_cpu_guess,                  \
-               ##args)
+               __PRETTY_FUNCTION__, __LINE__ , ##args)
 #define mlog(mask, fmt, args...) do {                                   \
        u64 __m = MLOG_MASK_PREFIX | (mask);                            \
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index da794bc07a6c..a3f150e52b02 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -294,10 +294,10 @@ static int sc_seq_show(struct seq_file *seq, void *v)
                if (sc->sc_sock) {
                        inet = inet_sk(sc->sc_sock->sk);
                        /* the stack's structs aren't sparse endian clean */
-                        saddr = (__force __be32)inet->saddr;
+                        saddr = (__force __be32)inet->inet_saddr;
-                        daddr = (__force __be32)inet->daddr;
+                        daddr = (__force __be32)inet->inet_daddr;
-                        sport = (__force __be16)inet->sport;
+                        sport = (__force __be16)inet->inet_sport;
-                        dport = (__force __be16)inet->dport;
+                        dport = (__force __be16)inet->inet_dport;
                }
                /* XXX sigh, inet-> doesn't have sparse annotation so any
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 7ee6188bc79a..ed0c9f367fed 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -19,6 +19,7 @@
 * Boston, MA 021110-1307, USA.
 */
+#include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/configfs.h>
@@ -35,6 +36,10 @@
 * cluster references throughout where nodes are looked up */
 struct o2nm_cluster *o2nm_single_cluster = NULL;
+char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = {
+                "reset",        /* O2NM_FENCE_RESET */
+                "panic",        /* O2NM_FENCE_PANIC */
+};
 struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
 {
@@ -579,6 +584,43 @@ static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write(
        return o2nm_cluster_attr_write(page, count,
                                       &cluster->cl_reconnect_delay_ms);
 }
+static ssize_t o2nm_cluster_attr_fence_method_read(
+        struct o2nm_cluster *cluster, char *page)
+{
+        ssize_t ret = 0;
+        if (cluster)
+                ret = sprintf(page, "%s\n",
+                              o2nm_fence_method_desc[cluster->cl_fence_method]);
+        return ret;
+}
+static ssize_t o2nm_cluster_attr_fence_method_write(
+        struct o2nm_cluster *cluster, const char *page, size_t count)
+{
+        unsigned int i;
+        if (page[count - 1] != '\n')
+                goto bail;
+        for (i = 0; i < O2NM_FENCE_METHODS; ++i) {
+                if (count != strlen(o2nm_fence_method_desc[i]) + 1)
+                        continue;
+                if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1))
+                        continue;
+                if (cluster->cl_fence_method != i) {
+                        printk(KERN_INFO "ocfs2: Changing fence method to %s\n",
+                               o2nm_fence_method_desc[i]);
+                        cluster->cl_fence_method = i;
+                }
+                return count;
+        }
+bail:
+        return -EINVAL;
+}
 static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = {
        .attr   = { .ca_owner = THIS_MODULE,
                    .ca_name = "idle_timeout_ms",
@@ -603,10 +645,19 @@ static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = {
        .store  = o2nm_cluster_attr_reconnect_delay_ms_write,
 };
+static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = {
+        .attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "fence_method",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+        .show   = o2nm_cluster_attr_fence_method_read,
+        .store  = o2nm_cluster_attr_fence_method_write,
+};
 static struct configfs_attribute *o2nm_cluster_attrs[] = {
        &o2nm_cluster_attr_idle_timeout_ms.attr,
        &o2nm_cluster_attr_keepalive_delay_ms.attr,
        &o2nm_cluster_attr_reconnect_delay_ms.attr,
+        &o2nm_cluster_attr_fence_method.attr,
        NULL,
 };
 static ssize_t o2nm_cluster_show(struct config_item *item,
@@ -778,6 +829,7 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
        cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT;
        cluster->cl_idle_timeout_ms    = O2NET_IDLE_TIMEOUT_MS_DEFAULT;
        cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT;
+        cluster->cl_fence_method       = O2NM_FENCE_RESET;
        ret = &cluster->cl_group;
        o2nm_single_cluster = cluster;
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
index c992ea0da4ad..09ea2d388bbb 100644
--- a/fs/ocfs2/cluster/nodemanager.h
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -33,6 +33,12 @@
 #include <linux/configfs.h>
 #include <linux/rbtree.h>
+enum o2nm_fence_method {
+        O2NM_FENCE_RESET        = 0,
+        O2NM_FENCE_PANIC,
+        O2NM_FENCE_METHODS,     /* Number of fence methods */
+};
 struct o2nm_node {
        spinlock_t              nd_lock;
        struct config_item      nd_item;
@@ -58,6 +64,7 @@ struct o2nm_cluster {
        unsigned int            cl_idle_timeout_ms;
        unsigned int            cl_keepalive_delay_ms;
        unsigned int            cl_reconnect_delay_ms;
+        enum o2nm_fence_method  cl_fence_method;
        /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */
        unsigned long   cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index bbacf7da48a4..cf3e16696216 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -44,7 +44,6 @@
 * and if they're the last, they fire off the decision.
 */
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/workqueue.h>
 #include <linux/reboot.h>
@@ -74,8 +73,20 @@ static void o2quo_fence_self(void)
         * threads can still schedule, etc, etc */
        o2hb_stop_all_regions();
-        printk("ocfs2 is very sorry to be fencing this system by restarting\n");
+        switch (o2nm_single_cluster->cl_fence_method) {
-        emergency_restart();
+        case O2NM_FENCE_PANIC:
+                panic("*** ocfs2 is very sorry to be fencing this system by "
+                      "panicing ***\n");
+                break;
+        default:
+                WARN_ON(o2nm_single_cluster->cl_fence_method >=
+                        O2NM_FENCE_METHODS);
+        case O2NM_FENCE_RESET:
+                printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this "
+                       "system by restarting ***\n");
+                emergency_restart();
+                break;
+        };
 }
 /* Indicate that a timeout occured on a hearbeat region write. The
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 334f231a422c..73e743eea2c8 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -72,9 +72,9 @@
 #include "tcp_internal.h"
-#define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u"
+#define SC_NODEF_FMT "node %s (num %u) at %pI4:%u"
 #define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num,    \
-                          NIPQUAD(sc->sc_node->nd_ipv4_address),        \
+                          &sc->sc_node->nd_ipv4_address,                \
                          ntohs(sc->sc_node->nd_ipv4_port)
 /*
@@ -485,7 +485,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
        }
        if (was_valid && !valid) {
-                printk(KERN_INFO "o2net: no longer connected to "
+                printk(KERN_NOTICE "o2net: no longer connected to "
                       SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
                o2net_complete_nodes_nsw(nn);
        }
@@ -493,7 +493,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
        if (!was_valid && valid) {
                o2quo_conn_up(o2net_num_from_nn(nn));
                cancel_delayed_work(&nn->nn_connect_expired);
-                printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n",
+                printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n",
                       o2nm_this_node() > sc->sc_node->nd_num ?
                                "connected to" : "accepted connection from",
                       SC_NODEF_ARGS(sc));
@@ -930,7 +930,7 @@ static void o2net_sendpage(struct o2net_sock_container *sc,
                        cond_resched();
                        continue;
                }
-                mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT 
+                mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT
                     " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret);
                o2net_ensure_shutdown(nn, sc, 0);
                break;
@@ -1476,14 +1476,14 @@ static void o2net_idle_timer(unsigned long data)
        do_gettimeofday(&now);
-        printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
+        printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
             "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
                     o2net_idle_timeout() / 1000,
                     o2net_idle_timeout() % 1000);
        mlog(ML_NOTICE, "here are some times that might help debug the "
             "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
             "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
-             sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec, 
+             sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec,
             now.tv_sec, (long) now.tv_usec,
             sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec,
             sc->sc_tv_advance_start.tv_sec,
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 8d58cfe410b1..96fa7ebc530c 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -32,10 +32,10 @@
 * on their number */
 #define O2NET_QUORUM_DELAY_MS   ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS)
-/* 
+/*
 * This version number represents quite a lot, unfortunately.  It not
 * only represents the raw network message protocol on the wire but also
- * locking semantics of the file system using the protocol.  It should 
+ * locking semantics of the file system using the protocol.  It should
 * be somewhere else, I'm sure, but right now it isn't.
 *
 * With version 11, we separate out the filesystem locking portion.  The
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 28c3ec238796..efd77d071c80 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2439,7 +2439,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
        memset(dx_root, 0, osb->sb->s_blocksize);
        strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
-        dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num);
+        dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
        dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
        dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
        dx_root->dr_blkno = cpu_to_le64(dr_blkno);
@@ -2964,12 +2964,10 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                goto out;
        }
-        if (vfs_dq_alloc_space_nodirty(dir,
+        ret = dquot_alloc_space_nodirty(dir,
-                                ocfs2_clusters_to_bytes(osb->sb,
+                ocfs2_clusters_to_bytes(osb->sb, alloc + dx_alloc));
-                                                        alloc + dx_alloc))) {
+        if (ret)
-                ret = -EDQUOT;
                goto out_commit;
-        }
        did_quota = 1;
        if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
@@ -3178,7 +3176,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 out_commit:
        if (ret < 0 && did_quota)
-                vfs_dq_free_space_nodirty(dir, bytes_allocated);
+                dquot_free_space_nodirty(dir, bytes_allocated);
        ocfs2_commit_trans(osb, handle);
@@ -3221,11 +3219,10 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
        if (extend) {
                u32 offset = OCFS2_I(dir)->ip_clusters;
-                if (vfs_dq_alloc_space_nodirty(dir,
+                status = dquot_alloc_space_nodirty(dir,
-                                        ocfs2_clusters_to_bytes(sb, 1))) {
+                                        ocfs2_clusters_to_bytes(sb, 1));
-                        status = -EDQUOT;
+                if (status)
                        goto bail;
-                }
                did_quota = 1;
                status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
@@ -3254,7 +3251,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
        status = 0;
 bail:
        if (did_quota && status < 0)
-                vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
+                dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
        mlog_exit(status);
        return status;
 }
@@ -3889,11 +3886,10 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
                goto out;
        }
-        if (vfs_dq_alloc_space_nodirty(dir,
+        ret = dquot_alloc_space_nodirty(dir,
-                                       ocfs2_clusters_to_bytes(dir->i_sb, 1))) {
+                                       ocfs2_clusters_to_bytes(dir->i_sb, 1));
-                ret = -EDQUOT;
+        if (ret)
                goto out_commit;
-        }
        did_quota = 1;
        ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
@@ -3983,7 +3979,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
 out_commit:
        if (ret < 0 && did_quota)
-                vfs_dq_free_space_nodirty(dir,
+                dquot_free_space_nodirty(dir,
                                ocfs2_clusters_to_bytes(dir->i_sb, 1));
        ocfs2_commit_trans(osb, handle);
@@ -4165,11 +4161,10 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
                goto out;
        }
-        if (vfs_dq_alloc_space_nodirty(dir,
+        ret = dquot_alloc_space_nodirty(dir,
-                                       ocfs2_clusters_to_bytes(osb->sb, 1))) {
+                                       ocfs2_clusters_to_bytes(osb->sb, 1));
-                ret = -EDQUOT;
+        if (ret)
                goto out_commit;
-        }
        did_quota = 1;
        /*
@@ -4229,7 +4224,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
 out_commit:
        if (ret < 0 && did_quota)
-                vfs_dq_free_space_nodirty(dir,
+                dquot_free_space_nodirty(dir,
                                          ocfs2_clusters_to_bytes(dir->i_sb, 1));
        ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index 190361375700..dcebf0d920fa 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,8 +1,7 @@
 EXTRA_CFLAGS += -Ifs/ocfs2
-obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlmfs.o
+obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
 ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
        dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
-ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h
index b5786a787fab..3cfa114aa391 100644
--- a/fs/ocfs2/dlm/dlmapi.h
+++ b/fs/ocfs2/dlm/dlmapi.h
@@ -95,7 +95,7 @@ const char *dlm_errname(enum dlm_status err);
                mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st))); \
 } while (0)
-#define DLM_LKSB_UNUSED1           0x01  
+#define DLM_LKSB_UNUSED1           0x01
 #define DLM_LKSB_PUT_LVB           0x02
 #define DLM_LKSB_GET_LVB           0x04
 #define DLM_LKSB_UNUSED2           0x08
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 01cf8cc3d286..a795eb91f4ea 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -28,7 +28,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
@@ -123,7 +122,7 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
                dlm_lock_put(lock);
                /* free up the reserved bast that we are cancelling.
                 * guaranteed that this will not be the last reserved
-                 * ast because *both* an ast and a bast were reserved 
+                 * ast because *both* an ast and a bast were reserved
                 * to get to this point.  the res->spinlock will not be
                 * taken here */
                dlm_lockres_release_ast(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index ca96bce50e18..90803b47cd8c 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -28,7 +28,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
@@ -396,7 +395,7 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
                        /* instead of logging the same network error over
                         * and over, sleep here and wait for the heartbeat
                         * to notice the node is dead.  times out after 5s. */
-                        dlm_wait_for_node_death(dlm, res->owner, 
+                        dlm_wait_for_node_death(dlm, res->owner,
                                                DLM_NODE_DEATH_WAIT_MAX);
                        ret = DLM_RECOVERING;
                        mlog(0, "node %u died so returning DLM_RECOVERING "
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 42b0bad7a612..0cd24cf54396 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -102,7 +102,7 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
        assert_spin_locked(&res->spinlock);
        stringify_lockname(res->lockname.name, res->lockname.len,
-                           buf, sizeof(buf) - 1);
+                           buf, sizeof(buf));
        printk("lockres: %s, owner=%u, state=%u\n",
               buf, res->owner, res->state);
        printk("  last used: %lu, refcnt: %u, on purge list: %s\n",
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 0334000676d3..988c9055fd4e 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -816,7 +816,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
        }
        /* Once the dlm ctxt is marked as leaving then we don't want
-         * to be put in someone's domain map. 
+         * to be put in someone's domain map.
         * Also, explicitly disallow joining at certain troublesome
         * times (ie. during recovery). */
        if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) {
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 437698e9465f..733337772671 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -269,7 +269,7 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
                }
                dlm_revert_pending_lock(res, lock);
                dlm_lock_put(lock);
-        } else if (dlm_is_recovery_lock(res->lockname.name, 
+        } else if (dlm_is_recovery_lock(res->lockname.name,
                                        res->lockname.len)) {
                /* special case for the $RECOVERY lock.
                 * there will never be an AST delivered to put
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 83bcaf266b35..9289b4357d27 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -366,7 +366,7 @@ void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
        struct dlm_master_list_entry *mle;
        assert_spin_locked(&dlm->spinlock);
-        
        list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {
                if (node_up)
                        dlm_mle_node_up(dlm, mle, NULL, idx);
@@ -833,7 +833,7 @@ lookup:
                __dlm_insert_mle(dlm, mle);
                /* still holding the dlm spinlock, check the recovery map
-                 * to see if there are any nodes that still need to be 
+                 * to see if there are any nodes that still need to be
                 * considered.  these will not appear in the mle nodemap
                 * but they might own this lockres.  wait on them. */
                bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
@@ -883,7 +883,7 @@ redo_request:
                                msleep(500);
                        }
                        continue;
-                } 
+                }
                dlm_kick_recovery_thread(dlm);
                msleep(1000);
@@ -939,8 +939,8 @@ wait:
                     res->lockname.name, blocked);
                if (++tries > 20) {
                        mlog(ML_ERROR, "%s:%.*s: spinning on "
-                             "dlm_wait_for_lock_mastery, blocked=%d\n", 
+                             "dlm_wait_for_lock_mastery, blocked=%d\n",
-                             dlm->name, res->lockname.len, 
+                             dlm->name, res->lockname.len,
                             res->lockname.name, blocked);
                        dlm_print_one_lock_resource(res);
                        dlm_print_one_mle(mle);
@@ -1029,7 +1029,7 @@ recheck:
                ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
                b = (mle->type == DLM_MLE_BLOCK);
                if ((*blocked && !b) || (!*blocked && b)) {
-                        mlog(0, "%s:%.*s: status change: old=%d new=%d\n", 
+                        mlog(0, "%s:%.*s: status change: old=%d new=%d\n",
                             dlm->name, res->lockname.len, res->lockname.name,
                             *blocked, b);
                        *blocked = b;
@@ -1602,7 +1602,7 @@ send_response:
                }
                mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
                             dlm->node_num, res->lockname.len, res->lockname.name);
-                ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx, 
+                ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx,
                                                 DLM_ASSERT_MASTER_MLE_CLEANUP);
                if (ret < 0) {
                        mlog(ML_ERROR, "failed to dispatch assert master work\n");
@@ -1701,7 +1701,7 @@ again:
                if (r & DLM_ASSERT_RESPONSE_REASSERT) {
                        mlog(0, "%.*s: node %u create mles on other "
-                             "nodes and requests a re-assert\n", 
+                             "nodes and requests a re-assert\n",
                             namelen, lockname, to);
                        reassert = 1;
                }
@@ -1812,7 +1812,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
                                spin_unlock(&dlm->master_lock);
                                spin_unlock(&dlm->spinlock);
                                goto done;
-                        }       
+                        }
                }
        }
        spin_unlock(&dlm->master_lock);
@@ -1875,7 +1875,6 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
 ok:
                spin_unlock(&res->spinlock);
        }
-        spin_unlock(&dlm->spinlock);
        // mlog(0, "woo!  got an assert_master from node %u!\n",
        //           assert->node_idx);
@@ -1883,7 +1882,7 @@ ok:
                int extra_ref = 0;
                int nn = -1;
                int rr, err = 0;
-                
                spin_lock(&mle->spinlock);
                if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
                        extra_ref = 1;
@@ -1891,7 +1890,7 @@ ok:
                        /* MASTER mle: if any bits set in the response map
                         * then the calling node needs to re-assert to clear
                         * up nodes that this node contacted */
-                        while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES, 
+                        while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
                                                    nn+1)) < O2NM_MAX_NODES) {
                                if (nn != dlm->node_num && nn != assert->node_idx)
                                        master_request = 1;
@@ -1926,7 +1925,6 @@ ok:
                /* master is known, detach if not already detached.
                 * ensures that only one assert_master call will happen
                 * on this mle. */
-                spin_lock(&dlm->spinlock);
                spin_lock(&dlm->master_lock);
                rr = atomic_read(&mle->mle_refs.refcount);
@@ -1959,7 +1957,6 @@ ok:
                        __dlm_put_mle(mle);
                }
                spin_unlock(&dlm->master_lock);
-                spin_unlock(&dlm->spinlock);
        } else if (res) {
                if (res->owner != assert->node_idx) {
                        mlog(0, "assert_master from %u, but current "
@@ -1967,6 +1964,7 @@ ok:
                             res->owner, namelen, name);
                }
        }
+        spin_unlock(&dlm->spinlock);
 done:
        ret = 0;
@@ -2002,7 +2000,7 @@ kill:
        __dlm_print_one_lock_resource(res);
        spin_unlock(&res->spinlock);
        spin_unlock(&dlm->spinlock);
-        *ret_data = (void *)res; 
+        *ret_data = (void *)res;
        dlm_put(dlm);
        return -EINVAL;
 }
@@ -2040,10 +2038,10 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
        item->u.am.request_from = request_from;
        item->u.am.flags = flags;
-        if (ignore_higher) 
+        if (ignore_higher)
-                mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len, 
+                mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len,
                     res->lockname.name);
-                
        spin_lock(&dlm->work_lock);
        list_add_tail(&item->list, &dlm->work_list);
        spin_unlock(&dlm->work_lock);
@@ -2133,7 +2131,7 @@ put:
 * think that $RECOVERY is currently mastered by a dead node.  If so,
 * we wait a short time to allow that node to get notified by its own
 * heartbeat stack, then check again.  All $RECOVERY lock resources
- * mastered by dead nodes are purged when the hearbeat callback is 
+ * mastered by dead nodes are purged when the hearbeat callback is
 * fired, so we can know for sure that it is safe to continue once
 * the node returns a live node or no node.  */
 static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
@@ -2174,7 +2172,7 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
                                ret = -EAGAIN;
                        }
                        spin_unlock(&dlm->spinlock);
-                        mlog(0, "%s: reco lock master is %u\n", dlm->name, 
+                        mlog(0, "%s: reco lock master is %u\n", dlm->name,
                             master);
                        break;
                }
@@ -2586,7 +2584,7 @@ fail:
         * is complete everywhere.  if the target dies while this is
         * going on, some nodes could potentially see the target as the
         * master, so it is important that my recovery finds the migration
-         * mle and sets the master to UNKNONWN. */
+         * mle and sets the master to UNKNOWN. */
        /* wait for new node to assert master */
@@ -2602,7 +2600,7 @@ fail:
                        mlog(0, "%s:%.*s: timed out during migration\n",
                             dlm->name, res->lockname.len, res->lockname.name);
-                        /* avoid hang during shutdown when migrating lockres 
+                        /* avoid hang during shutdown when migrating lockres
                         * to a node which also goes down */
                        if (dlm_is_node_dead(dlm, target)) {
                                mlog(0, "%s:%.*s: expected migration "
@@ -2738,7 +2736,7 @@ static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
        can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
        spin_unlock(&res->spinlock);
-        /* target has died, so make the caller break out of the 
+        /* target has died, so make the caller break out of the
         * wait_event, but caller must recheck the domain_map */
        spin_lock(&dlm->spinlock);
        if (!test_bit(mig_target, dlm->domain_map))
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index d9fa3d22e17c..b4f99de2caf3 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -310,7 +310,7 @@ static int dlm_recovery_thread(void *data)
        mlog(0, "dlm thread running for %s...\n", dlm->name);
        while (!kthread_should_stop()) {
-                if (dlm_joined(dlm)) {
+                if (dlm_domain_fully_joined(dlm)) {
                        status = dlm_do_recovery(dlm);
                        if (status == -EAGAIN) {
                                /* do not sleep, recheck immediately. */
@@ -1050,7 +1050,7 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
                                if (lock->ml.node == dead_node) {
                                        mlog(0, "AHA! there was "
                                             "a $RECOVERY lock for dead "
-                                             "node %u (%s)!\n", 
+                                             "node %u (%s)!\n",
                                             dead_node, dlm->name);
                                        list_del_init(&lock->list);
                                        dlm_lock_put(lock);
@@ -1164,6 +1164,39 @@ static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
        mres->master = master;
 }
+static void dlm_prepare_lvb_for_migration(struct dlm_lock *lock,
+                                          struct dlm_migratable_lockres *mres,
+                                          int queue)
+{
+        if (!lock->lksb)
+               return;
+        /* Ignore lvb in all locks in the blocked list */
+        if (queue == DLM_BLOCKED_LIST)
+                return;
+        /* Only consider lvbs in locks with granted EX or PR lock levels */
+        if (lock->ml.type != LKM_EXMODE && lock->ml.type != LKM_PRMODE)
+                return;
+        if (dlm_lvb_is_empty(mres->lvb)) {
+                memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
+                return;
+        }
+        /* Ensure the lvb copied for migration matches in other valid locks */
+        if (!memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))
+                return;
+        mlog(ML_ERROR, "Mismatched lvb in lock cookie=%u:%llu, name=%.*s, "
+             "node=%u\n",
+             dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+             dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+             lock->lockres->lockname.len, lock->lockres->lockname.name,
+             lock->ml.node);
+        dlm_print_one_lock_resource(lock->lockres);
+        BUG();
+}
 /* returns 1 if this lock fills the network structure,
 * 0 otherwise */
@@ -1181,20 +1214,7 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock,
        ml->list = queue;
        if (lock->lksb) {
                ml->flags = lock->lksb->flags;
-                /* send our current lvb */
+                dlm_prepare_lvb_for_migration(lock, mres, queue);
-                if (ml->type == LKM_EXMODE ||
-                    ml->type == LKM_PRMODE) {
-                        /* if it is already set, this had better be a PR
-                         * and it has to match */
-                        if (!dlm_lvb_is_empty(mres->lvb) &&
-                            (ml->type == LKM_EXMODE ||
-                             memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
-                                mlog(ML_ERROR, "mismatched lvbs!\n");
-                                dlm_print_one_lock_resource(lock->lockres);
-                                BUG();
-                        }
-                        memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
-                }
        }
        ml->node = lock->ml.node;
        mres->num_locks++;
@@ -1730,6 +1750,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
        struct dlm_lock *lock = NULL;
        u8 from = O2NM_MAX_NODES;
        unsigned int added = 0;
+        __be64 c;
        mlog(0, "running %d locks for this lockres\n", mres->num_locks);
        for (i=0; i<mres->num_locks; i++) {
@@ -1777,19 +1798,48 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                        /* lock is always created locally first, and
                         * destroyed locally last.  it must be on the list */
                        if (!lock) {
-                                __be64 c = ml->cookie;
+                                c = ml->cookie;
-                                mlog(ML_ERROR, "could not find local lock "
+                                mlog(ML_ERROR, "Could not find local lock "
-                                               "with cookie %u:%llu!\n",
+                                               "with cookie %u:%llu, node %u, "
+                                               "list %u, flags 0x%x, type %d, "
+                                               "conv %d, highest blocked %d\n",
                                     dlm_get_lock_cookie_node(be64_to_cpu(c)),
-                                     dlm_get_lock_cookie_seq(be64_to_cpu(c)));
+                                     dlm_get_lock_cookie_seq(be64_to_cpu(c)),
+                                     ml->node, ml->list, ml->flags, ml->type,
+                                     ml->convert_type, ml->highest_blocked);
+                                __dlm_print_one_lock_resource(res);
+                                BUG();
+                        }
+                        if (lock->ml.node != ml->node) {
+                                c = lock->ml.cookie;
+                                mlog(ML_ERROR, "Mismatched node# in lock "
+                                     "cookie %u:%llu, name %.*s, node %u\n",
+                                     dlm_get_lock_cookie_node(be64_to_cpu(c)),
+                                     dlm_get_lock_cookie_seq(be64_to_cpu(c)),
+                                     res->lockname.len, res->lockname.name,
+                                     lock->ml.node);
+                                c = ml->cookie;
+                                mlog(ML_ERROR, "Migrate lock cookie %u:%llu, "
+                                     "node %u, list %u, flags 0x%x, type %d, "
+                                     "conv %d, highest blocked %d\n",
+                                     dlm_get_lock_cookie_node(be64_to_cpu(c)),
+                                     dlm_get_lock_cookie_seq(be64_to_cpu(c)),
+                                     ml->node, ml->list, ml->flags, ml->type,
+                                     ml->convert_type, ml->highest_blocked);
                                __dlm_print_one_lock_resource(res);
                                BUG();
                        }
-                        BUG_ON(lock->ml.node != ml->node);
                        if (tmpq != queue) {
-                                mlog(0, "lock was on %u instead of %u for %.*s\n",
+                                c = ml->cookie;
-                                     j, ml->list, res->lockname.len, res->lockname.name);
+                                mlog(0, "Lock cookie %u:%llu was on list %u "
+                                     "instead of list %u for %.*s\n",
+                                     dlm_get_lock_cookie_node(be64_to_cpu(c)),
+                                     dlm_get_lock_cookie_seq(be64_to_cpu(c)),
+                                     j, ml->list, res->lockname.len,
+                                     res->lockname.name);
+                                __dlm_print_one_lock_resource(res);
                                spin_unlock(&res->spinlock);
                                continue;
                        }
@@ -1839,7 +1889,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                                 * the lvb. */
                                memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
                        } else {
-                                /* otherwise, the node is sending its 
+                                /* otherwise, the node is sending its
                                 * most recent valid lvb info */
                                BUG_ON(ml->type != LKM_EXMODE &&
                                       ml->type != LKM_PRMODE);
@@ -1886,7 +1936,7 @@ skip_lvb:
                spin_lock(&res->spinlock);
                list_for_each_entry(lock, queue, list) {
                        if (lock->ml.cookie == ml->cookie) {
-                                __be64 c = lock->ml.cookie;
+                                c = lock->ml.cookie;
                                mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
                                     "exists on this lockres!\n", dlm->name,
                                     res->lockname.len, res->lockname.name,
@@ -2114,7 +2164,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
        assert_spin_locked(&res->spinlock);
        if (res->owner == dlm->node_num)
-                /* if this node owned the lockres, and if the dead node 
+                /* if this node owned the lockres, and if the dead node
                 * had an EX when he died, blank out the lvb */
                search_node = dead_node;
        else {
@@ -2152,7 +2202,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
        /* this node is the lockres master:
         * 1) remove any stale locks for the dead node
-         * 2) if the dead node had an EX when he died, blank out the lvb 
+         * 2) if the dead node had an EX when he died, blank out the lvb
         */
        assert_spin_locked(&dlm->spinlock);
        assert_spin_locked(&res->spinlock);
@@ -2193,7 +2243,12 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
                mlog(0, "%s:%.*s: freed %u locks for dead node %u, "
                     "dropping ref from lockres\n", dlm->name,
                     res->lockname.len, res->lockname.name, freed, dead_node);
-                BUG_ON(!test_bit(dead_node, res->refmap));
+                if(!test_bit(dead_node, res->refmap)) {
+                        mlog(ML_ERROR, "%s:%.*s: freed %u locks for dead node %u, "
+                             "but ref was not set\n", dlm->name,
+                             res->lockname.len, res->lockname.name, freed, dead_node);
+                        __dlm_print_one_lock_resource(res);
+                }
                dlm_lockres_clear_refmap_bit(dead_node, res);
        } else if (test_bit(dead_node, res->refmap)) {
                mlog(0, "%s:%.*s: dead node %u had a ref, but had "
@@ -2260,7 +2315,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
                                }
                                spin_unlock(&res->spinlock);
                                continue;
-                        }                       
+                        }
                        spin_lock(&res->spinlock);
                        /* zero the lvb if necessary */
                        dlm_revalidate_lvb(dlm, res, dead_node);
@@ -2411,7 +2466,7 @@ static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
 * this function on each node racing to become the recovery
 * master will not stop attempting this until either:
 * a) this node gets the EX (and becomes the recovery master),
- * or b) dlm->reco.new_master gets set to some nodenum 
+ * or b) dlm->reco.new_master gets set to some nodenum
 * != O2NM_INVALID_NODE_NUM (another node will do the reco).
 * so each time a recovery master is needed, the entire cluster
 * will sync at this point.  if the new master dies, that will
@@ -2424,7 +2479,7 @@ static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
        mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
             dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
-again:  
+again:
        memset(&lksb, 0, sizeof(lksb));
        ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
@@ -2437,8 +2492,8 @@ again:
        if (ret == DLM_NORMAL) {
                mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
                     dlm->name, dlm->node_num);
-                
-                /* got the EX lock.  check to see if another node 
+                /* got the EX lock.  check to see if another node
                 * just became the reco master */
                if (dlm_reco_master_ready(dlm)) {
                        mlog(0, "%s: got reco EX lock, but %u will "
@@ -2451,12 +2506,12 @@ again:
                        /* see if recovery was already finished elsewhere */
                        spin_lock(&dlm->spinlock);
                        if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
-                                status = -EINVAL;       
+                                status = -EINVAL;
                                mlog(0, "%s: got reco EX lock, but "
                                     "node got recovered already\n", dlm->name);
                                if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
                                        mlog(ML_ERROR, "%s: new master is %u "
-                                             "but no dead node!\n", 
+                                             "but no dead node!\n",
                                             dlm->name, dlm->reco.new_master);
                                        BUG();
                                }
@@ -2468,7 +2523,7 @@ again:
                 * set the master and send the messages to begin recovery */
                if (!status) {
                        mlog(0, "%s: dead=%u, this=%u, sending "
-                             "begin_reco now\n", dlm->name, 
+                             "begin_reco now\n", dlm->name,
                             dlm->reco.dead_node, dlm->node_num);
                        status = dlm_send_begin_reco_message(dlm,
                                      dlm->reco.dead_node);
@@ -2501,7 +2556,7 @@ again:
                mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
                     dlm->name, dlm->node_num);
                /* another node is master. wait on
-                 * reco.new_master != O2NM_INVALID_NODE_NUM 
+                 * reco.new_master != O2NM_INVALID_NODE_NUM
                 * for at most one second */
                wait_event_timeout(dlm->dlm_reco_thread_wq,
                                         dlm_reco_master_ready(dlm),
@@ -2589,9 +2644,23 @@ retry:
                             "begin reco msg (%d)\n", dlm->name, nodenum, ret);
                        ret = 0;
                }
+                /*
+                 * Prior to commit aad1b15310b9bcd59fa81ab8f2b1513b59553ea8,
+                 * dlm_begin_reco_handler() returned EAGAIN and not -EAGAIN.
+                 * We are handling both for compatibility reasons.
+                 */
+                if (ret == -EAGAIN || ret == EAGAIN) {
+                        mlog(0, "%s: trying to start recovery of node "
+                             "%u, but node %u is waiting for last recovery "
+                             "to complete, backoff for a bit\n", dlm->name,
+                             dead_node, nodenum);
+                        msleep(100);
+                        goto retry;
+                }
                if (ret < 0) {
                        struct dlm_lock_resource *res;
-                        /* this is now a serious problem, possibly ENOMEM 
+                        /* this is now a serious problem, possibly ENOMEM
                         * in the network stack.  must retry */
                        mlog_errno(ret);
                        mlog(ML_ERROR, "begin reco of dlm %s to node %u "
@@ -2604,18 +2673,10 @@ retry:
                        } else {
                                mlog(ML_ERROR, "recovery lock not found\n");
                        }
-                        /* sleep for a bit in hopes that we can avoid 
+                        /* sleep for a bit in hopes that we can avoid
                         * another ENOMEM */
                        msleep(100);
                        goto retry;
-                } else if (ret == EAGAIN) {
-                        mlog(0, "%s: trying to start recovery of node "
-                             "%u, but node %u is waiting for last recovery "
-                             "to complete, backoff for a bit\n", dlm->name,
-                             dead_node, nodenum);
-                        /* TODO Look into replacing msleep with cond_resched() */
-                        msleep(100);
-                        goto retry;
                }
        }
@@ -2639,7 +2700,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
                     dlm->name, br->node_idx, br->dead_node,
                     dlm->reco.dead_node, dlm->reco.new_master);
                spin_unlock(&dlm->spinlock);
-                return EAGAIN;
+                return -EAGAIN;
        }
        spin_unlock(&dlm->spinlock);
@@ -2664,7 +2725,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
        }
        if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
                mlog(ML_NOTICE, "%s: dead_node previously set to %u, "
-                     "node %u changing it to %u\n", dlm->name, 
+                     "node %u changing it to %u\n", dlm->name,
                     dlm->reco.dead_node, br->node_idx, br->dead_node);
        }
        dlm_set_reco_master(dlm, br->node_idx);
@@ -2730,8 +2791,8 @@ stage2:
                if (ret < 0) {
                        mlog_errno(ret);
                        if (dlm_is_host_down(ret)) {
-                                /* this has no effect on this recovery 
+                                /* this has no effect on this recovery
-                                 * session, so set the status to zero to 
+                                 * session, so set the status to zero to
                                 * finish out the last recovery */
                                mlog(ML_ERROR, "node %u went down after this "
                                     "node finished recovery.\n", nodenum);
@@ -2768,7 +2829,7 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
        mlog(0, "%s: node %u finalizing recovery stage%d of "
             "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage,
             fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master);
- 
        spin_lock(&dlm->spinlock);
        if (dlm->reco.new_master != fr->node_idx) {
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 52ec020ea78b..11a6d1fd1d35 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -28,7 +28,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 00f53b2aea76..b47c1b92b82b 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -28,7 +28,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
@@ -190,8 +189,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
                        actions &= ~(DLM_UNLOCK_REMOVE_LOCK|
                                     DLM_UNLOCK_REGRANT_LOCK|
                                     DLM_UNLOCK_CLEAR_CONVERT_TYPE);
-                } else if (status == DLM_RECOVERING || 
+                } else if (status == DLM_RECOVERING ||
-                           status == DLM_MIGRATING || 
+                           status == DLM_MIGRATING ||
                           status == DLM_FORWARD) {
                        /* must clear the actions because this unlock
                         * is about to be retried.  cannot free or do
@@ -661,14 +660,14 @@ retry:
        if (call_ast) {
                mlog(0, "calling unlockast(%p, %d)\n", data, status);
                if (is_master) {
-                        /* it is possible that there is one last bast 
+                        /* it is possible that there is one last bast
                         * pending.  make sure it is flushed, then
                         * call the unlockast.
                         * not an issue if this is a mastered remotely,
                         * since this lock has been removed from the
                         * lockres queues and cannot be found. */
                        dlm_kick_thread(dlm, NULL);
-                        wait_event(dlm->ast_wq, 
+                        wait_event(dlm->ast_wq,
                                   dlm_lock_basts_flushed(dlm, lock));
                }
                (*unlockast)(data, status);
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
new file mode 100644
index 000000000000..df69b4856d0d
--- /dev/null
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -0,0 +1,5 @@
+EXTRA_CFLAGS += -Ifs/ocfs2
+obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
+ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 02bf17808bdc..1b0de157a08c 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -43,24 +43,17 @@
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/backing-dev.h>
+#include <linux/poll.h>
 #include <asm/uaccess.h>
+#include "stackglue.h"
-#include "cluster/nodemanager.h"
-#include "cluster/heartbeat.h"
-#include "cluster/tcp.h"
-#include "dlmapi.h"
 #include "userdlm.h"
 #include "dlmfsver.h"
 #define MLOG_MASK_PREFIX ML_DLMFS
 #include "cluster/masklog.h"
-#include "ocfs2_lockingver.h"
 static const struct super_operations dlmfs_ops;
 static const struct file_operations dlmfs_file_operations;
@@ -71,15 +64,46 @@ static struct kmem_cache *dlmfs_inode_cache;
 struct workqueue_struct *user_dlm_worker;
 /*
- * This is the userdlmfs locking protocol version.
+ * These are the ABI capabilities of dlmfs.
+ *
+ * Over time, dlmfs has added some features that were not part of the
+ * initial ABI.  Unfortunately, some of these features are not detectable
+ * via standard usage.  For example, Linux's default poll always returns
+ * POLLIN, so there is no way for a caller of poll(2) to know when dlmfs
+ * added poll support.  Instead, we provide this list of new capabilities.
+ *
+ * Capabilities is a read-only attribute.  We do it as a module parameter
+ * so we can discover it whether dlmfs is built in, loaded, or even not
+ * loaded.
 *
- * See fs/ocfs2/dlmglue.c for more details on locking versions.
+ * The ABI features are local to this machine's dlmfs mount.  This is
+ * distinct from the locking protocol, which is concerned with inter-node
+ * interaction.
+ *
+ * Capabilities:
+ * - bast       : POLLIN against the file descriptor of a held lock
+ *                signifies a bast fired on the lock.
 */
-static const struct dlm_protocol_version user_locking_protocol = {
+#define DLMFS_CAPABILITIES "bast stackglue"
-        .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
+extern int param_set_dlmfs_capabilities(const char *val,
-        .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
+                                        struct kernel_param *kp)
-};
+{
+        printk(KERN_ERR "%s: readonly parameter\n", kp->name);
+        return -EINVAL;
+}
+static int param_get_dlmfs_capabilities(char *buffer,
+                                        struct kernel_param *kp)
+{
+        return strlcpy(buffer, DLMFS_CAPABILITIES,
+                       strlen(DLMFS_CAPABILITIES) + 1);
+}
+module_param_call(capabilities, param_set_dlmfs_capabilities,
+                  param_get_dlmfs_capabilities, NULL, 0444);
+MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES);
 /*
 * decodes a set of open flags into a valid lock level and a set of flags.
@@ -179,13 +203,46 @@ static int dlmfs_file_release(struct inode *inode,
        return 0;
 }
+/*
+ * We do ->setattr() just to override size changes.  Our size is the size
+ * of the LVB and nothing else.
+ */
+static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
+{
+        int error;
+        struct inode *inode = dentry->d_inode;
+        attr->ia_valid &= ~ATTR_SIZE;
+        error = inode_change_ok(inode, attr);
+        if (!error)
+                error = inode_setattr(inode, attr);
+        return error;
+}
+static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
+{
+        int event = 0;
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct dlmfs_inode_private *ip = DLMFS_I(inode);
+        poll_wait(file, &ip->ip_lockres.l_event, wait);
+        spin_lock(&ip->ip_lockres.l_lock);
+        if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED)
+                event = POLLIN | POLLRDNORM;
+        spin_unlock(&ip->ip_lockres.l_lock);
+        return event;
+}
 static ssize_t dlmfs_file_read(struct file *filp,
                               char __user *buf,
                               size_t count,
                               loff_t *ppos)
 {
        int bytes_left;
-        ssize_t readlen;
+        ssize_t readlen, got;
        char *lvb_buf;
        struct inode *inode = filp->f_path.dentry->d_inode;
@@ -211,9 +268,13 @@ static ssize_t dlmfs_file_read(struct file *filp,
        if (!lvb_buf)
                return -ENOMEM;
-        user_dlm_read_lvb(inode, lvb_buf, readlen);
+        got = user_dlm_read_lvb(inode, lvb_buf, readlen);
-        bytes_left = __copy_to_user(buf, lvb_buf, readlen);
+        if (got) {
-        readlen -= bytes_left;
+                BUG_ON(got != readlen);
+                bytes_left = __copy_to_user(buf, lvb_buf, readlen);
+                readlen -= bytes_left;
+        } else
+                readlen = 0;
        kfree(lvb_buf);
@@ -272,7 +333,7 @@ static void dlmfs_init_once(void *foo)
        struct dlmfs_inode_private *ip =
                (struct dlmfs_inode_private *) foo;
-        ip->ip_dlm = NULL;
+        ip->ip_conn = NULL;
        ip->ip_parent = NULL;
        inode_init_once(&ip->ip_vfs_inode);
@@ -314,14 +375,14 @@ static void dlmfs_clear_inode(struct inode *inode)
                goto clear_fields;
        }
-        mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm);
+        mlog(0, "we're a directory, ip->ip_conn = 0x%p\n", ip->ip_conn);
        /* we must be a directory. If required, lets unregister the
         * dlm context now. */
-        if (ip->ip_dlm)
+        if (ip->ip_conn)
-                user_dlm_unregister_context(ip->ip_dlm);
+                user_dlm_unregister(ip->ip_conn);
 clear_fields:
        ip->ip_parent = NULL;
-        ip->ip_dlm = NULL;
+        ip->ip_conn = NULL;
 }
 static struct backing_dev_info dlmfs_backing_dev_info = {
@@ -371,7 +432,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        ip = DLMFS_I(inode);
-        ip->ip_dlm = DLMFS_I(parent)->ip_dlm;
+        ip->ip_conn = DLMFS_I(parent)->ip_conn;
        switch (mode & S_IFMT) {
        default:
@@ -425,13 +486,12 @@ static int dlmfs_mkdir(struct inode * dir,
        struct inode *inode = NULL;
        struct qstr *domain = &dentry->d_name;
        struct dlmfs_inode_private *ip;
-        struct dlm_ctxt *dlm;
+        struct ocfs2_cluster_connection *conn;
-        struct dlm_protocol_version proto = user_locking_protocol;
        mlog(0, "mkdir %.*s\n", domain->len, domain->name);
        /* verify that we have a proper domain */
-        if (domain->len >= O2NM_MAX_NAME_LEN) {
+        if (domain->len >= GROUP_NAME_MAX) {
                status = -EINVAL;
                mlog(ML_ERROR, "invalid domain name for directory.\n");
                goto bail;
@@ -446,14 +506,14 @@ static int dlmfs_mkdir(struct inode * dir,
        ip = DLMFS_I(inode);
-        dlm = user_dlm_register_context(domain, &proto);
+        conn = user_dlm_register(domain);
-        if (IS_ERR(dlm)) {
+        if (IS_ERR(conn)) {
-                status = PTR_ERR(dlm);
+                status = PTR_ERR(conn);
                mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
                     status, domain->len, domain->name);
                goto bail;
        }
-        ip->ip_dlm = dlm;
+        ip->ip_conn = conn;
        inc_nlink(dir);
        d_instantiate(dentry, inode);
@@ -549,6 +609,7 @@ static int dlmfs_fill_super(struct super_block * sb,
 static const struct file_operations dlmfs_file_operations = {
        .open           = dlmfs_file_open,
        .release        = dlmfs_file_release,
+        .poll           = dlmfs_file_poll,
        .read           = dlmfs_file_read,
        .write          = dlmfs_file_write,
 };
@@ -576,6 +637,7 @@ static const struct super_operations dlmfs_ops = {
 static const struct inode_operations dlmfs_file_inode_operations = {
        .getattr        = simple_getattr,
+        .setattr        = dlmfs_file_setattr,
 };
 static int dlmfs_get_sb(struct file_system_type *fs_type,
@@ -620,6 +682,7 @@ static int __init init_dlmfs_fs(void)
        }
        cleanup_worker = 1;
+        user_dlm_set_locking_protocol();
        status = register_filesystem(&dlmfs_fs_type);
 bail:
        if (status) {
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c
index a733b3321f83..a733b3321f83 100644
--- a/fs/ocfs2/dlm/dlmfsver.c
+++ b/fs/ocfs2/dlmfs/dlmfsver.c
diff --git a/fs/ocfs2/dlm/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h
index f35eadbed25c..f35eadbed25c 100644
--- a/fs/ocfs2/dlm/dlmfsver.h
+++ b/fs/ocfs2/dlmfs/dlmfsver.h
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index 4cb1d3dae250..0499e3fb7bdb 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -34,18 +34,19 @@
 #include <linux/types.h>
 #include <linux/crc32.h>
+#include "ocfs2_lockingver.h"
-#include "cluster/nodemanager.h"
+#include "stackglue.h"
-#include "cluster/heartbeat.h"
-#include "cluster/tcp.h"
-#include "dlmapi.h"
 #include "userdlm.h"
 #define MLOG_MASK_PREFIX ML_DLMFS
 #include "cluster/masklog.h"
+static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
+{
+        return container_of(lksb, struct user_lock_res, l_lksb);
+}
 static inline int user_check_wait_flag(struct user_lock_res *lockres,
                                       int flag)
 {
@@ -73,15 +74,15 @@ static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
 }
 /* I heart container_of... */
-static inline struct dlm_ctxt *
+static inline struct ocfs2_cluster_connection *
-dlm_ctxt_from_user_lockres(struct user_lock_res *lockres)
+cluster_connection_from_user_lockres(struct user_lock_res *lockres)
 {
        struct dlmfs_inode_private *ip;
        ip = container_of(lockres,
                          struct dlmfs_inode_private,
                          ip_lockres);
-        return ip->ip_dlm;
+        return ip->ip_conn;
 }
 static struct inode *
@@ -103,9 +104,9 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
 }
 #define user_log_dlm_error(_func, _stat, _lockres) do {                 \
-        mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on "          \
+        mlog(ML_ERROR, "Dlm error %d while calling %s on "              \
-                "resource %.*s: %s\n", dlm_errname(_stat), _func,       \
+                "resource %.*s\n", _stat, _func,                        \
-                _lockres->l_namelen, _lockres->l_name, dlm_errmsg(_stat)); \
+                _lockres->l_namelen, _lockres->l_name);                 \
 } while (0)
 /* WARNING: This function lives in a world where the only three lock
@@ -113,34 +114,35 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
 * lock types are added. */
 static inline int user_highest_compat_lock_level(int level)
 {
-        int new_level = LKM_EXMODE;
+        int new_level = DLM_LOCK_EX;
-        if (level == LKM_EXMODE)
+        if (level == DLM_LOCK_EX)
-                new_level = LKM_NLMODE;
+                new_level = DLM_LOCK_NL;
-        else if (level == LKM_PRMODE)
+        else if (level == DLM_LOCK_PR)
-                new_level = LKM_PRMODE;
+                new_level = DLM_LOCK_PR;
        return new_level;
 }
-static void user_ast(void *opaque)
+static void user_ast(struct ocfs2_dlm_lksb *lksb)
 {
-        struct user_lock_res *lockres = opaque;
+        struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
-        struct dlm_lockstatus *lksb;
+        int status;
-        mlog(0, "AST fired for lockres %.*s\n", lockres->l_namelen,
+        mlog(ML_BASTS, "AST fired for lockres %.*s, level %d => %d\n",
-             lockres->l_name);
+             lockres->l_namelen, lockres->l_name, lockres->l_level,
+             lockres->l_requested);
        spin_lock(&lockres->l_lock);
-        lksb = &(lockres->l_lksb);
+        status = ocfs2_dlm_lock_status(&lockres->l_lksb);
-        if (lksb->status != DLM_NORMAL) {
+        if (status) {
                mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n",
-                     lksb->status, lockres->l_namelen, lockres->l_name);
+                     status, lockres->l_namelen, lockres->l_name);
                spin_unlock(&lockres->l_lock);
                return;
        }
-        mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE,
+        mlog_bug_on_msg(lockres->l_requested == DLM_LOCK_IV,
                        "Lockres %.*s, requested ivmode. flags 0x%x\n",
                        lockres->l_namelen, lockres->l_name, lockres->l_flags);
@@ -148,13 +150,13 @@ static void user_ast(void *opaque)
        if (lockres->l_requested < lockres->l_level) {
                if (lockres->l_requested <=
                    user_highest_compat_lock_level(lockres->l_blocking)) {
-                        lockres->l_blocking = LKM_NLMODE;
+                        lockres->l_blocking = DLM_LOCK_NL;
                        lockres->l_flags &= ~USER_LOCK_BLOCKED;
                }
        }
        lockres->l_level = lockres->l_requested;
-        lockres->l_requested = LKM_IVMODE;
+        lockres->l_requested = DLM_LOCK_IV;
        lockres->l_flags |= USER_LOCK_ATTACHED;
        lockres->l_flags &= ~USER_LOCK_BUSY;
@@ -193,11 +195,11 @@ static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
                return;
        switch (lockres->l_blocking) {
-        case LKM_EXMODE:
+        case DLM_LOCK_EX:
                if (!lockres->l_ex_holders && !lockres->l_ro_holders)
                        queue = 1;
                break;
-        case LKM_PRMODE:
+        case DLM_LOCK_PR:
                if (!lockres->l_ex_holders)
                        queue = 1;
                break;
@@ -209,12 +211,12 @@ static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
                __user_dlm_queue_lockres(lockres);
 }
-static void user_bast(void *opaque, int level)
+static void user_bast(struct ocfs2_dlm_lksb *lksb, int level)
 {
-        struct user_lock_res *lockres = opaque;
+        struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
-        mlog(0, "Blocking AST fired for lockres %.*s. Blocking level %d\n",
+        mlog(ML_BASTS, "BAST fired for lockres %.*s, blocking %d, level %d\n",
-             lockres->l_namelen, lockres->l_name, level);
+             lockres->l_namelen, lockres->l_name, level, lockres->l_level);
        spin_lock(&lockres->l_lock);
        lockres->l_flags |= USER_LOCK_BLOCKED;
@@ -227,15 +229,15 @@ static void user_bast(void *opaque, int level)
        wake_up(&lockres->l_event);
 }
-static void user_unlock_ast(void *opaque, enum dlm_status status)
+static void user_unlock_ast(struct ocfs2_dlm_lksb *lksb, int status)
 {
-        struct user_lock_res *lockres = opaque;
+        struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
-        mlog(0, "UNLOCK AST called on lock %.*s\n", lockres->l_namelen,
+        mlog(ML_BASTS, "UNLOCK AST fired for lockres %.*s, flags 0x%x\n",
-             lockres->l_name);
+             lockres->l_namelen, lockres->l_name, lockres->l_flags);
-        if (status != DLM_NORMAL && status != DLM_CANCELGRANT)
+        if (status)
-                mlog(ML_ERROR, "Dlm returns status %d\n", status);
+                mlog(ML_ERROR, "dlm returns status %d\n", status);
        spin_lock(&lockres->l_lock);
        /* The teardown flag gets set early during the unlock process,
@@ -243,7 +245,7 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
         * for a concurrent cancel. */
        if (lockres->l_flags & USER_LOCK_IN_TEARDOWN
            && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) {
-                lockres->l_level = LKM_IVMODE;
+                lockres->l_level = DLM_LOCK_IV;
        } else if (status == DLM_CANCELGRANT) {
                /* We tried to cancel a convert request, but it was
                 * already granted. Don't clear the busy flag - the
@@ -254,7 +256,7 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
        } else {
                BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
                /* Cancel succeeded, we want to re-queue */
-                lockres->l_requested = LKM_IVMODE; /* cancel an
+                lockres->l_requested = DLM_LOCK_IV; /* cancel an
                                                    * upconvert
                                                    * request. */
                lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
@@ -271,6 +273,21 @@ out_noclear:
        wake_up(&lockres->l_event);
 }
+/*
+ * This is the userdlmfs locking protocol version.
+ *
+ * See fs/ocfs2/dlmglue.c for more details on locking versions.
+ */
+static struct ocfs2_locking_protocol user_dlm_lproto = {
+        .lp_max_version = {
+                .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
+                .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
+        },
+        .lp_lock_ast            = user_ast,
+        .lp_blocking_ast        = user_bast,
+        .lp_unlock_ast          = user_unlock_ast,
+};
 static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
 {
        struct inode *inode;
@@ -283,10 +300,10 @@ static void user_dlm_unblock_lock(struct work_struct *work)
        int new_level, status;
        struct user_lock_res *lockres =
                container_of(work, struct user_lock_res, l_work);
-        struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+        struct ocfs2_cluster_connection *conn =
+                cluster_connection_from_user_lockres(lockres);
-        mlog(0, "processing lockres %.*s\n", lockres->l_namelen,
+        mlog(0, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
-             lockres->l_name);
        spin_lock(&lockres->l_lock);
@@ -304,17 +321,23 @@ static void user_dlm_unblock_lock(struct work_struct *work)
         * flag, and finally we might get another bast which re-queues
         * us before our ast for the downconvert is called. */
        if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
+                mlog(ML_BASTS, "lockres %.*s USER_LOCK_BLOCKED\n",
+                     lockres->l_namelen, lockres->l_name);
                spin_unlock(&lockres->l_lock);
                goto drop_ref;
        }
        if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
+                mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_TEARDOWN\n",
+                     lockres->l_namelen, lockres->l_name);
                spin_unlock(&lockres->l_lock);
                goto drop_ref;
        }
        if (lockres->l_flags & USER_LOCK_BUSY) {
                if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
+                        mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_CANCEL\n",
+                             lockres->l_namelen, lockres->l_name);
                        spin_unlock(&lockres->l_lock);
                        goto drop_ref;
                }
@@ -322,32 +345,31 @@ static void user_dlm_unblock_lock(struct work_struct *work)
                lockres->l_flags |= USER_LOCK_IN_CANCEL;
                spin_unlock(&lockres->l_lock);
-                status = dlmunlock(dlm,
+                status = ocfs2_dlm_unlock(conn, &lockres->l_lksb,
-                                   &lockres->l_lksb,
+                                          DLM_LKF_CANCEL);
-                                   LKM_CANCEL,
+                if (status)
-                                   user_unlock_ast,
+                        user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
-                                   lockres);
-                if (status != DLM_NORMAL)
-                        user_log_dlm_error("dlmunlock", status, lockres);
                goto drop_ref;
        }
        /* If there are still incompat holders, we can exit safely
         * without worrying about re-queueing this lock as that will
         * happen on the last call to user_cluster_unlock. */
-        if ((lockres->l_blocking == LKM_EXMODE)
+        if ((lockres->l_blocking == DLM_LOCK_EX)
            && (lockres->l_ex_holders || lockres->l_ro_holders)) {
                spin_unlock(&lockres->l_lock);
-                mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n",
+                mlog(ML_BASTS, "lockres %.*s, EX/PR Holders %u,%u\n",
-                        lockres->l_ro_holders, lockres->l_ex_holders);
+                     lockres->l_namelen, lockres->l_name,
+                     lockres->l_ex_holders, lockres->l_ro_holders);
                goto drop_ref;
        }
-        if ((lockres->l_blocking == LKM_PRMODE)
+        if ((lockres->l_blocking == DLM_LOCK_PR)
            && lockres->l_ex_holders) {
                spin_unlock(&lockres->l_lock);
-                mlog(0, "can't downconvert for pr: ex = %u\n",
+                mlog(ML_BASTS, "lockres %.*s, EX Holders %u\n",
-                        lockres->l_ex_holders);
+                     lockres->l_namelen, lockres->l_name,
+                     lockres->l_ex_holders);
                goto drop_ref;
        }
@@ -355,22 +377,17 @@ static void user_dlm_unblock_lock(struct work_struct *work)
        new_level = user_highest_compat_lock_level(lockres->l_blocking);
        lockres->l_requested = new_level;
        lockres->l_flags |= USER_LOCK_BUSY;
-        mlog(0, "Downconvert lock from %d to %d\n",
+        mlog(ML_BASTS, "lockres %.*s, downconvert %d => %d\n",
-                lockres->l_level, new_level);
+             lockres->l_namelen, lockres->l_name, lockres->l_level, new_level);
        spin_unlock(&lockres->l_lock);
        /* need lock downconvert request now... */
-        status = dlmlock(dlm,
+        status = ocfs2_dlm_lock(conn, new_level, &lockres->l_lksb,
-                         new_level,
+                                DLM_LKF_CONVERT|DLM_LKF_VALBLK,
-                         &lockres->l_lksb,
+                                lockres->l_name,
-                         LKM_CONVERT|LKM_VALBLK,
+                                lockres->l_namelen);
-                         lockres->l_name,
+        if (status) {
-                         lockres->l_namelen,
+                user_log_dlm_error("ocfs2_dlm_lock", status, lockres);
-                         user_ast,
-                         lockres,
-                         user_bast);
-        if (status != DLM_NORMAL) {
-                user_log_dlm_error("dlmlock", status, lockres);
                user_recover_from_dlm_error(lockres);
        }
@@ -382,10 +399,10 @@ static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
                                        int level)
 {
        switch(level) {
-        case LKM_EXMODE:
+        case DLM_LOCK_EX:
                lockres->l_ex_holders++;
                break;
-        case LKM_PRMODE:
+        case DLM_LOCK_PR:
                lockres->l_ro_holders++;
                break;
        default:
@@ -410,20 +427,19 @@ int user_dlm_cluster_lock(struct user_lock_res *lockres,
                          int lkm_flags)
 {
        int status, local_flags;
-        struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+        struct ocfs2_cluster_connection *conn =
+                cluster_connection_from_user_lockres(lockres);
-        if (level != LKM_EXMODE &&
+        if (level != DLM_LOCK_EX &&
-            level != LKM_PRMODE) {
+            level != DLM_LOCK_PR) {
                mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
                     lockres->l_namelen, lockres->l_name);
                status = -EINVAL;
                goto bail;
        }
-        mlog(0, "lockres %.*s: asking for %s lock, passed flags = 0x%x\n",
+        mlog(ML_BASTS, "lockres %.*s, level %d, flags = 0x%x\n",
-             lockres->l_namelen, lockres->l_name,
+             lockres->l_namelen, lockres->l_name, level, lkm_flags);
-             (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
-             lkm_flags);
 again:
        if (signal_pending(current)) {
@@ -457,35 +473,26 @@ again:
        }
        if (level > lockres->l_level) {
-                local_flags = lkm_flags | LKM_VALBLK;
+                local_flags = lkm_flags | DLM_LKF_VALBLK;
-                if (lockres->l_level != LKM_IVMODE)
+                if (lockres->l_level != DLM_LOCK_IV)
-                        local_flags |= LKM_CONVERT;
+                        local_flags |= DLM_LKF_CONVERT;
                lockres->l_requested = level;
                lockres->l_flags |= USER_LOCK_BUSY;
                spin_unlock(&lockres->l_lock);
-                BUG_ON(level == LKM_IVMODE);
+                BUG_ON(level == DLM_LOCK_IV);
-                BUG_ON(level == LKM_NLMODE);
+                BUG_ON(level == DLM_LOCK_NL);
                /* call dlm_lock to upgrade lock now */
-                status = dlmlock(dlm,
+                status = ocfs2_dlm_lock(conn, level, &lockres->l_lksb,
-                                 level,
+                                        local_flags, lockres->l_name,
-                                 &lockres->l_lksb,
+                                        lockres->l_namelen);
-                                 local_flags,
+                if (status) {
-                                 lockres->l_name,
+                        if ((lkm_flags & DLM_LKF_NOQUEUE) &&
-                                 lockres->l_namelen,
+                            (status != -EAGAIN))
-                                 user_ast,
+                                user_log_dlm_error("ocfs2_dlm_lock",
-                                 lockres,
+                                                   status, lockres);
-                                 user_bast);
-                if (status != DLM_NORMAL) {
-                        if ((lkm_flags & LKM_NOQUEUE) &&
-                            (status == DLM_NOTQUEUED))
-                                status = -EAGAIN;
-                        else {
-                                user_log_dlm_error("dlmlock", status, lockres);
-                                status = -EINVAL;
-                        }
                        user_recover_from_dlm_error(lockres);
                        goto bail;
                }
@@ -506,11 +513,11 @@ static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
                                        int level)
 {
        switch(level) {
-        case LKM_EXMODE:
+        case DLM_LOCK_EX:
                BUG_ON(!lockres->l_ex_holders);
                lockres->l_ex_holders--;
                break;
-        case LKM_PRMODE:
+        case DLM_LOCK_PR:
                BUG_ON(!lockres->l_ro_holders);
                lockres->l_ro_holders--;
                break;
@@ -522,8 +529,8 @@ static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
 void user_dlm_cluster_unlock(struct user_lock_res *lockres,
                             int level)
 {
-        if (level != LKM_EXMODE &&
+        if (level != DLM_LOCK_EX &&
-            level != LKM_PRMODE) {
+            level != DLM_LOCK_PR) {
                mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
                     lockres->l_namelen, lockres->l_name);
                return;
@@ -540,33 +547,40 @@ void user_dlm_write_lvb(struct inode *inode,
                        unsigned int len)
 {
        struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
-        char *lvb = lockres->l_lksb.lvb;
+        char *lvb;
        BUG_ON(len > DLM_LVB_LEN);
        spin_lock(&lockres->l_lock);
-        BUG_ON(lockres->l_level < LKM_EXMODE);
+        BUG_ON(lockres->l_level < DLM_LOCK_EX);
+        lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
        memcpy(lvb, val, len);
        spin_unlock(&lockres->l_lock);
 }
-void user_dlm_read_lvb(struct inode *inode,
+ssize_t user_dlm_read_lvb(struct inode *inode,
-                       char *val,
+                          char *val,
-                       unsigned int len)
+                          unsigned int len)
 {
        struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
-        char *lvb = lockres->l_lksb.lvb;
+        char *lvb;
+        ssize_t ret = len;
        BUG_ON(len > DLM_LVB_LEN);
        spin_lock(&lockres->l_lock);
-        BUG_ON(lockres->l_level < LKM_PRMODE);
+        BUG_ON(lockres->l_level < DLM_LOCK_PR);
-        memcpy(val, lvb, len);
+        if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)) {
+                lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+                memcpy(val, lvb, len);
+        } else
+                ret = 0;
        spin_unlock(&lockres->l_lock);
+        return ret;
 }
 void user_dlm_lock_res_init(struct user_lock_res *lockres,
@@ -576,9 +590,9 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres,
        spin_lock_init(&lockres->l_lock);
        init_waitqueue_head(&lockres->l_event);
-        lockres->l_level = LKM_IVMODE;
+        lockres->l_level = DLM_LOCK_IV;
-        lockres->l_requested = LKM_IVMODE;
+        lockres->l_requested = DLM_LOCK_IV;
-        lockres->l_blocking = LKM_IVMODE;
+        lockres->l_blocking = DLM_LOCK_IV;
        /* should have been checked before getting here. */
        BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
@@ -592,9 +606,10 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres,
 int user_dlm_destroy_lock(struct user_lock_res *lockres)
 {
        int status = -EBUSY;
-        struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+        struct ocfs2_cluster_connection *conn =
+                cluster_connection_from_user_lockres(lockres);
-        mlog(0, "asked to destroy %.*s\n", lockres->l_namelen, lockres->l_name);
+        mlog(ML_BASTS, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
        spin_lock(&lockres->l_lock);
        if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
@@ -627,14 +642,9 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
        lockres->l_flags |= USER_LOCK_BUSY;
        spin_unlock(&lockres->l_lock);
-        status = dlmunlock(dlm,
+        status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK);
-                           &lockres->l_lksb,
+        if (status) {
-                           LKM_VALBLK,
+                user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
-                           user_unlock_ast,
-                           lockres);
-        if (status != DLM_NORMAL) {
-                user_log_dlm_error("dlmunlock", status, lockres);
-                status = -EINVAL;
                goto bail;
        }
@@ -645,32 +655,34 @@ bail:
        return status;
 }
-struct dlm_ctxt *user_dlm_register_context(struct qstr *name,
+static void user_dlm_recovery_handler_noop(int node_num,
-                                           struct dlm_protocol_version *proto)
+                                           void *recovery_data)
 {
-        struct dlm_ctxt *dlm;
+        /* We ignore recovery events */
-        u32 dlm_key;
+        return;
-        char *domain;
+}
-        domain = kmalloc(name->len + 1, GFP_NOFS);
-        if (!domain) {
-                mlog_errno(-ENOMEM);
-                return ERR_PTR(-ENOMEM);
-        }
-        dlm_key = crc32_le(0, name->name, name->len);
+void user_dlm_set_locking_protocol(void)
+{
+        ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version);
+}
-        snprintf(domain, name->len + 1, "%.*s", name->len, name->name);
+struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name)
+{
+        int rc;
+        struct ocfs2_cluster_connection *conn;
-        dlm = dlm_register_domain(domain, dlm_key, proto);
+        rc = ocfs2_cluster_connect_agnostic(name->name, name->len,
-        if (IS_ERR(dlm))
+                                            &user_dlm_lproto,
-                mlog_errno(PTR_ERR(dlm));
+                                            user_dlm_recovery_handler_noop,
+                                            NULL, &conn);
+        if (rc)
+                mlog_errno(rc);
-        kfree(domain);
+        return rc ? ERR_PTR(rc) : conn;
-        return dlm;
 }
-void user_dlm_unregister_context(struct dlm_ctxt *dlm)
+void user_dlm_unregister(struct ocfs2_cluster_connection *conn)
 {
-        dlm_unregister_domain(dlm);
+        ocfs2_cluster_disconnect(conn, 0);
 }
diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h
index 0c3cc03c61fa..3b42d79531d7 100644
--- a/fs/ocfs2/dlm/userdlm.h
+++ b/fs/ocfs2/dlmfs/userdlm.h
@@ -57,7 +57,7 @@ struct user_lock_res {
        int                      l_level;
        unsigned int             l_ro_holders;
        unsigned int             l_ex_holders;
-        struct dlm_lockstatus    l_lksb;
+        struct ocfs2_dlm_lksb    l_lksb;
        int                      l_requested;
        int                      l_blocking;
@@ -80,15 +80,15 @@ void user_dlm_cluster_unlock(struct user_lock_res *lockres,
 void user_dlm_write_lvb(struct inode *inode,
                        const char *val,
                        unsigned int len);
-void user_dlm_read_lvb(struct inode *inode,
+ssize_t user_dlm_read_lvb(struct inode *inode,
-                       char *val,
+                          char *val,
-                       unsigned int len);
+                          unsigned int len);
-struct dlm_ctxt *user_dlm_register_context(struct qstr *name,
+struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name);
-                                           struct dlm_protocol_version *proto);
+void user_dlm_unregister(struct ocfs2_cluster_connection *conn);
-void user_dlm_unregister_context(struct dlm_ctxt *dlm);
+void user_dlm_set_locking_protocol(void);
 struct dlmfs_inode_private {
-        struct dlm_ctxt             *ip_dlm;
+        struct ocfs2_cluster_connection *ip_conn;
        struct user_lock_res ip_lockres; /* unused for directories. */
        struct inode         *ip_parent;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 0d38d67194cb..50c4ee805da4 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -297,6 +297,11 @@ static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
                lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
 }
+static inline struct ocfs2_lock_res *ocfs2_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
+{
+        return container_of(lksb, struct ocfs2_lock_res, l_lksb);
+}
 static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
 {
        BUG_ON(!ocfs2_is_inode_lock(lockres));
@@ -875,6 +880,14 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
                lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
        lockres->l_level = lockres->l_requested;
+        /*
+         * We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing
+         * the OCFS2_LOCK_BUSY flag to prevent the dc thread from
+         * downconverting the lock before the upconvert has fully completed.
+         */
+        lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
        lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
        mlog_exit_void();
@@ -907,8 +920,6 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
        assert_spin_locked(&lockres->l_lock);
-        lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
        if (level > lockres->l_blocking) {
                /* only schedule a downconvert if we haven't already scheduled
                 * one that goes low enough to satisfy the level we're
@@ -921,6 +932,13 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
                lockres->l_blocking = level;
        }
+        mlog(ML_BASTS, "lockres %s, block %d, level %d, l_block %d, dwn %d\n",
+             lockres->l_name, level, lockres->l_level, lockres->l_blocking,
+             needs_downconvert);
+        if (needs_downconvert)
+                lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
        mlog_exit(needs_downconvert);
        return needs_downconvert;
 }
@@ -1031,18 +1049,17 @@ static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
        return lockres->l_pending_gen;
 }
+static void ocfs2_blocking_ast(struct ocfs2_dlm_lksb *lksb, int level)
-static void ocfs2_blocking_ast(void *opaque, int level)
 {
-        struct ocfs2_lock_res *lockres = opaque;
+        struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
        struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
        int needs_downconvert;
        unsigned long flags;
        BUG_ON(level <= DLM_LOCK_NL);
-        mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n",
+        mlog(ML_BASTS, "BAST fired for lockres %s, blocking %d, level %d, "
-             lockres->l_name, level, lockres->l_level,
+             "type %s\n", lockres->l_name, level, lockres->l_level,
             ocfs2_lock_type_string(lockres->l_type));
        /*
@@ -1063,9 +1080,9 @@ static void ocfs2_blocking_ast(void *opaque, int level)
        ocfs2_wake_downconvert_thread(osb);
 }
-static void ocfs2_locking_ast(void *opaque)
+static void ocfs2_locking_ast(struct ocfs2_dlm_lksb *lksb)
 {
-        struct ocfs2_lock_res *lockres = opaque;
+        struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
        struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
        unsigned long flags;
        int status;
@@ -1086,6 +1103,10 @@ static void ocfs2_locking_ast(void *opaque)
                return;
        }
+        mlog(ML_BASTS, "AST fired for lockres %s, action %d, unlock %d, "
+             "level %d => %d\n", lockres->l_name, lockres->l_action,
+             lockres->l_unlock_action, lockres->l_level, lockres->l_requested);
        switch(lockres->l_action) {
        case OCFS2_AST_ATTACH:
                ocfs2_generic_handle_attach_action(lockres);
@@ -1098,8 +1119,8 @@ static void ocfs2_locking_ast(void *opaque)
                ocfs2_generic_handle_downconvert_action(lockres);
                break;
        default:
-                mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u "
+                mlog(ML_ERROR, "lockres %s: AST fired with invalid action: %u, "
-                     "lockres flags = 0x%lx, unlock action: %u\n",
+                     "flags 0x%lx, unlock: %u\n",
                     lockres->l_name, lockres->l_action, lockres->l_flags,
                     lockres->l_unlock_action);
                BUG();
@@ -1125,6 +1146,88 @@ out:
        spin_unlock_irqrestore(&lockres->l_lock, flags);
 }
+static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
+{
+        struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
+        unsigned long flags;
+        mlog_entry_void();
+        mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n",
+             lockres->l_name, lockres->l_unlock_action);
+        spin_lock_irqsave(&lockres->l_lock, flags);
+        if (error) {
+                mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
+                     "unlock_action %d\n", error, lockres->l_name,
+                     lockres->l_unlock_action);
+                spin_unlock_irqrestore(&lockres->l_lock, flags);
+                mlog_exit_void();
+                return;
+        }
+        switch(lockres->l_unlock_action) {
+        case OCFS2_UNLOCK_CANCEL_CONVERT:
+                mlog(0, "Cancel convert success for %s\n", lockres->l_name);
+                lockres->l_action = OCFS2_AST_INVALID;
+                /* Downconvert thread may have requeued this lock, we
+                 * need to wake it. */
+                if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+                        ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
+                break;
+        case OCFS2_UNLOCK_DROP_LOCK:
+                lockres->l_level = DLM_LOCK_IV;
+                break;
+        default:
+                BUG();
+        }
+        lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+        lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+        wake_up(&lockres->l_event);
+        spin_unlock_irqrestore(&lockres->l_lock, flags);
+        mlog_exit_void();
+}
+/*
+ * This is the filesystem locking protocol.  It provides the lock handling
+ * hooks for the underlying DLM.  It has a maximum version number.
+ * The version number allows interoperability with systems running at
+ * the same major number and an equal or smaller minor number.
+ *
+ * Whenever the filesystem does new things with locks (adds or removes a
+ * lock, orders them differently, does different things underneath a lock),
+ * the version must be changed.  The protocol is negotiated when joining
+ * the dlm domain.  A node may join the domain if its major version is
+ * identical to all other nodes and its minor version is greater than
+ * or equal to all other nodes.  When its minor version is greater than
+ * the other nodes, it will run at the minor version specified by the
+ * other nodes.
+ *
+ * If a locking change is made that will not be compatible with older
+ * versions, the major number must be increased and the minor version set
+ * to zero.  If a change merely adds a behavior that can be disabled when
+ * speaking to older versions, the minor version must be increased.  If a
+ * change adds a fully backwards compatible change (eg, LVB changes that
+ * are just ignored by older versions), the version does not need to be
+ * updated.
+ */
+static struct ocfs2_locking_protocol lproto = {
+        .lp_max_version = {
+                .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
+                .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
+        },
+        .lp_lock_ast            = ocfs2_locking_ast,
+        .lp_blocking_ast        = ocfs2_blocking_ast,
+        .lp_unlock_ast          = ocfs2_unlock_ast,
+};
+void ocfs2_set_locking_protocol(void)
+{
+        ocfs2_stack_glue_set_max_proto_version(&lproto.lp_max_version);
+}
 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
                                                int convert)
 {
@@ -1133,6 +1236,7 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
        mlog_entry_void();
        spin_lock_irqsave(&lockres->l_lock, flags);
        lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+        lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
        if (convert)
                lockres->l_action = OCFS2_AST_INVALID;
        else
@@ -1179,8 +1283,7 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
                             &lockres->l_lksb,
                             dlm_flags,
                             lockres->l_name,
-                             OCFS2_LOCK_ID_MAX_LEN - 1,
+                             OCFS2_LOCK_ID_MAX_LEN - 1);
-                             lockres);
        lockres_clear_pending(lockres, gen, osb);
        if (ret) {
                ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -1323,13 +1426,13 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
 again:
        wait = 0;
+        spin_lock_irqsave(&lockres->l_lock, flags);
        if (catch_signals && signal_pending(current)) {
                ret = -ERESTARTSYS;
-                goto out;
+                goto unlock;
        }
-        spin_lock_irqsave(&lockres->l_lock, flags);
        mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
                        "Cluster lock called on freeing lockres %s! flags "
                        "0x%lx\n", lockres->l_name, lockres->l_flags);
@@ -1346,6 +1449,25 @@ again:
                goto unlock;
        }
+        if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING) {
+                /*
+                 * We've upconverted. If the lock now has a level we can
+                 * work with, we take it. If, however, the lock is not at the
+                 * required level, we go thru the full cycle. One way this could
+                 * happen is if a process requesting an upconvert to PR is
+                 * closely followed by another requesting upconvert to an EX.
+                 * If the process requesting EX lands here, we want it to
+                 * continue attempting to upconvert and let the process
+                 * requesting PR take the lock.
+                 * If multiple processes request upconvert to PR, the first one
+                 * here will take the lock. The others will have to go thru the
+                 * OCFS2_LOCK_BLOCKED check to ensure that there is no pending
+                 * downconvert request.
+                 */
+                if (level <= lockres->l_level)
+                        goto update_holders;
+        }
        if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
            !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
                /* is the lock is currently blocked on behalf of
@@ -1383,7 +1505,7 @@ again:
                BUG_ON(level == DLM_LOCK_IV);
                BUG_ON(level == DLM_LOCK_NL);
-                mlog(0, "lock %s, convert from %d to level = %d\n",
+                mlog(ML_BASTS, "lockres %s, convert from %d to %d\n",
                     lockres->l_name, lockres->l_level, level);
                /* call dlm_lock to upgrade lock now */
@@ -1392,8 +1514,7 @@ again:
                                     &lockres->l_lksb,
                                     lkm_flags,
                                     lockres->l_name,
-                                     OCFS2_LOCK_ID_MAX_LEN - 1,
+                                     OCFS2_LOCK_ID_MAX_LEN - 1);
-                                     lockres);
                lockres_clear_pending(lockres, gen, osb);
                if (ret) {
                        if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
@@ -1416,11 +1537,14 @@ again:
                goto again;
        }
+update_holders:
        /* Ok, if we get here then we're good to go. */
        ocfs2_inc_holders(lockres, level);
        ret = 0;
 unlock:
+        lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
 out:
        /*
@@ -1757,7 +1881,7 @@ out:
 * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
 * flock() calls. The locking approach this requires is sufficiently
 * different from all other cluster lock types that we implement a
- * seperate path to the "low-level" dlm calls. In particular:
+ * separate path to the "low-level" dlm calls. In particular:
 *
 * - No optimization of lock levels is done - we take at exactly
 *   what's been requested.
@@ -1827,8 +1951,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
        spin_unlock_irqrestore(&lockres->l_lock, flags);
        ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags,
-                             lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
+                             lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1);
-                             lockres);
        if (ret) {
                if (!trylock || (ret != -EAGAIN)) {
                        ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -1855,7 +1978,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
                 * outstanding lock request, so a cancel convert is
                 * required. We intentionally overwrite 'ret' - if the
                 * cancel fails and the lock was granted, it's easier
-                 * to just bubble sucess back up to the user.
+                 * to just bubble success back up to the user.
                 */
                ret = ocfs2_flock_handle_signal(lockres, level);
        } else if (!ret && (level > lockres->l_level)) {
@@ -2957,7 +3080,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
        status = ocfs2_cluster_connect(osb->osb_cluster_stack,
                                       osb->uuid_str,
                                       strlen(osb->uuid_str),
-                                       ocfs2_do_node_down, osb,
+                                       &lproto, ocfs2_do_node_down, osb,
                                       &conn);
        if (status) {
                mlog_errno(status);
@@ -3024,50 +3147,6 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
        mlog_exit_void();
 }
-static void ocfs2_unlock_ast(void *opaque, int error)
-{
-        struct ocfs2_lock_res *lockres = opaque;
-        unsigned long flags;
-        mlog_entry_void();
-        mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
-             lockres->l_unlock_action);
-        spin_lock_irqsave(&lockres->l_lock, flags);
-        if (error) {
-                mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
-                     "unlock_action %d\n", error, lockres->l_name,
-                     lockres->l_unlock_action);
-                spin_unlock_irqrestore(&lockres->l_lock, flags);
-                mlog_exit_void();
-                return;
-        }
-        switch(lockres->l_unlock_action) {
-        case OCFS2_UNLOCK_CANCEL_CONVERT:
-                mlog(0, "Cancel convert success for %s\n", lockres->l_name);
-                lockres->l_action = OCFS2_AST_INVALID;
-                /* Downconvert thread may have requeued this lock, we
-                 * need to wake it. */
-                if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
-                        ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
-                break;
-        case OCFS2_UNLOCK_DROP_LOCK:
-                lockres->l_level = DLM_LOCK_IV;
-                break;
-        default:
-                BUG();
-        }
-        lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-        lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
-        wake_up(&lockres->l_event);
-        spin_unlock_irqrestore(&lockres->l_lock, flags);
-        mlog_exit_void();
-}
 static int ocfs2_drop_lock(struct ocfs2_super *osb,
                           struct ocfs2_lock_res *lockres)
 {
@@ -3135,8 +3214,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
        mlog(0, "lock %s\n", lockres->l_name);
-        ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags,
+        ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags);
-                               lockres);
        if (ret) {
                ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
                mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
@@ -3155,7 +3233,7 @@ out:
 /* Mark the lockres as being dropped. It will no longer be
 * queued if blocking, but we still may have to wait on it
 * being dequeued from the downconvert thread before we can consider
- * it safe to drop. 
+ * it safe to drop.
 *
 * You can *not* attempt to call cluster_lock on this lockres anymore. */
 void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
@@ -3244,13 +3322,20 @@ static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
        BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
        if (lockres->l_level <= new_level) {
-                mlog(ML_ERROR, "lockres->l_level (%d) <= new_level (%d)\n",
+                mlog(ML_ERROR, "lockres %s, lvl %d <= %d, blcklst %d, mask %d, "
-                     lockres->l_level, new_level);
+                     "type %d, flags 0x%lx, hold %d %d, act %d %d, req %d, "
+                     "block %d, pgen %d\n", lockres->l_name, lockres->l_level,
+                     new_level, list_empty(&lockres->l_blocked_list),
+                     list_empty(&lockres->l_mask_waiters), lockres->l_type,
+                     lockres->l_flags, lockres->l_ro_holders,
+                     lockres->l_ex_holders, lockres->l_action,
+                     lockres->l_unlock_action, lockres->l_requested,
+                     lockres->l_blocking, lockres->l_pending_gen);
                BUG();
        }
-        mlog(0, "lock %s, new_level = %d, l_blocking = %d\n",
+        mlog(ML_BASTS, "lockres %s, level %d => %d, blocking %d\n",
-             lockres->l_name, new_level, lockres->l_blocking);
+             lockres->l_name, lockres->l_level, new_level, lockres->l_blocking);
        lockres->l_action = OCFS2_AST_DOWNCONVERT;
        lockres->l_requested = new_level;
@@ -3269,6 +3354,9 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
        mlog_entry_void();
+        mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
+             lockres->l_level, new_level);
        if (lvb)
                dlm_flags |= DLM_LKF_VALBLK;
@@ -3277,8 +3365,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
                             &lockres->l_lksb,
                             dlm_flags,
                             lockres->l_name,
-                             OCFS2_LOCK_ID_MAX_LEN - 1,
+                             OCFS2_LOCK_ID_MAX_LEN - 1);
-                             lockres);
        lockres_clear_pending(lockres, generation, osb);
        if (ret) {
                ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -3299,14 +3386,12 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
        assert_spin_locked(&lockres->l_lock);
        mlog_entry_void();
-        mlog(0, "lock %s\n", lockres->l_name);
        if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
                /* If we're already trying to cancel a lock conversion
                 * then just drop the spinlock and allow the caller to
                 * requeue this lock. */
+                mlog(ML_BASTS, "lockres %s, skip convert\n", lockres->l_name);
-                mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
                return 0;
        }
@@ -3321,6 +3406,8 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
                        "lock %s, invalid flags: 0x%lx\n",
                        lockres->l_name, lockres->l_flags);
+        mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
        return 1;
 }
@@ -3330,16 +3417,15 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
        int ret;
        mlog_entry_void();
-        mlog(0, "lock %s\n", lockres->l_name);
        ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
-                               DLM_LKF_CANCEL, lockres);
+                               DLM_LKF_CANCEL);
        if (ret) {
                ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
                ocfs2_recover_from_dlm_error(lockres, 0);
        }
-        mlog(0, "lock %s return from ocfs2_dlm_unlock\n", lockres->l_name);
+        mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
        mlog_exit(ret);
        return ret;
@@ -3352,6 +3438,7 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
        unsigned long flags;
        int blocking;
        int new_level;
+        int level;
        int ret = 0;
        int set_lvb = 0;
        unsigned int gen;
@@ -3360,9 +3447,17 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
        spin_lock_irqsave(&lockres->l_lock, flags);
-        BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
 recheck:
+        /*
+         * Is it still blocking? If not, we have no more work to do.
+         */
+        if (!(lockres->l_flags & OCFS2_LOCK_BLOCKED)) {
+                BUG_ON(lockres->l_blocking != DLM_LOCK_NL);
+                spin_unlock_irqrestore(&lockres->l_lock, flags);
+                ret = 0;
+                goto leave;
+        }
        if (lockres->l_flags & OCFS2_LOCK_BUSY) {
                /* XXX
                 * This is a *big* race.  The OCFS2_LOCK_PENDING flag
@@ -3387,8 +3482,11 @@ recheck:
                 * at the same time they set OCFS2_DLM_BUSY.  They must
                 * clear OCFS2_DLM_PENDING after dlm_lock() returns.
                 */
-                if (lockres->l_flags & OCFS2_LOCK_PENDING)
+                if (lockres->l_flags & OCFS2_LOCK_PENDING) {
+                        mlog(ML_BASTS, "lockres %s, ReQ: Pending\n",
+                             lockres->l_name);
                        goto leave_requeue;
+                }
                ctl->requeue = 1;
                ret = ocfs2_prepare_cancel_convert(osb, lockres);
@@ -3401,31 +3499,70 @@ recheck:
                goto leave;
        }
+        /*
+         * This prevents livelocks. OCFS2_LOCK_UPCONVERT_FINISHING flag is
+         * set when the ast is received for an upconvert just before the
+         * OCFS2_LOCK_BUSY flag is cleared. Now if the fs received a bast
+         * on the heels of the ast, we want to delay the downconvert just
+         * enough to allow the up requestor to do its task. Because this
+         * lock is in the blocked queue, the lock will be downconverted
+         * as soon as the requestor is done with the lock.
+         */
+        if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING)
+                goto leave_requeue;
+        /*
+         * How can we block and yet be at NL?  We were trying to upconvert
+         * from NL and got canceled.  The code comes back here, and now
+         * we notice and clear BLOCKING.
+         */
+        if (lockres->l_level == DLM_LOCK_NL) {
+                BUG_ON(lockres->l_ex_holders || lockres->l_ro_holders);
+                mlog(ML_BASTS, "lockres %s, Aborting dc\n", lockres->l_name);
+                lockres->l_blocking = DLM_LOCK_NL;
+                lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
+                spin_unlock_irqrestore(&lockres->l_lock, flags);
+                goto leave;
+        }
        /* if we're blocking an exclusive and we have *any* holders,
         * then requeue. */
        if ((lockres->l_blocking == DLM_LOCK_EX)
-            && (lockres->l_ex_holders || lockres->l_ro_holders))
+            && (lockres->l_ex_holders || lockres->l_ro_holders)) {
+                mlog(ML_BASTS, "lockres %s, ReQ: EX/PR Holders %u,%u\n",
+                     lockres->l_name, lockres->l_ex_holders,
+                     lockres->l_ro_holders);
                goto leave_requeue;
+        }
        /* If it's a PR we're blocking, then only
         * requeue if we've got any EX holders */
        if (lockres->l_blocking == DLM_LOCK_PR &&
-            lockres->l_ex_holders)
+            lockres->l_ex_holders) {
+                mlog(ML_BASTS, "lockres %s, ReQ: EX Holders %u\n",
+                     lockres->l_name, lockres->l_ex_holders);
                goto leave_requeue;
+        }
        /*
         * Can we get a lock in this state if the holder counts are
         * zero? The meta data unblock code used to check this.
         */
        if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
-            && (lockres->l_flags & OCFS2_LOCK_REFRESHING))
+            && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) {
+                mlog(ML_BASTS, "lockres %s, ReQ: Lock Refreshing\n",
+                     lockres->l_name);
                goto leave_requeue;
+        }
        new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
        if (lockres->l_ops->check_downconvert
-            && !lockres->l_ops->check_downconvert(lockres, new_level))
+            && !lockres->l_ops->check_downconvert(lockres, new_level)) {
+                mlog(ML_BASTS, "lockres %s, ReQ: Checkpointing\n",
+                     lockres->l_name);
                goto leave_requeue;
+        }
        /* If we get here, then we know that there are no more
         * incompatible holders (and anyone asking for an incompatible
@@ -3438,17 +3575,24 @@ recheck:
         * may sleep, so we save off a copy of what we're blocking as
         * it may change while we're not holding the spin lock. */
        blocking = lockres->l_blocking;
+        level = lockres->l_level;
        spin_unlock_irqrestore(&lockres->l_lock, flags);
        ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking);
-        if (ctl->unblock_action == UNBLOCK_STOP_POST)
+        if (ctl->unblock_action == UNBLOCK_STOP_POST) {
+                mlog(ML_BASTS, "lockres %s, UNBLOCK_STOP_POST\n",
+                     lockres->l_name);
                goto leave;
+        }
        spin_lock_irqsave(&lockres->l_lock, flags);
-        if (blocking != lockres->l_blocking) {
+        if ((blocking != lockres->l_blocking) || (level != lockres->l_level)) {
                /* If this changed underneath us, then we can't drop
                 * it just yet. */
+                mlog(ML_BASTS, "lockres %s, block=%d:%d, level=%d:%d, "
+                     "Recheck\n", lockres->l_name, blocking,
+                     lockres->l_blocking, level, lockres->l_level);
                goto recheck;
        }
@@ -3843,45 +3987,6 @@ void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex)
                ocfs2_cluster_unlock(osb, lockres, level);
 }
-/*
- * This is the filesystem locking protocol.  It provides the lock handling
- * hooks for the underlying DLM.  It has a maximum version number.
- * The version number allows interoperability with systems running at
- * the same major number and an equal or smaller minor number.
- *
- * Whenever the filesystem does new things with locks (adds or removes a
- * lock, orders them differently, does different things underneath a lock),
- * the version must be changed.  The protocol is negotiated when joining
- * the dlm domain.  A node may join the domain if its major version is
- * identical to all other nodes and its minor version is greater than
- * or equal to all other nodes.  When its minor version is greater than
- * the other nodes, it will run at the minor version specified by the
- * other nodes.
- *
- * If a locking change is made that will not be compatible with older
- * versions, the major number must be increased and the minor version set
- * to zero.  If a change merely adds a behavior that can be disabled when
- * speaking to older versions, the minor version must be increased.  If a
- * change adds a fully backwards compatible change (eg, LVB changes that
- * are just ignored by older versions), the version does not need to be
- * updated.
- */
-static struct ocfs2_locking_protocol lproto = {
-        .lp_max_version = {
-                .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
-                .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
-        },
-        .lp_lock_ast            = ocfs2_locking_ast,
-        .lp_blocking_ast        = ocfs2_blocking_ast,
-        .lp_unlock_ast          = ocfs2_unlock_ast,
-};
-void ocfs2_set_locking_protocol(void)
-{
-        ocfs2_stack_glue_set_locking_protocol(&lproto);
-}
 static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
                                       struct ocfs2_lock_res *lockres)
 {
@@ -3898,7 +4003,7 @@ static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
        BUG_ON(!lockres);
        BUG_ON(!lockres->l_ops);
-        mlog(0, "lockres %s blocked.\n", lockres->l_name);
+        mlog(ML_BASTS, "lockres %s blocked\n", lockres->l_name);
        /* Detect whether a lock has been marked as going away while
         * the downconvert thread was processing other things. A lock can
@@ -3921,7 +4026,7 @@ unqueue:
        } else
                ocfs2_schedule_blocked_lock(osb, lockres);
-        mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name,
+        mlog(ML_BASTS, "lockres %s, requeue = %s.\n", lockres->l_name,
             ctl.requeue ? "yes" : "no");
        spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -3943,7 +4048,7 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
                /* Do not schedule a lock for downconvert when it's on
                 * the way to destruction - any nodes wanting access
                 * to the resource will get it soon. */
-                mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n",
+                mlog(ML_BASTS, "lockres %s won't be scheduled: flags 0x%lx\n",
                     lockres->l_name, lockres->l_flags);
                return;
        }
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 15713cbb865c..19ad145d2af3 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -239,7 +239,7 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
                mlog(0, "Encoding parent: blkno: %llu, generation: %u\n",
                     (unsigned long long)blkno, generation);
        }
-        
        *max_len = len;
 bail:
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 843db64e9d4a..09e3fdfa6d33 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -24,6 +24,7 @@
 #include <linux/fs.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/fiemap.h>
@@ -37,6 +38,7 @@
 #include "extent_map.h"
 #include "inode.h"
 #include "super.h"
+#include "symlink.h"
 #include "buffer_head_io.h"
@@ -191,7 +193,7 @@ static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
                emi->ei_clusters += ins->ei_clusters;
                return 1;
        } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
-                   (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys &&
+                   (ins->ei_cpos + ins->ei_clusters) == emi->ei_cpos &&
                   ins->ei_flags == emi->ei_flags) {
                emi->ei_phys = ins->ei_phys;
                emi->ei_cpos = ins->ei_cpos;
@@ -452,7 +454,7 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
        if (i == -1) {
                /*
                 * Holes can be larger than the maximum size of an
-                 * extent, so we return their lengths in a seperate
+                 * extent, so we return their lengths in a separate
                 * field.
                 */
                if (hole_len) {
@@ -703,6 +705,12 @@ out:
        return ret;
 }
+/*
+ * The ocfs2_fiemap_inline() may be a little bit misleading, since
+ * it not only handles the fiemap for inlined files, but also deals
+ * with the fast symlink, cause they have no difference for extent
+ * mapping per se.
+ */
 static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
                               struct fiemap_extent_info *fieinfo,
                               u64 map_start)
@@ -715,11 +723,18 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        di = (struct ocfs2_dinode *)di_bh->b_data;
-        id_count = le16_to_cpu(di->id2.i_data.id_count);
+        if (ocfs2_inode_is_fast_symlink(inode))
+                id_count = ocfs2_fast_symlink_chars(inode->i_sb);
+        else
+                id_count = le16_to_cpu(di->id2.i_data.id_count);
        if (map_start < id_count) {
                phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits;
-                phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+                if (ocfs2_inode_is_fast_symlink(inode))
+                        phys += offsetof(struct ocfs2_dinode, id2.i_symlink);
+                else
+                        phys += offsetof(struct ocfs2_dinode,
+                                         id2.i_data.id_data);
                ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
                                              flags);
@@ -756,9 +771,10 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        down_read(&OCFS2_I(inode)->ip_alloc_sem);
        /*
-         * Handle inline-data separately.
+         * Handle inline-data and fast symlink separately.
         */
-        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+        if ((OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
+            ocfs2_inode_is_fast_symlink(inode)) {
                ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start);
                goto out_unlock;
        }
@@ -786,6 +802,8 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                fe_flags = 0;
                if (rec.e_flags & OCFS2_EXT_UNWRITTEN)
                        fe_flags |= FIEMAP_EXTENT_UNWRITTEN;
+                if (rec.e_flags & OCFS2_EXT_REFCOUNTED)
+                        fe_flags |= FIEMAP_EXTENT_SHARED;
                if (is_last)
                        fe_flags |= FIEMAP_EXTENT_LAST;
                len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index de059f490586..17947dc8341e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -107,6 +107,9 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
        mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
                   file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
+        if (file->f_mode & FMODE_WRITE)
+                dquot_initialize(inode);
        spin_lock(&oi->ip_lock);
        /* Check that the inode hasn't been wiped from disk by another
@@ -629,11 +632,10 @@ restart_all:
        }
 restarted_transaction:
-        if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb,
+        status = dquot_alloc_space_nodirty(inode,
-            clusters_to_add))) {
+                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
-                status = -EDQUOT;
+        if (status)
                goto leave;
-        }
        did_quota = 1;
        /* reserve a write to the file entry early on - that we if we
@@ -674,7 +676,7 @@ restarted_transaction:
        clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
        spin_unlock(&OCFS2_I(inode)->ip_lock);
        /* Release unused quota reservation */
-        vfs_dq_free_space(inode,
+        dquot_free_space(inode,
                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
        did_quota = 0;
@@ -710,7 +712,7 @@ restarted_transaction:
 leave:
        if (status < 0 && did_quota)
-                vfs_dq_free_space(inode,
+                dquot_free_space(inode,
                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
        if (handle) {
                ocfs2_commit_trans(osb, handle);
@@ -749,7 +751,7 @@ static int ocfs2_write_zero_page(struct inode *inode,
        int ret;
        offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
-        /* ugh.  in prepare/commit_write, if from==to==start of block, we 
+        /* ugh.  in prepare/commit_write, if from==to==start of block, we
        ** skip the prepare.  make sure we never send an offset for the start
        ** of a block
        */
@@ -978,6 +980,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
        size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
        if (size_change) {
+                dquot_initialize(inode);
                status = ocfs2_rw_lock(inode, 1);
                if (status < 0) {
                        mlog_errno(status);
@@ -993,10 +997,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
        }
        if (size_change && attr->ia_size != i_size_read(inode)) {
-                if (attr->ia_size > sb->s_maxbytes) {
+                status = inode_newsize_ok(inode, attr->ia_size);
-                        status = -EFBIG;
+                if (status)
                        goto bail_unlock;
-                }
                if (i_size_read(inode) > attr->ia_size) {
                        if (ocfs2_should_order_data(inode)) {
@@ -1021,7 +1024,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                /*
                 * Gather pointers to quota structures so that allocation /
                 * freeing of quota structures happens here and not inside
-                 * vfs_dq_transfer() where we have problems with lock ordering
+                 * dquot_transfer() where we have problems with lock ordering
                 */
                if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
                    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
@@ -1054,7 +1057,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                        mlog_errno(status);
                        goto bail_unlock;
                }
-                status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+                status = dquot_transfer(inode, attr);
                if (status < 0)
                        goto bail_commit;
        } else {
@@ -1772,13 +1775,14 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                                         loff_t *ppos,
                                         size_t count,
                                         int appending,
-                                         int *direct_io)
+                                         int *direct_io,
+                                         int *has_refcount)
 {
        int ret = 0, meta_level = 0;
        struct inode *inode = dentry->d_inode;
        loff_t saved_pos, end;
-        /* 
+        /*
         * We start with a read level meta lock and only jump to an ex
         * if we need to make modifications here.
         */
@@ -1833,6 +1837,10 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                                                               saved_pos,
                                                               count,
                                                               &meta_level);
+                        if (has_refcount)
+                                *has_refcount = 1;
+                        if (direct_io)
+                                *direct_io = 0;
                }
                if (ret < 0) {
@@ -1899,7 +1907,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
                                    loff_t pos)
 {
        int ret, direct_io, appending, rw_level, have_alloc_sem  = 0;
-        int can_do_direct;
+        int can_do_direct, has_refcount = 0;
        ssize_t written = 0;
        size_t ocount;          /* original count */
        size_t count;           /* after file limit checks */
@@ -1942,7 +1950,7 @@ relock:
        can_do_direct = direct_io;
        ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
                                            iocb->ki_left, appending,
-                                            &can_do_direct);
+                                            &can_do_direct, &has_refcount);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -2006,14 +2014,16 @@ out_dio:
        /* buffered aio wouldn't have proper lock coverage today */
        BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
-        if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {
+        if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
+            ((file->f_flags & O_DIRECT) && has_refcount)) {
                ret = filemap_fdatawrite_range(file->f_mapping, pos,
                                               pos + count - 1);
                if (ret < 0)
                        written = ret;
                if (!ret && (old_size != i_size_read(inode) ||
-                    old_clusters != OCFS2_I(inode)->ip_clusters)) {
+                    old_clusters != OCFS2_I(inode)->ip_clusters ||
+                    has_refcount)) {
                        ret = jbd2_journal_force_commit(osb->journal->j_journal);
                        if (ret < 0)
                                written = ret;
@@ -2024,7 +2034,7 @@ out_dio:
                                                      pos + count - 1);
        }
-        /* 
+        /*
         * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
         * function pointer which is called when o_direct io completes so that
         * it can unlock our rw lock.  (it's the clustered equivalent of
@@ -2034,7 +2044,7 @@ out_dio:
         * async dio is going to do it in the future or an end_io after an
         * error has already done it.
         */
-        if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
+        if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
                rw_level = -1;
                have_alloc_sem = 0;
        }
@@ -2062,7 +2072,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
        int ret;
        ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos,
-                                            sd->total_len, 0, NULL);
+                                            sd->total_len, 0, NULL, NULL);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
@@ -2189,7 +2199,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
                goto bail;
        }
-        /* 
+        /*
         * buffered reads protect themselves in ->readpage().  O_DIRECT reads
         * need locks to protect pending reads from racing with truncate.
         */
@@ -2211,10 +2221,10 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
         * We're fine letting folks race truncates and extending
         * writes with read across the cluster, just like they can
         * locally. Hence no rw_lock during read.
-         * 
+         *
         * Take and drop the meta data lock to update inode fields
         * like i_size. This allows the checks down below
-         * generic_file_aio_read() a chance of actually working. 
+         * generic_file_aio_read() a chance of actually working.
         */
        ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
        if (ret < 0) {
@@ -2239,7 +2249,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 bail:
        if (have_alloc_sem)
                up_read(&inode->i_alloc_sem);
-        if (rw_level != -1) 
+        if (rw_level != -1)
                ocfs2_rw_unlock(inode, rw_level);
        mlog_exit(ret);
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index c6e7213db868..1aa863dd901f 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -26,7 +26,6 @@
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #define MLOG_MASK_PREFIX ML_SUPER
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 0297fb8982b8..07cc8bb68b6d 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -25,7 +25,6 @@
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
@@ -475,7 +474,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
        if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
                status = ocfs2_try_open_lock(inode, 0);
                if (status) {
-                        make_bad_inode(inode);  
+                        make_bad_inode(inode);
                        return status;
                }
        }
@@ -665,7 +664,7 @@ static int ocfs2_remove_inode(struct inode *inode,
        }
        ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
-        vfs_dq_free_inode(inode);
+        dquot_free_inode(inode);
        status = ocfs2_free_dinode(handle, inode_alloc_inode,
                                   inode_alloc_bh, di);
@@ -684,7 +683,7 @@ bail:
        return status;
 }
-/* 
+/*
 * Serialize with orphan dir recovery. If the process doing
 * recovery on this orphan dir does an iget() with the dir
 * i_mutex held, we'll deadlock here. Instead we detect this
@@ -891,6 +890,21 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
        /* Do some basic inode verification... */
        di = (struct ocfs2_dinode *) di_bh->b_data;
        if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) {
+                /*
+                 * Inodes in the orphan dir must have ORPHANED_FL.  The only
+                 * inodes that come back out of the orphan dir are reflink
+                 * targets. A reflink target may be moved out of the orphan
+                 * dir between the time we scan the directory and the time we
+                 * process it. This would lead to HAS_REFCOUNT_FL being set but
+                 * ORPHANED_FL not.
+                 */
+                if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) {
+                        mlog(0, "Reflinked inode %llu is no longer orphaned.  "
+                             "it shouldn't be deleted\n",
+                             (unsigned long long)oi->ip_blkno);
+                        goto bail;
+                }
                /* for lack of a better error? */
                status = -EEXIST;
                mlog(ML_ERROR,
@@ -971,6 +985,8 @@ void ocfs2_delete_inode(struct inode *inode)
                goto bail;
        }
+        dquot_initialize(inode);
        if (!ocfs2_inode_is_valid_to_delete(inode)) {
                /* It's probably not necessary to truncate_inode_pages
                 * here but we do it for safety anyway (it will most
@@ -1087,6 +1103,8 @@ void ocfs2_clear_inode(struct inode *inode)
        mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
                        "Inode=%lu\n", inode->i_ino);
+        dquot_drop(inode);
        /* To preven remote deletes we hold open lock before, now it
         * is time to unlock PR and EX open locks. */
        ocfs2_open_unlock(inode);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 31fbb0619510..7d9d9c132cef 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,6 +7,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
+#include <linux/compat.h>
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -181,6 +182,10 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 #ifdef CONFIG_COMPAT
 long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
+        bool preserve;
+        struct reflink_arguments args;
+        struct inode *inode = file->f_path.dentry->d_inode;
        switch (cmd) {
        case OCFS2_IOC32_GETFLAGS:
                cmd = OCFS2_IOC_GETFLAGS;
@@ -195,8 +200,15 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        case OCFS2_IOC_GROUP_EXTEND:
        case OCFS2_IOC_GROUP_ADD:
        case OCFS2_IOC_GROUP_ADD64:
-        case OCFS2_IOC_REFLINK:
                break;
+        case OCFS2_IOC_REFLINK:
+                if (copy_from_user(&args, (struct reflink_arguments *)arg,
+                                   sizeof(args)))
+                        return -EFAULT;
+                preserve = (args.preserve != 0);
+                return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
+                                           compat_ptr(args.new_path), preserve);
        default:
                return -ENOIOCTLCMD;
        }
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index cf9a5ee30fef..0cd5323bd3f0 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -7,10 +7,10 @@
 *
 */
-#ifndef OCFS2_IOCTL_H
+#ifndef OCFS2_IOCTL_PROTO_H
-#define OCFS2_IOCTL_H
+#define OCFS2_IOCTL_PROTO_H
 long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
 long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
-#endif /* OCFS2_IOCTL_H */
+#endif /* OCFS2_IOCTL_PROTO_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 54c16b66327e..9336c60e3a36 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -659,7 +659,7 @@ static int __ocfs2_journal_access(handle_t *handle,
        default:
                status = -EINVAL;
-                mlog(ML_ERROR, "Uknown access type!\n");
+                mlog(ML_ERROR, "Unknown access type!\n");
        }
        if (!status && ocfs2_meta_ecc(osb) && triggers)
                jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
@@ -2034,7 +2034,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
                status = -ENOENT;
                mlog_errno(status);
                return status;
-        }       
+        }
        mutex_lock(&orphan_dir_inode->i_mutex);
        status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ac10f83edb95..c983715d8d8c 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -476,7 +476,7 @@ out_mutex:
 out:
        if (!status)
-                ocfs2_init_inode_steal_slot(osb);
+                ocfs2_init_steal_slots(osb);
        mlog_exit(status);
        return status;
 }
@@ -872,8 +872,10 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
                             (unsigned long long)la_start_blk,
                             (unsigned long long)blkno);
-                        status = ocfs2_free_clusters(handle, main_bm_inode,
+                        status = ocfs2_release_clusters(handle,
-                                                     main_bm_bh, blkno, count);
+                                                        main_bm_inode,
+                                                        main_bm_bh, blkno,
+                                                        count);
                        if (status < 0) {
                                mlog_errno(status);
                                goto bail;
@@ -984,8 +986,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
        }
 retry_enospc:
-        (*ac)->ac_bits_wanted = osb->local_alloc_bits;
+        (*ac)->ac_bits_wanted = osb->local_alloc_default_bits;
        status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
        if (status == -ENOSPC) {
                if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
@@ -1061,6 +1062,7 @@ retry_enospc:
                    OCFS2_LA_DISABLED)
                        goto bail;
+                ac->ac_bits_wanted = osb->local_alloc_default_bits;
                status = ocfs2_claim_clusters(osb, handle, ac,
                                              osb->local_alloc_bits,
                                              &cluster_off,
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 544ac6245175..b5cb3ede9408 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -133,7 +133,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
        if (!(fl->fl_flags & FL_POSIX))
                return -ENOLCK;
-        if (__mandatory_lock(inode))
+        if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
                return -ENOLCK;
        return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 39737613424a..7898bd3a99f5 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -25,7 +25,6 @@
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/uio.h>
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index f010b22b1c44..b1eb50ae4097 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -84,7 +84,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
                            handle_t *handle,
                            struct inode *inode,
-                            struct ocfs2_dinode *fe,
+                            struct buffer_head *fe_bh,
                            char *name,
                            struct ocfs2_dir_lookup_result *lookup,
                            struct inode *orphan_dir_inode);
@@ -212,7 +212,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
        } else
                inode->i_gid = current_fsgid();
        inode->i_mode = mode;
-        vfs_dq_init(inode);
+        dquot_initialize(inode);
        return inode;
 }
@@ -244,6 +244,8 @@ static int ocfs2_mknod(struct inode *dir,
                   (unsigned long)dev, dentry->d_name.len,
                   dentry->d_name.name);
+        dquot_initialize(dir);
        /* get our super block */
        osb = OCFS2_SB(dir->i_sb);
@@ -348,13 +350,9 @@ static int ocfs2_mknod(struct inode *dir,
                goto leave;
        }
-        /* We don't use standard VFS wrapper because we don't want vfs_dq_init
+        status = dquot_alloc_inode(inode);
-         * to be called. */
+        if (status)
-        if (sb_any_quota_active(osb->sb) &&
-            osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
-                status = -EDQUOT;
                goto leave;
-        }
        did_quota_inode = 1;
        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
@@ -431,7 +429,7 @@ static int ocfs2_mknod(struct inode *dir,
        status = 0;
 leave:
        if (status < 0 && did_quota_inode)
-                vfs_dq_free_inode(inode);
+                dquot_free_inode(inode);
        if (handle)
                ocfs2_commit_trans(osb, handle);
@@ -636,6 +634,8 @@ static int ocfs2_link(struct dentry *old_dentry,
        if (S_ISDIR(inode->i_mode))
                return -EPERM;
+        dquot_initialize(dir);
        err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT);
        if (err < 0) {
                if (err != -ENOENT)
@@ -791,6 +791,8 @@ static int ocfs2_unlink(struct inode *dir,
        mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
                   dentry->d_name.len, dentry->d_name.name);
+        dquot_initialize(dir);
        BUG_ON(dentry->d_parent->d_inode != dir);
        mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -877,7 +879,7 @@ static int ocfs2_unlink(struct inode *dir,
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
        if (inode_is_unlinkable(inode)) {
-                status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
+                status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name,
                                          &orphan_insert, orphan_dir);
                if (status < 0) {
                        mlog_errno(status);
@@ -1051,6 +1053,9 @@ static int ocfs2_rename(struct inode *old_dir,
                   old_dentry->d_name.len, old_dentry->d_name.name,
                   new_dentry->d_name.len, new_dentry->d_name.name);
+        dquot_initialize(old_dir);
+        dquot_initialize(new_dir);
        osb = OCFS2_SB(old_dir->i_sb);
        if (new_inode) {
@@ -1295,7 +1300,7 @@ static int ocfs2_rename(struct inode *old_dir,
                if (S_ISDIR(new_inode->i_mode) ||
                    (ocfs2_read_links_count(newfe) == 1)) {
                        status = ocfs2_orphan_add(osb, handle, new_inode,
-                                                  newfe, orphan_name,
+                                                  newfe_bh, orphan_name,
                                                  &orphan_insert, orphan_dir);
                        if (status < 0) {
                                mlog_errno(status);
@@ -1599,6 +1604,8 @@ static int ocfs2_symlink(struct inode *dir,
        mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
                   dentry, symname, dentry->d_name.len, dentry->d_name.name);
+        dquot_initialize(dir);
        sb = dir->i_sb;
        osb = OCFS2_SB(sb);
@@ -1688,13 +1695,9 @@ static int ocfs2_symlink(struct inode *dir,
                goto bail;
        }
-        /* We don't use standard VFS wrapper because we don't want vfs_dq_init
+        status = dquot_alloc_inode(inode);
-         * to be called. */
+        if (status)
-        if (sb_any_quota_active(osb->sb) &&
-            osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
-                status = -EDQUOT;
                goto bail;
-        }
        did_quota_inode = 1;
        mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry,
@@ -1716,11 +1719,10 @@ static int ocfs2_symlink(struct inode *dir,
                u32 offset = 0;
                inode->i_op = &ocfs2_symlink_inode_operations;
-                if (vfs_dq_alloc_space_nodirty(inode,
+                status = dquot_alloc_space_nodirty(inode,
-                    ocfs2_clusters_to_bytes(osb->sb, 1))) {
+                    ocfs2_clusters_to_bytes(osb->sb, 1));
-                        status = -EDQUOT;
+                if (status)
                        goto bail;
-                }
                did_quota = 1;
                status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
                                              new_fe_bh,
@@ -1788,10 +1790,10 @@ static int ocfs2_symlink(struct inode *dir,
        d_instantiate(dentry, inode);
 bail:
        if (status < 0 && did_quota)
-                vfs_dq_free_space_nodirty(inode,
+                dquot_free_space_nodirty(inode,
                                        ocfs2_clusters_to_bytes(osb->sb, 1));
        if (status < 0 && did_quota_inode)
-                vfs_dq_free_inode(inode);
+                dquot_free_inode(inode);
        if (handle)
                ocfs2_commit_trans(osb, handle);
@@ -1909,7 +1911,7 @@ leave:
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
                            handle_t *handle,
                            struct inode *inode,
-                            struct ocfs2_dinode *fe,
+                            struct buffer_head *fe_bh,
                            char *name,
                            struct ocfs2_dir_lookup_result *lookup,
                            struct inode *orphan_dir_inode)
@@ -1917,6 +1919,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        struct buffer_head *orphan_dir_bh = NULL;
        int status = 0;
        struct ocfs2_dinode *orphan_fe;
+        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
        mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
@@ -1957,6 +1960,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
                goto leave;
        }
+        /*
+         * We're going to journal the change of i_flags and i_orphaned_slot.
+         * It's safe anyway, though some callers may duplicate the journaling.
+         * Journaling within the func just make the logic look more
+         * straightforward.
+         */
+        status = ocfs2_journal_access_di(handle,
+                                         INODE_CACHE(inode),
+                                         fe_bh,
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto leave;
+        }
        le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
        /* Record which orphan dir our inode now resides
@@ -1964,6 +1982,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
         * dir to lock. */
        fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
+        ocfs2_journal_dirty(handle, fe_bh);
        mlog(0, "Inode %llu orphaned in slot %d\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
@@ -2099,15 +2119,12 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
                goto leave;
        }
-        /* We don't use standard VFS wrapper because we don't want vfs_dq_init
+        status = dquot_alloc_inode(inode);
-         * to be called. */
+        if (status)
-        if (sb_any_quota_active(osb->sb) &&
-            osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
-                status = -EDQUOT;
                goto leave;
-        }
        did_quota_inode = 1;
+        inode->i_nlink = 0;
        /* do the real work now. */
        status = ocfs2_mknod_locked(osb, dir, inode,
                                    0, &new_di_bh, parent_di_bh, handle,
@@ -2124,7 +2141,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
        }
        di = (struct ocfs2_dinode *)new_di_bh->b_data;
-        status = ocfs2_orphan_add(osb, handle, inode, di, orphan_name,
+        status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
                                  &orphan_insert, orphan_dir);
        if (status < 0) {
                mlog_errno(status);
@@ -2136,9 +2153,10 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
        if (status < 0)
                mlog_errno(status);
+        insert_inode_hash(inode);
 leave:
        if (status < 0 && did_quota_inode)
-                vfs_dq_free_inode(inode);
+                dquot_free_inode(inode);
        if (handle)
                ocfs2_commit_trans(osb, handle);
@@ -2267,6 +2285,8 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
        di = (struct ocfs2_dinode *)di_bh->b_data;
        le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL);
        di->i_orphaned_slot = 0;
+        inode->i_nlink = 1;
+        ocfs2_set_links_count(di, inode->i_nlink);
        ocfs2_journal_dirty(handle, di_bh);
        status = ocfs2_add_entry(handle, dentry, inode,
@@ -2284,7 +2304,6 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
                goto out_commit;
        }
-        insert_inode_hash(inode);
        dentry->d_op = &ocfs2_dentry_ops;
        d_instantiate(dentry, inode);
        status = 0;
@@ -2326,4 +2345,5 @@ const struct inode_operations ocfs2_dir_iops = {
        .getxattr       = generic_getxattr,
        .listxattr      = ocfs2_listxattr,
        .removexattr    = generic_removexattr,
+        .fiemap         = ocfs2_fiemap,
 };
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d963d8638709..adf5e2ebc2c4 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -42,6 +42,7 @@
 #include "ocfs2_fs.h"
 #include "ocfs2_lockid.h"
+#include "ocfs2_ioctl.h"
 /* For struct ocfs2_blockcheck_stats */
 #include "blockcheck.h"
@@ -136,6 +137,10 @@ enum ocfs2_unlock_action {
 #define OCFS2_LOCK_PENDING       (0x00000400) /* This lockres is pending a
                                                 call to dlm_lock.  Only
                                                 exists with BUSY set. */
+#define OCFS2_LOCK_UPCONVERT_FINISHING (0x00000800) /* blocks the dc thread
+                                                     * from downconverting
+                                                     * before the upconvert
+                                                     * has completed */
 struct ocfs2_lock_res_ops;
@@ -155,7 +160,7 @@ struct ocfs2_lock_res {
        int                      l_level;
        unsigned int             l_ro_holders;
        unsigned int             l_ex_holders;
-        union ocfs2_dlm_lksb     l_lksb;
+        struct ocfs2_dlm_lksb    l_lksb;
        /* used from AST/BAST funcs. */
        enum ocfs2_ast_action    l_action;
@@ -245,9 +250,11 @@ enum ocfs2_mount_options
        OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
        OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
        OCFS2_MOUNT_INODE64 = 1 << 7,   /* Allow inode numbers > 2^32 */
-        OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* POSIX access control lists */
+        OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* Force POSIX access control lists */
-        OCFS2_MOUNT_USRQUOTA = 1 << 9, /* We support user quotas */
+        OCFS2_MOUNT_NO_POSIX_ACL = 1 << 9,      /* Disable POSIX access
-        OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */
+                                                   control lists */
+        OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
+        OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
 };
 #define OCFS2_OSB_SOFT_RO                       0x0001
@@ -299,7 +306,9 @@ struct ocfs2_super
        u32 s_next_generation;
        unsigned long osb_flags;
        s16 s_inode_steal_slot;
+        s16 s_meta_steal_slot;
        atomic_t s_num_inodes_stolen;
+        atomic_t s_num_meta_stolen;
        unsigned long s_mount_opt;
        unsigned int s_atime_quantum;
@@ -754,35 +763,18 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
        return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
 }
-static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
+static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
 {
-        spin_lock(&osb->osb_lock);
+        ext2_set_bit(bit, bitmap);
-        osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
-        spin_unlock(&osb->osb_lock);
-        atomic_set(&osb->s_num_inodes_stolen, 0);
 }
+#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
-static inline void ocfs2_set_inode_steal_slot(struct ocfs2_super *osb,
+static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
-                                              s16 slot)
 {
-        spin_lock(&osb->osb_lock);
+        ext2_clear_bit(bit, bitmap);
-        osb->s_inode_steal_slot = slot;
-        spin_unlock(&osb->osb_lock);
-}
-static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
-{
-        s16 slot;
-        spin_lock(&osb->osb_lock);
-        slot = osb->s_inode_steal_slot;
-        spin_unlock(&osb->osb_lock);
-        return slot;
 }
+#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
-#define ocfs2_set_bit ext2_set_bit
-#define ocfs2_clear_bit ext2_clear_bit
 #define ocfs2_test_bit ext2_test_bit
 #define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
 #define ocfs2_find_next_bit ext2_find_next_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index e9431e4a5e7c..bb37218a7978 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -254,63 +254,6 @@
                                                 * refcount tree */
 /*
- * ioctl commands
- */
-#define OCFS2_IOC_GETFLAGS      _IOR('f', 1, long)
-#define OCFS2_IOC_SETFLAGS      _IOW('f', 2, long)
-#define OCFS2_IOC32_GETFLAGS    _IOR('f', 1, int)
-#define OCFS2_IOC32_SETFLAGS    _IOW('f', 2, int)
-/*
- * Space reservation / allocation / free ioctls and argument structure
- * are designed to be compatible with XFS.
- *
- * ALLOCSP* and FREESP* are not and will never be supported, but are
- * included here for completeness.
- */
-struct ocfs2_space_resv {
-        __s16           l_type;
-        __s16           l_whence;
-        __s64           l_start;
-        __s64           l_len;          /* len == 0 means until end of file */
-        __s32           l_sysid;
-        __u32           l_pid;
-        __s32           l_pad[4];       /* reserve area                     */
-};
-#define OCFS2_IOC_ALLOCSP               _IOW ('X', 10, struct ocfs2_space_resv)
-#define OCFS2_IOC_FREESP                _IOW ('X', 11, struct ocfs2_space_resv)
-#define OCFS2_IOC_RESVSP                _IOW ('X', 40, struct ocfs2_space_resv)
-#define OCFS2_IOC_UNRESVSP      _IOW ('X', 41, struct ocfs2_space_resv)
-#define OCFS2_IOC_ALLOCSP64     _IOW ('X', 36, struct ocfs2_space_resv)
-#define OCFS2_IOC_FREESP64      _IOW ('X', 37, struct ocfs2_space_resv)
-#define OCFS2_IOC_RESVSP64      _IOW ('X', 42, struct ocfs2_space_resv)
-#define OCFS2_IOC_UNRESVSP64    _IOW ('X', 43, struct ocfs2_space_resv)
-/* Used to pass group descriptor data when online resize is done */
-struct ocfs2_new_group_input {
-        __u64 group;            /* Group descriptor's blkno. */
-        __u32 clusters;         /* Total number of clusters in this group */
-        __u32 frees;            /* Total free clusters in this group */
-        __u16 chain;            /* Chain for this group */
-        __u16 reserved1;
-        __u32 reserved2;
-};
-#define OCFS2_IOC_GROUP_EXTEND  _IOW('o', 1, int)
-#define OCFS2_IOC_GROUP_ADD     _IOW('o', 2,struct ocfs2_new_group_input)
-#define OCFS2_IOC_GROUP_ADD64   _IOW('o', 3,struct ocfs2_new_group_input)
-/* Used to pass 2 file names to reflink. */
-struct reflink_arguments {
-        __u64 old_path;
-        __u64 new_path;
-        __u64 preserve;
-};
-#define OCFS2_IOC_REFLINK       _IOW('o', 4, struct reflink_arguments)
-/*
 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
 */
 #define OCFS2_JOURNAL_DIRTY_FL  (0x00000001)    /* Journal needs recovery */
@@ -1202,7 +1145,7 @@ struct ocfs2_local_disk_dqinfo {
 /* Header of one chunk of a quota file */
 struct ocfs2_local_disk_chunk {
        __le32 dqc_free;        /* Number of free entries in the bitmap */
-        u8 dqc_bitmap[0];       /* Bitmap of entries in the corresponding
+        __u8 dqc_bitmap[0];     /* Bitmap of entries in the corresponding
                                 * chunk of quota file */
 };
@@ -1417,9 +1360,16 @@ static inline int ocfs2_fast_symlink_chars(int blocksize)
        return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink);
 }
-static inline int ocfs2_max_inline_data(int blocksize)
+static inline int ocfs2_max_inline_data_with_xattr(int blocksize,
+                                                   struct ocfs2_dinode *di)
 {
-        return blocksize - offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+        if (di && (di->i_dyn_features & OCFS2_INLINE_XATTR_FL))
+                return blocksize -
+                        offsetof(struct ocfs2_dinode, id2.i_data.id_data) -
+                        di->i_xattr_inline_size;
+        else
+                return blocksize -
+                        offsetof(struct ocfs2_dinode, id2.i_data.id_data);
 }
 static inline int ocfs2_extent_recs_per_inode(int blocksize)
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
new file mode 100644
index 000000000000..2d3420af1a83
--- /dev/null
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -0,0 +1,79 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_ioctl.h
+ *
+ * Defines OCFS2 ioctls.
+ *
+ * Copyright (C) 2010 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_IOCTL_H
+#define OCFS2_IOCTL_H
+/*
+ * ioctl commands
+ */
+#define OCFS2_IOC_GETFLAGS      _IOR('f', 1, long)
+#define OCFS2_IOC_SETFLAGS      _IOW('f', 2, long)
+#define OCFS2_IOC32_GETFLAGS    _IOR('f', 1, int)
+#define OCFS2_IOC32_SETFLAGS    _IOW('f', 2, int)
+/*
+ * Space reservation / allocation / free ioctls and argument structure
+ * are designed to be compatible with XFS.
+ *
+ * ALLOCSP* and FREESP* are not and will never be supported, but are
+ * included here for completeness.
+ */
+struct ocfs2_space_resv {
+        __s16           l_type;
+        __s16           l_whence;
+        __s64           l_start;
+        __s64           l_len;          /* len == 0 means until end of file */
+        __s32           l_sysid;
+        __u32           l_pid;
+        __s32           l_pad[4];       /* reserve area                     */
+};
+#define OCFS2_IOC_ALLOCSP               _IOW ('X', 10, struct ocfs2_space_resv)
+#define OCFS2_IOC_FREESP                _IOW ('X', 11, struct ocfs2_space_resv)
+#define OCFS2_IOC_RESVSP                _IOW ('X', 40, struct ocfs2_space_resv)
+#define OCFS2_IOC_UNRESVSP      _IOW ('X', 41, struct ocfs2_space_resv)
+#define OCFS2_IOC_ALLOCSP64     _IOW ('X', 36, struct ocfs2_space_resv)
+#define OCFS2_IOC_FREESP64      _IOW ('X', 37, struct ocfs2_space_resv)
+#define OCFS2_IOC_RESVSP64      _IOW ('X', 42, struct ocfs2_space_resv)
+#define OCFS2_IOC_UNRESVSP64    _IOW ('X', 43, struct ocfs2_space_resv)
+/* Used to pass group descriptor data when online resize is done */
+struct ocfs2_new_group_input {
+        __u64 group;            /* Group descriptor's blkno. */
+        __u32 clusters;         /* Total number of clusters in this group */
+        __u32 frees;            /* Total free clusters in this group */
+        __u16 chain;            /* Chain for this group */
+        __u16 reserved1;
+        __u32 reserved2;
+};
+#define OCFS2_IOC_GROUP_EXTEND  _IOW('o', 1, int)
+#define OCFS2_IOC_GROUP_ADD     _IOW('o', 2,struct ocfs2_new_group_input)
+#define OCFS2_IOC_GROUP_ADD64   _IOW('o', 3,struct ocfs2_new_group_input)
+/* Used to pass 2 file names to reflink. */
+struct reflink_arguments {
+        __u64 old_path;
+        __u64 new_path;
+        __u64 preserve;
+};
+#define OCFS2_IOC_REFLINK       _IOW('o', 4, struct reflink_arguments)
+#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/ocfs2_lockingver.h b/fs/ocfs2/ocfs2_lockingver.h
index 82d5eeac0fff..2e45c8d2ea7e 100644
--- a/fs/ocfs2/ocfs2_lockingver.h
+++ b/fs/ocfs2/ocfs2_lockingver.h
@@ -23,6 +23,8 @@
 /*
 * The protocol version for ocfs2 cluster locking.  See dlmglue.c for
 * more details.
+ *
+ * 1.0 - Initial locking version from ocfs2 1.4.
 */
 #define OCFS2_LOCKING_PROTOCOL_MAJOR 1
 #define OCFS2_LOCKING_PROTOCOL_MINOR 0
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index e5df9d170b0c..123bc520a2c0 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -17,10 +17,6 @@
 #include "ocfs2.h"
-/* Common stuff */
-/* id number of quota format */
-#define QFMT_OCFS2 3
 /*
 * In-memory structures
 */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index b437dc0c4cad..ab42a74c7539 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -3,6 +3,7 @@
 */
 #include <linux/spinlock.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/quota.h>
 #include <linux/quotaops.h>
 #include <linux/dqblk_qtree.h>
@@ -851,13 +852,6 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
 }
 const struct dquot_operations ocfs2_quota_operations = {
-        .initialize     = dquot_initialize,
-        .drop           = dquot_drop,
-        .alloc_space    = dquot_alloc_space,
-        .alloc_inode    = dquot_alloc_inode,
-        .free_space     = dquot_free_space,
-        .free_inode     = dquot_free_inode,
-        .transfer       = dquot_transfer,
        .write_dquot    = ocfs2_write_dquot,
        .acquire_dquot  = ocfs2_acquire_dquot,
        .release_dquot  = ocfs2_release_dquot,
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 1a2c50a759fa..9ad49305f450 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -3,6 +3,7 @@
 */
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/quota.h>
 #include <linux/quotaops.h>
 #include <linux/module.h>
@@ -457,7 +458,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
                        break;
                }
                dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
-                for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
+                for_each_set_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
                        qbh = NULL;
                        status = ocfs2_read_quota_block(lqinode,
                                                ol_dqblk_block(sb, chunk, bit),
@@ -1325,7 +1326,7 @@ out:
        return status;
 }
-static struct quota_format_ops ocfs2_format_ops = {
+static const struct quota_format_ops ocfs2_format_ops = {
        .check_quota_file       = ocfs2_local_check_quota_file,
        .read_file_info         = ocfs2_local_read_info,
        .write_file_info        = ocfs2_global_write_info,
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3a0df7a1b810..bd96f6c7877e 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -37,7 +37,6 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
-#include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
@@ -276,7 +275,7 @@ static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
        spin_unlock(&osb->osb_lock);
 }
-void ocfs2_kref_remove_refcount_tree(struct kref *kref)
+static void ocfs2_kref_remove_refcount_tree(struct kref *kref)
 {
        struct ocfs2_refcount_tree *tree =
                container_of(kref, struct ocfs2_refcount_tree, rf_getcnt);
@@ -524,23 +523,6 @@ out:
        return ret;
 }
-int ocfs2_lock_refcount_tree_by_inode(struct inode *inode, int rw,
-                                      struct ocfs2_refcount_tree **ret_tree,
-                                      struct buffer_head **ref_bh)
-{
-        int ret;
-        u64 ref_blkno;
-        ret = ocfs2_get_refcount_block(inode, &ref_blkno);
-        if (ret) {
-                mlog_errno(ret);
-                return ret;
-        }
-        return ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno,
-                                        rw, ret_tree, ref_bh);
-}
 void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
                                struct ocfs2_refcount_tree *tree, int rw)
 {
@@ -643,7 +625,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
        rb = (struct ocfs2_refcount_block *)new_bh->b_data;
        memset(rb, 0, inode->i_sb->s_blocksize);
        strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
-        rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num);
+        rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
        rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
        rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
        rb->rf_blkno = cpu_to_le64(first_blkno);
@@ -969,6 +951,103 @@ out:
 }
 /*
+ * Find the end range for a leaf refcount block indicated by
+ * el->l_recs[index].e_blkno.
+ */
+static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
+                                       struct buffer_head *ref_root_bh,
+                                       struct ocfs2_extent_block *eb,
+                                       struct ocfs2_extent_list *el,
+                                       int index,  u32 *cpos_end)
+{
+        int ret, i, subtree_root;
+        u32 cpos;
+        u64 blkno;
+        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+        struct ocfs2_path *left_path = NULL, *right_path = NULL;
+        struct ocfs2_extent_tree et;
+        struct ocfs2_extent_list *tmp_el;
+        if (index < le16_to_cpu(el->l_next_free_rec) - 1) {
+                /*
+                 * We have a extent rec after index, so just use the e_cpos
+                 * of the next extent rec.
+                 */
+                *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos);
+                return 0;
+        }
+        if (!eb || (eb && !eb->h_next_leaf_blk)) {
+                /*
+                 * We are the last extent rec, so any high cpos should
+                 * be stored in this leaf refcount block.
+                 */
+                *cpos_end = UINT_MAX;
+                return 0;
+        }
+        /*
+         * If the extent block isn't the last one, we have to find
+         * the subtree root between this extent block and the next
+         * leaf extent block and get the corresponding e_cpos from
+         * the subroot. Otherwise we may corrupt the b-tree.
+         */
+        ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+        left_path = ocfs2_new_path_from_et(&et);
+        if (!left_path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos);
+        ret = ocfs2_find_path(ci, left_path, cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        right_path = ocfs2_new_path_from_path(left_path);
+        if (!right_path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_path(ci, right_path, cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        subtree_root = ocfs2_find_subtree_root(&et, left_path,
+                                               right_path);
+        tmp_el = left_path->p_node[subtree_root].el;
+        blkno = left_path->p_node[subtree_root+1].bh->b_blocknr;
+        for (i = 0; i < le32_to_cpu(tmp_el->l_next_free_rec); i++) {
+                if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) {
+                        *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos);
+                        break;
+                }
+        }
+        BUG_ON(i == le32_to_cpu(tmp_el->l_next_free_rec));
+out:
+        ocfs2_free_path(left_path);
+        ocfs2_free_path(right_path);
+        return ret;
+}
+/*
 * Given a cpos and len, try to find the refcount record which contains cpos.
 * 1. If cpos can be found in one refcount record, return the record.
 * 2. If cpos can't be found, return a fake record which start from cpos
@@ -983,10 +1062,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
                                  struct buffer_head **ret_bh)
 {
        int ret = 0, i, found;
-        u32 low_cpos;
+        u32 low_cpos, uninitialized_var(cpos_end);
        struct ocfs2_extent_list *el;
-        struct ocfs2_extent_rec *tmp, *rec = NULL;
+        struct ocfs2_extent_rec *rec = NULL;
-        struct ocfs2_extent_block *eb;
+        struct ocfs2_extent_block *eb = NULL;
        struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
        struct ocfs2_refcount_block *rb =
@@ -1034,12 +1113,16 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
                }
        }
-        /* adjust len when we have ocfs2_extent_rec after it. */
+        if (found) {
-        if (found && i < le16_to_cpu(el->l_next_free_rec) - 1) {
+                ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh,
-                tmp = &el->l_recs[i+1];
+                                                  eb, el, i, &cpos_end);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
-                if (le32_to_cpu(tmp->e_cpos) < cpos + len)
+                if (cpos_end < low_cpos + len)
-                        len = le32_to_cpu(tmp->e_cpos) - cpos;
+                        len = cpos_end - low_cpos;
        }
        ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
@@ -1246,7 +1329,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
        memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
        new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
-        new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num);
+        new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
        new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
        new_rb->rf_blkno = cpu_to_le64(blkno);
        new_rb->rf_cpos = cpu_to_le32(0);
@@ -1418,7 +1501,7 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
        /* change old and new rl_used accordingly. */
        le16_add_cpu(&rl->rl_used, -num_moved);
-        new_rl->rl_used = cpu_to_le32(num_moved);
+        new_rl->rl_used = cpu_to_le16(num_moved);
        sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
             sizeof(struct ocfs2_refcount_rec),
@@ -1492,7 +1575,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
        new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
        memset(new_rb, 0, sb->s_blocksize);
        strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
-        new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num);
+        new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
        new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
        new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
        new_rb->rf_blkno = cpu_to_le64(blkno);
@@ -1797,7 +1880,8 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
                recs_need++;
        /* If the leaf block don't have enough record, expand it. */
-        if (le16_to_cpu(rf_list->rl_used) + recs_need > rf_list->rl_count) {
+        if (le16_to_cpu(rf_list->rl_used) + recs_need >
+                                         le16_to_cpu(rf_list->rl_count)) {
                struct ocfs2_refcount_rec tmp_rec;
                u64 cpos = le64_to_cpu(orig_rec->r_cpos);
                len = le32_to_cpu(orig_rec->r_clusters);
@@ -1859,7 +1943,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
                memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec));
                le64_add_cpu(&tail_rec->r_cpos,
                             le32_to_cpu(tail_rec->r_clusters) - len);
-                tail_rec->r_clusters = le32_to_cpu(len);
+                tail_rec->r_clusters = cpu_to_le32(len);
        }
        /*
@@ -2431,7 +2515,7 @@ out:
 * we gonna touch and whether we need to create new blocks.
 *
 * Normally the refcount blocks store these refcount should be
- * continguous also, so that we can get the number easily.
+ * contiguous also, so that we can get the number easily.
 * As for meta_ac, we will at most add split 2 refcount record and
 * 2 more refcount block, so just check it in a rough way.
 *
@@ -2860,7 +2944,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
        while (offset < end) {
                page_index = offset >> PAGE_CACHE_SHIFT;
-                map_end = (page_index + 1) << PAGE_CACHE_SHIFT;
+                map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
                if (map_end > end)
                        map_end = end;
@@ -2872,8 +2956,12 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
                page = grab_cache_page(mapping, page_index);
-                /* This page can't be dirtied before we CoW it out. */
+                /*
-                BUG_ON(PageDirty(page));
+                 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
+                 * can't be dirtied before we CoW it out.
+                 */
+                if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
+                        BUG_ON(PageDirty(page));
                if (!PageUptodate(page)) {
                        ret = block_read_full_page(page, ocfs2_get_block);
@@ -3085,7 +3173,7 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
        while (offset < end) {
                page_index = offset >> PAGE_CACHE_SHIFT;
-                map_end = (page_index + 1) << PAGE_CACHE_SHIFT;
+                map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
                if (map_end > end)
                        map_end = end;
@@ -3840,8 +3928,7 @@ static int ocfs2_add_refcounted_extent(struct inode *inode,
        }
        ret = ocfs2_insert_extent(handle, et, cpos,
-                        cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
+                        ocfs2_clusters_to_blocks(inode->i_sb, p_cluster),
-                                                             p_cluster)),
                        num_clusters, ext_flags, meta_ac);
        if (ret) {
                mlog_errno(ret);
@@ -3987,6 +4074,7 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
        OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
        spin_unlock(&OCFS2_I(t_inode)->ip_lock);
        i_size_write(t_inode, size);
+        t_inode->i_blocks = s_inode->i_blocks;
        di->i_xattr_inline_size = s_di->i_xattr_inline_size;
        di->i_clusters = s_di->i_clusters;
@@ -4253,8 +4341,8 @@ static int ocfs2_user_path_parent(const char __user *path,
 * @new_dentry:        target dentry
 * @preserve:  if true, preserve all file attributes
 */
-int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
+static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
-                      struct dentry *new_dentry, bool preserve)
+                             struct dentry *new_dentry, bool preserve)
 {
        struct inode *inode = old_dentry->d_inode;
        int error;
@@ -4302,7 +4390,7 @@ int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
        }
        mutex_lock(&inode->i_mutex);
-        vfs_dq_init(dir);
+        dquot_initialize(dir);
        error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
        mutex_unlock(&inode->i_mutex);
        if (!error)
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index e49c41050264..0d3049f696c5 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -19,6 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 /* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
@@ -161,24 +162,23 @@ static int dlm_status_to_errno(enum dlm_status status)
 static void o2dlm_lock_ast_wrapper(void *astarg)
 {
-        BUG_ON(o2cb_stack.sp_proto == NULL);
+        struct ocfs2_dlm_lksb *lksb = astarg;
-        o2cb_stack.sp_proto->lp_lock_ast(astarg);
+        lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
 }
 static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
 {
-        BUG_ON(o2cb_stack.sp_proto == NULL);
+        struct ocfs2_dlm_lksb *lksb = astarg;
-        o2cb_stack.sp_proto->lp_blocking_ast(astarg, level);
+        lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
 }
 static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
 {
+        struct ocfs2_dlm_lksb *lksb = astarg;
        int error = dlm_status_to_errno(status);
-        BUG_ON(o2cb_stack.sp_proto == NULL);
        /*
         * In o2dlm, you can get both the lock_ast() for the lock being
         * granted and the unlock_ast() for the CANCEL failing.  A
@@ -193,16 +193,15 @@ static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
        if (status == DLM_CANCELGRANT)
                return;
-        o2cb_stack.sp_proto->lp_unlock_ast(astarg, error);
+        lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, error);
 }
 static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
                         int mode,
-                         union ocfs2_dlm_lksb *lksb,
+                         struct ocfs2_dlm_lksb *lksb,
                         u32 flags,
                         void *name,
-                         unsigned int namelen,
+                         unsigned int namelen)
-                         void *astarg)
 {
        enum dlm_status status;
        int o2dlm_mode = mode_to_o2dlm(mode);
@@ -211,28 +210,27 @@ static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
        status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
                         o2dlm_flags, name, namelen,
-                         o2dlm_lock_ast_wrapper, astarg,
+                         o2dlm_lock_ast_wrapper, lksb,
                         o2dlm_blocking_ast_wrapper);
        ret = dlm_status_to_errno(status);
        return ret;
 }
 static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
-                           union ocfs2_dlm_lksb *lksb,
+                           struct ocfs2_dlm_lksb *lksb,
-                           u32 flags,
+                           u32 flags)
-                           void *astarg)
 {
        enum dlm_status status;
        int o2dlm_flags = flags_to_o2dlm(flags);
        int ret;
        status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
-                           o2dlm_flags, o2dlm_unlock_ast_wrapper, astarg);
+                           o2dlm_flags, o2dlm_unlock_ast_wrapper, lksb);
        ret = dlm_status_to_errno(status);
        return ret;
 }
-static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+static int o2cb_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
 {
        return dlm_status_to_errno(lksb->lksb_o2dlm.status);
 }
@@ -242,17 +240,17 @@ static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
 * contents, it will zero out the LVB.  Thus the caller can always trust
 * the contents.
 */
-static int o2cb_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb)
+static int o2cb_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
 {
        return 1;
 }
-static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+static void *o2cb_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
 {
        return (void *)(lksb->lksb_o2dlm.lvb);
 }
-static void o2cb_dump_lksb(union ocfs2_dlm_lksb *lksb)
+static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
 {
        dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
 }
@@ -277,10 +275,10 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
        u32 dlm_key;
        struct dlm_ctxt *dlm;
        struct o2dlm_private *priv;
-        struct dlm_protocol_version dlm_version;
+        struct dlm_protocol_version fs_version;
        BUG_ON(conn == NULL);
-        BUG_ON(o2cb_stack.sp_proto == NULL);
+        BUG_ON(conn->cc_proto == NULL);
        /* for now we only have one cluster/node, make sure we see it
         * in the heartbeat universe */
@@ -304,18 +302,18 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
        /* used by the dlm code to make message headers unique, each
         * node in this domain must agree on this. */
        dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen);
-        dlm_version.pv_major = conn->cc_version.pv_major;
+        fs_version.pv_major = conn->cc_version.pv_major;
-        dlm_version.pv_minor = conn->cc_version.pv_minor;
+        fs_version.pv_minor = conn->cc_version.pv_minor;
-        dlm = dlm_register_domain(conn->cc_name, dlm_key, &dlm_version);
+        dlm = dlm_register_domain(conn->cc_name, dlm_key, &fs_version);
        if (IS_ERR(dlm)) {
                rc = PTR_ERR(dlm);
                mlog_errno(rc);
                goto out_free;
        }
-        conn->cc_version.pv_major = dlm_version.pv_major;
+        conn->cc_version.pv_major = fs_version.pv_major;
-        conn->cc_version.pv_minor = dlm_version.pv_minor;
+        conn->cc_version.pv_minor = fs_version.pv_minor;
        conn->cc_lockspace = dlm;
        dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index ff4c798a5635..2dc57bca0688 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,11 +21,11 @@
 #include <linux/fs.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/reboot.h>
 #include <asm/uaccess.h>
-#include "ocfs2.h"  /* For struct ocfs2_lock_res */
 #include "stackglue.h"
 #include <linux/dlm_plock.h>
@@ -63,8 +63,8 @@
 * negotiated by the client.  The client negotiates based on the maximum
 * version advertised in /sys/fs/ocfs2/max_locking_protocol.  The major
 * number from the "SETV" message must match
- * ocfs2_user_plugin.sp_proto->lp_max_version.pv_major, and the minor number
+ * ocfs2_user_plugin.sp_max_proto.pv_major, and the minor number
- * must be less than or equal to ...->lp_max_version.pv_minor.
+ * must be less than or equal to ...sp_max_version.pv_minor.
 *
 * Once this information has been set, mounts will be allowed.  From this
 * point on, the "DOWN" message can be sent for node down notification.
@@ -401,7 +401,7 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
        char *ptr = NULL;
        struct ocfs2_control_private *p = file->private_data;
        struct ocfs2_protocol_version *max =
-                &ocfs2_user_plugin.sp_proto->lp_max_version;
+                &ocfs2_user_plugin.sp_max_proto;
        if (ocfs2_control_get_handshake_state(file) !=
            OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
@@ -664,18 +664,10 @@ static void ocfs2_control_exit(void)
                       -rc);
 }
-static struct dlm_lksb *fsdlm_astarg_to_lksb(void *astarg)
-{
-        struct ocfs2_lock_res *res = astarg;
-        return &res->l_lksb.lksb_fsdlm;
-}
 static void fsdlm_lock_ast_wrapper(void *astarg)
 {
-        struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg);
+        struct ocfs2_dlm_lksb *lksb = astarg;
-        int status = lksb->sb_status;
+        int status = lksb->lksb_fsdlm.sb_status;
-        BUG_ON(ocfs2_user_plugin.sp_proto == NULL);
        /*
         * For now we're punting on the issue of other non-standard errors
@@ -688,25 +680,24 @@ static void fsdlm_lock_ast_wrapper(void *astarg)
         */
        if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
-                ocfs2_user_plugin.sp_proto->lp_unlock_ast(astarg, 0);
+                lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, 0);
        else
-                ocfs2_user_plugin.sp_proto->lp_lock_ast(astarg);
+                lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
 }
 static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
 {
-        BUG_ON(ocfs2_user_plugin.sp_proto == NULL);
+        struct ocfs2_dlm_lksb *lksb = astarg;
-        ocfs2_user_plugin.sp_proto->lp_blocking_ast(astarg, level);
+        lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
 }
 static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
                         int mode,
-                         union ocfs2_dlm_lksb *lksb,
+                         struct ocfs2_dlm_lksb *lksb,
                         u32 flags,
                         void *name,
-                         unsigned int namelen,
+                         unsigned int namelen)
-                         void *astarg)
 {
        int ret;
@@ -716,36 +707,35 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
        ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
                       flags|DLM_LKF_NODLCKWT, name, namelen, 0,
-                       fsdlm_lock_ast_wrapper, astarg,
+                       fsdlm_lock_ast_wrapper, lksb,
                       fsdlm_blocking_ast_wrapper);
        return ret;
 }
 static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
-                           union ocfs2_dlm_lksb *lksb,
+                           struct ocfs2_dlm_lksb *lksb,
-                           u32 flags,
+                           u32 flags)
-                           void *astarg)
 {
        int ret;
        ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
-                         flags, &lksb->lksb_fsdlm, astarg);
+                         flags, &lksb->lksb_fsdlm, lksb);
        return ret;
 }
-static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
 {
        return lksb->lksb_fsdlm.sb_status;
 }
-static int user_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb)
+static int user_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
 {
        int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID;
        return !invalid;
 }
-static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+static void *user_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
 {
        if (!lksb->lksb_fsdlm.sb_lvbptr)
                lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
@@ -753,7 +743,7 @@ static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
        return (void *)(lksb->lksb_fsdlm.sb_lvbptr);
 }
-static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
+static void user_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
 {
 }
@@ -814,7 +804,7 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
 static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
 {
        dlm_lockspace_t *fsdlm;
-        struct ocfs2_live_connection *control;
+        struct ocfs2_live_connection *uninitialized_var(control);
        int rc = 0;
        BUG_ON(conn == NULL);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 3f2f1c45b7b6..39abf89697ed 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -36,7 +36,7 @@
 #define OCFS2_STACK_PLUGIN_USER         "user"
 #define OCFS2_MAX_HB_CTL_PATH           256
-static struct ocfs2_locking_protocol *lproto;
+static struct ocfs2_protocol_version locking_max_version;
 static DEFINE_SPINLOCK(ocfs2_stack_lock);
 static LIST_HEAD(ocfs2_stack_list);
 static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
@@ -176,7 +176,7 @@ int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin)
        spin_lock(&ocfs2_stack_lock);
        if (!ocfs2_stack_lookup(plugin->sp_name)) {
                plugin->sp_count = 0;
-                plugin->sp_proto = lproto;
+                plugin->sp_max_proto = locking_max_version;
                list_add(&plugin->sp_list, &ocfs2_stack_list);
                printk(KERN_INFO "ocfs2: Registered cluster interface %s\n",
                       plugin->sp_name);
@@ -213,77 +213,76 @@ void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin)
 }
 EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister);
-void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto)
+void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto)
 {
        struct ocfs2_stack_plugin *p;
-        BUG_ON(proto == NULL);
        spin_lock(&ocfs2_stack_lock);
-        BUG_ON(active_stack != NULL);
+        if (memcmp(max_proto, &locking_max_version,
+                   sizeof(struct ocfs2_protocol_version))) {
+                BUG_ON(locking_max_version.pv_major != 0);
-        lproto = proto;
+                locking_max_version = *max_proto;
-        list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+                list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
-                p->sp_proto = lproto;
+                        p->sp_max_proto = locking_max_version;
+                }
        }
        spin_unlock(&ocfs2_stack_lock);
 }
-EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol);
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_max_proto_version);
 /*
- * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take
+ * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take no argument
- * "struct ocfs2_lock_res *astarg" instead of "void *astarg" because the
+ * for the ast and bast functions.  They will pass the lksb to the ast
- * underlying stack plugins need to pilfer the lksb off of the lock_res.
+ * and bast.  The caller can wrap the lksb with their own structure to
- * If some other structure needs to be passed as an astarg, the plugins
+ * get more information.
- * will need to be given a different avenue to the lksb.
 */
 int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
                   int mode,
-                   union ocfs2_dlm_lksb *lksb,
+                   struct ocfs2_dlm_lksb *lksb,
                   u32 flags,
                   void *name,
-                   unsigned int namelen,
+                   unsigned int namelen)
-                   struct ocfs2_lock_res *astarg)
 {
-        BUG_ON(lproto == NULL);
+        if (!lksb->lksb_conn)
+                lksb->lksb_conn = conn;
+        else
+                BUG_ON(lksb->lksb_conn != conn);
        return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags,
-                                              name, namelen, astarg);
+                                              name, namelen);
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_lock);
 int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
-                     union ocfs2_dlm_lksb *lksb,
+                     struct ocfs2_dlm_lksb *lksb,
-                     u32 flags,
+                     u32 flags)
-                     struct ocfs2_lock_res *astarg)
 {
-        BUG_ON(lproto == NULL);
+        BUG_ON(lksb->lksb_conn == NULL);
-        return active_stack->sp_ops->dlm_unlock(conn, lksb, flags, astarg);
+        return active_stack->sp_ops->dlm_unlock(conn, lksb, flags);
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock);
-int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
 {
        return active_stack->sp_ops->lock_status(lksb);
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
-int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb)
+int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
 {
        return active_stack->sp_ops->lvb_valid(lksb);
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid);
-void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
 {
        return active_stack->sp_ops->lock_lvb(lksb);
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb);
-void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
+void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
 {
        active_stack->sp_ops->dump_lksb(lksb);
 }
@@ -312,6 +311,7 @@ EXPORT_SYMBOL_GPL(ocfs2_plock);
 int ocfs2_cluster_connect(const char *stack_name,
                          const char *group,
                          int grouplen,
+                          struct ocfs2_locking_protocol *lproto,
                          void (*recovery_handler)(int node_num,
                                                   void *recovery_data),
                          void *recovery_data,
@@ -329,6 +329,12 @@ int ocfs2_cluster_connect(const char *stack_name,
                goto out;
        }
+        if (memcmp(&lproto->lp_max_version, &locking_max_version,
+                   sizeof(struct ocfs2_protocol_version))) {
+                rc = -EINVAL;
+                goto out;
+        }
        new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection),
                           GFP_KERNEL);
        if (!new_conn) {
@@ -341,6 +347,7 @@ int ocfs2_cluster_connect(const char *stack_name,
        new_conn->cc_recovery_handler = recovery_handler;
        new_conn->cc_recovery_data = recovery_data;
+        new_conn->cc_proto = lproto;
        /* Start the new connection at our maximum compatibility level */
        new_conn->cc_version = lproto->lp_max_version;
@@ -366,6 +373,24 @@ out:
 }
 EXPORT_SYMBOL_GPL(ocfs2_cluster_connect);
+/* The caller will ensure all nodes have the same cluster stack */
+int ocfs2_cluster_connect_agnostic(const char *group,
+                                   int grouplen,
+                                   struct ocfs2_locking_protocol *lproto,
+                                   void (*recovery_handler)(int node_num,
+                                                            void *recovery_data),
+                                   void *recovery_data,
+                                   struct ocfs2_cluster_connection **conn)
+{
+        char *stack_name = NULL;
+        if (cluster_stack_name[0])
+                stack_name = cluster_stack_name;
+        return ocfs2_cluster_connect(stack_name, group, grouplen, lproto,
+                                     recovery_handler, recovery_data, conn);
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic);
 /* If hangup_pending is 0, the stack driver will be dropped */
 int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
                             int hangup_pending)
@@ -453,10 +478,10 @@ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
        ssize_t ret = 0;
        spin_lock(&ocfs2_stack_lock);
-        if (lproto)
+        if (locking_max_version.pv_major)
                ret = snprintf(buf, PAGE_SIZE, "%u.%u\n",
-                               lproto->lp_max_version.pv_major,
+                               locking_max_version.pv_major,
-                               lproto->lp_max_version.pv_minor);
+                               locking_max_version.pv_minor);
        spin_unlock(&ocfs2_stack_lock);
        return ret;
@@ -620,51 +645,46 @@ error:
 static ctl_table ocfs2_nm_table[] = {
        {
-                .ctl_name       = 1,
                .procname       = "hb_ctl_path",
                .data           = ocfs2_hb_ctl_path,
                .maxlen         = OCFS2_MAX_HB_CTL_PATH,
                .mode           = 0644,
-                .proc_handler   = &proc_dostring,
+                .proc_handler   = proc_dostring,
-                .strategy       = &sysctl_string,
        },
-        { .ctl_name = 0 }
+        { }
 };
 static ctl_table ocfs2_mod_table[] = {
        {
-                .ctl_name       = FS_OCFS2_NM,
                .procname       = "nm",
                .data           = NULL,
                .maxlen         = 0,
                .mode           = 0555,
                .child          = ocfs2_nm_table
        },
-        { .ctl_name = 0}
+        { }
 };
 static ctl_table ocfs2_kern_table[] = {
        {
-                .ctl_name       = FS_OCFS2,
                .procname       = "ocfs2",
                .data           = NULL,
                .maxlen         = 0,
                .mode           = 0555,
                .child          = ocfs2_mod_table
        },
-        { .ctl_name = 0}
+        { }
 };
 static ctl_table ocfs2_root_table[] = {
        {
-                .ctl_name       = CTL_FS,
                .procname       = "fs",
                .data           = NULL,
                .maxlen         = 0,
                .mode           = 0555,
                .child          = ocfs2_kern_table
        },
-        { .ctl_name = 0 }
+        { }
 };
 static struct ctl_table_header *ocfs2_table_header = NULL;
@@ -690,7 +710,10 @@ static int __init ocfs2_stack_glue_init(void)
 static void __exit ocfs2_stack_glue_exit(void)
 {
-        lproto = NULL;
+        memset(&locking_max_version, 0,
+               sizeof(struct ocfs2_protocol_version));
+        locking_max_version.pv_major = 0;
+        locking_max_version.pv_minor = 0;
        ocfs2_sysfs_exit();
        if (ocfs2_table_header)
                unregister_sysctl_table(ocfs2_table_header);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 03a44d60eac9..8ce7398ae1d2 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -56,17 +56,6 @@ struct ocfs2_protocol_version {
 };
 /*
- * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
- */
-struct ocfs2_locking_protocol {
-        struct ocfs2_protocol_version lp_max_version;
-        void (*lp_lock_ast)(void *astarg);
-        void (*lp_blocking_ast)(void *astarg, int level);
-        void (*lp_unlock_ast)(void *astarg, int error);
-};
-/*
 * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only
 * has a pointer to separately allocated lvb space.  This struct exists only to
 * include in the lksb union to make space for a combined dlm_lksb and lvb.
@@ -81,12 +70,27 @@ struct fsdlm_lksb_plus_lvb {
 * size of the union is known.  Lock status structures are embedded in
 * ocfs2 inodes.
 */
-union ocfs2_dlm_lksb {
+struct ocfs2_cluster_connection;
-        struct dlm_lockstatus lksb_o2dlm;
+struct ocfs2_dlm_lksb {
-        struct dlm_lksb lksb_fsdlm;
+         union {
-        struct fsdlm_lksb_plus_lvb padding;
+                 struct dlm_lockstatus lksb_o2dlm;
+                 struct dlm_lksb lksb_fsdlm;
+                 struct fsdlm_lksb_plus_lvb padding;
+         };
+         struct ocfs2_cluster_connection *lksb_conn;
+};
+/*
+ * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
+ */
+struct ocfs2_locking_protocol {
+        struct ocfs2_protocol_version lp_max_version;
+        void (*lp_lock_ast)(struct ocfs2_dlm_lksb *lksb);
+        void (*lp_blocking_ast)(struct ocfs2_dlm_lksb *lksb, int level);
+        void (*lp_unlock_ast)(struct ocfs2_dlm_lksb *lksb, int error);
 };
 /*
 * A cluster connection.  Mostly opaque to ocfs2, the connection holds
 * state for the underlying stack.  ocfs2 does use cc_version to determine
@@ -96,6 +100,7 @@ struct ocfs2_cluster_connection {
        char cc_name[GROUP_NAME_MAX];
        int cc_namelen;
        struct ocfs2_protocol_version cc_version;
+        struct ocfs2_locking_protocol *cc_proto;
        void (*cc_recovery_handler)(int node_num, void *recovery_data);
        void *cc_recovery_data;
        void *cc_lockspace;
@@ -155,27 +160,29 @@ struct ocfs2_stack_operations {
         *
         * ast and bast functions are not part of the call because the
         * stack will likely want to wrap ast and bast calls before passing
-         * them to stack->sp_proto.
+         * them to stack->sp_proto.  There is no astarg.  The lksb will
+         * be passed back to the ast and bast functions.  The caller can
+         * use this to find their object.
         */
        int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
                        int mode,
-                        union ocfs2_dlm_lksb *lksb,
+                        struct ocfs2_dlm_lksb *lksb,
                        u32 flags,
                        void *name,
-                        unsigned int namelen,
+                        unsigned int namelen);
-                        void *astarg);
        /*
         * Call the underlying dlm unlock function.  The ->dlm_unlock()
         * function should convert the flags as appropriate.
         *
         * The unlock ast is not passed, as the stack will want to wrap
-         * it before calling stack->sp_proto->lp_unlock_ast().
+         * it before calling stack->sp_proto->lp_unlock_ast().  There is
+         * no astarg.  The lksb will be passed back to the unlock ast
+         * function.  The caller can use this to find their object.
         */
        int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
-                          union ocfs2_dlm_lksb *lksb,
+                          struct ocfs2_dlm_lksb *lksb,
-                          u32 flags,
+                          u32 flags);
-                          void *astarg);
        /*
         * Return the status of the current lock status block.  The fs
@@ -183,17 +190,17 @@ struct ocfs2_stack_operations {
         * callback pulls out the stack-specific lksb, converts the status
         * to a proper errno, and returns it.
         */
-        int (*lock_status)(union ocfs2_dlm_lksb *lksb);
+        int (*lock_status)(struct ocfs2_dlm_lksb *lksb);
        /*
         * Return non-zero if the LVB is valid.
         */
-        int (*lvb_valid)(union ocfs2_dlm_lksb *lksb);
+        int (*lvb_valid)(struct ocfs2_dlm_lksb *lksb);
        /*
         * Pull the lvb pointer off of the stack-specific lksb.
         */
-        void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb);
+        void *(*lock_lvb)(struct ocfs2_dlm_lksb *lksb);
        /*
         * Cluster-aware posix locks
@@ -210,7 +217,7 @@ struct ocfs2_stack_operations {
         * This is an optoinal debugging hook.  If provided, the
         * stack can dump debugging information about this lock.
         */
-        void (*dump_lksb)(union ocfs2_dlm_lksb *lksb);
+        void (*dump_lksb)(struct ocfs2_dlm_lksb *lksb);
 };
 /*
@@ -226,7 +233,7 @@ struct ocfs2_stack_plugin {
        /* These are managed by the stackglue code. */
        struct list_head sp_list;
        unsigned int sp_count;
-        struct ocfs2_locking_protocol *sp_proto;
+        struct ocfs2_protocol_version sp_max_proto;
 };
@@ -234,10 +241,22 @@ struct ocfs2_stack_plugin {
 int ocfs2_cluster_connect(const char *stack_name,
                          const char *group,
                          int grouplen,
+                          struct ocfs2_locking_protocol *lproto,
                          void (*recovery_handler)(int node_num,
                                                   void *recovery_data),
                          void *recovery_data,
                          struct ocfs2_cluster_connection **conn);
+/*
+ * Used by callers that don't store their stack name.  They must ensure
+ * all nodes have the same stack.
+ */
+int ocfs2_cluster_connect_agnostic(const char *group,
+                                   int grouplen,
+                                   struct ocfs2_locking_protocol *lproto,
+                                   void (*recovery_handler)(int node_num,
+                                                            void *recovery_data),
+                                   void *recovery_data,
+                                   struct ocfs2_cluster_connection **conn);
 int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
                             int hangup_pending);
 void ocfs2_cluster_hangup(const char *group, int grouplen);
@@ -246,26 +265,24 @@ int ocfs2_cluster_this_node(unsigned int *node);
 struct ocfs2_lock_res;
 int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
                   int mode,
-                   union ocfs2_dlm_lksb *lksb,
+                   struct ocfs2_dlm_lksb *lksb,
                   u32 flags,
                   void *name,
-                   unsigned int namelen,
+                   unsigned int namelen);
-                   struct ocfs2_lock_res *astarg);
 int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
-                     union ocfs2_dlm_lksb *lksb,
+                     struct ocfs2_dlm_lksb *lksb,
-                     u32 flags,
+                     u32 flags);
-                     struct ocfs2_lock_res *astarg);
-int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
+int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb);
-int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb);
+int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb);
-void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
+void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb);
-void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
+void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb);
 int ocfs2_stack_supports_plocks(void);
 int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
                struct file *file, int cmd, struct file_lock *fl);
-void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto);
+void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto);
 /* Used by stack plugins */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c30b644d9572..19ba00f28547 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -51,7 +51,7 @@
 #define ALLOC_NEW_GROUP                 0x1
 #define ALLOC_GROUPS_FROM_GLOBAL        0x2
-#define OCFS2_MAX_INODES_TO_STEAL       1024
+#define OCFS2_MAX_TO_STEAL              1024
 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
@@ -95,13 +95,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
                                             struct buffer_head *group_bh,
                                             unsigned int bit_off,
                                             unsigned int num_bits);
-static inline int ocfs2_block_group_clear_bits(handle_t *handle,
-                                               struct inode *alloc_inode,
-                                               struct ocfs2_group_desc *bg,
-                                               struct buffer_head *group_bh,
-                                               unsigned int bit_off,
-                                               unsigned int num_bits);
 static int ocfs2_relink_block_group(handle_t *handle,
                                    struct inode *alloc_inode,
                                    struct buffer_head *fe_bh,
@@ -152,7 +145,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 #define do_error(fmt, ...)                                              \
        do{                                                             \
-                if (clean_error)                                        \
+                if (resize)                                     \
                        mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);        \
                else                                                    \
                        ocfs2_error(sb, fmt, ##__VA_ARGS__);            \
@@ -160,7 +153,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 static int ocfs2_validate_gd_self(struct super_block *sb,
                                  struct buffer_head *bh,
-                                  int clean_error)
+                                  int resize)
 {
        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
@@ -211,7 +204,7 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
 static int ocfs2_validate_gd_parent(struct super_block *sb,
                                    struct ocfs2_dinode *di,
                                    struct buffer_head *bh,
-                                    int clean_error)
+                                    int resize)
 {
        unsigned int max_bits;
        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
@@ -233,8 +226,11 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
                return -EINVAL;
        }
-        if (le16_to_cpu(gd->bg_chain) >=
+        /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
-            le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
+        if ((le16_to_cpu(gd->bg_chain) >
+             le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
+            ((le16_to_cpu(gd->bg_chain) ==
+             le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
                do_error("Group descriptor #%llu has bad chain %u",
                         (unsigned long long)bh->b_blocknr,
                         le16_to_cpu(gd->bg_chain));
@@ -637,12 +633,113 @@ bail:
        return status;
 }
+static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
+{
+        spin_lock(&osb->osb_lock);
+        osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
+        spin_unlock(&osb->osb_lock);
+        atomic_set(&osb->s_num_inodes_stolen, 0);
+}
+static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
+{
+        spin_lock(&osb->osb_lock);
+        osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
+        spin_unlock(&osb->osb_lock);
+        atomic_set(&osb->s_num_meta_stolen, 0);
+}
+void ocfs2_init_steal_slots(struct ocfs2_super *osb)
+{
+        ocfs2_init_inode_steal_slot(osb);
+        ocfs2_init_meta_steal_slot(osb);
+}
+static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
+{
+        spin_lock(&osb->osb_lock);
+        if (type == INODE_ALLOC_SYSTEM_INODE)
+                osb->s_inode_steal_slot = slot;
+        else if (type == EXTENT_ALLOC_SYSTEM_INODE)
+                osb->s_meta_steal_slot = slot;
+        spin_unlock(&osb->osb_lock);
+}
+static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
+{
+        int slot = OCFS2_INVALID_SLOT;
+        spin_lock(&osb->osb_lock);
+        if (type == INODE_ALLOC_SYSTEM_INODE)
+                slot = osb->s_inode_steal_slot;
+        else if (type == EXTENT_ALLOC_SYSTEM_INODE)
+                slot = osb->s_meta_steal_slot;
+        spin_unlock(&osb->osb_lock);
+        return slot;
+}
+static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
+{
+        return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
+}
+static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
+{
+        return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
+}
+static int ocfs2_steal_resource(struct ocfs2_super *osb,
+                                struct ocfs2_alloc_context *ac,
+                                int type)
+{
+        int i, status = -ENOSPC;
+        int slot = __ocfs2_get_steal_slot(osb, type);
+        /* Start to steal resource from the first slot after ours. */
+        if (slot == OCFS2_INVALID_SLOT)
+                slot = osb->slot_num + 1;
+        for (i = 0; i < osb->max_slots; i++, slot++) {
+                if (slot == osb->max_slots)
+                        slot = 0;
+                if (slot == osb->slot_num)
+                        continue;
+                status = ocfs2_reserve_suballoc_bits(osb, ac,
+                                                     type,
+                                                     (u32)slot, NULL,
+                                                     NOT_ALLOC_NEW_GROUP);
+                if (status >= 0) {
+                        __ocfs2_set_steal_slot(osb, slot, type);
+                        break;
+                }
+                ocfs2_free_ac_resource(ac);
+        }
+        return status;
+}
+static int ocfs2_steal_inode(struct ocfs2_super *osb,
+                             struct ocfs2_alloc_context *ac)
+{
+        return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
+}
+static int ocfs2_steal_meta(struct ocfs2_super *osb,
+                            struct ocfs2_alloc_context *ac)
+{
+        return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
+}
 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
                                      int blocks,
                                      struct ocfs2_alloc_context **ac)
 {
        int status;
-        u32 slot;
+        int slot = ocfs2_get_meta_steal_slot(osb);
        *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
        if (!(*ac)) {
@@ -653,12 +750,34 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
        (*ac)->ac_bits_wanted = blocks;
        (*ac)->ac_which = OCFS2_AC_USE_META;
-        slot = osb->slot_num;
        (*ac)->ac_group_search = ocfs2_block_group_search;
+        if (slot != OCFS2_INVALID_SLOT &&
+                atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
+                goto extent_steal;
+        atomic_set(&osb->s_num_meta_stolen, 0);
        status = ocfs2_reserve_suballoc_bits(osb, (*ac),
                                             EXTENT_ALLOC_SYSTEM_INODE,
-                                             slot, NULL, ALLOC_NEW_GROUP);
+                                             (u32)osb->slot_num, NULL,
+                                             ALLOC_NEW_GROUP);
+        if (status >= 0) {
+                status = 0;
+                if (slot != OCFS2_INVALID_SLOT)
+                        ocfs2_init_meta_steal_slot(osb);
+                goto bail;
+        } else if (status < 0 && status != -ENOSPC) {
+                mlog_errno(status);
+                goto bail;
+        }
+        ocfs2_free_ac_resource(*ac);
+extent_steal:
+        status = ocfs2_steal_meta(osb, *ac);
+        atomic_inc(&osb->s_num_meta_stolen);
        if (status < 0) {
                if (status != -ENOSPC)
                        mlog_errno(status);
@@ -685,43 +804,11 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
                                        ac);
 }
-static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
-                                              struct ocfs2_alloc_context *ac)
-{
-        int i, status = -ENOSPC;
-        s16 slot = ocfs2_get_inode_steal_slot(osb);
-        /* Start to steal inodes from the first slot after ours. */
-        if (slot == OCFS2_INVALID_SLOT)
-                slot = osb->slot_num + 1;
-        for (i = 0; i < osb->max_slots; i++, slot++) {
-                if (slot == osb->max_slots)
-                        slot = 0;
-                if (slot == osb->slot_num)
-                        continue;
-                status = ocfs2_reserve_suballoc_bits(osb, ac,
-                                                     INODE_ALLOC_SYSTEM_INODE,
-                                                     slot, NULL,
-                                                     NOT_ALLOC_NEW_GROUP);
-                if (status >= 0) {
-                        ocfs2_set_inode_steal_slot(osb, slot);
-                        break;
-                }
-                ocfs2_free_ac_resource(ac);
-        }
-        return status;
-}
 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
                            struct ocfs2_alloc_context **ac)
 {
        int status;
-        s16 slot = ocfs2_get_inode_steal_slot(osb);
+        int slot = ocfs2_get_inode_steal_slot(osb);
        u64 alloc_group;
        *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
@@ -754,14 +841,14 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
         * need to check our slots to see whether there is some space for us.
         */
        if (slot != OCFS2_INVALID_SLOT &&
-            atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL)
+            atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
                goto inode_steal;
        atomic_set(&osb->s_num_inodes_stolen, 0);
        alloc_group = osb->osb_inode_alloc_group;
        status = ocfs2_reserve_suballoc_bits(osb, *ac,
                                             INODE_ALLOC_SYSTEM_INODE,
-                                             osb->slot_num,
+                                             (u32)osb->slot_num,
                                             &alloc_group,
                                             ALLOC_NEW_GROUP |
                                             ALLOC_GROUPS_FROM_GLOBAL);
@@ -789,7 +876,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
        ocfs2_free_ac_resource(*ac);
 inode_steal:
-        status = ocfs2_steal_inode_from_other_nodes(osb, *ac);
+        status = ocfs2_steal_inode(osb, *ac);
        atomic_inc(&osb->s_num_inodes_stolen);
        if (status < 0) {
                if (status != -ENOSPC)
@@ -1884,18 +1971,18 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
                                      bits_wanted, cluster_start, num_clusters);
 }
-static inline int ocfs2_block_group_clear_bits(handle_t *handle,
+static int ocfs2_block_group_clear_bits(handle_t *handle,
-                                               struct inode *alloc_inode,
+                                        struct inode *alloc_inode,
-                                               struct ocfs2_group_desc *bg,
+                                        struct ocfs2_group_desc *bg,
-                                               struct buffer_head *group_bh,
+                                        struct buffer_head *group_bh,
-                                               unsigned int bit_off,
+                                        unsigned int bit_off,
-                                               unsigned int num_bits)
+                                        unsigned int num_bits,
+                                        void (*undo_fn)(unsigned int bit,
+                                                        unsigned long *bmap))
 {
        int status;
        unsigned int tmp;
-        int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
        struct ocfs2_group_desc *undo_bg = NULL;
-        int cluster_bitmap = 0;
        mlog_entry_void();
@@ -1905,20 +1992,18 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
        mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
-        if (ocfs2_is_cluster_bitmap(alloc_inode))
+        BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
-                journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
        status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
-                                         group_bh, journal_type);
+                                         group_bh,
+                                         undo_fn ?
+                                         OCFS2_JOURNAL_ACCESS_UNDO :
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        if (ocfs2_is_cluster_bitmap(alloc_inode))
+        if (undo_fn) {
-                cluster_bitmap = 1;
-        if (cluster_bitmap) {
                jbd_lock_bh_state(group_bh);
                undo_bg = (struct ocfs2_group_desc *)
                                        bh2jh(group_bh)->b_committed_data;
@@ -1929,13 +2014,13 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
        while(tmp--) {
                ocfs2_clear_bit((bit_off + tmp),
                                (unsigned long *) bg->bg_bitmap);
-                if (cluster_bitmap)
+                if (undo_fn)
-                        ocfs2_set_bit(bit_off + tmp,
+                        undo_fn(bit_off + tmp,
-                                      (unsigned long *) undo_bg->bg_bitmap);
+                                (unsigned long *) undo_bg->bg_bitmap);
        }
        le16_add_cpu(&bg->bg_free_bits_count, num_bits);
-        if (cluster_bitmap)
+        if (undo_fn)
                jbd_unlock_bh_state(group_bh);
        status = ocfs2_journal_dirty(handle, group_bh);
@@ -1948,12 +2033,14 @@ bail:
 /*
 * expects the suballoc inode to already be locked.
 */
-int ocfs2_free_suballoc_bits(handle_t *handle,
+static int _ocfs2_free_suballoc_bits(handle_t *handle,
-                             struct inode *alloc_inode,
+                                     struct inode *alloc_inode,
-                             struct buffer_head *alloc_bh,
+                                     struct buffer_head *alloc_bh,
-                             unsigned int start_bit,
+                                     unsigned int start_bit,
-                             u64 bg_blkno,
+                                     u64 bg_blkno,
-                             unsigned int count)
+                                     unsigned int count,
+                                     void (*undo_fn)(unsigned int bit,
+                                                     unsigned long *bitmap))
 {
        int status = 0;
        u32 tmp_used;
@@ -1988,7 +2075,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
        status = ocfs2_block_group_clear_bits(handle, alloc_inode,
                                              group, group_bh,
-                                              start_bit, count);
+                                              start_bit, count, undo_fn);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -2019,6 +2106,17 @@ bail:
        return status;
 }
+int ocfs2_free_suballoc_bits(handle_t *handle,
+                             struct inode *alloc_inode,
+                             struct buffer_head *alloc_bh,
+                             unsigned int start_bit,
+                             u64 bg_blkno,
+                             unsigned int count)
+{
+        return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
+                                         start_bit, bg_blkno, count, NULL);
+}
 int ocfs2_free_dinode(handle_t *handle,
                      struct inode *inode_alloc_inode,
                      struct buffer_head *inode_alloc_bh,
@@ -2032,11 +2130,13 @@ int ocfs2_free_dinode(handle_t *handle,
                                        inode_alloc_bh, bit, bg_blkno, 1);
 }
-int ocfs2_free_clusters(handle_t *handle,
+static int _ocfs2_free_clusters(handle_t *handle,
-                       struct inode *bitmap_inode,
+                                struct inode *bitmap_inode,
-                       struct buffer_head *bitmap_bh,
+                                struct buffer_head *bitmap_bh,
-                       u64 start_blk,
+                                u64 start_blk,
-                       unsigned int num_clusters)
+                                unsigned int num_clusters,
+                                void (*undo_fn)(unsigned int bit,
+                                                unsigned long *bitmap))
 {
        int status;
        u16 bg_start_bit;
@@ -2063,9 +2163,9 @@ int ocfs2_free_clusters(handle_t *handle,
        mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
             (unsigned long long)bg_blkno, bg_start_bit);
-        status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
+        status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
-                                          bg_start_bit, bg_blkno,
+                                           bg_start_bit, bg_blkno,
-                                          num_clusters);
+                                           num_clusters, undo_fn);
        if (status < 0) {
                mlog_errno(status);
                goto out;
@@ -2079,6 +2179,32 @@ out:
        return status;
 }
+int ocfs2_free_clusters(handle_t *handle,
+                        struct inode *bitmap_inode,
+                        struct buffer_head *bitmap_bh,
+                        u64 start_blk,
+                        unsigned int num_clusters)
+{
+        return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
+                                    start_blk, num_clusters,
+                                    _ocfs2_set_bit);
+}
+/*
+ * Give never-used clusters back to the global bitmap.  We don't need
+ * to protect these bits in the undo buffer.
+ */
+int ocfs2_release_clusters(handle_t *handle,
+                           struct inode *bitmap_inode,
+                           struct buffer_head *bitmap_bh,
+                           u64 start_blk,
+                           unsigned int num_clusters)
+{
+        return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
+                                    start_blk, num_clusters,
+                                    _ocfs2_clear_bit);
+}
 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
 {
        printk("Block Group:\n");
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 8c9a78a43164..e0f46df357e6 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -56,6 +56,7 @@ struct ocfs2_alloc_context {
                                 is the same as ~0 - unlimited */
 };
+void ocfs2_init_steal_slots(struct ocfs2_super *osb);
 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
 static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
 {
@@ -126,6 +127,11 @@ int ocfs2_free_clusters(handle_t *handle,
                        struct buffer_head *bitmap_bh,
                        u64 start_blk,
                        unsigned int num_clusters);
+int ocfs2_release_clusters(handle_t *handle,
+                           struct inode *bitmap_inode,
+                           struct buffer_head *bitmap_bh,
+                           u64 start_blk,
+                           unsigned int num_clusters);
 static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
 {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 14f47d2bfe02..dee03197a494 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -69,6 +69,7 @@
 #include "xattr.h"
 #include "quota.h"
 #include "refcounttree.h"
+#include "suballoc.h"
 #include "buffer_head_io.h"
@@ -100,6 +101,8 @@ struct mount_options
 static int ocfs2_parse_options(struct super_block *sb, char *options,
                               struct mount_options *mopt,
                               int is_remount);
+static int ocfs2_check_set_options(struct super_block *sb,
+                                   struct mount_options *options);
 static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt);
 static void ocfs2_put_super(struct super_block *sb);
 static int ocfs2_mount_volume(struct super_block *sb);
@@ -299,9 +302,12 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
        spin_lock(&osb->osb_lock);
        out += snprintf(buf + out, len - out,
-                        "%10s => Slot: %d  NumStolen: %d\n", "Steal",
+                        "%10s => InodeSlot: %d  StolenInodes: %d, "
+                        "MetaSlot: %d  StolenMeta: %d\n", "Steal",
                        osb->s_inode_steal_slot,
-                        atomic_read(&osb->s_num_inodes_stolen));
+                        atomic_read(&osb->s_num_inodes_stolen),
+                        osb->s_meta_steal_slot,
+                        atomic_read(&osb->s_num_meta_stolen));
        spin_unlock(&osb->osb_lock);
        out += snprintf(buf + out, len - out, "OrphanScan => ");
@@ -600,7 +606,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
        lock_kernel();
-        if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
+        if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
+            !ocfs2_check_set_options(sb, &parsed_options)) {
                ret = -EINVAL;
                goto out;
        }
@@ -691,8 +698,6 @@ unlock_osb:
        if (!ret) {
                /* Only save off the new mount options in case of a successful
                 * remount. */
-                if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
-                        parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
                osb->s_mount_opt = parsed_options.mount_opt;
                osb->s_atime_quantum = parsed_options.atime_quantum;
                osb->preferred_slot = parsed_options.slot;
@@ -701,6 +706,10 @@ unlock_osb:
                if (!ocfs2_is_hard_readonly(osb))
                        ocfs2_set_journal_params(osb);
+                sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+                        ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ?
+                                                        MS_POSIXACL : 0);
        }
 out:
        unlock_kernel();
@@ -1011,31 +1020,16 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        brelse(bh);
        bh = NULL;
-        if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
+        if (!ocfs2_check_set_options(sb, &parsed_options)) {
-                parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
+                status = -EINVAL;
+                goto read_super_error;
+        }
        osb->s_mount_opt = parsed_options.mount_opt;
        osb->s_atime_quantum = parsed_options.atime_quantum;
        osb->preferred_slot = parsed_options.slot;
        osb->osb_commit_interval = parsed_options.commit_interval;
        osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
        osb->local_alloc_bits = osb->local_alloc_default_bits;
-        if (osb->s_mount_opt & OCFS2_MOUNT_USRQUOTA &&
-            !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
-                                         OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
-                status = -EINVAL;
-                mlog(ML_ERROR, "User quotas were requested, but this "
-                     "filesystem does not have the feature enabled.\n");
-                goto read_super_error;
-        }
-        if (osb->s_mount_opt & OCFS2_MOUNT_GRPQUOTA &&
-            !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
-                                         OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
-                status = -EINVAL;
-                mlog(ML_ERROR, "Group quotas were requested, but this "
-                     "filesystem does not have the feature enabled.\n");
-                goto read_super_error;
-        }
        status = ocfs2_verify_userspace_stack(osb, &parsed_options);
        if (status)
@@ -1072,7 +1066,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
                                     "file system, but write access is "
                                     "unavailable.\n");
                        else
-                                mlog_errno(status);                     
+                                mlog_errno(status);
                        goto read_super_error;
                }
@@ -1245,6 +1239,40 @@ static struct file_system_type ocfs2_fs_type = {
        .next           = NULL
 };
+static int ocfs2_check_set_options(struct super_block *sb,
+                                   struct mount_options *options)
+{
+        if (options->mount_opt & OCFS2_MOUNT_USRQUOTA &&
+            !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                                         OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+                mlog(ML_ERROR, "User quotas were requested, but this "
+                     "filesystem does not have the feature enabled.\n");
+                return 0;
+        }
+        if (options->mount_opt & OCFS2_MOUNT_GRPQUOTA &&
+            !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                                         OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+                mlog(ML_ERROR, "Group quotas were requested, but this "
+                     "filesystem does not have the feature enabled.\n");
+                return 0;
+        }
+        if (options->mount_opt & OCFS2_MOUNT_POSIX_ACL &&
+            !OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) {
+                mlog(ML_ERROR, "ACL support requested but extended attributes "
+                     "feature is not enabled\n");
+                return 0;
+        }
+        /* No ACL setting specified? Use XATTR feature... */
+        if (!(options->mount_opt & (OCFS2_MOUNT_POSIX_ACL |
+                                    OCFS2_MOUNT_NO_POSIX_ACL))) {
+                if (OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR))
+                        options->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+                else
+                        options->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
+        }
+        return 1;
+}
 static int ocfs2_parse_options(struct super_block *sb,
                               char *options,
                               struct mount_options *mopt,
@@ -1392,40 +1420,19 @@ static int ocfs2_parse_options(struct super_block *sb,
                        mopt->mount_opt |= OCFS2_MOUNT_INODE64;
                        break;
                case Opt_usrquota:
-                        /* We check only on remount, otherwise features
-                         * aren't yet initialized. */
-                        if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
-                            OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
-                                mlog(ML_ERROR, "User quota requested but "
-                                     "filesystem feature is not set\n");
-                                status = 0;
-                                goto bail;
-                        }
                        mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
                        break;
                case Opt_grpquota:
-                        if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
-                            OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
-                                mlog(ML_ERROR, "Group quota requested but "
-                                     "filesystem feature is not set\n");
-                                status = 0;
-                                goto bail;
-                        }
                        mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
                        break;
-#ifdef CONFIG_OCFS2_FS_POSIX_ACL
                case Opt_acl:
                        mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+                        mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
                        break;
                case Opt_noacl:
+                        mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
                        mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
                        break;
-#else
-                case Opt_acl:
-                case Opt_noacl:
-                        printk(KERN_INFO "ocfs2 (no)acl options not supported\n");
-                        break;
-#endif
                default:
                        mlog(ML_ERROR,
                             "Unrecognized mount option \"%s\" "
@@ -1502,12 +1509,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        if (opts & OCFS2_MOUNT_INODE64)
                seq_printf(s, ",inode64");
-#ifdef CONFIG_OCFS2_FS_POSIX_ACL
        if (opts & OCFS2_MOUNT_POSIX_ACL)
                seq_printf(s, ",acl");
        else
                seq_printf(s, ",noacl");
-#endif
        return 0;
 }
@@ -1996,7 +2001,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
        osb->blocked_lock_count = 0;
        spin_lock_init(&osb->osb_lock);
        spin_lock_init(&osb->osb_xattr_lock);
-        ocfs2_init_inode_steal_slot(osb);
+        ocfs2_init_steal_slots(osb);
        atomic_set(&osb->alloc_stats.moves, 0);
        atomic_set(&osb->alloc_stats.local_data, 0);
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index e3421030a69f..32499d213fc4 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -137,20 +137,20 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry,
        }
        memcpy(link, target, len);
-        nd_set_link(nd, link);
 bail:
+        nd_set_link(nd, status ? ERR_PTR(status) : link);
        brelse(bh);
        mlog_exit(status);
-        return status ? ERR_PTR(status) : link;
+        return NULL;
 }
 static void ocfs2_fast_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
 {
-        char *link = cookie;
+        char *link = nd_get_link(nd);
+        if (!IS_ERR(link))
-        kfree(link);
+                kfree(link);
 }
 const struct inode_operations ocfs2_symlink_inode_operations = {
@@ -163,6 +163,7 @@ const struct inode_operations ocfs2_symlink_inode_operations = {
        .getxattr       = generic_getxattr,
        .listxattr      = ocfs2_listxattr,
        .removexattr    = generic_removexattr,
+        .fiemap         = ocfs2_fiemap,
 };
 const struct inode_operations ocfs2_fast_symlink_inode_operations = {
        .readlink       = ocfs2_readlink,
@@ -174,4 +175,5 @@ const struct inode_operations ocfs2_fast_symlink_inode_operations = {
        .getxattr       = generic_getxattr,
        .listxattr      = ocfs2_listxattr,
        .removexattr    = generic_removexattr,
+        .fiemap         = ocfs2_fiemap,
 };
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 40e53702948c..bfe7190cdbf1 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -25,7 +25,6 @@
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #define MLOG_MASK_PREFIX ML_INODE
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index c61369342a27..a0a120e82b97 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -267,8 +267,8 @@ static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci,
 }
 /* Warning: even if it returns true, this does *not* guarantee that
- * the block is stored in our inode metadata cache. 
+ * the block is stored in our inode metadata cache.
- * 
+ *
 * This can be called under lock_buffer()
 */
 int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index fe3419068df2..3e7773089b96 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -98,10 +98,8 @@ static struct ocfs2_xattr_def_value_root def_xv = {
 struct xattr_handler *ocfs2_xattr_handlers[] = {
        &ocfs2_xattr_user_handler,
-#ifdef CONFIG_OCFS2_FS_POSIX_ACL
        &ocfs2_xattr_acl_access_handler,
        &ocfs2_xattr_acl_default_handler,
-#endif
        &ocfs2_xattr_trusted_handler,
        &ocfs2_xattr_security_handler,
        NULL
@@ -109,21 +107,20 @@ struct xattr_handler *ocfs2_xattr_handlers[] = {
 static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
        [OCFS2_XATTR_INDEX_USER]        = &ocfs2_xattr_user_handler,
-#ifdef CONFIG_OCFS2_FS_POSIX_ACL
        [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
                                        = &ocfs2_xattr_acl_access_handler,
        [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
                                        = &ocfs2_xattr_acl_default_handler,
-#endif
        [OCFS2_XATTR_INDEX_TRUSTED]     = &ocfs2_xattr_trusted_handler,
        [OCFS2_XATTR_INDEX_SECURITY]    = &ocfs2_xattr_security_handler,
 };
 struct ocfs2_xattr_info {
-        int name_index;
+        int             xi_name_index;
-        const char *name;
+        const char      *xi_name;
-        const void *value;
+        int             xi_name_len;
-        size_t value_len;
+        const void      *xi_value;
+        size_t          xi_value_len;
 };
 struct ocfs2_xattr_search {
@@ -141,6 +138,115 @@ struct ocfs2_xattr_search {
        int not_found;
 };
+/* Operations on struct ocfs2_xa_entry */
+struct ocfs2_xa_loc;
+struct ocfs2_xa_loc_operations {
+        /*
+         * Journal functions
+         */
+        int (*xlo_journal_access)(handle_t *handle, struct ocfs2_xa_loc *loc,
+                                  int type);
+        void (*xlo_journal_dirty)(handle_t *handle, struct ocfs2_xa_loc *loc);
+        /*
+         * Return a pointer to the appropriate buffer in loc->xl_storage
+         * at the given offset from loc->xl_header.
+         */
+        void *(*xlo_offset_pointer)(struct ocfs2_xa_loc *loc, int offset);
+        /* Can we reuse the existing entry for the new value? */
+        int (*xlo_can_reuse)(struct ocfs2_xa_loc *loc,
+                             struct ocfs2_xattr_info *xi);
+        /* How much space is needed for the new value? */
+        int (*xlo_check_space)(struct ocfs2_xa_loc *loc,
+                               struct ocfs2_xattr_info *xi);
+        /*
+         * Return the offset of the first name+value pair.  This is
+         * the start of our downward-filling free space.
+         */
+        int (*xlo_get_free_start)(struct ocfs2_xa_loc *loc);
+        /*
+         * Remove the name+value at this location.  Do whatever is
+         * appropriate with the remaining name+value pairs.
+         */
+        void (*xlo_wipe_namevalue)(struct ocfs2_xa_loc *loc);
+        /* Fill xl_entry with a new entry */
+        void (*xlo_add_entry)(struct ocfs2_xa_loc *loc, u32 name_hash);
+        /* Add name+value storage to an entry */
+        void (*xlo_add_namevalue)(struct ocfs2_xa_loc *loc, int size);
+        /*
+         * Initialize the value buf's access and bh fields for this entry.
+         * ocfs2_xa_fill_value_buf() will handle the xv pointer.
+         */
+        void (*xlo_fill_value_buf)(struct ocfs2_xa_loc *loc,
+                                   struct ocfs2_xattr_value_buf *vb);
+};
+/*
+ * Describes an xattr entry location.  This is a memory structure
+ * tracking the on-disk structure.
+ */
+struct ocfs2_xa_loc {
+        /* This xattr belongs to this inode */
+        struct inode *xl_inode;
+        /* The ocfs2_xattr_header inside the on-disk storage. Not NULL. */
+        struct ocfs2_xattr_header *xl_header;
+        /* Bytes from xl_header to the end of the storage */
+        int xl_size;
+        /*
+         * The ocfs2_xattr_entry this location describes.  If this is
+         * NULL, this location describes the on-disk structure where it
+         * would have been.
+         */
+        struct ocfs2_xattr_entry *xl_entry;
+        /*
+         * Internal housekeeping
+         */
+        /* Buffer(s) containing this entry */
+        void *xl_storage;
+        /* Operations on the storage backing this location */
+        const struct ocfs2_xa_loc_operations *xl_ops;
+};
+/*
+ * Convenience functions to calculate how much space is needed for a
+ * given name+value pair
+ */
+static int namevalue_size(int name_len, uint64_t value_len)
+{
+        if (value_len > OCFS2_XATTR_INLINE_SIZE)
+                return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+        else
+                return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
+}
+static int namevalue_size_xi(struct ocfs2_xattr_info *xi)
+{
+        return namevalue_size(xi->xi_name_len, xi->xi_value_len);
+}
+static int namevalue_size_xe(struct ocfs2_xattr_entry *xe)
+{
+        u64 value_len = le64_to_cpu(xe->xe_value_size);
+        BUG_ON((value_len > OCFS2_XATTR_INLINE_SIZE) &&
+               ocfs2_xattr_is_local(xe));
+        return namevalue_size(xe->xe_name_len, value_len);
+}
 static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
                                             struct ocfs2_xattr_header *xh,
                                             int index,
@@ -205,8 +311,6 @@ static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
                                           int offset,
                                           struct ocfs2_xattr_value_root **xv,
                                           struct buffer_head **bh);
-static int ocfs2_xattr_security_set(struct inode *inode, const char *name,
-                                    const void *value, size_t size, int flags);
 static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
 {
@@ -218,14 +322,6 @@ static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
        return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
 }
-static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
-{
-        u16 len = sb->s_blocksize -
-                 offsetof(struct ocfs2_xattr_header, xh_entries);
-        return len / sizeof(struct ocfs2_xattr_entry);
-}
 #define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
 #define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
 #define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
@@ -469,35 +565,22 @@ static u32 ocfs2_xattr_name_hash(struct inode *inode,
        return hash;
 }
-/*
+static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
- * ocfs2_xattr_hash_entry()
- *
- * Compute the hash of an extended attribute.
- */
-static void ocfs2_xattr_hash_entry(struct inode *inode,
-                                   struct ocfs2_xattr_header *header,
-                                   struct ocfs2_xattr_entry *entry)
 {
-        u32 hash = 0;
+        return namevalue_size(name_len, value_len) +
-        char *name = (char *)header + le16_to_cpu(entry->xe_name_offset);
+                sizeof(struct ocfs2_xattr_entry);
-        hash = ocfs2_xattr_name_hash(inode, name, entry->xe_name_len);
-        entry->xe_name_hash = cpu_to_le32(hash);
-        return;
 }
-static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
+static int ocfs2_xi_entry_usage(struct ocfs2_xattr_info *xi)
 {
-        int size = 0;
+        return namevalue_size_xi(xi) +
+                sizeof(struct ocfs2_xattr_entry);
-        if (value_len <= OCFS2_XATTR_INLINE_SIZE)
+}
-                size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
-        else
-                size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
-        size += sizeof(struct ocfs2_xattr_entry);
-        return size;
+static int ocfs2_xe_entry_usage(struct ocfs2_xattr_entry *xe)
+{
+        return namevalue_size_xe(xe) +
+                sizeof(struct ocfs2_xattr_entry);
 }
 int ocfs2_calc_security_init(struct inode *dir,
@@ -1314,452 +1397,897 @@ out:
        return ret;
 }
-static int ocfs2_xattr_cleanup(struct inode *inode,
+static int ocfs2_xa_check_space_helper(int needed_space, int free_start,
-                               handle_t *handle,
+                                       int num_entries)
-                               struct ocfs2_xattr_info *xi,
-                               struct ocfs2_xattr_search *xs,
-                               struct ocfs2_xattr_value_buf *vb,
-                               size_t offs)
 {
-        int ret = 0;
+        int free_space;
-        size_t name_len = strlen(xi->name);
-        void *val = xs->base + offs;
-        size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
-        ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
+        if (!needed_space)
-                            OCFS2_JOURNAL_ACCESS_WRITE);
+                return 0;
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
-        /* Decrease xattr count */
-        le16_add_cpu(&xs->header->xh_count, -1);
-        /* Remove the xattr entry and tree root which has already be set*/
-        memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
-        memset(val, 0, size);
-        ret = ocfs2_journal_dirty(handle, vb->vb_bh);
+        free_space = free_start -
-        if (ret < 0)
+                sizeof(struct ocfs2_xattr_header) -
-                mlog_errno(ret);
+                (num_entries * sizeof(struct ocfs2_xattr_entry)) -
-out:
+                OCFS2_XATTR_HEADER_GAP;
-        return ret;
+        if (free_space < 0)
+                return -EIO;
+        if (free_space < needed_space)
+                return -ENOSPC;
+        return 0;
 }
-static int ocfs2_xattr_update_entry(struct inode *inode,
+static int ocfs2_xa_journal_access(handle_t *handle, struct ocfs2_xa_loc *loc,
-                                    handle_t *handle,
+                                   int type)
-                                    struct ocfs2_xattr_info *xi,
-                                    struct ocfs2_xattr_search *xs,
-                                    struct ocfs2_xattr_value_buf *vb,
-                                    size_t offs)
 {
-        int ret;
+        return loc->xl_ops->xlo_journal_access(handle, loc, type);
+}
-        ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
+static void ocfs2_xa_journal_dirty(handle_t *handle, struct ocfs2_xa_loc *loc)
-                            OCFS2_JOURNAL_ACCESS_WRITE);
+{
-        if (ret) {
+        loc->xl_ops->xlo_journal_dirty(handle, loc);
-                mlog_errno(ret);
+}
-                goto out;
-        }
-        xs->here->xe_name_offset = cpu_to_le16(offs);
+/* Give a pointer into the storage for the given offset */
-        xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+static void *ocfs2_xa_offset_pointer(struct ocfs2_xa_loc *loc, int offset)
-        if (xi->value_len <= OCFS2_XATTR_INLINE_SIZE)
+{
-                ocfs2_xattr_set_local(xs->here, 1);
+        BUG_ON(offset >= loc->xl_size);
-        else
+        return loc->xl_ops->xlo_offset_pointer(loc, offset);
-                ocfs2_xattr_set_local(xs->here, 0);
+}
-        ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
-        ret = ocfs2_journal_dirty(handle, vb->vb_bh);
+/*
-        if (ret < 0)
+ * Wipe the name+value pair and allow the storage to reclaim it.  This
-                mlog_errno(ret);
+ * must be followed by either removal of the entry or a call to
-out:
+ * ocfs2_xa_add_namevalue().
-        return ret;
+ */
+static void ocfs2_xa_wipe_namevalue(struct ocfs2_xa_loc *loc)
+{
+        loc->xl_ops->xlo_wipe_namevalue(loc);
 }
 /*
- * ocfs2_xattr_set_value_outside()
+ * Find lowest offset to a name+value pair.  This is the start of our
- *
+ * downward-growing free space.
- * Set large size value in B tree.
 */
-static int ocfs2_xattr_set_value_outside(struct inode *inode,
+static int ocfs2_xa_get_free_start(struct ocfs2_xa_loc *loc)
-                                         struct ocfs2_xattr_info *xi,
-                                         struct ocfs2_xattr_search *xs,
-                                         struct ocfs2_xattr_set_ctxt *ctxt,
-                                         struct ocfs2_xattr_value_buf *vb,
-                                         size_t offs)
 {
-        size_t name_len = strlen(xi->name);
+        return loc->xl_ops->xlo_get_free_start(loc);
-        void *val = xs->base + offs;
+}
-        struct ocfs2_xattr_value_root *xv = NULL;
-        size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
-        int ret = 0;
-        memset(val, 0, size);
+/* Can we reuse loc->xl_entry for xi? */
-        memcpy(val, xi->name, name_len);
+static int ocfs2_xa_can_reuse_entry(struct ocfs2_xa_loc *loc,
-        xv = (struct ocfs2_xattr_value_root *)
+                                    struct ocfs2_xattr_info *xi)
-                (val + OCFS2_XATTR_SIZE(name_len));
+{
-        xv->xr_clusters = 0;
+        return loc->xl_ops->xlo_can_reuse(loc, xi);
-        xv->xr_last_eb_blk = 0;
+}
-        xv->xr_list.l_tree_depth = 0;
-        xv->xr_list.l_count = cpu_to_le16(1);
+/* How much free space is needed to set the new value */
-        xv->xr_list.l_next_free_rec = 0;
+static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc,
-        vb->vb_xv = xv;
+                                struct ocfs2_xattr_info *xi)
+{
-        ret = ocfs2_xattr_value_truncate(inode, vb, xi->value_len, ctxt);
+        return loc->xl_ops->xlo_check_space(loc, xi);
-        if (ret < 0) {
+}
-                mlog_errno(ret);
-                return ret;
+static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
+{
+        loc->xl_ops->xlo_add_entry(loc, name_hash);
+        loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash);
+        /*
+         * We can't leave the new entry's xe_name_offset at zero or
+         * add_namevalue() will go nuts.  We set it to the size of our
+         * storage so that it can never be less than any other entry.
+         */
+        loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size);
+}
+static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc,
+                                   struct ocfs2_xattr_info *xi)
+{
+        int size = namevalue_size_xi(xi);
+        int nameval_offset;
+        char *nameval_buf;
+        loc->xl_ops->xlo_add_namevalue(loc, size);
+        loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
+        loc->xl_entry->xe_name_len = xi->xi_name_len;
+        ocfs2_xattr_set_type(loc->xl_entry, xi->xi_name_index);
+        ocfs2_xattr_set_local(loc->xl_entry,
+                              xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE);
+        nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+        nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
+        memset(nameval_buf, 0, size);
+        memcpy(nameval_buf, xi->xi_name, xi->xi_name_len);
+}
+static void ocfs2_xa_fill_value_buf(struct ocfs2_xa_loc *loc,
+                                    struct ocfs2_xattr_value_buf *vb)
+{
+        int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+        int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
+        /* Value bufs are for value trees */
+        BUG_ON(ocfs2_xattr_is_local(loc->xl_entry));
+        BUG_ON(namevalue_size_xe(loc->xl_entry) !=
+               (name_size + OCFS2_XATTR_ROOT_SIZE));
+        loc->xl_ops->xlo_fill_value_buf(loc, vb);
+        vb->vb_xv =
+                (struct ocfs2_xattr_value_root *)ocfs2_xa_offset_pointer(loc,
+                                                        nameval_offset +
+                                                        name_size);
+}
+static int ocfs2_xa_block_journal_access(handle_t *handle,
+                                         struct ocfs2_xa_loc *loc, int type)
+{
+        struct buffer_head *bh = loc->xl_storage;
+        ocfs2_journal_access_func access;
+        if (loc->xl_size == (bh->b_size -
+                             offsetof(struct ocfs2_xattr_block,
+                                      xb_attrs.xb_header)))
+                access = ocfs2_journal_access_xb;
+        else
+                access = ocfs2_journal_access_di;
+        return access(handle, INODE_CACHE(loc->xl_inode), bh, type);
+}
+static void ocfs2_xa_block_journal_dirty(handle_t *handle,
+                                         struct ocfs2_xa_loc *loc)
+{
+        struct buffer_head *bh = loc->xl_storage;
+        ocfs2_journal_dirty(handle, bh);
+}
+static void *ocfs2_xa_block_offset_pointer(struct ocfs2_xa_loc *loc,
+                                           int offset)
+{
+        return (char *)loc->xl_header + offset;
+}
+static int ocfs2_xa_block_can_reuse(struct ocfs2_xa_loc *loc,
+                                    struct ocfs2_xattr_info *xi)
+{
+        /*
+         * Block storage is strict.  If the sizes aren't exact, we will
+         * remove the old one and reinsert the new.
+         */
+        return namevalue_size_xe(loc->xl_entry) ==
+                namevalue_size_xi(xi);
+}
+static int ocfs2_xa_block_get_free_start(struct ocfs2_xa_loc *loc)
+{
+        struct ocfs2_xattr_header *xh = loc->xl_header;
+        int i, count = le16_to_cpu(xh->xh_count);
+        int offset, free_start = loc->xl_size;
+        for (i = 0; i < count; i++) {
+                offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
+                if (offset < free_start)
+                        free_start = offset;
        }
-        ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, vb, offs);
-        if (ret < 0) {
+        return free_start;
-                mlog_errno(ret);
+}
-                return ret;
+static int ocfs2_xa_block_check_space(struct ocfs2_xa_loc *loc,
+                                      struct ocfs2_xattr_info *xi)
+{
+        int count = le16_to_cpu(loc->xl_header->xh_count);
+        int free_start = ocfs2_xa_get_free_start(loc);
+        int needed_space = ocfs2_xi_entry_usage(xi);
+        /*
+         * Block storage will reclaim the original entry before inserting
+         * the new value, so we only need the difference.  If the new
+         * entry is smaller than the old one, we don't need anything.
+         */
+        if (loc->xl_entry) {
+                /* Don't need space if we're reusing! */
+                if (ocfs2_xa_can_reuse_entry(loc, xi))
+                        needed_space = 0;
+                else
+                        needed_space -= ocfs2_xe_entry_usage(loc->xl_entry);
        }
-        ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb,
+        if (needed_space < 0)
-                                              xi->value, xi->value_len);
+                needed_space = 0;
-        if (ret < 0)
+        return ocfs2_xa_check_space_helper(needed_space, free_start, count);
-                mlog_errno(ret);
+}
-        return ret;
+/*
+ * Block storage for xattrs keeps the name+value pairs compacted.  When
+ * we remove one, we have to shift any that preceded it towards the end.
+ */
+static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
+{
+        int i, offset;
+        int namevalue_offset, first_namevalue_offset, namevalue_size;
+        struct ocfs2_xattr_entry *entry = loc->xl_entry;
+        struct ocfs2_xattr_header *xh = loc->xl_header;
+        int count = le16_to_cpu(xh->xh_count);
+        namevalue_offset = le16_to_cpu(entry->xe_name_offset);
+        namevalue_size = namevalue_size_xe(entry);
+        first_namevalue_offset = ocfs2_xa_get_free_start(loc);
+        /* Shift the name+value pairs */
+        memmove((char *)xh + first_namevalue_offset + namevalue_size,
+                (char *)xh + first_namevalue_offset,
+                namevalue_offset - first_namevalue_offset);
+        memset((char *)xh + first_namevalue_offset, 0, namevalue_size);
+        /* Now tell xh->xh_entries about it */
+        for (i = 0; i < count; i++) {
+                offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
+                if (offset <= namevalue_offset)
+                        le16_add_cpu(&xh->xh_entries[i].xe_name_offset,
+                                     namevalue_size);
+        }
+        /*
+         * Note that we don't update xh_free_start or xh_name_value_len
+         * because they're not used in block-stored xattrs.
+         */
+}
+static void ocfs2_xa_block_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
+{
+        int count = le16_to_cpu(loc->xl_header->xh_count);
+        loc->xl_entry = &(loc->xl_header->xh_entries[count]);
+        le16_add_cpu(&loc->xl_header->xh_count, 1);
+        memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
+}
+static void ocfs2_xa_block_add_namevalue(struct ocfs2_xa_loc *loc, int size)
+{
+        int free_start = ocfs2_xa_get_free_start(loc);
+        loc->xl_entry->xe_name_offset = cpu_to_le16(free_start - size);
+}
+static void ocfs2_xa_block_fill_value_buf(struct ocfs2_xa_loc *loc,
+                                          struct ocfs2_xattr_value_buf *vb)
+{
+        struct buffer_head *bh = loc->xl_storage;
+        if (loc->xl_size == (bh->b_size -
+                             offsetof(struct ocfs2_xattr_block,
+                                      xb_attrs.xb_header)))
+                vb->vb_access = ocfs2_journal_access_xb;
+        else
+                vb->vb_access = ocfs2_journal_access_di;
+        vb->vb_bh = bh;
 }
 /*
- * ocfs2_xattr_set_entry_local()
+ * Operations for xattrs stored in blocks.  This includes inline inode
- *
+ * storage and unindexed ocfs2_xattr_blocks.
- * Set, replace or remove extended attribute in local.
 */
-static void ocfs2_xattr_set_entry_local(struct inode *inode,
+static const struct ocfs2_xa_loc_operations ocfs2_xa_block_loc_ops = {
-                                        struct ocfs2_xattr_info *xi,
+        .xlo_journal_access     = ocfs2_xa_block_journal_access,
-                                        struct ocfs2_xattr_search *xs,
+        .xlo_journal_dirty      = ocfs2_xa_block_journal_dirty,
-                                        struct ocfs2_xattr_entry *last,
+        .xlo_offset_pointer     = ocfs2_xa_block_offset_pointer,
-                                        size_t min_offs)
+        .xlo_check_space        = ocfs2_xa_block_check_space,
+        .xlo_can_reuse          = ocfs2_xa_block_can_reuse,
+        .xlo_get_free_start     = ocfs2_xa_block_get_free_start,
+        .xlo_wipe_namevalue     = ocfs2_xa_block_wipe_namevalue,
+        .xlo_add_entry          = ocfs2_xa_block_add_entry,
+        .xlo_add_namevalue      = ocfs2_xa_block_add_namevalue,
+        .xlo_fill_value_buf     = ocfs2_xa_block_fill_value_buf,
+};
+static int ocfs2_xa_bucket_journal_access(handle_t *handle,
+                                          struct ocfs2_xa_loc *loc, int type)
 {
-        size_t name_len = strlen(xi->name);
+        struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
-        int i;
-        if (xi->value && xs->not_found) {
+        return ocfs2_xattr_bucket_journal_access(handle, bucket, type);
-                /* Insert the new xattr entry. */
+}
-                le16_add_cpu(&xs->header->xh_count, 1);
-                ocfs2_xattr_set_type(last, xi->name_index);
+static void ocfs2_xa_bucket_journal_dirty(handle_t *handle,
-                ocfs2_xattr_set_local(last, 1);
+                                          struct ocfs2_xa_loc *loc)
-                last->xe_name_len = name_len;
+{
-        } else {
+        struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
-                void *first_val;
-                void *val;
+        ocfs2_xattr_bucket_journal_dirty(handle, bucket);
-                size_t offs, size;
+}
-                first_val = xs->base + min_offs;
+static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc,
-                offs = le16_to_cpu(xs->here->xe_name_offset);
+                                            int offset)
-                val = xs->base + offs;
+{
+        struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
-                if (le64_to_cpu(xs->here->xe_value_size) >
+        int block, block_offset;
-                    OCFS2_XATTR_INLINE_SIZE)
-                        size = OCFS2_XATTR_SIZE(name_len) +
+        /* The header is at the front of the bucket */
-                                OCFS2_XATTR_ROOT_SIZE;
+        block = offset >> loc->xl_inode->i_sb->s_blocksize_bits;
+        block_offset = offset % loc->xl_inode->i_sb->s_blocksize;
+        return bucket_block(bucket, block) + block_offset;
+}
+static int ocfs2_xa_bucket_can_reuse(struct ocfs2_xa_loc *loc,
+                                     struct ocfs2_xattr_info *xi)
+{
+        return namevalue_size_xe(loc->xl_entry) >=
+                namevalue_size_xi(xi);
+}
+static int ocfs2_xa_bucket_get_free_start(struct ocfs2_xa_loc *loc)
+{
+        struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+        return le16_to_cpu(bucket_xh(bucket)->xh_free_start);
+}
+static int ocfs2_bucket_align_free_start(struct super_block *sb,
+                                         int free_start, int size)
+{
+        /*
+         * We need to make sure that the name+value pair fits within
+         * one block.
+         */
+        if (((free_start - size) >> sb->s_blocksize_bits) !=
+            ((free_start - 1) >> sb->s_blocksize_bits))
+                free_start -= free_start % sb->s_blocksize;
+        return free_start;
+}
+static int ocfs2_xa_bucket_check_space(struct ocfs2_xa_loc *loc,
+                                       struct ocfs2_xattr_info *xi)
+{
+        int rc;
+        int count = le16_to_cpu(loc->xl_header->xh_count);
+        int free_start = ocfs2_xa_get_free_start(loc);
+        int needed_space = ocfs2_xi_entry_usage(xi);
+        int size = namevalue_size_xi(xi);
+        struct super_block *sb = loc->xl_inode->i_sb;
+        /*
+         * Bucket storage does not reclaim name+value pairs it cannot
+         * reuse.  They live as holes until the bucket fills, and then
+         * the bucket is defragmented.  However, the bucket can reclaim
+         * the ocfs2_xattr_entry.
+         */
+        if (loc->xl_entry) {
+                /* Don't need space if we're reusing! */
+                if (ocfs2_xa_can_reuse_entry(loc, xi))
+                        needed_space = 0;
                else
-                        size = OCFS2_XATTR_SIZE(name_len) +
+                        needed_space -= sizeof(struct ocfs2_xattr_entry);
-                        OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
+        }
+        BUG_ON(needed_space < 0);
-                if (xi->value && size == OCFS2_XATTR_SIZE(name_len) +
-                                OCFS2_XATTR_SIZE(xi->value_len)) {
-                        /* The old and the new value have the
-                           same size. Just replace the value. */
-                        ocfs2_xattr_set_local(xs->here, 1);
-                        xs->here->xe_value_size = cpu_to_le64(xi->value_len);
-                        /* Clear value bytes. */
-                        memset(val + OCFS2_XATTR_SIZE(name_len),
-                               0,
-                               OCFS2_XATTR_SIZE(xi->value_len));
-                        memcpy(val + OCFS2_XATTR_SIZE(name_len),
-                               xi->value,
-                               xi->value_len);
-                        return;
-                }
-                /* Remove the old name+value. */
-                memmove(first_val + size, first_val, val - first_val);
-                memset(first_val, 0, size);
-                xs->here->xe_name_hash = 0;
-                xs->here->xe_name_offset = 0;
-                ocfs2_xattr_set_local(xs->here, 1);
-                xs->here->xe_value_size = 0;
-                min_offs += size;
-                /* Adjust all value offsets. */
-                last = xs->header->xh_entries;
-                for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
-                        size_t o = le16_to_cpu(last->xe_name_offset);
-                        if (o < offs)
-                                last->xe_name_offset = cpu_to_le16(o + size);
-                        last += 1;
-                }
-                if (!xi->value) {
+        if (free_start < size) {
-                        /* Remove the old entry. */
+                if (needed_space)
-                        last -= 1;
+                        return -ENOSPC;
-                        memmove(xs->here, xs->here + 1,
+        } else {
-                                (void *)last - (void *)xs->here);
+                /*
-                        memset(last, 0, sizeof(struct ocfs2_xattr_entry));
+                 * First we check if it would fit in the first place.
-                        le16_add_cpu(&xs->header->xh_count, -1);
+                 * Below, we align the free start to a block.  This may
-                }
+                 * slide us below the minimum gap.  By checking unaligned
+                 * first, we avoid that error.
+                 */
+                rc = ocfs2_xa_check_space_helper(needed_space, free_start,
+                                                 count);
+                if (rc)
+                        return rc;
+                free_start = ocfs2_bucket_align_free_start(sb, free_start,
+                                                           size);
        }
-        if (xi->value) {
+        return ocfs2_xa_check_space_helper(needed_space, free_start, count);
-                /* Insert the new name+value. */
+}
-                size_t size = OCFS2_XATTR_SIZE(name_len) +
-                                OCFS2_XATTR_SIZE(xi->value_len);
+static void ocfs2_xa_bucket_wipe_namevalue(struct ocfs2_xa_loc *loc)
-                void *val = xs->base + min_offs - size;
+{
+        le16_add_cpu(&loc->xl_header->xh_name_value_len,
+                     -namevalue_size_xe(loc->xl_entry));
+}
+static void ocfs2_xa_bucket_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
+{
+        struct ocfs2_xattr_header *xh = loc->xl_header;
+        int count = le16_to_cpu(xh->xh_count);
+        int low = 0, high = count - 1, tmp;
+        struct ocfs2_xattr_entry *tmp_xe;
-                xs->here->xe_name_offset = cpu_to_le16(min_offs - size);
+        /*
-                memset(val, 0, size);
+         * We keep buckets sorted by name_hash, so we need to find
-                memcpy(val, xi->name, name_len);
+         * our insert place.
-                memcpy(val + OCFS2_XATTR_SIZE(name_len),
+         */
-                       xi->value,
+        while (low <= high && count) {
-                       xi->value_len);
+                tmp = (low + high) / 2;
-                xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+                tmp_xe = &xh->xh_entries[tmp];
-                ocfs2_xattr_set_local(xs->here, 1);
-                ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
+                if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
+                        low = tmp + 1;
+                else if (name_hash < le32_to_cpu(tmp_xe->xe_name_hash))
+                        high = tmp - 1;
+                else {
+                        low = tmp;
+                        break;
+                }
        }
-        return;
+        if (low != count)
+                memmove(&xh->xh_entries[low + 1],
+                        &xh->xh_entries[low],
+                        ((count - low) * sizeof(struct ocfs2_xattr_entry)));
+        le16_add_cpu(&xh->xh_count, 1);
+        loc->xl_entry = &xh->xh_entries[low];
+        memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
+}
+static void ocfs2_xa_bucket_add_namevalue(struct ocfs2_xa_loc *loc, int size)
+{
+        int free_start = ocfs2_xa_get_free_start(loc);
+        struct ocfs2_xattr_header *xh = loc->xl_header;
+        struct super_block *sb = loc->xl_inode->i_sb;
+        int nameval_offset;
+        free_start = ocfs2_bucket_align_free_start(sb, free_start, size);
+        nameval_offset = free_start - size;
+        loc->xl_entry->xe_name_offset = cpu_to_le16(nameval_offset);
+        xh->xh_free_start = cpu_to_le16(nameval_offset);
+        le16_add_cpu(&xh->xh_name_value_len, size);
+}
+static void ocfs2_xa_bucket_fill_value_buf(struct ocfs2_xa_loc *loc,
+                                           struct ocfs2_xattr_value_buf *vb)
+{
+        struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+        struct super_block *sb = loc->xl_inode->i_sb;
+        int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+        int size = namevalue_size_xe(loc->xl_entry);
+        int block_offset = nameval_offset >> sb->s_blocksize_bits;
+        /* Values are not allowed to straddle block boundaries */
+        BUG_ON(block_offset !=
+               ((nameval_offset + size - 1) >> sb->s_blocksize_bits));
+        /* We expect the bucket to be filled in */
+        BUG_ON(!bucket->bu_bhs[block_offset]);
+        vb->vb_access = ocfs2_journal_access;
+        vb->vb_bh = bucket->bu_bhs[block_offset];
+}
+/* Operations for xattrs stored in buckets. */
+static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = {
+        .xlo_journal_access     = ocfs2_xa_bucket_journal_access,
+        .xlo_journal_dirty      = ocfs2_xa_bucket_journal_dirty,
+        .xlo_offset_pointer     = ocfs2_xa_bucket_offset_pointer,
+        .xlo_check_space        = ocfs2_xa_bucket_check_space,
+        .xlo_can_reuse          = ocfs2_xa_bucket_can_reuse,
+        .xlo_get_free_start     = ocfs2_xa_bucket_get_free_start,
+        .xlo_wipe_namevalue     = ocfs2_xa_bucket_wipe_namevalue,
+        .xlo_add_entry          = ocfs2_xa_bucket_add_entry,
+        .xlo_add_namevalue      = ocfs2_xa_bucket_add_namevalue,
+        .xlo_fill_value_buf     = ocfs2_xa_bucket_fill_value_buf,
+};
+static unsigned int ocfs2_xa_value_clusters(struct ocfs2_xa_loc *loc)
+{
+        struct ocfs2_xattr_value_buf vb;
+        if (ocfs2_xattr_is_local(loc->xl_entry))
+                return 0;
+        ocfs2_xa_fill_value_buf(loc, &vb);
+        return le32_to_cpu(vb.vb_xv->xr_clusters);
+}
+static int ocfs2_xa_value_truncate(struct ocfs2_xa_loc *loc, u64 bytes,
+                                   struct ocfs2_xattr_set_ctxt *ctxt)
+{
+        int trunc_rc, access_rc;
+        struct ocfs2_xattr_value_buf vb;
+        ocfs2_xa_fill_value_buf(loc, &vb);
+        trunc_rc = ocfs2_xattr_value_truncate(loc->xl_inode, &vb, bytes,
+                                              ctxt);
+        /*
+         * The caller of ocfs2_xa_value_truncate() has already called
+         * ocfs2_xa_journal_access on the loc.  However, The truncate code
+         * calls ocfs2_extend_trans().  This may commit the previous
+         * transaction and open a new one.  If this is a bucket, truncate
+         * could leave only vb->vb_bh set up for journaling.  Meanwhile,
+         * the caller is expecting to dirty the entire bucket.  So we must
+         * reset the journal work.  We do this even if truncate has failed,
+         * as it could have failed after committing the extend.
+         */
+        access_rc = ocfs2_xa_journal_access(ctxt->handle, loc,
+                                            OCFS2_JOURNAL_ACCESS_WRITE);
+        /* Errors in truncate take precedence */
+        return trunc_rc ? trunc_rc : access_rc;
+}
+static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc)
+{
+        int index, count;
+        struct ocfs2_xattr_header *xh = loc->xl_header;
+        struct ocfs2_xattr_entry *entry = loc->xl_entry;
+        ocfs2_xa_wipe_namevalue(loc);
+        loc->xl_entry = NULL;
+        le16_add_cpu(&xh->xh_count, -1);
+        count = le16_to_cpu(xh->xh_count);
+        /*
+         * Only zero out the entry if there are more remaining.  This is
+         * important for an empty bucket, as it keeps track of the
+         * bucket's hash value.  It doesn't hurt empty block storage.
+         */
+        if (count) {
+                index = ((char *)entry - (char *)&xh->xh_entries) /
+                        sizeof(struct ocfs2_xattr_entry);
+                memmove(&xh->xh_entries[index], &xh->xh_entries[index + 1],
+                        (count - index) * sizeof(struct ocfs2_xattr_entry));
+                memset(&xh->xh_entries[count], 0,
+                       sizeof(struct ocfs2_xattr_entry));
+        }
 }
 /*
- * ocfs2_xattr_set_entry()
+ * If we have a problem adjusting the size of an external value during
+ * ocfs2_xa_prepare_entry() or ocfs2_xa_remove(), we may have an xattr
+ * in an intermediate state.  For example, the value may be partially
+ * truncated.
 *
- * Set extended attribute entry into inode or block.
+ * If the value tree hasn't changed, the extend/truncate went nowhere.
+ * We have nothing to do.  The caller can treat it as a straight error.
 *
- * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE,
+ * If the value tree got partially truncated, we now have a corrupted
- * We first insert tree root(ocfs2_xattr_value_root) with set_entry_local(),
+ * extended attribute.  We're going to wipe its entry and leak the
- * then set value in B tree with set_value_outside().
+ * clusters.  Better to leak some storage than leave a corrupt entry.
+ *
+ * If the value tree grew, it obviously didn't grow enough for the
+ * new entry.  We're not going to try and reclaim those clusters either.
+ * If there was already an external value there (orig_clusters != 0),
+ * the new clusters are attached safely and we can just leave the old
+ * value in place.  If there was no external value there, we remove
+ * the entry.
+ *
+ * This way, the xattr block we store in the journal will be consistent.
+ * If the size change broke because of the journal, no changes will hit
+ * disk anyway.
 */
-static int ocfs2_xattr_set_entry(struct inode *inode,
+static void ocfs2_xa_cleanup_value_truncate(struct ocfs2_xa_loc *loc,
-                                 struct ocfs2_xattr_info *xi,
+                                            const char *what,
-                                 struct ocfs2_xattr_search *xs,
+                                            unsigned int orig_clusters)
-                                 struct ocfs2_xattr_set_ctxt *ctxt,
+{
-                                 int flag)
+        unsigned int new_clusters = ocfs2_xa_value_clusters(loc);
-{
+        char *nameval_buf = ocfs2_xa_offset_pointer(loc,
-        struct ocfs2_xattr_entry *last;
+                                le16_to_cpu(loc->xl_entry->xe_name_offset));
-        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+        if (new_clusters < orig_clusters) {
-        size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name);
+                mlog(ML_ERROR,
-        size_t size_l = 0;
+                     "Partial truncate while %s xattr %.*s.  Leaking "
-        handle_t *handle = ctxt->handle;
+                     "%u clusters and removing the entry\n",
-        int free, i, ret;
+                     what, loc->xl_entry->xe_name_len, nameval_buf,
-        struct ocfs2_xattr_info xi_l = {
+                     orig_clusters - new_clusters);
-                .name_index = xi->name_index,
+                ocfs2_xa_remove_entry(loc);
-                .name = xi->name,
+        } else if (!orig_clusters) {
-                .value = xi->value,
+                mlog(ML_ERROR,
-                .value_len = xi->value_len,
+                     "Unable to allocate an external value for xattr "
-        };
+                     "%.*s safely.  Leaking %u clusters and removing the "
-        struct ocfs2_xattr_value_buf vb = {
+                     "entry\n",
-                .vb_bh = xs->xattr_bh,
+                     loc->xl_entry->xe_name_len, nameval_buf,
-                .vb_access = ocfs2_journal_access_di,
+                     new_clusters - orig_clusters);
-        };
+                ocfs2_xa_remove_entry(loc);
+        } else if (new_clusters > orig_clusters)
+                mlog(ML_ERROR,
+                     "Unable to grow xattr %.*s safely.  %u new clusters "
+                     "have been added, but the value will not be "
+                     "modified\n",
+                     loc->xl_entry->xe_name_len, nameval_buf,
+                     new_clusters - orig_clusters);
+}
+static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc,
+                           struct ocfs2_xattr_set_ctxt *ctxt)
+{
+        int rc = 0;
+        unsigned int orig_clusters;
+        if (!ocfs2_xattr_is_local(loc->xl_entry)) {
+                orig_clusters = ocfs2_xa_value_clusters(loc);
+                rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+                if (rc) {
+                        mlog_errno(rc);
+                        /*
+                         * Since this is remove, we can return 0 if
+                         * ocfs2_xa_cleanup_value_truncate() is going to
+                         * wipe the entry anyway.  So we check the
+                         * cluster count as well.
+                         */
+                        if (orig_clusters != ocfs2_xa_value_clusters(loc))
+                                rc = 0;
+                        ocfs2_xa_cleanup_value_truncate(loc, "removing",
+                                                        orig_clusters);
+                        if (rc)
+                                goto out;
+                }
+        }
-        if (!(flag & OCFS2_INLINE_XATTR_FL)) {
+        ocfs2_xa_remove_entry(loc);
-                BUG_ON(xs->xattr_bh == xs->inode_bh);
-                vb.vb_access = ocfs2_journal_access_xb;
-        } else
-                BUG_ON(xs->xattr_bh != xs->inode_bh);
-        /* Compute min_offs, last and free space. */
+out:
-        last = xs->header->xh_entries;
+        return rc;
+}
-        for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
+static void ocfs2_xa_install_value_root(struct ocfs2_xa_loc *loc)
-                size_t offs = le16_to_cpu(last->xe_name_offset);
+{
-                if (offs < min_offs)
+        int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
-                        min_offs = offs;
+        char *nameval_buf;
-                last += 1;
-        }
-        free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP;
+        nameval_buf = ocfs2_xa_offset_pointer(loc,
-        if (free < 0)
+                                le16_to_cpu(loc->xl_entry->xe_name_offset));
-                return -EIO;
+        memcpy(nameval_buf + name_size, &def_xv, OCFS2_XATTR_ROOT_SIZE);
+}
-        if (!xs->not_found) {
+/*
-                size_t size = 0;
+ * Take an existing entry and make it ready for the new value.  This
-                if (ocfs2_xattr_is_local(xs->here))
+ * won't allocate space, but it may free space.  It should be ready for
-                        size = OCFS2_XATTR_SIZE(name_len) +
+ * ocfs2_xa_prepare_entry() to finish the work.
-                        OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
+ */
-                else
+static int ocfs2_xa_reuse_entry(struct ocfs2_xa_loc *loc,
-                        size = OCFS2_XATTR_SIZE(name_len) +
+                                struct ocfs2_xattr_info *xi,
-                                OCFS2_XATTR_ROOT_SIZE;
+                                struct ocfs2_xattr_set_ctxt *ctxt)
-                free += (size + sizeof(struct ocfs2_xattr_entry));
+{
-        }
+        int rc = 0;
-        /* Check free space in inode or block */
+        int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
-        if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+        unsigned int orig_clusters;
-                if (free < sizeof(struct ocfs2_xattr_entry) +
+        char *nameval_buf;
-                           OCFS2_XATTR_SIZE(name_len) +
+        int xe_local = ocfs2_xattr_is_local(loc->xl_entry);
-                           OCFS2_XATTR_ROOT_SIZE) {
+        int xi_local = xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE;
-                        ret = -ENOSPC;
-                        goto out;
+        BUG_ON(OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len) !=
+               name_size);
+        nameval_buf = ocfs2_xa_offset_pointer(loc,
+                                le16_to_cpu(loc->xl_entry->xe_name_offset));
+        if (xe_local) {
+                memset(nameval_buf + name_size, 0,
+                       namevalue_size_xe(loc->xl_entry) - name_size);
+                if (!xi_local)
+                        ocfs2_xa_install_value_root(loc);
+        } else {
+                orig_clusters = ocfs2_xa_value_clusters(loc);
+                if (xi_local) {
+                        rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+                        if (rc < 0)
+                                mlog_errno(rc);
+                        else
+                                memset(nameval_buf + name_size, 0,
+                                       namevalue_size_xe(loc->xl_entry) -
+                                       name_size);
+                } else if (le64_to_cpu(loc->xl_entry->xe_value_size) >
+                           xi->xi_value_len) {
+                        rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len,
+                                                     ctxt);
+                        if (rc < 0)
+                                mlog_errno(rc);
                }
-                size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
-                xi_l.value = (void *)&def_xv;
+                if (rc) {
-                xi_l.value_len = OCFS2_XATTR_ROOT_SIZE;
+                        ocfs2_xa_cleanup_value_truncate(loc, "reusing",
-        } else if (xi->value) {
+                                                        orig_clusters);
-                if (free < sizeof(struct ocfs2_xattr_entry) +
-                           OCFS2_XATTR_SIZE(name_len) +
-                           OCFS2_XATTR_SIZE(xi->value_len)) {
-                        ret = -ENOSPC;
                        goto out;
                }
        }
-        if (!xs->not_found) {
+        loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
-                /* For existing extended attribute */
+        ocfs2_xattr_set_local(loc->xl_entry, xi_local);
-                size_t size = OCFS2_XATTR_SIZE(name_len) +
-                        OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
-                size_t offs = le16_to_cpu(xs->here->xe_name_offset);
-                void *val = xs->base + offs;
-                if (ocfs2_xattr_is_local(xs->here) && size == size_l) {
+out:
-                        /* Replace existing local xattr with tree root */
+        return rc;
-                        ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
+}
-                                                            ctxt, &vb, offs);
-                        if (ret < 0)
-                                mlog_errno(ret);
-                        goto out;
-                } else if (!ocfs2_xattr_is_local(xs->here)) {
-                        /* For existing xattr which has value outside */
-                        vb.vb_xv = (struct ocfs2_xattr_value_root *)
-                                (val + OCFS2_XATTR_SIZE(name_len));
-                        if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+/*
-                                /*
+ * Prepares loc->xl_entry to receive the new xattr.  This includes
-                                 * If new value need set outside also,
+ * properly setting up the name+value pair region.  If loc->xl_entry
-                                 * first truncate old value to new value,
+ * already exists, it will take care of modifying it appropriately.
-                                 * then set new value with set_value_outside().
+ *
-                                 */
+ * Note that this modifies the data.  You did journal_access already,
-                                ret = ocfs2_xattr_value_truncate(inode,
+ * right?
-                                                                 &vb,
+ */
-                                                                 xi->value_len,
+static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
-                                                                 ctxt);
+                                  struct ocfs2_xattr_info *xi,
-                                if (ret < 0) {
+                                  u32 name_hash,
-                                        mlog_errno(ret);
+                                  struct ocfs2_xattr_set_ctxt *ctxt)
-                                        goto out;
+{
-                                }
+        int rc = 0;
+        unsigned int orig_clusters;
+        __le64 orig_value_size = 0;
-                                ret = ocfs2_xattr_update_entry(inode,
+        rc = ocfs2_xa_check_space(loc, xi);
-                                                               handle,
+        if (rc)
-                                                               xi,
+                goto out;
-                                                               xs,
-                                                               &vb,
-                                                               offs);
-                                if (ret < 0) {
-                                        mlog_errno(ret);
-                                        goto out;
-                                }
-                                ret = __ocfs2_xattr_set_value_outside(inode,
+        if (loc->xl_entry) {
-                                                                handle,
+                if (ocfs2_xa_can_reuse_entry(loc, xi)) {
-                                                                &vb,
+                        orig_value_size = loc->xl_entry->xe_value_size;
-                                                                xi->value,
+                        rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
-                                                                xi->value_len);
+                        if (rc)
-                                if (ret < 0)
+                                goto out;
-                                        mlog_errno(ret);
+                        goto alloc_value;
+                }
+                if (!ocfs2_xattr_is_local(loc->xl_entry)) {
+                        orig_clusters = ocfs2_xa_value_clusters(loc);
+                        rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+                        if (rc) {
+                                mlog_errno(rc);
+                                ocfs2_xa_cleanup_value_truncate(loc,
+                                                                "overwriting",
+                                                                orig_clusters);
                                goto out;
-                        } else {
-                                /*
-                                 * If new value need set in local,
-                                 * just trucate old value to zero.
-                                 */
-                                 ret = ocfs2_xattr_value_truncate(inode,
-                                                                  &vb,
-                                                                  0,
-                                                                  ctxt);
-                                if (ret < 0)
-                                        mlog_errno(ret);
                        }
                }
+                ocfs2_xa_wipe_namevalue(loc);
+        } else
+                ocfs2_xa_add_entry(loc, name_hash);
+        /*
+         * If we get here, we have a blank entry.  Fill it.  We grow our
+         * name+value pair back from the end.
+         */
+        ocfs2_xa_add_namevalue(loc, xi);
+        if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
+                ocfs2_xa_install_value_root(loc);
+alloc_value:
+        if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+                orig_clusters = ocfs2_xa_value_clusters(loc);
+                rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
+                if (rc < 0) {
+                        /*
+                         * If we tried to grow an existing external value,
+                         * ocfs2_xa_cleanuP-value_truncate() is going to
+                         * let it stand.  We have to restore its original
+                         * value size.
+                         */
+                        loc->xl_entry->xe_value_size = orig_value_size;
+                        ocfs2_xa_cleanup_value_truncate(loc, "growing",
+                                                        orig_clusters);
+                        mlog_errno(rc);
+                }
        }
-        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), xs->inode_bh,
+out:
+        return rc;
+}
+/*
+ * Store the value portion of the name+value pair.  This will skip
+ * values that are stored externally.  Their tree roots were set up
+ * by ocfs2_xa_prepare_entry().
+ */
+static int ocfs2_xa_store_value(struct ocfs2_xa_loc *loc,
+                                struct ocfs2_xattr_info *xi,
+                                struct ocfs2_xattr_set_ctxt *ctxt)
+{
+        int rc = 0;
+        int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+        int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
+        char *nameval_buf;
+        struct ocfs2_xattr_value_buf vb;
+        nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
+        if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+                ocfs2_xa_fill_value_buf(loc, &vb);
+                rc = __ocfs2_xattr_set_value_outside(loc->xl_inode,
+                                                     ctxt->handle, &vb,
+                                                     xi->xi_value,
+                                                     xi->xi_value_len);
+        } else
+                memcpy(nameval_buf + name_size, xi->xi_value, xi->xi_value_len);
+        return rc;
+}
+static int ocfs2_xa_set(struct ocfs2_xa_loc *loc,
+                        struct ocfs2_xattr_info *xi,
+                        struct ocfs2_xattr_set_ctxt *ctxt)
+{
+        int ret;
+        u32 name_hash = ocfs2_xattr_name_hash(loc->xl_inode, xi->xi_name,
+                                              xi->xi_name_len);
+        ret = ocfs2_xa_journal_access(ctxt->handle, loc,
                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        if (!(flag & OCFS2_INLINE_XATTR_FL)) {
-                ret = vb.vb_access(handle, INODE_CACHE(inode), vb.vb_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-        }
        /*
-         * Set value in local, include set tree root in local.
+         * From here on out, everything is going to modify the buffer a
-         * This is the first step for value size >INLINE_SIZE.
+         * little.  Errors are going to leave the xattr header in a
+         * sane state.  Thus, even with errors we dirty the sucker.
         */
-        ocfs2_xattr_set_entry_local(inode, &xi_l, xs, last, min_offs);
-        if (!(flag & OCFS2_INLINE_XATTR_FL)) {
+        /* Don't worry, we are never called with !xi_value and !xl_entry */
-                ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+        if (!xi->xi_value) {
-                if (ret < 0) {
+                ret = ocfs2_xa_remove(loc, ctxt);
-                        mlog_errno(ret);
+                goto out_dirty;
-                        goto out;
-                }
        }
-        if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) &&
+        ret = ocfs2_xa_prepare_entry(loc, xi, name_hash, ctxt);
-            (flag & OCFS2_INLINE_XATTR_FL)) {
+        if (ret) {
-                struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+                if (ret != -ENOSPC)
-                unsigned int xattrsize = osb->s_xattr_inline_size;
+                        mlog_errno(ret);
+                goto out_dirty;
-                /*
-                 * Adjust extent record count or inline data size
-                 * to reserve space for extended attribute.
-                 */
-                if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-                        struct ocfs2_inline_data *idata = &di->id2.i_data;
-                        le16_add_cpu(&idata->id_count, -xattrsize);
-                } else if (!(ocfs2_inode_is_fast_symlink(inode))) {
-                        struct ocfs2_extent_list *el = &di->id2.i_list;
-                        le16_add_cpu(&el->l_count, -(xattrsize /
-                                        sizeof(struct ocfs2_extent_rec)));
-                }
-                di->i_xattr_inline_size = cpu_to_le16(xattrsize);
        }
-        /* Update xattr flag */
-        spin_lock(&oi->ip_lock);
-        oi->ip_dyn_features |= flag;
-        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
-        spin_unlock(&oi->ip_lock);
-        ret = ocfs2_journal_dirty(handle, xs->inode_bh);
+        ret = ocfs2_xa_store_value(loc, xi, ctxt);
-        if (ret < 0)
+        if (ret)
                mlog_errno(ret);
-        if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+out_dirty:
-                /*
+        ocfs2_xa_journal_dirty(ctxt->handle, loc);
-                 * Set value outside in B tree.
-                 * This is the second step for value size > INLINE_SIZE.
-                 */
-                size_t offs = le16_to_cpu(xs->here->xe_name_offset);
-                ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt,
-                                                    &vb, offs);
-                if (ret < 0) {
-                        int ret2;
-                        mlog_errno(ret);
-                        /*
-                         * If set value outside failed, we have to clean
-                         * the junk tree root we have already set in local.
-                         */
-                        ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle,
-                                                   xi, xs, &vb, offs);
-                        if (ret2 < 0)
-                                mlog_errno(ret2);
-                }
-        }
 out:
        return ret;
 }
+static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc,
+                                     struct inode *inode,
+                                     struct buffer_head *bh,
+                                     struct ocfs2_xattr_entry *entry)
+{
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+        BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_XATTR_FL));
+        loc->xl_inode = inode;
+        loc->xl_ops = &ocfs2_xa_block_loc_ops;
+        loc->xl_storage = bh;
+        loc->xl_entry = entry;
+        loc->xl_size = le16_to_cpu(di->i_xattr_inline_size);
+        loc->xl_header =
+                (struct ocfs2_xattr_header *)(bh->b_data + bh->b_size -
+                                              loc->xl_size);
+}
+static void ocfs2_init_xattr_block_xa_loc(struct ocfs2_xa_loc *loc,
+                                          struct inode *inode,
+                                          struct buffer_head *bh,
+                                          struct ocfs2_xattr_entry *entry)
+{
+        struct ocfs2_xattr_block *xb =
+                (struct ocfs2_xattr_block *)bh->b_data;
+        BUG_ON(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED);
+        loc->xl_inode = inode;
+        loc->xl_ops = &ocfs2_xa_block_loc_ops;
+        loc->xl_storage = bh;
+        loc->xl_header = &(xb->xb_attrs.xb_header);
+        loc->xl_entry = entry;
+        loc->xl_size = bh->b_size - offsetof(struct ocfs2_xattr_block,
+                                             xb_attrs.xb_header);
+}
+static void ocfs2_init_xattr_bucket_xa_loc(struct ocfs2_xa_loc *loc,
+                                           struct ocfs2_xattr_bucket *bucket,
+                                           struct ocfs2_xattr_entry *entry)
+{
+        loc->xl_inode = bucket->bu_inode;
+        loc->xl_ops = &ocfs2_xa_bucket_loc_ops;
+        loc->xl_storage = bucket;
+        loc->xl_header = bucket_xh(bucket);
+        loc->xl_entry = entry;
+        loc->xl_size = OCFS2_XATTR_BUCKET_SIZE;
+}
 /*
 * In xattr remove, if it is stored outside and refcounted, we may have
 * the chance to split the refcount tree. So need the allocators.
@@ -2155,6 +2683,55 @@ static int ocfs2_xattr_ibody_find(struct inode *inode,
        return 0;
 }
+static int ocfs2_xattr_ibody_init(struct inode *inode,
+                                  struct buffer_head *di_bh,
+                                  struct ocfs2_xattr_set_ctxt *ctxt)
+{
+        int ret;
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        unsigned int xattrsize = osb->s_xattr_inline_size;
+        if (!ocfs2_xattr_has_space_inline(inode, di)) {
+                ret = -ENOSPC;
+                goto out;
+        }
+        ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode), di_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * Adjust extent record count or inline data size
+         * to reserve space for extended attribute.
+         */
+        if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+                struct ocfs2_inline_data *idata = &di->id2.i_data;
+                le16_add_cpu(&idata->id_count, -xattrsize);
+        } else if (!(ocfs2_inode_is_fast_symlink(inode))) {
+                struct ocfs2_extent_list *el = &di->id2.i_list;
+                le16_add_cpu(&el->l_count, -(xattrsize /
+                                             sizeof(struct ocfs2_extent_rec)));
+        }
+        di->i_xattr_inline_size = cpu_to_le16(xattrsize);
+        spin_lock(&oi->ip_lock);
+        oi->ip_dyn_features |= OCFS2_INLINE_XATTR_FL|OCFS2_HAS_XATTR_FL;
+        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+        spin_unlock(&oi->ip_lock);
+        ret = ocfs2_journal_dirty(ctxt->handle, di_bh);
+        if (ret < 0)
+                mlog_errno(ret);
+out:
+        return ret;
+}
 /*
 * ocfs2_xattr_ibody_set()
 *
@@ -2166,9 +2743,10 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
                                 struct ocfs2_xattr_search *xs,
                                 struct ocfs2_xattr_set_ctxt *ctxt)
 {
+        int ret;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
-        int ret;
+        struct ocfs2_xa_loc loc;
        if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
                return -ENOSPC;
@@ -2181,8 +2759,25 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
                }
        }
-        ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
+        if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
-                                (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL));
+                ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt);
+                if (ret) {
+                        if (ret != -ENOSPC)
+                                mlog_errno(ret);
+                        goto out;
+                }
+        }
+        ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh,
+                                 xs->not_found ? NULL : xs->here);
+        ret = ocfs2_xa_set(&loc, xi, ctxt);
+        if (ret) {
+                if (ret != -ENOSPC)
+                        mlog_errno(ret);
+                goto out;
+        }
+        xs->here = loc.xl_entry;
 out:
        up_write(&oi->ip_alloc_sem);
@@ -2242,12 +2837,11 @@ cleanup:
        return ret;
 }
-static int ocfs2_create_xattr_block(handle_t *handle,
+static int ocfs2_create_xattr_block(struct inode *inode,
-                                    struct inode *inode,
                                    struct buffer_head *inode_bh,
-                                    struct ocfs2_alloc_context *meta_ac,
+                                    struct ocfs2_xattr_set_ctxt *ctxt,
-                                    struct buffer_head **ret_bh,
+                                    int indexed,
-                                    int indexed)
+                                    struct buffer_head **ret_bh)
 {
        int ret;
        u16 suballoc_bit_start;
@@ -2258,14 +2852,14 @@ static int ocfs2_create_xattr_block(handle_t *handle,
        struct buffer_head *new_bh = NULL;
        struct ocfs2_xattr_block *xblk;
-        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), inode_bh,
+        ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode),
-                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                      inode_bh, OCFS2_JOURNAL_ACCESS_CREATE);
        if (ret < 0) {
                mlog_errno(ret);
                goto end;
        }
-        ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+        ret = ocfs2_claim_metadata(osb, ctxt->handle, ctxt->meta_ac, 1,
                                   &suballoc_bit_start, &num_got,
                                   &first_blkno);
        if (ret < 0) {
@@ -2276,7 +2870,7 @@ static int ocfs2_create_xattr_block(handle_t *handle,
        new_bh = sb_getblk(inode->i_sb, first_blkno);
        ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
-        ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode),
+        ret = ocfs2_journal_access_xb(ctxt->handle, INODE_CACHE(inode),
                                      new_bh,
                                      OCFS2_JOURNAL_ACCESS_CREATE);
        if (ret < 0) {
@@ -2288,11 +2882,10 @@ static int ocfs2_create_xattr_block(handle_t *handle,
        xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
        memset(xblk, 0, inode->i_sb->s_blocksize);
        strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
-        xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
+        xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
        xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
        xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
        xblk->xb_blkno = cpu_to_le64(first_blkno);
        if (indexed) {
                struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
                xr->xt_clusters = cpu_to_le32(1);
@@ -2303,14 +2896,17 @@ static int ocfs2_create_xattr_block(handle_t *handle,
                xr->xt_list.l_next_free_rec = cpu_to_le16(1);
                xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED);
        }
+        ocfs2_journal_dirty(ctxt->handle, new_bh);
-        ret = ocfs2_journal_dirty(handle, new_bh);
+        /* Add it to the inode */
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto end;
-        }
        di->i_xattr_loc = cpu_to_le64(first_blkno);
-        ocfs2_journal_dirty(handle, inode_bh);
+        spin_lock(&OCFS2_I(inode)->ip_lock);
+        OCFS2_I(inode)->ip_dyn_features |= OCFS2_HAS_XATTR_FL;
+        di->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features);
+        spin_unlock(&OCFS2_I(inode)->ip_lock);
+        ocfs2_journal_dirty(ctxt->handle, inode_bh);
        *ret_bh = new_bh;
        new_bh = NULL;
@@ -2332,13 +2928,13 @@ static int ocfs2_xattr_block_set(struct inode *inode,
                                 struct ocfs2_xattr_set_ctxt *ctxt)
 {
        struct buffer_head *new_bh = NULL;
-        handle_t *handle = ctxt->handle;
        struct ocfs2_xattr_block *xblk = NULL;
        int ret;
+        struct ocfs2_xa_loc loc;
        if (!xs->xattr_bh) {
-                ret = ocfs2_create_xattr_block(handle, inode, xs->inode_bh,
+                ret = ocfs2_create_xattr_block(inode, xs->inode_bh, ctxt,
-                                               ctxt->meta_ac, &new_bh, 0);
+                                               0, &new_bh);
                if (ret) {
                        mlog_errno(ret);
                        goto end;
@@ -2354,21 +2950,25 @@ static int ocfs2_xattr_block_set(struct inode *inode,
                xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
        if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
-                /* Set extended attribute into external block */
+                ocfs2_init_xattr_block_xa_loc(&loc, inode, xs->xattr_bh,
-                ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
+                                              xs->not_found ? NULL : xs->here);
-                                            OCFS2_HAS_XATTR_FL);
-                if (!ret || ret != -ENOSPC)
-                        goto end;
-                ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
+                ret = ocfs2_xa_set(&loc, xi, ctxt);
-                if (ret)
+                if (!ret)
+                        xs->here = loc.xl_entry;
+                else if (ret != -ENOSPC)
                        goto end;
+                else {
+                        ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
+                        if (ret)
+                                goto end;
+                }
        }
-        ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
+        if (le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)
+                ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
 end:
        return ret;
 }
@@ -2377,7 +2977,6 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
                                       struct ocfs2_xattr_info *xi,
                                       struct ocfs2_xattr_search *xs)
 {
-        u64 value_size;
        struct ocfs2_xattr_entry *last;
        int free, i;
        size_t min_offs = xs->end - xs->base;
@@ -2400,13 +2999,7 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
        BUG_ON(!xs->not_found);
-        if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+        if (free >= (sizeof(struct ocfs2_xattr_entry) + namevalue_size_xi(xi)))
-                value_size = OCFS2_XATTR_ROOT_SIZE;
-        else
-                value_size = OCFS2_XATTR_SIZE(xi->value_len);
-        if (free >= sizeof(struct ocfs2_xattr_entry) +
-                   OCFS2_XATTR_SIZE(strlen(xi->name)) + value_size)
                return 1;
        return 0;
@@ -2430,7 +3023,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
        char *base = NULL;
        int name_offset, name_len = 0;
        u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
-                                                    xi->value_len);
+                                                    xi->xi_value_len);
        u64 value_size;
        /*
@@ -2438,14 +3031,14 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
         * No matter whether we replace an old one or add a new one,
         * we need this for writing.
         */
-        if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+        if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
                credits += new_clusters *
                           ocfs2_clusters_to_blocks(inode->i_sb, 1);
        if (xis->not_found && xbs->not_found) {
                credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-                if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+                if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
                        clusters_add += new_clusters;
                        credits += ocfs2_calc_extend_credits(inode->i_sb,
                                                        &def_xv.xv.xr_list,
@@ -2490,7 +3083,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
         * The credits for removing the value tree will be extended
         * by ocfs2_remove_extent itself.
         */
-        if (!xi->value) {
+        if (!xi->xi_value) {
                if (!ocfs2_xattr_is_local(xe))
                        credits += ocfs2_remove_extent_credits(inode->i_sb);
@@ -2520,7 +3113,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
                }
        }
-        if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+        if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
                /* the new values will be stored outside. */
                u32 old_clusters = 0;
@@ -2553,9 +3146,10 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
                 * value, we don't need any allocation, otherwise we have
                 * to guess metadata allocation.
                 */
-                if ((ocfs2_xattr_is_local(xe) && value_size >= xi->value_len) ||
+                if ((ocfs2_xattr_is_local(xe) &&
+                     (value_size >= xi->xi_value_len)) ||
                    (!ocfs2_xattr_is_local(xe) &&
-                     OCFS2_XATTR_ROOT_SIZE >= xi->value_len))
+                     OCFS2_XATTR_ROOT_SIZE >= xi->xi_value_len))
                        goto out;
        }
@@ -2645,7 +3239,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
        meta_add += extra_meta;
        mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
-             "credits = %d\n", xi->name, meta_add, clusters_add, *credits);
+             "credits = %d\n", xi->xi_name, meta_add, clusters_add, *credits);
        if (meta_add) {
                ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
@@ -2685,7 +3279,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
 {
        int ret = 0, credits, old_found;
-        if (!xi->value) {
+        if (!xi->xi_value) {
                /* Remove existing extended attribute */
                if (!xis->not_found)
                        ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
@@ -2699,8 +3293,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
                         * If succeed and that extended attribute existing in
                         * external block, then we will remove it.
                         */
-                        xi->value = NULL;
+                        xi->xi_value = NULL;
-                        xi->value_len = 0;
+                        xi->xi_value_len = 0;
                        old_found = xis->not_found;
                        xis->not_found = -ENODATA;
@@ -2728,8 +3322,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
                } else if (ret == -ENOSPC) {
                        if (di->i_xattr_loc && !xbs->xattr_bh) {
                                ret = ocfs2_xattr_block_find(inode,
-                                                             xi->name_index,
+                                                             xi->xi_name_index,
-                                                             xi->name, xbs);
+                                                             xi->xi_name, xbs);
                                if (ret)
                                        goto out;
@@ -2768,8 +3362,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
                                 * If succeed and that extended attribute
                                 * existing in inode, we will remove it.
                                 */
-                                xi->value = NULL;
+                                xi->xi_value = NULL;
-                                xi->value_len = 0;
+                                xi->xi_value_len = 0;
                                xbs->not_found = -ENODATA;
                                ret = ocfs2_calc_xattr_set_need(inode,
                                                                di,
@@ -2835,10 +3429,11 @@ int ocfs2_xattr_set_handle(handle_t *handle,
        int ret;
        struct ocfs2_xattr_info xi = {
-                .name_index = name_index,
+                .xi_name_index = name_index,
-                .name = name,
+                .xi_name = name,
-                .value = value,
+                .xi_name_len = strlen(name),
-                .value_len = value_len,
+                .xi_value = value,
+                .xi_value_len = value_len,
        };
        struct ocfs2_xattr_search xis = {
@@ -2918,10 +3513,11 @@ int ocfs2_xattr_set(struct inode *inode,
        struct ocfs2_refcount_tree *ref_tree = NULL;
        struct ocfs2_xattr_info xi = {
-                .name_index = name_index,
+                .xi_name_index = name_index,
-                .name = name,
+                .xi_name = name,
-                .value = value,
+                .xi_name_len = strlen(name),
-                .value_len = value_len,
+                .xi_value = value,
+                .xi_value_len = value_len,
        };
        struct ocfs2_xattr_search xis = {
@@ -3765,7 +4361,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
                                     struct ocfs2_xattr_bucket *bucket)
 {
        int ret, i;
-        size_t end, offset, len, value_len;
+        size_t end, offset, len;
        struct ocfs2_xattr_header *xh;
        char *entries, *buf, *bucket_buf = NULL;
        u64 blkno = bucket_blkno(bucket);
@@ -3819,12 +4415,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
        end = OCFS2_XATTR_BUCKET_SIZE;
        for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) {
                offset = le16_to_cpu(xe->xe_name_offset);
-                if (ocfs2_xattr_is_local(xe))
+                len = namevalue_size_xe(xe);
-                        value_len = OCFS2_XATTR_SIZE(
-                                        le64_to_cpu(xe->xe_value_size));
-                else
-                        value_len = OCFS2_XATTR_ROOT_SIZE;
-                len = OCFS2_XATTR_SIZE(xe->xe_name_len) + value_len;
                /*
                 * We must make sure that the name/value pair
@@ -4013,7 +4604,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
                                    int new_bucket_head)
 {
        int ret, i;
-        int count, start, len, name_value_len = 0, xe_len, name_offset = 0;
+        int count, start, len, name_value_len = 0, name_offset = 0;
        struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
        struct ocfs2_xattr_header *xh;
        struct ocfs2_xattr_entry *xe;
@@ -4104,13 +4695,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
        name_value_len = 0;
        for (i = 0; i < start; i++) {
                xe = &xh->xh_entries[i];
-                xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+                name_value_len += namevalue_size_xe(xe);
-                if (ocfs2_xattr_is_local(xe))
-                        xe_len +=
-                           OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
-                else
-                        xe_len += OCFS2_XATTR_ROOT_SIZE;
-                name_value_len += xe_len;
                if (le16_to_cpu(xe->xe_name_offset) < name_offset)
                        name_offset = le16_to_cpu(xe->xe_name_offset);
        }
@@ -4140,12 +4725,6 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
        xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
        for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
                xe = &xh->xh_entries[i];
-                xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
-                if (ocfs2_xattr_is_local(xe))
-                        xe_len +=
-                           OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
-                else
-                        xe_len += OCFS2_XATTR_ROOT_SIZE;
                if (le16_to_cpu(xe->xe_name_offset) <
                    le16_to_cpu(xh->xh_free_start))
                        xh->xh_free_start = xe->xe_name_offset;
@@ -4757,195 +5336,6 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
 }
 /*
- * Handle the normal xattr set, including replace, delete and new.
- *
- * Note: "local" indicates the real data's locality. So we can't
- * just its bucket locality by its length.
- */
-static void ocfs2_xattr_set_entry_normal(struct inode *inode,
-                                         struct ocfs2_xattr_info *xi,
-                                         struct ocfs2_xattr_search *xs,
-                                         u32 name_hash,
-                                         int local)
-{
-        struct ocfs2_xattr_entry *last, *xe;
-        int name_len = strlen(xi->name);
-        struct ocfs2_xattr_header *xh = xs->header;
-        u16 count = le16_to_cpu(xh->xh_count), start;
-        size_t blocksize = inode->i_sb->s_blocksize;
-        char *val;
-        size_t offs, size, new_size;
-        last = &xh->xh_entries[count];
-        if (!xs->not_found) {
-                xe = xs->here;
-                offs = le16_to_cpu(xe->xe_name_offset);
-                if (ocfs2_xattr_is_local(xe))
-                        size = OCFS2_XATTR_SIZE(name_len) +
-                        OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
-                else
-                        size = OCFS2_XATTR_SIZE(name_len) +
-                        OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
-                /*
-                 * If the new value will be stored outside, xi->value has been
-                 * initalized as an empty ocfs2_xattr_value_root, and the same
-                 * goes with xi->value_len, so we can set new_size safely here.
-                 * See ocfs2_xattr_set_in_bucket.
-                 */
-                new_size = OCFS2_XATTR_SIZE(name_len) +
-                           OCFS2_XATTR_SIZE(xi->value_len);
-                le16_add_cpu(&xh->xh_name_value_len, -size);
-                if (xi->value) {
-                        if (new_size > size)
-                                goto set_new_name_value;
-                        /* Now replace the old value with new one. */
-                        if (local)
-                                xe->xe_value_size = cpu_to_le64(xi->value_len);
-                        else
-                                xe->xe_value_size = 0;
-                        val = ocfs2_xattr_bucket_get_val(inode,
-                                                         xs->bucket, offs);
-                        memset(val + OCFS2_XATTR_SIZE(name_len), 0,
-                               size - OCFS2_XATTR_SIZE(name_len));
-                        if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
-                                memcpy(val + OCFS2_XATTR_SIZE(name_len),
-                                       xi->value, xi->value_len);
-                        le16_add_cpu(&xh->xh_name_value_len, new_size);
-                        ocfs2_xattr_set_local(xe, local);
-                        return;
-                } else {
-                        /*
-                         * Remove the old entry if there is more than one.
-                         * We don't remove the last entry so that we can
-                         * use it to indicate the hash value of the empty
-                         * bucket.
-                         */
-                        last -= 1;
-                        le16_add_cpu(&xh->xh_count, -1);
-                        if (xh->xh_count) {
-                                memmove(xe, xe + 1,
-                                        (void *)last - (void *)xe);
-                                memset(last, 0,
-                                       sizeof(struct ocfs2_xattr_entry));
-                        } else
-                                xh->xh_free_start =
-                                        cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
-                        return;
-                }
-        } else {
-                /* find a new entry for insert. */
-                int low = 0, high = count - 1, tmp;
-                struct ocfs2_xattr_entry *tmp_xe;
-                while (low <= high && count) {
-                        tmp = (low + high) / 2;
-                        tmp_xe = &xh->xh_entries[tmp];
-                        if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
-                                low = tmp + 1;
-                        else if (name_hash <
-                                 le32_to_cpu(tmp_xe->xe_name_hash))
-                                high = tmp - 1;
-                        else {
-                                low = tmp;
-                                break;
-                        }
-                }
-                xe = &xh->xh_entries[low];
-                if (low != count)
-                        memmove(xe + 1, xe, (void *)last - (void *)xe);
-                le16_add_cpu(&xh->xh_count, 1);
-                memset(xe, 0, sizeof(struct ocfs2_xattr_entry));
-                xe->xe_name_hash = cpu_to_le32(name_hash);
-                xe->xe_name_len = name_len;
-                ocfs2_xattr_set_type(xe, xi->name_index);
-        }
-set_new_name_value:
-        /* Insert the new name+value. */
-        size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len);
-        /*
-         * We must make sure that the name/value pair
-         * exists in the same block.
-         */
-        offs = le16_to_cpu(xh->xh_free_start);
-        start = offs - size;
-        if (start >> inode->i_sb->s_blocksize_bits !=
-            (offs - 1) >> inode->i_sb->s_blocksize_bits) {
-                offs = offs - offs % blocksize;
-                xh->xh_free_start = cpu_to_le16(offs);
-        }
-        val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs - size);
-        xe->xe_name_offset = cpu_to_le16(offs - size);
-        memset(val, 0, size);
-        memcpy(val, xi->name, name_len);
-        memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->value, xi->value_len);
-        xe->xe_value_size = cpu_to_le64(xi->value_len);
-        ocfs2_xattr_set_local(xe, local);
-        xs->here = xe;
-        le16_add_cpu(&xh->xh_free_start, -size);
-        le16_add_cpu(&xh->xh_name_value_len, size);
-        return;
-}
-/*
- * Set the xattr entry in the specified bucket.
- * The bucket is indicated by xs->bucket and it should have the enough
- * space for the xattr insertion.
- */
-static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
-                                           handle_t *handle,
-                                           struct ocfs2_xattr_info *xi,
-                                           struct ocfs2_xattr_search *xs,
-                                           u32 name_hash,
-                                           int local)
-{
-        int ret;
-        u64 blkno;
-        mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
-             (unsigned long)xi->value_len, xi->name_index,
-             (unsigned long long)bucket_blkno(xs->bucket));
-        if (!xs->bucket->bu_bhs[1]) {
-                blkno = bucket_blkno(xs->bucket);
-                ocfs2_xattr_bucket_relse(xs->bucket);
-                ret = ocfs2_read_xattr_bucket(xs->bucket, blkno);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-        }
-        ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
-                                                OCFS2_JOURNAL_ACCESS_WRITE);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out;
-        }
-        ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
-        ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
-out:
-        return ret;
-}
-/*
 * Truncate the specified xe_off entry in xattr bucket.
 * bucket is indicated by header_bh and len is the new length.
 * Both the ocfs2_xattr_value_root and the entry will be updated here.
@@ -5015,66 +5405,6 @@ out:
        return ret;
 }
-static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
-                                        struct ocfs2_xattr_search *xs,
-                                        int len,
-                                        struct ocfs2_xattr_set_ctxt *ctxt)
-{
-        int ret, offset;
-        struct ocfs2_xattr_entry *xe = xs->here;
-        struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
-        BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe));
-        offset = xe - xh->xh_entries;
-        ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket,
-                                                offset, len, ctxt);
-        if (ret)
-                mlog_errno(ret);
-        return ret;
-}
-static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
-                                                handle_t *handle,
-                                                struct ocfs2_xattr_search *xs,
-                                                char *val,
-                                                int value_len)
-{
-        int ret, offset, block_off;
-        struct ocfs2_xattr_value_root *xv;
-        struct ocfs2_xattr_entry *xe = xs->here;
-        struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
-        void *base;
-        struct ocfs2_xattr_value_buf vb = {
-                .vb_access = ocfs2_journal_access,
-        };
-        BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
-        ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, xh,
-                                                xe - xh->xh_entries,
-                                                &block_off,
-                                                &offset);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
-        base = bucket_block(xs->bucket, block_off);
-        xv = (struct ocfs2_xattr_value_root *)(base + offset +
-                 OCFS2_XATTR_SIZE(xe->xe_name_len));
-        vb.vb_xv = xv;
-        vb.vb_bh = xs->bucket->bu_bhs[block_off];
-        ret = __ocfs2_xattr_set_value_outside(inode, handle,
-                                              &vb, val, value_len);
-        if (ret)
-                mlog_errno(ret);
-out:
-        return ret;
-}
 static int ocfs2_rm_xattr_cluster(struct inode *inode,
                                  struct buffer_head *root_bh,
                                  u64 blkno,
@@ -5173,128 +5503,6 @@ out:
        return ret;
 }
-static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
-                                         handle_t *handle,
-                                         struct ocfs2_xattr_search *xs)
-{
-        struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
-        struct ocfs2_xattr_entry *last = &xh->xh_entries[
-                                                le16_to_cpu(xh->xh_count) - 1];
-        int ret = 0;
-        ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
-                                                OCFS2_JOURNAL_ACCESS_WRITE);
-        if (ret) {
-                mlog_errno(ret);
-                return;
-        }
-        /* Remove the old entry. */
-        memmove(xs->here, xs->here + 1,
-                (void *)last - (void *)xs->here);
-        memset(last, 0, sizeof(struct ocfs2_xattr_entry));
-        le16_add_cpu(&xh->xh_count, -1);
-        ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
-}
-/*
- * Set the xattr name/value in the bucket specified in xs.
- *
- * As the new value in xi may be stored in the bucket or in an outside cluster,
- * we divide the whole process into 3 steps:
- * 1. insert name/value in the bucket(ocfs2_xattr_set_entry_in_bucket)
- * 2. truncate of the outside cluster(ocfs2_xattr_bucket_value_truncate_xs)
- * 3. Set the value to the outside cluster(ocfs2_xattr_bucket_set_value_outside)
- * 4. If the clusters for the new outside value can't be allocated, we need
- *    to free the xattr we allocated in set.
- */
-static int ocfs2_xattr_set_in_bucket(struct inode *inode,
-                                     struct ocfs2_xattr_info *xi,
-                                     struct ocfs2_xattr_search *xs,
-                                     struct ocfs2_xattr_set_ctxt *ctxt)
-{
-        int ret, local = 1;
-        size_t value_len;
-        char *val = (char *)xi->value;
-        struct ocfs2_xattr_entry *xe = xs->here;
-        u32 name_hash = ocfs2_xattr_name_hash(inode, xi->name,
-                                              strlen(xi->name));
-        if (!xs->not_found && !ocfs2_xattr_is_local(xe)) {
-                /*
-                 * We need to truncate the xattr storage first.
-                 *
-                 * If both the old and new value are stored to
-                 * outside block, we only need to truncate
-                 * the storage and then set the value outside.
-                 *
-                 * If the new value should be stored within block,
-                 * we should free all the outside block first and
-                 * the modification to the xattr block will be done
-                 * by following steps.
-                 */
-                if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
-                        value_len = xi->value_len;
-                else
-                        value_len = 0;
-                ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
-                                                           value_len,
-                                                           ctxt);
-                if (ret)
-                        goto out;
-                if (value_len)
-                        goto set_value_outside;
-        }
-        value_len = xi->value_len;
-        /* So we have to handle the inside block change now. */
-        if (value_len > OCFS2_XATTR_INLINE_SIZE) {
-                /*
-                 * If the new value will be stored outside of block,
-                 * initalize a new empty value root and insert it first.
-                 */
-                local = 0;
-                xi->value = &def_xv;
-                xi->value_len = OCFS2_XATTR_ROOT_SIZE;
-        }
-        ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs,
-                                              name_hash, local);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
-        if (value_len <= OCFS2_XATTR_INLINE_SIZE)
-                goto out;
-        /* allocate the space now for the outside block storage. */
-        ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
-                                                   value_len, ctxt);
-        if (ret) {
-                mlog_errno(ret);
-                if (xs->not_found) {
-                        /*
-                         * We can't allocate enough clusters for outside
-                         * storage and we have allocated xattr already,
-                         * so need to remove it.
-                         */
-                        ocfs2_xattr_bucket_remove_xs(inode, ctxt->handle, xs);
-                }
-                goto out;
-        }
-set_value_outside:
-        ret = ocfs2_xattr_bucket_set_value_outside(inode, ctxt->handle,
-                                                   xs, val, value_len);
-out:
-        return ret;
-}
 /*
 * check whether the xattr bucket is filled up with the same hash value.
 * If we want to insert the xattr with the same hash, return -ENOSPC.
@@ -5323,156 +5531,116 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
        return 0;
 }
-static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
+/*
-                                             struct ocfs2_xattr_info *xi,
+ * Try to set the entry in the current bucket.  If we fail, the caller
-                                             struct ocfs2_xattr_search *xs,
+ * will handle getting us another bucket.
-                                             struct ocfs2_xattr_set_ctxt *ctxt)
+ */
+static int ocfs2_xattr_set_entry_bucket(struct inode *inode,
+                                        struct ocfs2_xattr_info *xi,
+                                        struct ocfs2_xattr_search *xs,
+                                        struct ocfs2_xattr_set_ctxt *ctxt)
 {
-        struct ocfs2_xattr_header *xh;
+        int ret;
-        struct ocfs2_xattr_entry *xe;
+        struct ocfs2_xa_loc loc;
-        u16 count, header_size, xh_free_start;
-        int free, max_free, need, old;
-        size_t value_size = 0, name_len = strlen(xi->name);
-        size_t blocksize = inode->i_sb->s_blocksize;
-        int ret, allocation = 0;
-        mlog_entry("Set xattr %s in xattr index block\n", xi->name);
-try_again:
-        xh = xs->header;
-        count = le16_to_cpu(xh->xh_count);
-        xh_free_start = le16_to_cpu(xh->xh_free_start);
-        header_size = sizeof(struct ocfs2_xattr_header) +
-                        count * sizeof(struct ocfs2_xattr_entry);
-        max_free = OCFS2_XATTR_BUCKET_SIZE - header_size -
-                le16_to_cpu(xh->xh_name_value_len) - OCFS2_XATTR_HEADER_GAP;
-        mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
-                        "of %u which exceed block size\n",
-                        (unsigned long long)bucket_blkno(xs->bucket),
-                        header_size);
-        if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+        mlog_entry("Set xattr %s in xattr bucket\n", xi->xi_name);
-                value_size = OCFS2_XATTR_ROOT_SIZE;
-        else if (xi->value)
-                value_size = OCFS2_XATTR_SIZE(xi->value_len);
-        if (xs->not_found)
+        ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
-                need = sizeof(struct ocfs2_xattr_entry) +
+                                       xs->not_found ? NULL : xs->here);
-                        OCFS2_XATTR_SIZE(name_len) + value_size;
+        ret = ocfs2_xa_set(&loc, xi, ctxt);
-        else {
+        if (!ret) {
-                need = value_size + OCFS2_XATTR_SIZE(name_len);
+                xs->here = loc.xl_entry;
+                goto out;
+        }
+        if (ret != -ENOSPC) {
+                mlog_errno(ret);
+                goto out;
+        }
-                /*
+        /* Ok, we need space.  Let's try defragmenting the bucket. */
-                 * We only replace the old value if the new length is smaller
+        ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
-                 * than the old one. Otherwise we will allocate new space in the
+                                        xs->bucket);
-                 * bucket to store it.
+        if (ret) {
-                 */
+                mlog_errno(ret);
-                xe = xs->here;
+                goto out;
-                if (ocfs2_xattr_is_local(xe))
+        }
-                        old = OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
-                else
-                        old = OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
-                if (old >= value_size)
+        ret = ocfs2_xa_set(&loc, xi, ctxt);
-                        need = 0;
+        if (!ret) {
+                xs->here = loc.xl_entry;
+                goto out;
        }
+        if (ret != -ENOSPC)
+                mlog_errno(ret);
-        free = xh_free_start - header_size - OCFS2_XATTR_HEADER_GAP;
-        /*
-         * We need to make sure the new name/value pair
-         * can exist in the same block.
-         */
-        if (xh_free_start % blocksize < need)
-                free -= xh_free_start % blocksize;
-        mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
-             "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
-             " %u\n", xs->not_found,
-             (unsigned long long)bucket_blkno(xs->bucket),
-             free, need, max_free, le16_to_cpu(xh->xh_free_start),
-             le16_to_cpu(xh->xh_name_value_len));
-        if (free < need ||
-            (xs->not_found &&
-             count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb))) {
-                if (need <= max_free &&
-                    count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
-                        /*
-                         * We can create the space by defragment. Since only the
-                         * name/value will be moved, the xe shouldn't be changed
-                         * in xs.
-                         */
-                        ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
-                                                        xs->bucket);
-                        if (ret) {
-                                mlog_errno(ret);
-                                goto out;
-                        }
-                        xh_free_start = le16_to_cpu(xh->xh_free_start);
+out:
-                        free = xh_free_start - header_size
+        mlog_exit(ret);
-                                - OCFS2_XATTR_HEADER_GAP;
+        return ret;
-                        if (xh_free_start % blocksize < need)
+}
-                                free -= xh_free_start % blocksize;
-                        if (free >= need)
+static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
-                                goto xattr_set;
+                                             struct ocfs2_xattr_info *xi,
+                                             struct ocfs2_xattr_search *xs,
+                                             struct ocfs2_xattr_set_ctxt *ctxt)
+{
+        int ret;
-                        mlog(0, "Can't get enough space for xattr insert by "
+        mlog_entry("Set xattr %s in xattr index block\n", xi->xi_name);
-                             "defragment. Need %u bytes, but we have %d, so "
-                             "allocate new bucket for it.\n", need, free);
-                }
-                /*
+        ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
-                 * We have to add new buckets or clusters and one
+        if (!ret)
-                 * allocation should leave us enough space for insert.
+                goto out;
-                 */
+        if (ret != -ENOSPC) {
-                BUG_ON(allocation);
+                mlog_errno(ret);
+                goto out;
+        }
-                /*
+        /* Ack, need more space.  Let's try to get another bucket! */
-                 * We do not allow for overlapping ranges between buckets. And
-                 * the maximum number of collisions we will allow for then is
-                 * one bucket's worth, so check it here whether we need to
-                 * add a new bucket for the insert.
-                 */
-                ret = ocfs2_check_xattr_bucket_collision(inode,
-                                                         xs->bucket,
-                                                         xi->name);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-                ret = ocfs2_add_new_xattr_bucket(inode,
+        /*
-                                                 xs->xattr_bh,
+         * We do not allow for overlapping ranges between buckets. And
+         * the maximum number of collisions we will allow for then is
+         * one bucket's worth, so check it here whether we need to
+         * add a new bucket for the insert.
+         */
+        ret = ocfs2_check_xattr_bucket_collision(inode,
                                                 xs->bucket,
-                                                 ctxt);
+                                                 xi->xi_name);
-                if (ret) {
+        if (ret) {
-                        mlog_errno(ret);
+                mlog_errno(ret);
-                        goto out;
+                goto out;
-                }
+        }
-                /*
+        ret = ocfs2_add_new_xattr_bucket(inode,
-                 * ocfs2_add_new_xattr_bucket() will have updated
+                                         xs->xattr_bh,
-                 * xs->bucket if it moved, but it will not have updated
+                                         xs->bucket,
-                 * any of the other search fields.  Thus, we drop it and
+                                         ctxt);
-                 * re-search.  Everything should be cached, so it'll be
+        if (ret) {
-                 * quick.
+                mlog_errno(ret);
-                 */
+                goto out;
-                ocfs2_xattr_bucket_relse(xs->bucket);
-                ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
-                                                   xi->name_index,
-                                                   xi->name, xs);
-                if (ret && ret != -ENODATA)
-                        goto out;
-                xs->not_found = ret;
-                allocation = 1;
-                goto try_again;
        }
-xattr_set:
+        /*
-        ret = ocfs2_xattr_set_in_bucket(inode, xi, xs, ctxt);
+         * ocfs2_add_new_xattr_bucket() will have updated
+         * xs->bucket if it moved, but it will not have updated
+         * any of the other search fields.  Thus, we drop it and
+         * re-search.  Everything should be cached, so it'll be
+         * quick.
+         */
+        ocfs2_xattr_bucket_relse(xs->bucket);
+        ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
+                                           xi->xi_name_index,
+                                           xi->xi_name, xs);
+        if (ret && ret != -ENODATA)
+                goto out;
+        xs->not_found = ret;
+        /* Ok, we have a new bucket, let's try again */
+        ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
+        if (ret && (ret != -ENOSPC))
+                mlog_errno(ret);
 out:
        mlog_exit(ret);
        return ret;
@@ -5684,7 +5852,7 @@ static int ocfs2_prepare_refcount_xattr(struct inode *inode,
         * refcount tree, and make the original extent become 3. So we will need
         * 2 * cluster more extent recs at most.
         */
-        if (!xi->value || xi->value_len <= OCFS2_XATTR_INLINE_SIZE) {
+        if (!xi->xi_value || xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE) {
                ret = ocfs2_refcounted_xattr_delete_need(inode,
                                                         &(*ref_tree)->rf_ci,
@@ -6066,7 +6234,7 @@ static int ocfs2_value_metas_in_xattr_header(struct super_block *sb,
                 * to the extent block, so just calculate a maximum record num.
                 */
                if (!xv->xr_list.l_tree_depth)
-                        *num_recs += xv->xr_list.l_next_free_rec;
+                        *num_recs += le16_to_cpu(xv->xr_list.l_next_free_rec);
                else
                        *num_recs += ocfs2_clusters_for_bytes(sb,
                                                              XATTR_SIZE_MAX);
@@ -6360,33 +6528,33 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
                                          int indexed)
 {
        int ret;
-        handle_t *handle;
-        struct ocfs2_alloc_context *meta_ac;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_xattr_set_ctxt ctxt;
-        ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+        memset(&ctxt, 0, sizeof(ctxt));
+        ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &ctxt.meta_ac);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
        }
-        handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS);
+        ctxt.handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS);
-        if (IS_ERR(handle)) {
+        if (IS_ERR(ctxt.handle)) {
-                ret = PTR_ERR(handle);
+                ret = PTR_ERR(ctxt.handle);
                mlog_errno(ret);
                goto out;
        }
        mlog(0, "create new xattr block for inode %llu, index = %d\n",
             (unsigned long long)fe_bh->b_blocknr, indexed);
-        ret = ocfs2_create_xattr_block(handle, inode, fe_bh,
+        ret = ocfs2_create_xattr_block(inode, fe_bh, &ctxt, indexed,
-                                       meta_ac, ret_bh, indexed);
+                                       ret_bh);
        if (ret)
                mlog_errno(ret);
-        ocfs2_commit_trans(osb, handle);
+        ocfs2_commit_trans(osb, ctxt.handle);
 out:
-        ocfs2_free_alloc_context(meta_ac);
+        ocfs2_free_alloc_context(ctxt.meta_ac);
        return ret;
 }
@@ -6978,9 +7146,9 @@ int ocfs2_init_security_and_acl(struct inode *dir,
        ret = ocfs2_init_security_get(inode, dir, &si);
        if (!ret) {
-                ret = ocfs2_xattr_security_set(inode, si.name,
+                ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
-                                               si.value, si.value_len,
+                                      si.name, si.value, si.value_len,
-                                               XATTR_CREATE);
+                                      XATTR_CREATE);
                if (ret) {
                        mlog_errno(ret);
                        goto leave;
@@ -7008,9 +7176,9 @@ leave:
 /*
 * 'security' attributes support
 */
-static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
+static size_t ocfs2_xattr_security_list(struct dentry *dentry, char *list,
                                        size_t list_size, const char *name,
-                                        size_t name_len)
+                                        size_t name_len, int type)
 {
        const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
@@ -7023,23 +7191,23 @@ static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
        return total_len;
 }
-static int ocfs2_xattr_security_get(struct inode *inode, const char *name,
+static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name,
-                                    void *buffer, size_t size)
+                                    void *buffer, size_t size, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY, name,
+        return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY,
-                               buffer, size);
+                               name, buffer, size);
 }
-static int ocfs2_xattr_security_set(struct inode *inode, const char *name,
+static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
-                                    const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value,
+        return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY,
-                               size, flags);
+                               name, value, size, flags);
 }
 int ocfs2_init_security_get(struct inode *inode,
@@ -7076,9 +7244,9 @@ struct xattr_handler ocfs2_xattr_security_handler = {
 /*
 * 'trusted' attributes support
 */
-static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
+static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
                                       size_t list_size, const char *name,
-                                       size_t name_len)
+                                       size_t name_len, int type)
 {
        const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
@@ -7091,23 +7259,23 @@ static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
        return total_len;
 }
-static int ocfs2_xattr_trusted_get(struct inode *inode, const char *name,
+static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name,
-                                   void *buffer, size_t size)
+                void *buffer, size_t size, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name,
+        return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED,
-                               buffer, size);
+                               name, buffer, size);
 }
-static int ocfs2_xattr_trusted_set(struct inode *inode, const char *name,
+static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
-                                   const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value,
+        return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED,
-                               size, flags);
+                               name, value, size, flags);
 }
 struct xattr_handler ocfs2_xattr_trusted_handler = {
@@ -7120,13 +7288,13 @@ struct xattr_handler ocfs2_xattr_trusted_handler = {
 /*
 * 'user' attributes support
 */
-static size_t ocfs2_xattr_user_list(struct inode *inode, char *list,
+static size_t ocfs2_xattr_user_list(struct dentry *dentry, char *list,
                                    size_t list_size, const char *name,
-                                    size_t name_len)
+                                    size_t name_len, int type)
 {
        const size_t prefix_len = XATTR_USER_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
        if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
                return 0;
@@ -7139,31 +7307,31 @@ static size_t ocfs2_xattr_user_list(struct inode *inode, char *list,
        return total_len;
 }
-static int ocfs2_xattr_user_get(struct inode *inode, const char *name,
+static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name,
-                                void *buffer, size_t size)
+                void *buffer, size_t size, int type)
 {
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
        if (strcmp(name, "") == 0)
                return -EINVAL;
        if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
                return -EOPNOTSUPP;
-        return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name,
+        return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_USER, name,
                               buffer, size);
 }
-static int ocfs2_xattr_user_set(struct inode *inode, const char *name,
+static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
-                                const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int type)
 {
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
        if (strcmp(name, "") == 0)
                return -EINVAL;
        if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
                return -EOPNOTSUPP;
-        return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value,
+        return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_USER,
-                               size, flags);
+                               name, value, size, flags);
 }
 struct xattr_handler ocfs2_xattr_user_handler = {
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 08e36389f56d..abd72a47f520 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -40,10 +40,8 @@ struct ocfs2_security_xattr_info {
 extern struct xattr_handler ocfs2_xattr_user_handler;
 extern struct xattr_handler ocfs2_xattr_trusted_handler;
 extern struct xattr_handler ocfs2_xattr_security_handler;
-#ifdef CONFIG_OCFS2_FS_POSIX_ACL
 extern struct xattr_handler ocfs2_xattr_acl_access_handler;
 extern struct xattr_handler ocfs2_xattr_acl_default_handler;
-#endif
 extern struct xattr_handler *ocfs2_xattr_handlers[];
 ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c
index e1c0ec0ae989..082234581d05 100644
--- a/fs/omfs/bitmap.c
+++ b/fs/omfs/bitmap.c
@@ -85,7 +85,7 @@ out:
 }
 /*
- * Tries to allocate exactly one block.  Returns true if sucessful.
+ * Tries to allocate exactly one block.  Returns true if successful.
 */
 int omfs_allocate_block(struct super_block *sb, u64 block)
 {
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index f3b7c1541f3a..c82af6acc2e7 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -6,11 +6,13 @@
 #include <linux/version.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/parser.h>
 #include <linux/buffer_head.h>
 #include <linux/vmalloc.h>
+#include <linux/writeback.h>
 #include <linux/crc-itu-t.h>
 #include "omfs.h"
@@ -89,7 +91,7 @@ static void omfs_update_checksums(struct omfs_inode *oi)
        oi->i_head.h_check_xor = xor;
 }
-static int omfs_write_inode(struct inode *inode, int wait)
+static int __omfs_write_inode(struct inode *inode, int wait)
 {
        struct omfs_inode *oi;
        struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
@@ -162,9 +164,14 @@ out:
        return ret;
 }
+static int omfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+        return __omfs_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
 int omfs_sync_inode(struct inode *inode)
 {
-        return omfs_write_inode(inode, 1);
+        return __omfs_write_inode(inode, 1);
 }
 /*
diff --git a/fs/open.c b/fs/open.c
index 4f01e06227c6..74e5cd9f718e 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -8,10 +8,8 @@
 #include <linux/mm.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
-#include <linux/quotaops.h>
 #include <linux/fsnotify.h>
 #include <linux/module.h>
-#include <linux/slab.h>
 #include <linux/tty.h>
 #include <linux/namei.h>
 #include <linux/backing-dev.h>
@@ -21,6 +19,7 @@
 #include <linux/mount.h>
 #include <linux/vfs.h>
 #include <linux/fcntl.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/fs.h>
 #include <linux/personality.h>
@@ -30,6 +29,9 @@
 #include <linux/audit.h>
 #include <linux/falloc.h>
 #include <linux/fs_struct.h>
+#include <linux/ima.h>
+#include "internal.h"
 int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
@@ -268,17 +270,15 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
         * Make sure that there are no leases.  get_write_access() protects
         * against the truncate racing with a lease-granting setlease().
         */
-        error = break_lease(inode, FMODE_WRITE);
+        error = break_lease(inode, O_WRONLY);
        if (error)
                goto put_write_and_out;
        error = locks_verify_truncate(inode, NULL, length);
        if (!error)
                error = security_path_truncate(&path, length, 0);
-        if (!error) {
+        if (!error)
-                vfs_dq_init(inode);
                error = do_truncate(path.dentry, length, 0, NULL);
-        }
 put_write_and_out:
        put_write_access(inode);
@@ -587,6 +587,9 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
        error = -EPERM;
        if (!capable(CAP_SYS_CHROOT))
                goto dput_and_out;
+        error = security_path_chroot(&path);
+        if (error)
+                goto dput_and_out;
        set_fs_root(current->fs, &path);
        error = 0;
@@ -617,11 +620,15 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
        if (err)
                goto out_putf;
        mutex_lock(&inode->i_mutex);
+        err = security_path_chmod(dentry, file->f_vfsmnt, mode);
+        if (err)
+                goto out_unlock;
        if (mode == (mode_t) -1)
                mode = inode->i_mode;
        newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
        newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
        err = notify_change(dentry, &newattrs);
+out_unlock:
        mutex_unlock(&inode->i_mutex);
        mnt_drop_write(file->f_path.mnt);
 out_putf:
@@ -646,11 +653,15 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
        if (error)
                goto dput_and_out;
        mutex_lock(&inode->i_mutex);
+        error = security_path_chmod(path.dentry, path.mnt, mode);
+        if (error)
+                goto out_unlock;
        if (mode == (mode_t) -1)
                mode = inode->i_mode;
        newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
        newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
        error = notify_change(path.dentry, &newattrs);
+out_unlock:
        mutex_unlock(&inode->i_mutex);
        mnt_drop_write(path.mnt);
 dput_and_out:
@@ -664,9 +675,9 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, mode_t, mode)
        return sys_fchmodat(AT_FDCWD, filename, mode);
 }
-static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
+static int chown_common(struct path *path, uid_t user, gid_t group)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = path->dentry->d_inode;
        int error;
        struct iattr newattrs;
@@ -683,7 +694,9 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
                newattrs.ia_valid |=
                        ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
        mutex_lock(&inode->i_mutex);
-        error = notify_change(dentry, &newattrs);
+        error = security_path_chown(path, user, group);
+        if (!error)
+                error = notify_change(path->dentry, &newattrs);
        mutex_unlock(&inode->i_mutex);
        return error;
@@ -700,7 +713,7 @@ SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
        error = mnt_want_write(path.mnt);
        if (error)
                goto out_release;
-        error = chown_common(path.dentry, user, group);
+        error = chown_common(&path, user, group);
        mnt_drop_write(path.mnt);
 out_release:
        path_put(&path);
@@ -725,7 +738,7 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
        error = mnt_want_write(path.mnt);
        if (error)
                goto out_release;
-        error = chown_common(path.dentry, user, group);
+        error = chown_common(&path, user, group);
        mnt_drop_write(path.mnt);
 out_release:
        path_put(&path);
@@ -744,7 +757,7 @@ SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group
        error = mnt_want_write(path.mnt);
        if (error)
                goto out_release;
-        error = chown_common(path.dentry, user, group);
+        error = chown_common(&path, user, group);
        mnt_drop_write(path.mnt);
 out_release:
        path_put(&path);
@@ -767,7 +780,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
                goto out_fput;
        dentry = file->f_path.dentry;
        audit_inode(NULL, dentry);
-        error = chown_common(dentry, user, group);
+        error = chown_common(&file->f_path, user, group);
        mnt_drop_write(file->f_path.mnt);
 out_fput:
        fput(file);
@@ -805,15 +818,14 @@ static inline int __get_file_write_access(struct inode *inode,
 }
 static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
-                                        int flags, struct file *f,
+                                        struct file *f,
                                        int (*open)(struct inode *, struct file *),
                                        const struct cred *cred)
 {
        struct inode *inode;
        int error;
-        f->f_flags = flags;
+        f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
-        f->f_mode = (__force fmode_t)((flags+1) & O_ACCMODE) | FMODE_LSEEK |
                                FMODE_PREAD | FMODE_PWRITE;
        inode = dentry->d_inode;
        if (f->f_mode & FMODE_WRITE) {
@@ -842,6 +854,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
                if (error)
                        goto cleanup_all;
        }
+        ima_counts_get(f);
        f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
@@ -913,7 +926,6 @@ struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry
        if (IS_ERR(dentry))
                goto out_err;
        nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->path.mnt),
-                                             nd->intent.open.flags - 1,
                                             nd->intent.open.file,
                                             open, cred);
 out:
@@ -932,7 +944,7 @@ EXPORT_SYMBOL_GPL(lookup_instantiate_filp);
 *
 * Note that this function destroys the original nameidata
 */
-struct file *nameidata_to_filp(struct nameidata *nd, int flags)
+struct file *nameidata_to_filp(struct nameidata *nd)
 {
        const struct cred *cred = current_cred();
        struct file *filp;
@@ -941,7 +953,7 @@ struct file *nameidata_to_filp(struct nameidata *nd, int flags)
        filp = nd->intent.open.file;
        /* Has the filesystem initialised the file for us? */
        if (filp->f_path.dentry == NULL)
-                filp = __dentry_open(nd->path.dentry, nd->path.mnt, flags, filp,
+                filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp,
                                     NULL, cred);
        else
                path_put(&nd->path);
@@ -980,7 +992,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
                return ERR_PTR(error);
        }
-        return __dentry_open(dentry, mnt, flags, f, NULL, cred);
+        f->f_flags = flags;
+        return __dentry_open(dentry, mnt, f, NULL, cred);
 }
 EXPORT_SYMBOL(dentry_open);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7b685e10cbad..e238ab23a9e7 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -16,6 +16,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/kmod.h>
 #include <linux/ctype.h>
 #include <linux/genhd.h>
@@ -226,6 +227,13 @@ ssize_t part_alignment_offset_show(struct device *dev,
        return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
 }
+ssize_t part_discard_alignment_show(struct device *dev,
+                                   struct device_attribute *attr, char *buf)
+{
+        struct hd_struct *p = dev_to_part(dev);
+        return sprintf(buf, "%u\n", p->discard_alignment);
+}
 ssize_t part_stat_show(struct device *dev,
                       struct device_attribute *attr, char *buf)
 {
@@ -288,6 +296,8 @@ static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
+static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
+                   NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -300,6 +310,7 @@ static struct attribute *part_attrs[] = {
        &dev_attr_start.attr,
        &dev_attr_size.attr,
        &dev_attr_alignment_offset.attr,
+        &dev_attr_discard_alignment.attr,
        &dev_attr_stat.attr,
        &dev_attr_inflight.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -402,7 +413,10 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        pdev = part_to_dev(p);
        p->start_sect = start;
-        p->alignment_offset = queue_sector_alignment_offset(disk->queue, start);
+        p->alignment_offset =
+                queue_limit_alignment_offset(&disk->queue->limits, start);
+        p->discard_alignment =
+                queue_limit_discard_alignment(&disk->queue->limits, start);
        p->nr_sects = len;
        p->partno = partno;
        p->policy = get_disk_ro(disk);
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 038a6022152f..91babdae7587 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -1,7 +1,9 @@
 /************************************************************
 * EFI GUID Partition Table handling
- * Per Intel EFI Specification v1.02
+ *
- * http://developer.intel.com/technology/efi/efi.htm
+ * http://www.uefi.org/specs/
+ * http://www.intel.com/technology/efi/
+ *
 * efi.[ch] by Matt Domsch <Matt_Domsch@dell.com>
 *   Copyright 2000,2001,2002,2004 Dell Inc.
 *
@@ -92,6 +94,8 @@
 *
 ************************************************************/
 #include <linux/crc32.h>
+#include <linux/math64.h>
+#include <linux/slab.h>
 #include "check.h"
 #include "efi.h"
@@ -141,7 +145,8 @@ last_lba(struct block_device *bdev)
 {
        if (!bdev || !bdev->bd_inode)
                return 0;
-        return (bdev->bd_inode->i_size >> 9) - 1ULL;
+        return div_u64(bdev->bd_inode->i_size,
+                       bdev_logical_block_size(bdev)) - 1ULL;
 }
 static inline int
@@ -188,6 +193,7 @@ static size_t
 read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
 {
        size_t totalreadcount = 0;
+        sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
        if (!bdev || !buffer || lba > last_lba(bdev))
                return 0;
@@ -195,7 +201,7 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
        while (count) {
                int copied = 512;
                Sector sect;
-                unsigned char *data = read_dev_sector(bdev, lba++, &sect);
+                unsigned char *data = read_dev_sector(bdev, n++, &sect);
                if (!data)
                        break;
                if (copied > count)
@@ -257,15 +263,16 @@ static gpt_header *
 alloc_read_gpt_header(struct block_device *bdev, u64 lba)
 {
        gpt_header *gpt;
+        unsigned ssz = bdev_logical_block_size(bdev);
        if (!bdev)
                return NULL;
-        gpt = kzalloc(sizeof (gpt_header), GFP_KERNEL);
+        gpt = kzalloc(ssz, GFP_KERNEL);
        if (!gpt)
                return NULL;
-        if (read_lba(bdev, lba, (u8 *) gpt,
+        if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) {
-                     sizeof (gpt_header)) < sizeof (gpt_header)) {
                kfree(gpt);
                gpt=NULL;
                return NULL;
@@ -601,6 +608,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
        gpt_header *gpt = NULL;
        gpt_entry *ptes = NULL;
        u32 i;
+        unsigned ssz = bdev_logical_block_size(bdev) / 512;
        if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) {
                kfree(gpt);
@@ -611,13 +619,14 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
        pr_debug("GUID Partition Table is valid!  Yea!\n");
        for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
+                u64 start = le64_to_cpu(ptes[i].starting_lba);
+                u64 size = le64_to_cpu(ptes[i].ending_lba) -
+                           le64_to_cpu(ptes[i].starting_lba) + 1ULL;
                if (!is_pte_valid(&ptes[i], last_lba(bdev)))
                        continue;
-                put_partition(state, i+1, le64_to_cpu(ptes[i].starting_lba),
+                put_partition(state, i+1, start * ssz, size * ssz);
-                                 (le64_to_cpu(ptes[i].ending_lba) -
-                                  le64_to_cpu(ptes[i].starting_lba) +
-                                  1ULL));
                /* If this is a RAID volume, tell md */
                if (!efi_guidcmp(ptes[i].partition_type_guid,
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
index 2cc89d0475bf..6998b589abf9 100644
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
@@ -37,7 +37,6 @@
 #define EFI_PMBR_OSTYPE_EFI 0xEF
 #define EFI_PMBR_OSTYPE_EFI_GPT 0xEE
-#define GPT_BLOCK_SIZE 512
 #define GPT_HEADER_SIGNATURE 0x5452415020494645ULL
 #define GPT_HEADER_REVISION_V1 0x00010000
 #define GPT_PRIMARY_PARTITION_TABLE_LBA 1
@@ -79,7 +78,12 @@ typedef struct _gpt_header {
        __le32 num_partition_entries;
        __le32 sizeof_partition_entry;
        __le32 partition_entry_array_crc32;
-        u8 reserved2[GPT_BLOCK_SIZE - 92];
+        /* The rest of the logical block is reserved by UEFI and must be zero.
+         * EFI standard handles this by:
+         *
+         * uint8_t              reserved2[ BlockSize - 92 ];
+         */
 } __attribute__ ((packed)) gpt_header;
 typedef struct _gpt_entry_attributes {
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 0028d2ef0662..90be97f1f5a8 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -31,14 +31,17 @@
 */
 #include <asm/unaligned.h>
-#define SYS_IND(p)      (get_unaligned(&p->sys_ind))
+#define SYS_IND(p)      get_unaligned(&p->sys_ind)
-#define NR_SECTS(p)     ({ __le32 __a = get_unaligned(&p->nr_sects);    \
-                                le32_to_cpu(__a); \
-                        })
-#define START_SECT(p)   ({ __le32 __a = get_unaligned(&p->start_sect);  \
+static inline sector_t nr_sects(struct partition *p)
-                                le32_to_cpu(__a); \
+{
-                        })
+        return (sector_t)get_unaligned_le32(&p->nr_sects);
+}
+static inline sector_t start_sect(struct partition *p)
+{
+        return (sector_t)get_unaligned_le32(&p->start_sect);
+}
 static inline int is_extended_partition(struct partition *p)
 {
@@ -104,13 +107,13 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
 static void
 parse_extended(struct parsed_partitions *state, struct block_device *bdev,
-                        u32 first_sector, u32 first_size)
+                        sector_t first_sector, sector_t first_size)
 {
        struct partition *p;
        Sector sect;
        unsigned char *data;
-        u32 this_sector, this_size;
+        sector_t this_sector, this_size;
-        int sector_size = bdev_logical_block_size(bdev) / 512;
+        sector_t sector_size = bdev_logical_block_size(bdev) / 512;
        int loopct = 0;         /* number of links followed
                                   without finding a data partition */
        int i;
@@ -145,14 +148,14 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
                 * First process the data partition(s)
                 */
                for (i=0; i<4; i++, p++) {
-                        u32 offs, size, next;
+                        sector_t offs, size, next;
-                        if (!NR_SECTS(p) || is_extended_partition(p))
+                        if (!nr_sects(p) || is_extended_partition(p))
                                continue;
                        /* Check the 3rd and 4th entries -
                           these sometimes contain random garbage */
-                        offs = START_SECT(p)*sector_size;
+                        offs = start_sect(p)*sector_size;
-                        size = NR_SECTS(p)*sector_size;
+                        size = nr_sects(p)*sector_size;
                        next = this_sector + offs;
                        if (i >= 2) {
                                if (offs + size > this_size)
@@ -179,13 +182,13 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
                 */
                p -= 4;
                for (i=0; i<4; i++, p++)
-                        if (NR_SECTS(p) && is_extended_partition(p))
+                        if (nr_sects(p) && is_extended_partition(p))
                                break;
                if (i == 4)
                        goto done;       /* nothing left to do */
-                this_sector = first_sector + START_SECT(p) * sector_size;
+                this_sector = first_sector + start_sect(p) * sector_size;
-                this_size = NR_SECTS(p) * sector_size;
+                this_size = nr_sects(p) * sector_size;
                put_dev_sector(sect);
        }
 done:
@@ -197,7 +200,7 @@ done:
 static void
 parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
-                        u32 offset, u32 size, int origin)
+                        sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_SOLARIS_X86_PARTITION
        Sector sect;
@@ -244,7 +247,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
 */
 static void
 parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
-                u32 offset, u32 size, int origin, char *flavour,
+                sector_t offset, sector_t size, int origin, char *flavour,
                int max_partitions)
 {
        Sector sect;
@@ -263,7 +266,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
        if (le16_to_cpu(l->d_npartitions) < max_partitions)
                max_partitions = le16_to_cpu(l->d_npartitions);
        for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) {
-                u32 bsd_start, bsd_size;
+                sector_t bsd_start, bsd_size;
                if (state->next == state->limit)
                        break;
@@ -290,7 +293,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
 static void
 parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
-                u32 offset, u32 size, int origin)
+                sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
        parse_bsd(state, bdev, offset, size, origin,
@@ -300,7 +303,7 @@ parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
 static void
 parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
-                u32 offset, u32 size, int origin)
+                sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
        parse_bsd(state, bdev, offset, size, origin,
@@ -310,7 +313,7 @@ parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
 static void
 parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
-                u32 offset, u32 size, int origin)
+                sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
        parse_bsd(state, bdev, offset, size, origin,
@@ -324,7 +327,7 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
 */
 static void
 parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
-                u32 offset, u32 size, int origin)
+                sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_UNIXWARE_DISKLABEL
        Sector sect;
@@ -348,7 +351,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
                if (p->s_label != UNIXWARE_FS_UNUSED)
                        put_partition(state, state->next++,
-                                                START_SECT(p), NR_SECTS(p));
+                                      le32_to_cpu(p->start_sect),
+                                      le32_to_cpu(p->nr_sects));
                p++;
        }
        put_dev_sector(sect);
@@ -363,7 +367,7 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
 */
 static void
 parse_minix(struct parsed_partitions *state, struct block_device *bdev,
-                u32 offset, u32 size, int origin)
+                sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_MINIX_SUBPARTITION
        Sector sect;
@@ -390,7 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
                        /* add each partition in use */
                        if (SYS_IND(p) == MINIX_PARTITION)
                                put_partition(state, state->next++,
-                                              START_SECT(p), NR_SECTS(p));
+                                              start_sect(p), nr_sects(p));
                }
                printk(" >\n");
        }
@@ -401,7 +405,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
 static struct {
        unsigned char id;
        void (*parse)(struct parsed_partitions *, struct block_device *,
-                        u32, u32, int);
+                        sector_t, sector_t, int);
 } subtypes[] = {
        {FREEBSD_PARTITION, parse_freebsd},
        {NETBSD_PARTITION, parse_netbsd},
@@ -415,7 +419,7 @@ static struct {
 
 int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
-        int sector_size = bdev_logical_block_size(bdev) / 512;
+        sector_t sector_size = bdev_logical_block_size(bdev) / 512;
        Sector sect;
        unsigned char *data;
        struct partition *p;
@@ -483,14 +487,21 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
        state->next = 5;
        for (slot = 1 ; slot <= 4 ; slot++, p++) {
-                u32 start = START_SECT(p)*sector_size;
+                sector_t start = start_sect(p)*sector_size;
-                u32 size = NR_SECTS(p)*sector_size;
+                sector_t size = nr_sects(p)*sector_size;
                if (!size)
                        continue;
                if (is_extended_partition(p)) {
-                        /* prevent someone doing mkfs or mkswap on an
+                        /*
-                           extended partition, but leave room for LILO */
+                         * prevent someone doing mkfs or mkswap on an
-                        put_partition(state, slot, start, size == 1 ? 1 : 2);
+                         * extended partition, but leave room for LILO
+                         * FIXME: this uses one logical sector for > 512b
+                         * sector, although it may not be enough/proper.
+                         */
+                        sector_t n = 2;
+                        n = min(size, max(sector_size, n));
+                        put_partition(state, slot, start, n);
                        printk(" <");
                        parse_extended(state, bdev, start, size);
                        printk(" >");
@@ -513,7 +524,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
                unsigned char id = SYS_IND(p);
                int n;
-                if (!NR_SECTS(p))
+                if (!nr_sects(p))
                        continue;
                for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++)
@@ -521,8 +532,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
                if (!subtypes[n].parse)
                        continue;
-                subtypes[n].parse(state, bdev, START_SECT(p)*sector_size,
+                subtypes[n].parse(state, bdev, start_sect(p)*sector_size,
-                                                NR_SECTS(p)*sector_size, slot);
+                                                nr_sects(p)*sector_size, slot);
        }
        put_dev_sector(sect);
        return 1;
diff --git a/fs/pipe.c b/fs/pipe.c
index ae17d026aaa3..37ba29ff3158 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -906,17 +906,6 @@ void free_pipe_info(struct inode *inode)
 }
 static struct vfsmount *pipe_mnt __read_mostly;
-static int pipefs_delete_dentry(struct dentry *dentry)
-{
-        /*
-         * At creation time, we pretended this dentry was hashed
-         * (by clearing DCACHE_UNHASHED bit in d_flags)
-         * At delete time, we restore the truth : not hashed.
-         * (so that dput() can proceed correctly)
-         */
-        dentry->d_flags |= DCACHE_UNHASHED;
-        return 0;
-}
 /*
 * pipefs_dname() is called from d_path().
@@ -928,7 +917,6 @@ static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
 }
 static const struct dentry_operations pipefs_dentry_operations = {
-        .d_delete       = pipefs_delete_dentry,
        .d_dname        = pipefs_dname,
 };
@@ -974,7 +962,7 @@ struct file *create_write_pipe(int flags)
        int err;
        struct inode *inode;
        struct file *f;
-        struct dentry *dentry;
+        struct path path;
        struct qstr name = { .name = "" };
        err = -ENFILE;
@@ -983,21 +971,16 @@ struct file *create_write_pipe(int flags)
                goto err;
        err = -ENOMEM;
-        dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name);
+        path.dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name);
-        if (!dentry)
+        if (!path.dentry)
                goto err_inode;
+        path.mnt = mntget(pipe_mnt);
-        dentry->d_op = &pipefs_dentry_operations;
+        path.dentry->d_op = &pipefs_dentry_operations;
-        /*
+        d_instantiate(path.dentry, inode);
-         * We dont want to publish this dentry into global dentry hash table.
-         * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED
-         * This permits a working /proc/$pid/fd/XXX on pipes
-         */
-        dentry->d_flags &= ~DCACHE_UNHASHED;
-        d_instantiate(dentry, inode);
        err = -ENFILE;
-        f = alloc_file(pipe_mnt, dentry, FMODE_WRITE, &write_pipefifo_fops);
+        f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops);
        if (!f)
                goto err_dentry;
        f->f_mapping = inode->i_mapping;
@@ -1009,7 +992,7 @@ struct file *create_write_pipe(int flags)
 err_dentry:
        free_pipe_info(inode);
-        dput(dentry);
+        path_put(&path);
        return ERR_PTR(err);
 err_inode:
@@ -1028,20 +1011,14 @@ void free_write_pipe(struct file *f)
 struct file *create_read_pipe(struct file *wrf, int flags)
 {
-        struct file *f = get_empty_filp();
+        /* Grab pipe from the writer */
+        struct file *f = alloc_file(&wrf->f_path, FMODE_READ,
+                                    &read_pipefifo_fops);
        if (!f)
                return ERR_PTR(-ENFILE);
-        /* Grab pipe from the writer */
-        f->f_path = wrf->f_path;
        path_get(&wrf->f_path);
-        f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping;
-        f->f_pos = 0;
        f->f_flags = O_RDONLY | (flags & O_NONBLOCK);
-        f->f_op = &read_pipefifo_fops;
-        f->f_mode = FMODE_READ;
-        f->f_version = 0;
        return f;
 }
diff --git a/fs/pnode.c b/fs/pnode.c
index 8d5f392ec3d3..5cc564a83149 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -86,7 +86,7 @@ static int do_make_slave(struct vfsmount *mnt)
        /*
         * slave 'mnt' to a peer mount that has the
-         * same root dentry. If none is available than
+         * same root dentry. If none is available then
         * slave it to anything that is available.
         */
        while ((peer_mnt = next_peer(peer_mnt)) != mnt &&
@@ -147,6 +147,11 @@ void change_mnt_propagation(struct vfsmount *mnt, int type)
 * get the next mount in the propagation tree.
 * @m: the mount seen last
 * @origin: the original mount from where the tree walk initiated
+ *
+ * Note that peer groups form contiguous segments of slave lists.
+ * We rely on that in get_source() to be able to find out if
+ * vfsmount found while iterating with propagation_next() is
+ * a peer of one we'd found earlier.
 */
 static struct vfsmount *propagation_next(struct vfsmount *m,
                                         struct vfsmount *origin)
@@ -186,10 +191,6 @@ static struct vfsmount *get_source(struct vfsmount *dest,
 {
        struct vfsmount *p_last_src = NULL;
        struct vfsmount *p_last_dest = NULL;
-        *type = CL_PROPAGATION;
-        if (IS_MNT_SHARED(dest))
-                *type |= CL_MAKE_SHARED;
        while (last_dest != dest->mnt_master) {
                p_last_dest = last_dest;
@@ -202,13 +203,18 @@ static struct vfsmount *get_source(struct vfsmount *dest,
                do {
                        p_last_dest = next_peer(p_last_dest);
                } while (IS_MNT_NEW(p_last_dest));
+                /* is that a peer of the earlier? */
+                if (dest == p_last_dest) {
+                        *type = CL_MAKE_SHARED;
+                        return p_last_src;
+                }
        }
+        /* slave of the earlier, then */
-        if (dest != p_last_dest) {
+        *type = CL_SLAVE;
-                *type |= CL_SLAVE;
+        /* beginning of peer group among the slaves? */
-                return last_src;
+        if (IS_MNT_SHARED(dest))
-        } else
+                *type |= CL_MAKE_SHARED;
-                return p_last_src;
+        return last_src;
 }
 /*
diff --git a/fs/pnode.h b/fs/pnode.h
index 958665d662af..1ea4ae1efcd3 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -21,12 +21,11 @@
 #define CL_SLAVE                0x02
 #define CL_COPY_ALL             0x04
 #define CL_MAKE_SHARED          0x08
-#define CL_PROPAGATION          0x10
+#define CL_PRIVATE              0x10
-#define CL_PRIVATE              0x20
 static inline void set_mnt_shared(struct vfsmount *mnt)
 {
-        mnt->mnt_flags &= ~MNT_PNODE_MASK;
+        mnt->mnt_flags &= ~MNT_SHARED_MASK;
        mnt->mnt_flags |= MNT_SHARED;
 }
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 822c2d506518..e51f2ec2c5e5 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -68,7 +68,6 @@
 #include <linux/hugetlb.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
-#include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/signal.h>
 #include <linux/highmem.h>
@@ -134,13 +133,16 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
 * simple bit tests.
 */
 static const char *task_state_array[] = {
-        "R (running)",          /*  0 */
+        "R (running)",          /*   0 */
-        "S (sleeping)",         /*  1 */
+        "S (sleeping)",         /*   1 */
-        "D (disk sleep)",       /*  2 */
+        "D (disk sleep)",       /*   2 */
-        "T (stopped)",          /*  4 */
+        "T (stopped)",          /*   4 */
-        "T (tracing stop)",     /*  8 */
+        "t (tracing stop)",     /*   8 */
-        "Z (zombie)",           /* 16 */
+        "Z (zombie)",           /*  16 */
-        "X (dead)"              /* 32 */
+        "X (dead)",             /*  32 */
+        "x (dead)",             /*  64 */
+        "K (wakekill)",         /* 128 */
+        "W (waking)",           /* 256 */
 };
 static inline const char *get_task_state(struct task_struct *tsk)
@@ -148,6 +150,8 @@ static inline const char *get_task_state(struct task_struct *tsk)
        unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state;
        const char **p = &task_state_array[0];
+        BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array));
        while (state) {
                p++;
                state >>= 1;
@@ -265,8 +269,10 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
                blocked = p->blocked;
                collect_sigign_sigcatch(p, &ignored, &caught);
                num_threads = atomic_read(&p->signal->count);
+                rcu_read_lock();  /* FIXME: is this correct? */
                qsize = atomic_read(&__task_cred(p)->user->sigpending);
-                qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur;
+                rcu_read_unlock();
+                qlim = task_rlimit(p, RLIMIT_SIGPENDING);
                unlock_task_sighand(p, &flags);
        }
@@ -322,93 +328,15 @@ static inline void task_context_switch_counts(struct seq_file *m,
                        p->nivcsw);
 }
-#ifdef CONFIG_MMU
+static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
-struct stack_stats {
-        struct vm_area_struct *vma;
-        unsigned long   startpage;
-        unsigned long   usage;
-};
-static int stack_usage_pte_range(pmd_t *pmd, unsigned long addr,
-                                unsigned long end, struct mm_walk *walk)
-{
-        struct stack_stats *ss = walk->private;
-        struct vm_area_struct *vma = ss->vma;
-        pte_t *pte, ptent;
-        spinlock_t *ptl;
-        int ret = 0;
-        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
-        for (; addr != end; pte++, addr += PAGE_SIZE) {
-                ptent = *pte;
-#ifdef CONFIG_STACK_GROWSUP
-                if (pte_present(ptent) || is_swap_pte(ptent))
-                        ss->usage = addr - ss->startpage + PAGE_SIZE;
-#else
-                if (pte_present(ptent) || is_swap_pte(ptent)) {
-                        ss->usage = ss->startpage - addr + PAGE_SIZE;
-                        pte++;
-                        ret = 1;
-                        break;
-                }
-#endif
-        }
-        pte_unmap_unlock(pte - 1, ptl);
-        cond_resched();
-        return ret;
-}
-static inline unsigned long get_stack_usage_in_bytes(struct vm_area_struct *vma,
-                                struct task_struct *task)
-{
-        struct stack_stats ss;
-        struct mm_walk stack_walk = {
-                .pmd_entry = stack_usage_pte_range,
-                .mm = vma->vm_mm,
-                .private = &ss,
-        };
-        if (!vma->vm_mm || is_vm_hugetlb_page(vma))
-                return 0;
-        ss.vma = vma;
-        ss.startpage = task->stack_start & PAGE_MASK;
-        ss.usage = 0;
-#ifdef CONFIG_STACK_GROWSUP
-        walk_page_range(KSTK_ESP(task) & PAGE_MASK, vma->vm_end,
-                &stack_walk);
-#else
-        walk_page_range(vma->vm_start, (KSTK_ESP(task) & PAGE_MASK) + PAGE_SIZE,
-                &stack_walk);
-#endif
-        return ss.usage;
-}
-static inline void task_show_stack_usage(struct seq_file *m,
-                                                struct task_struct *task)
-{
-        struct vm_area_struct   *vma;
-        struct mm_struct        *mm = get_task_mm(task);
-        if (mm) {
-                down_read(&mm->mmap_sem);
-                vma = find_vma(mm, task->stack_start);
-                if (vma)
-                        seq_printf(m, "Stack usage:\t%lu kB\n",
-                                get_stack_usage_in_bytes(vma, task) >> 10);
-                up_read(&mm->mmap_sem);
-                mmput(mm);
-        }
-}
-#else
-static void task_show_stack_usage(struct seq_file *m, struct task_struct *task)
 {
+        seq_printf(m, "Cpus_allowed:\t");
+        seq_cpumask(m, &task->cpus_allowed);
+        seq_printf(m, "\n");
+        seq_printf(m, "Cpus_allowed_list:\t");
+        seq_cpumask_list(m, &task->cpus_allowed);
+        seq_printf(m, "\n");
 }
-#endif          /* CONFIG_MMU */
 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task)
@@ -424,12 +352,12 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
        }
        task_sig(m, task);
        task_cap(m, task);
+        task_cpus_allowed(m, task);
        cpuset_task_status_allowed(m, task);
 #if defined(CONFIG_S390)
        task_show_regs(m, task);
 #endif
        task_context_switch_counts(m, task);
-        task_show_stack_usage(m, task);
        return 0;
 }
@@ -491,24 +419,21 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                cutime = sig->cutime;
                cstime = sig->cstime;
                cgtime = sig->cgtime;
-                rsslim = sig->rlim[RLIMIT_RSS].rlim_cur;
+                rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
                /* add up live thread stats at the group level */
                if (whole) {
-                        struct task_cputime cputime;
                        struct task_struct *t = task;
                        do {
                                min_flt += t->min_flt;
                                maj_flt += t->maj_flt;
-                                gtime = cputime_add(gtime, task_gtime(t));
+                                gtime = cputime_add(gtime, t->gtime);
                                t = next_thread(t);
                        } while (t != task);
                        min_flt += sig->min_flt;
                        maj_flt += sig->maj_flt;
-                        thread_group_cputime(task, &cputime);
+                        thread_group_times(task, &utime, &stime);
-                        utime = cputime.utime;
-                        stime = cputime.stime;
                        gtime = cputime_add(gtime, sig->gtime);
                }
@@ -524,9 +449,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
        if (!whole) {
                min_flt = task->min_flt;
                maj_flt = task->maj_flt;
-                utime = task_utime(task);
+                task_times(task, &utime, &stime);
-                stime = task_stime(task);
+                gtime = task->gtime;
-                gtime = task_gtime(task);
        }
        /* scale priority and nice values from timeslices to -20..20 */
diff --git a/fs/proc/base.c b/fs/proc/base.c
index af643b5aefe8..7621db800a74 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -81,6 +81,7 @@
 #include <linux/elf.h>
 #include <linux/pid_namespace.h>
 #include <linux/fs_struct.h>
+#include <linux/slab.h>
 #include "internal.h"
 /* NOTE:
@@ -442,12 +443,13 @@ static const struct file_operations proc_lstats_operations = {
 unsigned long badness(struct task_struct *p, unsigned long uptime);
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
-        unsigned long points;
+        unsigned long points = 0;
        struct timespec uptime;
        do_posix_clock_monotonic_gettime(&uptime);
        read_lock(&tasklist_lock);
-        points = badness(task->group_leader, uptime.tv_sec);
+        if (pid_alive(task))
+                points = badness(task, uptime.tv_sec);
        read_unlock(&tasklist_lock);
        return sprintf(buffer, "%lu\n", points);
 }
@@ -647,17 +649,11 @@ static int mounts_release(struct inode *inode, struct file *file)
 static unsigned mounts_poll(struct file *file, poll_table *wait)
 {
        struct proc_mounts *p = file->private_data;
-        struct mnt_namespace *ns = p->ns;
        unsigned res = POLLIN | POLLRDNORM;
-        poll_wait(file, &ns->poll, wait);
+        poll_wait(file, &p->ns->poll, wait);
+        if (mnt_had_events(p))
-        spin_lock(&vfsmount_lock);
-        if (p->event != ns->event) {
-                p->event = ns->event;
                res |= POLLERR | POLLPRI;
-        }
-        spin_unlock(&vfsmount_lock);
        return res;
 }
@@ -1095,8 +1091,12 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
        if (!capable(CAP_AUDIT_CONTROL))
                return -EPERM;
-        if (current != pid_task(proc_pid(inode), PIDTYPE_PID))
+        rcu_read_lock();
+        if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
+                rcu_read_unlock();
                return -EPERM;
+        }
+        rcu_read_unlock();
        if (count >= PAGE_SIZE)
                count = PAGE_SIZE - 1;
@@ -1265,6 +1265,72 @@ static const struct file_operations proc_pid_sched_operations = {
 #endif
+static ssize_t comm_write(struct file *file, const char __user *buf,
+                                size_t count, loff_t *offset)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct task_struct *p;
+        char buffer[TASK_COMM_LEN];
+        memset(buffer, 0, sizeof(buffer));
+        if (count > sizeof(buffer) - 1)
+                count = sizeof(buffer) - 1;
+        if (copy_from_user(buffer, buf, count))
+                return -EFAULT;
+        p = get_proc_task(inode);
+        if (!p)
+                return -ESRCH;
+        if (same_thread_group(current, p))
+                set_task_comm(p, buffer);
+        else
+                count = -EINVAL;
+        put_task_struct(p);
+        return count;
+}
+static int comm_show(struct seq_file *m, void *v)
+{
+        struct inode *inode = m->private;
+        struct task_struct *p;
+        p = get_proc_task(inode);
+        if (!p)
+                return -ESRCH;
+        task_lock(p);
+        seq_printf(m, "%s\n", p->comm);
+        task_unlock(p);
+        put_task_struct(p);
+        return 0;
+}
+static int comm_open(struct inode *inode, struct file *filp)
+{
+        int ret;
+        ret = single_open(filp, comm_show, NULL);
+        if (!ret) {
+                struct seq_file *m = filp->private_data;
+                m->private = inode;
+        }
+        return ret;
+}
+static const struct file_operations proc_pid_set_comm_operations = {
+        .open           = comm_open,
+        .read           = seq_read,
+        .write          = comm_write,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 /*
 * We added or removed a vma mapping the executable. The vmas are only mapped
 * during exec and are not mapped with the mmap system call.
@@ -1353,7 +1419,6 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
                goto out;
        error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
-        nd->last_type = LAST_BIND;
 out:
        return ERR_PTR(error);
 }
@@ -2200,7 +2265,7 @@ static const struct inode_operations proc_attr_dir_inode_operations = {
 #endif
-#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+#ifdef CONFIG_ELF_CORE
 static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
                                         size_t count, loff_t *ppos)
 {
@@ -2304,16 +2369,30 @@ static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
        struct pid_namespace *ns = dentry->d_sb->s_fs_info;
        pid_t tgid = task_tgid_nr_ns(current, ns);
-        char tmp[PROC_NUMBUF];
+        char *name = ERR_PTR(-ENOENT);
-        if (!tgid)
+        if (tgid) {
-                return ERR_PTR(-ENOENT);
+                name = __getname();
-        sprintf(tmp, "%d", task_tgid_nr_ns(current, ns));
+                if (!name)
-        return ERR_PTR(vfs_follow_link(nd,tmp));
+                        name = ERR_PTR(-ENOMEM);
+                else
+                        sprintf(name, "%d", tgid);
+        }
+        nd_set_link(nd, name);
+        return NULL;
+}
+static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
+                                void *cookie)
+{
+        char *s = nd_get_link(nd);
+        if (!IS_ERR(s))
+                __putname(s);
 }
 static const struct inode_operations proc_self_inode_operations = {
        .readlink       = proc_self_readlink,
        .follow_link    = proc_self_follow_link,
+        .put_link       = proc_self_put_link,
 };
 /*
@@ -2504,6 +2583,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
+        REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
        INF("syscall",    S_IRUSR, proc_pid_syscall),
 #endif
@@ -2556,7 +2636,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_FAULT_INJECTION
        REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
 #endif
-#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+#ifdef CONFIG_ELF_CORE
        REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
@@ -2838,6 +2918,7 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
+        REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
        INF("syscall",   S_IRUSR, proc_pid_syscall),
 #endif
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index fa678abc9db1..43c127490606 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -13,6 +13,7 @@
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/init.h>
 #include <linux/idr.h>
@@ -291,19 +292,17 @@ static const struct inode_operations proc_file_inode_operations = {
 * returns the struct proc_dir_entry for "/proc/tty/driver", and
 * returns "serial" in residual.
 */
-static int xlate_proc_name(const char *name,
+static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
-                           struct proc_dir_entry **ret, const char **residual)
+                             const char **residual)
 {
        const char              *cp = name, *next;
        struct proc_dir_entry   *de;
        int                     len;
-        int                     rtn = 0;
        de = *ret;
        if (!de)
                de = &proc_root;
-        spin_lock(&proc_subdir_lock);
        while (1) {
                next = strchr(cp, '/');
                if (!next)
@@ -315,16 +314,25 @@ static int xlate_proc_name(const char *name,
                                break;
                }
                if (!de) {
-                        rtn = -ENOENT;
+                        WARN(1, "name '%s'\n", name);
-                        goto out;
+                        return -ENOENT;
                }
                cp += len + 1;
        }
        *residual = cp;
        *ret = de;
-out:
+        return 0;
+}
+static int xlate_proc_name(const char *name, struct proc_dir_entry **ret,
+                           const char **residual)
+{
+        int rv;
+        spin_lock(&proc_subdir_lock);
+        rv = __xlate_proc_name(name, ret, residual);
        spin_unlock(&proc_subdir_lock);
-        return rtn;
+        return rv;
 }
 static DEFINE_IDA(proc_inum_ida);
@@ -429,7 +437,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
                        unsigned int ino;
                        ino = de->low_ino;
-                        de_get(de);
+                        pde_get(de);
                        spin_unlock(&proc_subdir_lock);
                        error = -EINVAL;
                        inode = proc_get_inode(dir->i_sb, ino, de);
@@ -445,7 +453,7 @@ out_unlock:
                return NULL;
        }
        if (de)
-                de_put(de);
+                pde_put(de);
        return ERR_PTR(error);
 }
@@ -509,17 +517,17 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
                                struct proc_dir_entry *next;
                                /* filldir passes info to user space */
-                                de_get(de);
+                                pde_get(de);
                                spin_unlock(&proc_subdir_lock);
                                if (filldir(dirent, de->name, de->namelen, filp->f_pos,
                                            de->low_ino, de->mode >> 12) < 0) {
-                                        de_put(de);
+                                        pde_put(de);
                                        goto out;
                                }
                                spin_lock(&proc_subdir_lock);
                                filp->f_pos++;
                                next = de->next;
-                                de_put(de);
+                                pde_put(de);
                                de = next;
                        } while (de);
                        spin_unlock(&proc_subdir_lock);
@@ -662,6 +670,7 @@ struct proc_dir_entry *proc_symlink(const char *name,
        }
        return ent;
 }
+EXPORT_SYMBOL(proc_symlink);
 struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
                struct proc_dir_entry *parent)
@@ -700,6 +709,7 @@ struct proc_dir_entry *proc_mkdir(const char *name,
 {
        return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent);
 }
+EXPORT_SYMBOL(proc_mkdir);
 struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
                                         struct proc_dir_entry *parent)
@@ -728,6 +738,7 @@ struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
        }
        return ent;
 }
+EXPORT_SYMBOL(create_proc_entry);
 struct proc_dir_entry *proc_create_data(const char *name, mode_t mode,
                                        struct proc_dir_entry *parent,
@@ -762,8 +773,9 @@ out_free:
 out:
        return NULL;
 }
+EXPORT_SYMBOL(proc_create_data);
-void free_proc_entry(struct proc_dir_entry *de)
+static void free_proc_entry(struct proc_dir_entry *de)
 {
        unsigned int ino = de->low_ino;
@@ -777,6 +789,12 @@ void free_proc_entry(struct proc_dir_entry *de)
        kfree(de);
 }
+void pde_put(struct proc_dir_entry *pde)
+{
+        if (atomic_dec_and_test(&pde->count))
+                free_proc_entry(pde);
+}
 /*
 * Remove a /proc entry and free it if it's not currently in use.
 */
@@ -787,11 +805,13 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
        const char *fn = name;
        int len;
-        if (xlate_proc_name(name, &parent, &fn) != 0)
+        spin_lock(&proc_subdir_lock);
+        if (__xlate_proc_name(name, &parent, &fn) != 0) {
+                spin_unlock(&proc_subdir_lock);
                return;
+        }
        len = strlen(fn);
-        spin_lock(&proc_subdir_lock);
        for (p = &parent->subdir; *p; p=&(*p)->next ) {
                if (proc_match(len, fn, *p)) {
                        de = *p;
@@ -801,8 +821,10 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
                }
        }
        spin_unlock(&proc_subdir_lock);
-        if (!de)
+        if (!de) {
+                WARN(1, "name '%s'\n", name);
                return;
+        }
        spin_lock(&de->pde_unload_lock);
        /*
@@ -845,6 +867,6 @@ continue_removing:
        WARN(de->subdir, KERN_WARNING "%s: removing non-empty directory "
                        "'%s/%s', leaking at least '%s'\n", __func__,
                        de->parent->name, de->name, de->subdir->name);
-        if (atomic_dec_and_test(&de->count))
+        pde_put(de);
-                free_proc_entry(de);
 }
+EXPORT_SYMBOL(remove_proc_entry);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d78ade305541..d35b23238fb1 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -18,35 +18,13 @@
 #include <linux/module.h>
 #include <linux/smp_lock.h>
 #include <linux/sysctl.h>
+#include <linux/slab.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include "internal.h"
-struct proc_dir_entry *de_get(struct proc_dir_entry *de)
-{
-        atomic_inc(&de->count);
-        return de;
-}
-/*
- * Decrements the use count and checks for deferred deletion.
- */
-void de_put(struct proc_dir_entry *de)
-{
-        if (!atomic_read(&de->count)) {
-                printk("de_put: entry %s already free!\n", de->name);
-                return;
-        }
-        if (atomic_dec_and_test(&de->count))
-                free_proc_entry(de);
-}
-/*
- * Decrement the use count of the proc_dir_entry.
- */
 static void proc_delete_inode(struct inode *inode)
 {
        struct proc_dir_entry *de;
@@ -59,7 +37,7 @@ static void proc_delete_inode(struct inode *inode)
        /* Let go of any associated proc directory entry */
        de = PROC_I(inode)->pde;
        if (de)
-                de_put(de);
+                pde_put(de);
        if (PROC_I(inode)->sysctl)
                sysctl_head_put(PROC_I(inode)->sysctl);
        clear_inode(inode);
@@ -480,7 +458,7 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
                }
                unlock_new_inode(inode);
        } else
-               de_put(de);
+               pde_put(de);
        return inode;
 }                       
@@ -495,7 +473,7 @@ int proc_fill_super(struct super_block *s)
        s->s_op = &proc_sops;
        s->s_time_gran = 1;
        
-        de_get(&proc_root);
+        pde_get(&proc_root);
        root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root);
        if (!root_inode)
                goto out_no_root;
@@ -509,6 +487,6 @@ int proc_fill_super(struct super_block *s)
 out_no_root:
        printk("proc_read_super: get root inode failed\n");
        iput(root_inode);
-        de_put(&proc_root);
+        pde_put(&proc_root);
        return -ENOMEM;
 }
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 753ca37002c8..1f24a3eddd12 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -61,8 +61,6 @@ extern const struct file_operations proc_pagemap_operations;
 extern const struct file_operations proc_net_operations;
 extern const struct inode_operations proc_net_inode_operations;
-void free_proc_entry(struct proc_dir_entry *de);
 void proc_init_inodecache(void);
 static inline struct pid *proc_pid(struct inode *inode)
@@ -101,8 +99,12 @@ unsigned long task_vsize(struct mm_struct *);
 int task_statm(struct mm_struct *, int *, int *, int *, int *);
 void task_mem(struct seq_file *, struct mm_struct *);
-struct proc_dir_entry *de_get(struct proc_dir_entry *de);
+static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
-void de_put(struct proc_dir_entry *de);
+{
+        atomic_inc(&pde->count);
+        return pde;
+}
+void pde_put(struct proc_dir_entry *pde);
 extern struct vfsmount *proc_mnt;
 int proc_fill_super(struct super_block *);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a44a7897fd4d..19979a2ce272 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -19,6 +19,7 @@
 #include <linux/highmem.h>
 #include <linux/bootmem.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <linux/list.h>
@@ -490,7 +491,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
                }
                read_unlock(&kclist_lock);
-                if (m == NULL) {
+                if (&m->list == &kclist_head) {
                        if (clear_user(buffer, tsz))
                                return -EFAULT;
                } else if (is_vmalloc_or_module_addr((void *)start)) {
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index 7ca78346d3f0..cfe90a48a6e8 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -12,37 +12,37 @@
 #include <linux/poll.h>
 #include <linux/proc_fs.h>
 #include <linux/fs.h>
+#include <linux/syslog.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 extern wait_queue_head_t log_wait;
-extern int do_syslog(int type, char __user *bug, int count);
 static int kmsg_open(struct inode * inode, struct file * file)
 {
-        return do_syslog(1,NULL,0);
+        return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE);
 }
 static int kmsg_release(struct inode * inode, struct file * file)
 {
-        (void) do_syslog(0,NULL,0);
+        (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE);
        return 0;
 }
 static ssize_t kmsg_read(struct file *file, char __user *buf,
                         size_t count, loff_t *ppos)
 {
-        if ((file->f_flags & O_NONBLOCK) && !do_syslog(9, NULL, 0))
+        if ((file->f_flags & O_NONBLOCK) &&
+            !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
                return -EAGAIN;
-        return do_syslog(2, buf, count);
+        return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE);
 }
 static unsigned int kmsg_poll(struct file *file, poll_table *wait)
 {
        poll_wait(file, &log_wait, wait);
-        if (do_syslog(9, NULL, 0))
+        if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
                return POLLIN | POLLRDNORM;
        return 0;
 }
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 9fe7d7ebe115..b1822dde55c2 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -21,7 +21,6 @@
 #include <linux/mmzone.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
-#include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/seq_file.h>
 #include <linux/hugetlb.h>
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 5033ce0d254b..180cf5a0bd67 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -8,6 +8,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/hugetlb.h>
+#include <linux/kernel-page-flags.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -71,52 +72,12 @@ static const struct file_operations proc_kpagecount_operations = {
 * physical page flags.
 */
-/* These macros are used to decouple internal flags from exported ones */
-#define KPF_LOCKED              0
-#define KPF_ERROR               1
-#define KPF_REFERENCED          2
-#define KPF_UPTODATE            3
-#define KPF_DIRTY               4
-#define KPF_LRU                 5
-#define KPF_ACTIVE              6
-#define KPF_SLAB                7
-#define KPF_WRITEBACK           8
-#define KPF_RECLAIM             9
-#define KPF_BUDDY               10
-/* 11-20: new additions in 2.6.31 */
-#define KPF_MMAP                11
-#define KPF_ANON                12
-#define KPF_SWAPCACHE           13
-#define KPF_SWAPBACKED          14
-#define KPF_COMPOUND_HEAD       15
-#define KPF_COMPOUND_TAIL       16
-#define KPF_HUGE                17
-#define KPF_UNEVICTABLE         18
-#define KPF_HWPOISON            19
-#define KPF_NOPAGE              20
-#define KPF_KSM                 21
-/* kernel hacking assistances
- * WARNING: subject to change, never rely on them!
- */
-#define KPF_RESERVED            32
-#define KPF_MLOCKED             33
-#define KPF_MAPPEDTODISK        34
-#define KPF_PRIVATE             35
-#define KPF_PRIVATE_2           36
-#define KPF_OWNER_PRIVATE       37
-#define KPF_ARCH                38
-#define KPF_UNCACHED            39
 static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit)
 {
        return ((kflags >> kbit) & 1) << ubit;
 }
-static u64 get_uflags(struct page *page)
+u64 stable_page_flags(struct page *page)
 {
        u64 k;
        u64 u;
@@ -219,7 +180,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
                else
                        ppage = NULL;
-                if (put_user(get_uflags(ppage), out)) {
+                if (put_user(stable_page_flags(ppage), out)) {
                        ret = -EFAULT;
                        break;
                }
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index 7ba79a54948c..ce94801f48ca 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -7,44 +7,50 @@
 #include <linux/init.h>
 #include <linux/time.h>
 #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <linux/stat.h>
 #include <linux/string.h>
+#include <linux/of.h>
+#include <linux/module.h>
+#include <linux/slab.h>
 #include <asm/prom.h>
 #include <asm/uaccess.h>
 #include "internal.h"
-#ifndef HAVE_ARCH_DEVTREE_FIXUPS
 static inline void set_node_proc_entry(struct device_node *np,
                                       struct proc_dir_entry *de)
 {
-}
+#ifdef HAVE_ARCH_DEVTREE_FIXUPS
+        np->pde = de;
 #endif
+}
 static struct proc_dir_entry *proc_device_tree;
 /*
 * Supply data on a read from /proc/device-tree/node/property.
 */
-static int property_read_proc(char *page, char **start, off_t off,
+static int property_proc_show(struct seq_file *m, void *v)
-                              int count, int *eof, void *data)
 {
-        struct property *pp = data;
+        struct property *pp = m->private;
-        int n;
-        if (off >= pp->length) {
+        seq_write(m, pp->value, pp->length);
-                *eof = 1;
+        return 0;
-                return 0;
-        }
-        n = pp->length - off;
-        if (n > count)
-                n = count;
-        else
-                *eof = 1;
-        memcpy(page, (char *)pp->value + off, n);
-        *start = page;
-        return n;
 }
+static int property_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, property_proc_show, PDE(inode)->data);
+}
+static const struct file_operations property_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = property_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 /*
 * For a node with a name like "gc@10", we make symlinks called "gc"
 * and "@10" to it.
@@ -63,10 +69,9 @@ __proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp,
         * Unfortunately proc_register puts each new entry
         * at the beginning of the list.  So we rearrange them.
         */
-        ent = create_proc_read_entry(name,
+        ent = proc_create_data(name,
-                                     strncmp(name, "security-", 9)
+                               strncmp(name, "security-", 9) ? S_IRUGO : S_IRUSR,
-                                     ? S_IRUGO : S_IRUSR, de,
+                               de, &property_proc_fops, pp);
-                                     property_read_proc, pp);
        if (ent == NULL)
                return NULL;
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 04d1270f1c38..9020ac15baaa 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -14,6 +14,7 @@
 #include <linux/time.h>
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/module.h>
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index f667e8aeabdf..6ff9981f0a18 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -48,7 +48,7 @@ out:
 static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name)
 {
        int len;
-        for ( ; p->ctl_name || p->procname; p++) {
+        for ( ; p->procname; p++) {
                if (!p->procname)
                        continue;
@@ -218,7 +218,7 @@ static int scan(struct ctl_table_header *head, ctl_table *table,
                void *dirent, filldir_t filldir)
 {
-        for (; table->ctl_name || table->procname; table++, (*pos)++) {
+        for (; table->procname; table++, (*pos)++) {
                int res;
                /* Can't do anything without a proc name */
diff --git a/fs/proc/root.c b/fs/proc/root.c
index b080b791d9e3..757c069f2a65 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -220,9 +220,3 @@ void pid_ns_release_proc(struct pid_namespace *ns)
 {
        mntput(ns->proc_mnt);
 }
-EXPORT_SYMBOL(proc_symlink);
-EXPORT_SYMBOL(proc_mkdir);
-EXPORT_SYMBOL(create_proc_entry);
-EXPORT_SYMBOL(proc_create_data);
-EXPORT_SYMBOL(remove_proc_entry);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 7cc726c6d70a..bf31b03fc275 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -1,6 +1,5 @@
 #include <linux/cpumask.h>
 #include <linux/fs.h>
-#include <linux/gfp.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
@@ -27,7 +26,7 @@ static int show_stat(struct seq_file *p, void *v)
        int i, j;
        unsigned long jif;
        cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
-        cputime64_t guest;
+        cputime64_t guest, guest_nice;
        u64 sum = 0;
        u64 sum_softirq = 0;
        unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
@@ -36,7 +35,7 @@ static int show_stat(struct seq_file *p, void *v)
        user = nice = system = idle = iowait =
                irq = softirq = steal = cputime64_zero;
-        guest = cputime64_zero;
+        guest = guest_nice = cputime64_zero;
        getboottime(&boottime);
        jif = boottime.tv_sec;
@@ -51,6 +50,8 @@ static int show_stat(struct seq_file *p, void *v)
                softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
                steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
                guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
+                guest_nice = cputime64_add(guest_nice,
+                        kstat_cpu(i).cpustat.guest_nice);
                for_each_irq_nr(j) {
                        sum += kstat_irqs_cpu(j, i);
                }
@@ -65,7 +66,8 @@ static int show_stat(struct seq_file *p, void *v)
        }
        sum += arch_irq_stat();
-        seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+        seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu "
+                "%llu\n",
                (unsigned long long)cputime64_to_clock_t(user),
                (unsigned long long)cputime64_to_clock_t(nice),
                (unsigned long long)cputime64_to_clock_t(system),
@@ -74,7 +76,8 @@ static int show_stat(struct seq_file *p, void *v)
                (unsigned long long)cputime64_to_clock_t(irq),
                (unsigned long long)cputime64_to_clock_t(softirq),
                (unsigned long long)cputime64_to_clock_t(steal),
-                (unsigned long long)cputime64_to_clock_t(guest));
+                (unsigned long long)cputime64_to_clock_t(guest),
+                (unsigned long long)cputime64_to_clock_t(guest_nice));
        for_each_online_cpu(i) {
                /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
@@ -88,8 +91,10 @@ static int show_stat(struct seq_file *p, void *v)
                softirq = kstat_cpu(i).cpustat.softirq;
                steal = kstat_cpu(i).cpustat.steal;
                guest = kstat_cpu(i).cpustat.guest;
+                guest_nice = kstat_cpu(i).cpustat.guest_nice;
                seq_printf(p,
-                        "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+                        "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu "
+                        "%llu\n",
                        i,
                        (unsigned long long)cputime64_to_clock_t(user),
                        (unsigned long long)cputime64_to_clock_t(nice),
@@ -99,7 +104,8 @@ static int show_stat(struct seq_file *p, void *v)
                        (unsigned long long)cputime64_to_clock_t(irq),
                        (unsigned long long)cputime64_to_clock_t(softirq),
                        (unsigned long long)cputime64_to_clock_t(steal),
-                        (unsigned long long)cputime64_to_clock_t(guest));
+                        (unsigned long long)cputime64_to_clock_t(guest),
+                        (unsigned long long)cputime64_to_clock_t(guest_nice));
        }
        seq_printf(p, "intr %llu", (unsigned long long)sum);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 2a1bef9203c6..caf0337dff73 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -4,6 +4,7 @@
 #include <linux/seq_file.h>
 #include <linux/highmem.h>
 #include <linux/ptrace.h>
+#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/mempolicy.h>
 #include <linux/swap.h>
@@ -16,7 +17,7 @@
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-        unsigned long data, text, lib;
+        unsigned long data, text, lib, swap;
        unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
        /*
@@ -36,6 +37,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
        data = mm->total_vm - mm->shared_vm - mm->stack_vm;
        text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
        lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
+        swap = get_mm_counter(mm, MM_SWAPENTS);
        seq_printf(m,
                "VmPeak:\t%8lu kB\n"
                "VmSize:\t%8lu kB\n"
@@ -46,7 +48,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
                "VmStk:\t%8lu kB\n"
                "VmExe:\t%8lu kB\n"
                "VmLib:\t%8lu kB\n"
-                "VmPTE:\t%8lu kB\n",
+                "VmPTE:\t%8lu kB\n"
+                "VmSwap:\t%8lu kB\n",
                hiwater_vm << (PAGE_SHIFT-10),
                (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
                mm->locked_vm << (PAGE_SHIFT-10),
@@ -54,7 +57,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
                total_rss << (PAGE_SHIFT-10),
                data << (PAGE_SHIFT-10),
                mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-                (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
+                (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
+                swap << (PAGE_SHIFT-10));
 }
 unsigned long task_vsize(struct mm_struct *mm)
@@ -65,11 +69,11 @@ unsigned long task_vsize(struct mm_struct *mm)
 int task_statm(struct mm_struct *mm, int *shared, int *text,
               int *data, int *resident)
 {
-        *shared = get_mm_counter(mm, file_rss);
+        *shared = get_mm_counter(mm, MM_FILEPAGES);
        *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
                                                                >> PAGE_SHIFT;
        *data = mm->total_vm - mm->shared_vm;
-        *resident = *shared + get_mm_counter(mm, anon_rss);
+        *resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
        return mm->total_vm;
 }
@@ -361,12 +365,11 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                if (!pte_present(ptent))
                        continue;
-                mss->resident += PAGE_SIZE;
                page = vm_normal_page(vma, addr, ptent);
                if (!page)
                        continue;
+                mss->resident += PAGE_SIZE;
                /* Accumulate the size in pages that have been accessed. */
                if (pte_young(ptent) || PageReferenced(page))
                        mss->referenced += PAGE_SIZE;
@@ -404,6 +407,7 @@ static int show_smap(struct seq_file *m, void *v)
        memset(&mss, 0, sizeof mss);
        mss.vma = vma;
+        /* mmap_sem is held in m_start */
        if (vma->vm_mm && !is_vm_hugetlb_page(vma))
                walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
@@ -550,7 +554,8 @@ const struct file_operations proc_clear_refs_operations = {
 };
 struct pagemapread {
-        u64 __user *out, *end;
+        int pos, len;
+        u64 *buffer;
 };
 #define PM_ENTRY_BYTES      sizeof(u64)
@@ -573,10 +578,8 @@ struct pagemapread {
 static int add_to_pagemap(unsigned long addr, u64 pfn,
                          struct pagemapread *pm)
 {
-        if (put_user(pfn, pm->out))
+        pm->buffer[pm->pos++] = pfn;
-                return -EFAULT;
+        if (pm->pos >= pm->len)
-        pm->out++;
-        if (pm->out >= pm->end)
                return PM_END_OF_BUFFER;
        return 0;
 }
@@ -650,6 +653,50 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        return err;
 }
+static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
+{
+        u64 pme = 0;
+        if (pte_present(pte))
+                pme = PM_PFRAME(pte_pfn(pte) + offset)
+                        | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
+        return pme;
+}
+static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr,
+                                 unsigned long end, struct mm_walk *walk)
+{
+        struct vm_area_struct *vma;
+        struct pagemapread *pm = walk->private;
+        struct hstate *hs = NULL;
+        int err = 0;
+        vma = find_vma(walk->mm, addr);
+        if (vma)
+                hs = hstate_vma(vma);
+        for (; addr != end; addr += PAGE_SIZE) {
+                u64 pfn = PM_NOT_PRESENT;
+                if (vma && (addr >= vma->vm_end)) {
+                        vma = find_vma(walk->mm, addr);
+                        if (vma)
+                                hs = hstate_vma(vma);
+                }
+                if (vma && (vma->vm_start <= addr) && is_vm_hugetlb_page(vma)) {
+                        /* calculate pfn of the "raw" page in the hugepage. */
+                        int offset = (addr & ~huge_page_mask(hs)) >> PAGE_SHIFT;
+                        pfn = huge_pte_to_pagemap_entry(*pte, offset);
+                }
+                err = add_to_pagemap(addr, pfn, pm);
+                if (err)
+                        return err;
+        }
+        cond_resched();
+        return err;
+}
 /*
 * /proc/pid/pagemap - an array mapping virtual pages to pfns
 *
@@ -674,21 +721,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 * determine which areas of memory are actually mapped and llseek to
 * skip over unmapped regions.
 */
+#define PAGEMAP_WALK_SIZE       (PMD_SIZE)
 static ssize_t pagemap_read(struct file *file, char __user *buf,
                            size_t count, loff_t *ppos)
 {
        struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
-        struct page **pages, *page;
-        unsigned long uaddr, uend;
        struct mm_struct *mm;
        struct pagemapread pm;
-        int pagecount;
        int ret = -ESRCH;
        struct mm_walk pagemap_walk = {};
        unsigned long src;
        unsigned long svpfn;
        unsigned long start_vaddr;
        unsigned long end_vaddr;
+        int copied = 0;
        if (!task)
                goto out;
@@ -711,37 +757,15 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        if (!mm)
                goto out_task;
+        pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
-        uaddr = (unsigned long)buf & PAGE_MASK;
+        pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
-        uend = (unsigned long)(buf + count);
-        pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
-        ret = 0;
-        if (pagecount == 0)
-                goto out_mm;
-        pages = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
        ret = -ENOMEM;
-        if (!pages)
+        if (!pm.buffer)
                goto out_mm;
-        down_read(&current->mm->mmap_sem);
-        ret = get_user_pages(current, current->mm, uaddr, pagecount,
-                             1, 0, pages, NULL);
-        up_read(&current->mm->mmap_sem);
-        if (ret < 0)
-                goto out_free;
-        if (ret != pagecount) {
-                pagecount = ret;
-                ret = -EFAULT;
-                goto out_pages;
-        }
-        pm.out = (u64 __user *)buf;
-        pm.end = (u64 __user *)(buf + count);
        pagemap_walk.pmd_entry = pagemap_pte_range;
        pagemap_walk.pte_hole = pagemap_pte_hole;
+        pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
        pagemap_walk.mm = mm;
        pagemap_walk.private = &pm;
@@ -760,23 +784,36 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
         * user buffer is tracked in "pm", and the walk
         * will stop when we hit the end of the buffer.
         */
-        ret = walk_page_range(start_vaddr, end_vaddr, &pagemap_walk);
+        ret = 0;
-        if (ret == PM_END_OF_BUFFER)
+        while (count && (start_vaddr < end_vaddr)) {
-                ret = 0;
+                int len;
-        /* don't need mmap_sem for these, but this looks cleaner */
+                unsigned long end;
-        *ppos += (char __user *)pm.out - buf;
-        if (!ret)
+                pm.pos = 0;
-                ret = (char __user *)pm.out - buf;
+                end = start_vaddr + PAGEMAP_WALK_SIZE;
+                /* overflow ? */
-out_pages:
+                if (end < start_vaddr || end > end_vaddr)
-        for (; pagecount; pagecount--) {
+                        end = end_vaddr;
-                page = pages[pagecount-1];
+                down_read(&mm->mmap_sem);
-                if (!PageReserved(page))
+                ret = walk_page_range(start_vaddr, end, &pagemap_walk);
-                        SetPageDirty(page);
+                up_read(&mm->mmap_sem);
-                page_cache_release(page);
+                start_vaddr = end;
+                len = min(count, PM_ENTRY_BYTES * pm.pos);
+                if (copy_to_user(buf, pm.buffer, len) < 0) {
+                        ret = -EFAULT;
+                        goto out_free;
+                }
+                copied += len;
+                buf += len;
+                count -= len;
        }
+        *ppos += copied;
+        if (!ret || ret == PM_END_OF_BUFFER)
+                ret = copied;
 out_free:
-        kfree(pages);
+        kfree(pm.buffer);
 out_mm:
        mmput(mm);
 out_task:
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 8f5c05d3dbd3..46d4b5d72bd3 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -5,6 +5,7 @@
 #include <linux/fs_struct.h>
 #include <linux/mount.h>
 #include <linux/ptrace.h>
+#include <linux/slab.h>
 #include <linux/seq_file.h>
 #include "internal.h"
@@ -110,9 +111,13 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
                }
        }
-        size += (*text = mm->end_code - mm->start_code);
+        *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
-        size += (*data = mm->start_stack - mm->start_data);
+                >> PAGE_SHIFT;
+        *data = (PAGE_ALIGN(mm->start_stack) - (mm->start_data & PAGE_MASK))
+                >> PAGE_SHIFT;
        up_read(&mm->mmap_sem);
+        size >>= PAGE_SHIFT;
+        size += *text + *data;
        *resident = size;
        return size;
 }
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 0872afa58d39..9fbc99ec799a 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -12,6 +12,7 @@
 #include <linux/user.h>
 #include <linux/elf.h>
 #include <linux/elfcore.h>
+#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/bootmem.h>
 #include <linux/init.h>
diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
index 0afba069d567..22e0d60e53ef 100644
--- a/fs/qnx4/bitmap.c
+++ b/fs/qnx4/bitmap.c
@@ -17,13 +17,6 @@
 #include <linux/bitops.h>
 #include "qnx4.h"
-#if 0
-int qnx4_new_block(struct super_block *sb)
-{
-        return 0;
-}
-#endif  /*  0  */
 static void count_bits(register const char *bmPart, register int size,
                       int *const tf)
 {
@@ -35,22 +28,7 @@ static void count_bits(register const char *bmPart, register int size,
        }
        do {
                b = *bmPart++;
-                if ((b & 1) == 0)
+                tot += 8 - hweight8(b);
-                        tot++;
-                if ((b & 2) == 0)
-                        tot++;
-                if ((b & 4) == 0)
-                        tot++;
-                if ((b & 8) == 0)
-                        tot++;
-                if ((b & 16) == 0)
-                        tot++;
-                if ((b & 32) == 0)
-                        tot++;
-                if ((b & 64) == 0)
-                        tot++;
-                if ((b & 128) == 0)
-                        tot++;
                size--;
        } while (size != 0);
        *tf = tot;
@@ -67,7 +45,7 @@ unsigned long qnx4_count_free_blocks(struct super_block *sb)
        while (total < size) {
                if ((bh = sb_bread(sb, start + offset)) == NULL) {
-                        printk("qnx4: I/O error in counting free blocks\n");
+                        printk(KERN_ERR "qnx4: I/O error in counting free blocks\n");
                        break;
                }
                count_bits(bh->b_data, size - total, &total_free);
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 86cc39cb1398..6f30c3d5bcbf 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -26,8 +26,8 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
        int ix, ino;
        int size;
-        QNX4DEBUG(("qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
+        QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
-        QNX4DEBUG(("filp->f_pos         = %ld\n", (long) filp->f_pos));
+        QNX4DEBUG((KERN_INFO "filp->f_pos         = %ld\n", (long) filp->f_pos));
        lock_kernel();
@@ -50,7 +50,7 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                        size = QNX4_NAME_MAX;
                                if ( ( de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK) ) != 0 ) {
-                                        QNX4DEBUG(("qnx4_readdir:%.*s\n", size, de->di_fname));
+                                        QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname));
                                        if ( ( de->di_status & QNX4_FILE_LINK ) == 0 )
                                                ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
                                        else {
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index d2cd1798d8c4..277575ddc05c 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -64,25 +64,7 @@ static struct buffer_head *qnx4_getblk(struct inode *inode, int nr,
                result = sb_getblk(inode->i_sb, nr);
                return result;
        }
-        if (!create) {
+        return NULL;
-                return NULL;
-        }
-#if 0
-        tmp = qnx4_new_block(inode->i_sb);
-        if (!tmp) {
-                return NULL;
-        }
-        result = sb_getblk(inode->i_sb, tmp);
-        if (tst) {
-                qnx4_free_block(inode->i_sb, tmp);
-                brelse(result);
-                goto repeat;
-        }
-        tst = tmp;
-#endif
-        inode->i_ctime = CURRENT_TIME_SEC;
-        mark_inode_dirty(inode);
-        return result;
 }
 struct buffer_head *qnx4_bread(struct inode *inode, int block, int create)
@@ -107,14 +89,12 @@ static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_h
 {
        unsigned long phys;
-        QNX4DEBUG(("qnx4: qnx4_get_block inode=[%ld] iblock=[%ld]\n",inode->i_ino,iblock));
+        QNX4DEBUG((KERN_INFO "qnx4: qnx4_get_block inode=[%ld] iblock=[%ld]\n",inode->i_ino,iblock));
        phys = qnx4_block_map( inode, iblock );
        if ( phys ) {
                // logical block is before EOF
                map_bh(bh, inode->i_sb, phys);
-        } else if ( create ) {
-                // to be done.
        }
        return 0;
 }
@@ -142,12 +122,12 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
                                // read next xtnt block.
                                bh = sb_bread(inode->i_sb, i_xblk - 1);
                                if ( !bh ) {
-                                        QNX4DEBUG(("qnx4: I/O error reading xtnt block [%ld])\n", i_xblk - 1));
+                                        QNX4DEBUG((KERN_ERR "qnx4: I/O error reading xtnt block [%ld])\n", i_xblk - 1));
                                        return -EIO;
                                }
                                xblk = (struct qnx4_xblk*)bh->b_data;
                                if ( memcmp( xblk->xblk_signature, "IamXblk", 7 ) ) {
-                                        QNX4DEBUG(("qnx4: block at %ld is not a valid xtnt\n", qnx4_inode->i_xblk));
+                                        QNX4DEBUG((KERN_ERR "qnx4: block at %ld is not a valid xtnt\n", qnx4_inode->i_xblk));
                                        return -EIO;
                                }
                        }
@@ -168,7 +148,7 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
                        brelse( bh );
        }
-        QNX4DEBUG(("qnx4: mapping block %ld of inode %ld = %ld\n",iblock,inode->i_ino,block));
+        QNX4DEBUG((KERN_INFO "qnx4: mapping block %ld of inode %ld = %ld\n",iblock,inode->i_ino,block));
        return block;
 }
@@ -209,7 +189,7 @@ static const char *qnx4_checkroot(struct super_block *sb)
        if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/') {
                return "no qnx4 filesystem (no root dir).";
        } else {
-                QNX4DEBUG(("QNX4 filesystem found on dev %s.\n", sb->s_id));
+                QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
                rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1;
                rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size);
                for (j = 0; j < rl; j++) {
@@ -220,8 +200,9 @@ static const char *qnx4_checkroot(struct super_block *sb)
                        for (i = 0; i < QNX4_INODES_PER_BLOCK; i++) {
                                rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE);
                                if (rootdir->di_fname != NULL) {
-                                        QNX4DEBUG(("Rootdir entry found : [%s]\n", rootdir->di_fname));
+                                        QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
-                                        if (!strncmp(rootdir->di_fname, QNX4_BMNAME, sizeof QNX4_BMNAME)) {
+                                        if (!strcmp(rootdir->di_fname,
+                                                    QNX4_BMNAME)) {
                                                found = 1;
                                                qnx4_sb(sb)->BitMap = kmalloc( sizeof( struct qnx4_inode_entry ), GFP_KERNEL );
                                                if (!qnx4_sb(sb)->BitMap) {
@@ -265,12 +246,12 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
           if we don't belong here... */
        bh = sb_bread(s, 1);
        if (!bh) {
-                printk("qnx4: unable to read the superblock\n");
+                printk(KERN_ERR "qnx4: unable to read the superblock\n");
                goto outnobh;
        }
        if ( le32_to_cpup((__le32*) bh->b_data) != QNX4_SUPER_MAGIC ) {
                if (!silent)
-                        printk("qnx4: wrong fsid in superblock.\n");
+                        printk(KERN_ERR "qnx4: wrong fsid in superblock.\n");
                goto out;
        }
        s->s_op = &qnx4_sops;
@@ -284,14 +265,14 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
        errmsg = qnx4_checkroot(s);
        if (errmsg != NULL) {
                if (!silent)
-                        printk("qnx4: %s\n", errmsg);
+                        printk(KERN_ERR "qnx4: %s\n", errmsg);
                goto out;
        }
        /* does root not have inode number QNX4_ROOT_INO ?? */
        root = qnx4_iget(s, QNX4_ROOT_INO * QNX4_INODES_PER_BLOCK);
        if (IS_ERR(root)) {
-                printk("qnx4: get inode failed\n");
+                printk(KERN_ERR "qnx4: get inode failed\n");
                ret = PTR_ERR(root);
                goto out;
        }
@@ -374,7 +355,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
        qnx4_inode = qnx4_raw_inode(inode);
        inode->i_mode = 0;
-        QNX4DEBUG(("Reading inode : [%d]\n", ino));
+        QNX4DEBUG((KERN_INFO "reading inode : [%d]\n", ino));
        if (!ino) {
                printk(KERN_ERR "qnx4: bad inode number on dev %s: %lu is "
                                "out of range\n",
@@ -385,7 +366,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
        block = ino / QNX4_INODES_PER_BLOCK;
        if (!(bh = sb_bread(sb, block))) {
-                printk("qnx4: major problem: unable to read inode from dev "
+                printk(KERN_ERR "qnx4: major problem: unable to read inode from dev "
                       "%s\n", sb->s_id);
                iget_failed(inode);
                return ERR_PTR(-EIO);
@@ -499,7 +480,7 @@ static int __init init_qnx4_fs(void)
                return err;
        }
-        printk("QNX4 filesystem 0.2.3 registered.\n");
+        printk(KERN_INFO "QNX4 filesystem 0.2.3 registered.\n");
        return 0;
 }
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index ae1e7edbacd6..58703ebba879 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -30,7 +30,7 @@ static int qnx4_match(int len, const char *name,
        int namelen, thislen;
        if (bh == NULL) {
-                printk("qnx4: matching unassigned buffer !\n");
+                printk(KERN_WARNING "qnx4: matching unassigned buffer !\n");
                return 0;
        }
        de = (struct qnx4_inode_entry *) (bh->b_data + *offset);
@@ -66,7 +66,7 @@ static struct buffer_head *qnx4_find_entry(int len, struct inode *dir,
        *res_dir = NULL;
        if (!dir->i_sb) {
-                printk("qnx4: no superblock on dir.\n");
+                printk(KERN_WARNING "qnx4: no superblock on dir.\n");
                return NULL;
        }
        bh = NULL;
@@ -124,7 +124,7 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nam
        foundinode = qnx4_iget(dir->i_sb, ino);
        if (IS_ERR(foundinode)) {
                unlock_kernel();
-                QNX4DEBUG(("qnx4: lookup->iget -> error %ld\n",
+                QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n",
                           PTR_ERR(foundinode)));
                return ERR_CAST(foundinode);
        }
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 8047e01ef46b..dad7fb247ddc 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -17,7 +17,7 @@ config QUOTA
 config QUOTA_NETLINK_INTERFACE
        bool "Report quota messages through netlink interface"
-        depends on QUOTA && NET
+        depends on QUOTACTL && NET
        help
          If you say Y here, quota warnings (about exceeding softlimit, reaching
          hardlimit, etc.) will be reported through netlink interface. If unsure,
@@ -46,14 +46,21 @@ config QFMT_V1
          format say Y here.
 config QFMT_V2
-        tristate "Quota format v2 support"
+        tristate "Quota format vfsv0 and vfsv1 support"
        depends on QUOTA
        select QUOTA_TREE
        help
-          This quota format allows using quotas with 32-bit UIDs/GIDs. If you
+          This config option enables kernel support for vfsv0 and vfsv1 quota
-          need this functionality say Y here.
+          formats. Both these formats support 32-bit UIDs/GIDs and vfsv1 format
+          also supports 64-bit inode and block quota limits. If you need this
+          functionality say Y here.
 config QUOTACTL
        bool
        depends on XFS_QUOTA || QUOTA
        default y
+config QUOTACTL_COMPAT
+        bool
+        depends on QUOTACTL && COMPAT_FOR_U64_ALIGNMENT
+        default y
diff --git a/fs/quota/Makefile b/fs/quota/Makefile
index 68d4f6dc0578..5f9e9e276af0 100644
--- a/fs/quota/Makefile
+++ b/fs/quota/Makefile
@@ -3,3 +3,5 @@ obj-$(CONFIG_QFMT_V1)		+= quota_v1.o
 obj-$(CONFIG_QFMT_V2)           += quota_v2.o
 obj-$(CONFIG_QUOTA_TREE)        += quota_tree.o
 obj-$(CONFIG_QUOTACTL)          += quota.o
+obj-$(CONFIG_QUOTACTL_COMPAT)   += compat.o
+obj-$(CONFIG_QUOTA_NETLINK_INTERFACE)   += netlink.o
diff --git a/fs/quota/compat.c b/fs/quota/compat.c
new file mode 100644
index 000000000000..fb1892fe3e56
--- /dev/null
+++ b/fs/quota/compat.c
@@ -0,0 +1,118 @@
+#include <linux/syscalls.h>
+#include <linux/compat.h>
+#include <linux/quotaops.h>
+/*
+ * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64)
+ * and is necessary due to alignment problems.
+ */
+struct compat_if_dqblk {
+        compat_u64 dqb_bhardlimit;
+        compat_u64 dqb_bsoftlimit;
+        compat_u64 dqb_curspace;
+        compat_u64 dqb_ihardlimit;
+        compat_u64 dqb_isoftlimit;
+        compat_u64 dqb_curinodes;
+        compat_u64 dqb_btime;
+        compat_u64 dqb_itime;
+        compat_uint_t dqb_valid;
+};
+/* XFS structures */
+struct compat_fs_qfilestat {
+        compat_u64 dqb_bhardlimit;
+        compat_u64 qfs_nblks;
+        compat_uint_t qfs_nextents;
+};
+struct compat_fs_quota_stat {
+        __s8            qs_version;
+        __u16           qs_flags;
+        __s8            qs_pad;
+        struct compat_fs_qfilestat      qs_uquota;
+        struct compat_fs_qfilestat      qs_gquota;
+        compat_uint_t   qs_incoredqs;
+        compat_int_t    qs_btimelimit;
+        compat_int_t    qs_itimelimit;
+        compat_int_t    qs_rtbtimelimit;
+        __u16           qs_bwarnlimit;
+        __u16           qs_iwarnlimit;
+};
+asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
+                                                qid_t id, void __user *addr)
+{
+        unsigned int cmds;
+        struct if_dqblk __user *dqblk;
+        struct compat_if_dqblk __user *compat_dqblk;
+        struct fs_quota_stat __user *fsqstat;
+        struct compat_fs_quota_stat __user *compat_fsqstat;
+        compat_uint_t data;
+        u16 xdata;
+        long ret;
+        cmds = cmd >> SUBCMDSHIFT;
+        switch (cmds) {
+        case Q_GETQUOTA:
+                dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
+                compat_dqblk = addr;
+                ret = sys_quotactl(cmd, special, id, dqblk);
+                if (ret)
+                        break;
+                if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) ||
+                        get_user(data, &dqblk->dqb_valid) ||
+                        put_user(data, &compat_dqblk->dqb_valid))
+                        ret = -EFAULT;
+                break;
+        case Q_SETQUOTA:
+                dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
+                compat_dqblk = addr;
+                ret = -EFAULT;
+                if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) ||
+                        get_user(data, &compat_dqblk->dqb_valid) ||
+                        put_user(data, &dqblk->dqb_valid))
+                        break;
+                ret = sys_quotactl(cmd, special, id, dqblk);
+                break;
+        case Q_XGETQSTAT:
+                fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat));
+                compat_fsqstat = addr;
+                ret = sys_quotactl(cmd, special, id, fsqstat);
+                if (ret)
+                        break;
+                ret = -EFAULT;
+                /* Copying qs_version, qs_flags, qs_pad */
+                if (copy_in_user(compat_fsqstat, fsqstat,
+                        offsetof(struct compat_fs_quota_stat, qs_uquota)))
+                        break;
+                /* Copying qs_uquota */
+                if (copy_in_user(&compat_fsqstat->qs_uquota,
+                        &fsqstat->qs_uquota,
+                        sizeof(compat_fsqstat->qs_uquota)) ||
+                        get_user(data, &fsqstat->qs_uquota.qfs_nextents) ||
+                        put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents))
+                        break;
+                /* Copying qs_gquota */
+                if (copy_in_user(&compat_fsqstat->qs_gquota,
+                        &fsqstat->qs_gquota,
+                        sizeof(compat_fsqstat->qs_gquota)) ||
+                        get_user(data, &fsqstat->qs_gquota.qfs_nextents) ||
+                        put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents))
+                        break;
+                /* Copying the rest */
+                if (copy_in_user(&compat_fsqstat->qs_incoredqs,
+                        &fsqstat->qs_incoredqs,
+                        sizeof(struct compat_fs_quota_stat) -
+                        offsetof(struct compat_fs_quota_stat, qs_incoredqs)) ||
+                        get_user(xdata, &fsqstat->qs_iwarnlimit) ||
+                        put_user(xdata, &compat_fsqstat->qs_iwarnlimit))
+                        break;
+                ret = 0;
+                break;
+        default:
+                ret = sys_quotactl(cmd, special, id, addr);
+        }
+        return ret;
+}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 39b49c42a7ed..e0b870f4749f 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -77,10 +77,6 @@
 #include <linux/capability.h>
 #include <linux/quotaops.h>
 #include <linux/writeback.h> /* for inode_lock, oddly enough.. */
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-#include <net/netlink.h>
-#include <net/genetlink.h>
-#endif
 #include <asm/uaccess.h>
@@ -104,9 +100,13 @@
 *
 * Any operation working on dquots via inode pointers must hold dqptr_sem.  If
 * operation is just reading pointers from inode (or not using them at all) the
- * read lock is enough. If pointers are altered function must hold write lock
+ * read lock is enough. If pointers are altered function must hold write lock.
- * (these locking rules also apply for S_NOQUOTA flag in the inode - note that
+ * Special care needs to be taken about S_NOQUOTA inode flag (marking that
- * for altering the flag i_mutex is also needed).
+ * inode is a quota file). Functions adding pointers from inode to dquots have
+ * to check this flag under dqptr_sem and then (if S_NOQUOTA is not set) they
+ * have to do all pointer modifications before dropping dqptr_sem. This makes
+ * sure they cannot race with quotaon which first sets S_NOQUOTA flag and
+ * then drops all pointers to dquots from an inode.
 *
 * Each dquot has its dq_lock mutex. Locked dquots might not be referenced
 * from inodes (dquot_alloc_space() and such don't check the dq_lock).
@@ -229,6 +229,9 @@ static struct hlist_head *dquot_hash;
 struct dqstats dqstats;
 EXPORT_SYMBOL(dqstats);
+static qsize_t inode_get_rsv_space(struct inode *inode);
+static void __dquot_initialize(struct inode *inode, int type);
 static inline unsigned int
 hashfn(const struct super_block *sb, unsigned int id, int type)
 {
@@ -327,6 +330,30 @@ int dquot_mark_dquot_dirty(struct dquot *dquot)
 }
 EXPORT_SYMBOL(dquot_mark_dquot_dirty);
+/* Dirtify all the dquots - this can block when journalling */
+static inline int mark_all_dquot_dirty(struct dquot * const *dquot)
+{
+        int ret, err, cnt;
+        ret = err = 0;
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+                if (dquot[cnt])
+                        /* Even in case of error we have to continue */
+                        ret = mark_dquot_dirty(dquot[cnt]);
+                if (!err)
+                        err = ret;
+        }
+        return err;
+}
+static inline void dqput_all(struct dquot **dquot)
+{
+        unsigned int cnt;
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+                dqput(dquot[cnt]);
+}
 /* This function needs dq_list_lock */
 static inline int clear_dquot_dirty(struct dquot *dquot)
 {
@@ -544,7 +571,7 @@ out:
 }
 EXPORT_SYMBOL(dquot_scan_active);
-int vfs_quota_sync(struct super_block *sb, int type)
+int vfs_quota_sync(struct super_block *sb, int type, int wait)
 {
        struct list_head *dirty;
        struct dquot *dquot;
@@ -589,6 +616,33 @@ int vfs_quota_sync(struct super_block *sb, int type)
        spin_unlock(&dq_list_lock);
        mutex_unlock(&dqopt->dqonoff_mutex);
+        if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE))
+                return 0;
+        /* This is not very clever (and fast) but currently I don't know about
+         * any other simple way of getting quota data to disk and we must get
+         * them there for userspace to be visible... */
+        if (sb->s_op->sync_fs)
+                sb->s_op->sync_fs(sb, 1);
+        sync_blockdev(sb->s_bdev);
+        /*
+         * Now when everything is written we can discard the pagecache so
+         * that userspace sees the changes.
+         */
+        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+                if (type != -1 && cnt != type)
+                        continue;
+                if (!sb_has_quota_active(sb, cnt))
+                        continue;
+                mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex,
+                                  I_MUTEX_QUOTA);
+                truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
+                mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex);
+        }
+        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
        return 0;
 }
 EXPORT_SYMBOL(vfs_quota_sync);
@@ -820,11 +874,14 @@ static int dqinit_needed(struct inode *inode, int type)
 static void add_dquot_ref(struct super_block *sb, int type)
 {
        struct inode *inode, *old_inode = NULL;
+        int reserved = 0;
        spin_lock(&inode_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
                        continue;
+                if (unlikely(inode_get_rsv_space(inode) > 0))
+                        reserved = 1;
                if (!atomic_read(&inode->i_writecount))
                        continue;
                if (!dqinit_needed(inode, type))
@@ -834,7 +891,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
                spin_unlock(&inode_lock);
                iput(old_inode);
-                sb->dq_op->initialize(inode, type);
+                __dquot_initialize(inode, type);
                /* We hold a reference to 'inode' so it couldn't have been
                 * removed from s_inodes list while we dropped the inode_lock.
                 * We cannot iput the inode now as we can be holding the last
@@ -845,6 +902,12 @@ static void add_dquot_ref(struct super_block *sb, int type)
        }
        spin_unlock(&inode_lock);
        iput(old_inode);
+        if (reserved) {
+                printk(KERN_WARNING "VFS (%s): Writes happened before quota"
+                        " was turned on thus quota information is probably "
+                        "inconsistent. Please run quotacheck(8).\n", sb->s_id);
+        }
 }
 /*
@@ -958,10 +1021,12 @@ static inline void dquot_resv_space(struct dquot *dquot, qsize_t number)
 /*
 * Claim reserved quota space
 */
-static void dquot_claim_reserved_space(struct dquot *dquot,
+static void dquot_claim_reserved_space(struct dquot *dquot, qsize_t number)
-                                                qsize_t number)
 {
-        WARN_ON(dquot->dq_dqb.dqb_rsvspace < number);
+        if (dquot->dq_dqb.dqb_rsvspace < number) {
+                WARN_ON_ONCE(1);
+                number = dquot->dq_dqb.dqb_rsvspace;
+        }
        dquot->dq_dqb.dqb_curspace += number;
        dquot->dq_dqb.dqb_rsvspace -= number;
 }
@@ -969,7 +1034,12 @@ static void dquot_claim_reserved_space(struct dquot *dquot,
 static inline
 void dquot_free_reserved_space(struct dquot *dquot, qsize_t number)
 {
-        dquot->dq_dqb.dqb_rsvspace -= number;
+        if (dquot->dq_dqb.dqb_rsvspace >= number)
+                dquot->dq_dqb.dqb_rsvspace -= number;
+        else {
+                WARN_ON_ONCE(1);
+                dquot->dq_dqb.dqb_rsvspace = 0;
+        }
 }
 static void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
@@ -1071,73 +1141,6 @@ static void print_warning(struct dquot *dquot, const int warntype)
 }
 #endif
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-/* Netlink family structure for quota */
-static struct genl_family quota_genl_family = {
-        .id = GENL_ID_GENERATE,
-        .hdrsize = 0,
-        .name = "VFS_DQUOT",
-        .version = 1,
-        .maxattr = QUOTA_NL_A_MAX,
-};
-/* Send warning to userspace about user which exceeded quota */
-static void send_warning(const struct dquot *dquot, const char warntype)
-{
-        static atomic_t seq;
-        struct sk_buff *skb;
-        void *msg_head;
-        int ret;
-        int msg_size = 4 * nla_total_size(sizeof(u32)) +
-                       2 * nla_total_size(sizeof(u64));
-        /* We have to allocate using GFP_NOFS as we are called from a
-         * filesystem performing write and thus further recursion into
-         * the fs to free some data could cause deadlocks. */
-        skb = genlmsg_new(msg_size, GFP_NOFS);
-        if (!skb) {
-                printk(KERN_ERR
-                  "VFS: Not enough memory to send quota warning.\n");
-                return;
-        }
-        msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
-                        &quota_genl_family, 0, QUOTA_NL_C_WARNING);
-        if (!msg_head) {
-                printk(KERN_ERR
-                  "VFS: Cannot store netlink header in quota warning.\n");
-                goto err_out;
-        }
-        ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, dquot->dq_type);
-        if (ret)
-                goto attr_err_out;
-        ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, dquot->dq_id);
-        if (ret)
-                goto attr_err_out;
-        ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
-        if (ret)
-                goto attr_err_out;
-        ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR,
-                MAJOR(dquot->dq_sb->s_dev));
-        if (ret)
-                goto attr_err_out;
-        ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR,
-                MINOR(dquot->dq_sb->s_dev));
-        if (ret)
-                goto attr_err_out;
-        ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
-        if (ret)
-                goto attr_err_out;
-        genlmsg_end(skb, msg_head);
-        genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
-        return;
-attr_err_out:
-        printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
-err_out:
-        kfree_skb(skb);
-}
-#endif
 /*
 * Write warnings to the console and send warning messages over netlink.
 *
@@ -1145,18 +1148,20 @@ err_out:
 */
 static void flush_warnings(struct dquot *const *dquots, char *warntype)
 {
+        struct dquot *dq;
        int i;
-        for (i = 0; i < MAXQUOTAS; i++)
+        for (i = 0; i < MAXQUOTAS; i++) {
-                if (dquots[i] && warntype[i] != QUOTA_NL_NOWARN &&
+                dq = dquots[i];
-                    !warning_issued(dquots[i], warntype[i])) {
+                if (dq && warntype[i] != QUOTA_NL_NOWARN &&
+                    !warning_issued(dq, warntype[i])) {
 #ifdef CONFIG_PRINT_QUOTA_WARNING
-                        print_warning(dquots[i], warntype[i]);
+                        print_warning(dq, warntype[i]);
-#endif
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-                        send_warning(dquots[i], warntype[i]);
 #endif
+                        quota_send_warning(dq->dq_type, dq->dq_id,
+                                           dq->dq_sb->s_dev, warntype[i]);
                }
+        }
 }
 static int ignore_hardlimit(struct dquot *dquot)
@@ -1176,13 +1181,13 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
        *warntype = QUOTA_NL_NOWARN;
        if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
            test_bit(DQ_FAKE_B, &dquot->dq_flags))
-                return QUOTA_OK;
+                return 0;
        if (dquot->dq_dqb.dqb_ihardlimit &&
            newinodes > dquot->dq_dqb.dqb_ihardlimit &&
            !ignore_hardlimit(dquot)) {
                *warntype = QUOTA_NL_IHARDWARN;
-                return NO_QUOTA;
+                return -EDQUOT;
        }
        if (dquot->dq_dqb.dqb_isoftlimit &&
@@ -1191,7 +1196,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
            get_seconds() >= dquot->dq_dqb.dqb_itime &&
            !ignore_hardlimit(dquot)) {
                *warntype = QUOTA_NL_ISOFTLONGWARN;
-                return NO_QUOTA;
+                return -EDQUOT;
        }
        if (dquot->dq_dqb.dqb_isoftlimit &&
@@ -1202,7 +1207,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
                    sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace;
        }
-        return QUOTA_OK;
+        return 0;
 }
 /* needs dq_data_lock */
@@ -1214,7 +1219,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
        *warntype = QUOTA_NL_NOWARN;
        if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) ||
            test_bit(DQ_FAKE_B, &dquot->dq_flags))
-                return QUOTA_OK;
+                return 0;
        tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace
                + space;
@@ -1224,7 +1229,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
            !ignore_hardlimit(dquot)) {
                if (!prealloc)
                        *warntype = QUOTA_NL_BHARDWARN;
-                return NO_QUOTA;
+                return -EDQUOT;
        }
        if (dquot->dq_dqb.dqb_bsoftlimit &&
@@ -1234,7 +1239,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
            !ignore_hardlimit(dquot)) {
                if (!prealloc)
                        *warntype = QUOTA_NL_BSOFTLONGWARN;
-                return NO_QUOTA;
+                return -EDQUOT;
        }
        if (dquot->dq_dqb.dqb_bsoftlimit &&
@@ -1250,10 +1255,10 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
                         * We don't allow preallocation to exceed softlimit so exceeding will
                         * be always printed
                         */
-                        return NO_QUOTA;
+                        return -EDQUOT;
        }
-        return QUOTA_OK;
+        return 0;
 }
 static int info_idq_free(struct dquot *dquot, qsize_t inodes)
@@ -1287,25 +1292,32 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space)
                return QUOTA_NL_BHARDBELOW;
        return QUOTA_NL_NOWARN;
 }
 /*
- *      Initialize quota pointers in inode
+ * Initialize quota pointers in inode
- *      We do things in a bit complicated way but by that we avoid calling
+ *
- *      dqget() and thus filesystem callbacks under dqptr_sem.
+ * We do things in a bit complicated way but by that we avoid calling
+ * dqget() and thus filesystem callbacks under dqptr_sem.
+ *
+ * It is better to call this function outside of any transaction as it
+ * might need a lot of space in journal for dquot structure allocation.
 */
-int dquot_initialize(struct inode *inode, int type)
+static void __dquot_initialize(struct inode *inode, int type)
 {
        unsigned int id = 0;
-        int cnt, ret = 0;
+        int cnt;
-        struct dquot *got[MAXQUOTAS] = { NULL, NULL };
+        struct dquot *got[MAXQUOTAS];
        struct super_block *sb = inode->i_sb;
+        qsize_t rsv;
        /* First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex */
-        if (IS_NOQUOTA(inode))
+        if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
-                return 0;
+                return;
        /* First get references to structures we might need. */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+                got[cnt] = NULL;
                if (type != -1 && cnt != type)
                        continue;
                switch (cnt) {
@@ -1320,7 +1332,6 @@ int dquot_initialize(struct inode *inode, int type)
        }
        down_write(&sb_dqopt(sb)->dqptr_sem);
-        /* Having dqptr_sem we know NOQUOTA flags can't be altered... */
        if (IS_NOQUOTA(inode))
                goto out_err;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1332,21 +1343,31 @@ int dquot_initialize(struct inode *inode, int type)
                if (!inode->i_dquot[cnt]) {
                        inode->i_dquot[cnt] = got[cnt];
                        got[cnt] = NULL;
+                        /*
+                         * Make quota reservation system happy if someone
+                         * did a write before quota was turned on
+                         */
+                        rsv = inode_get_rsv_space(inode);
+                        if (unlikely(rsv))
+                                dquot_resv_space(inode->i_dquot[cnt], rsv);
                }
        }
 out_err:
        up_write(&sb_dqopt(sb)->dqptr_sem);
        /* Drop unused references */
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+        dqput_all(got);
-                dqput(got[cnt]);
+}
-        return ret;
+void dquot_initialize(struct inode *inode)
+{
+        __dquot_initialize(inode, -1);
 }
 EXPORT_SYMBOL(dquot_initialize);
 /*
 *      Release all quotas referenced by inode
 */
-int dquot_drop(struct inode *inode)
+static void __dquot_drop(struct inode *inode)
 {
        int cnt;
        struct dquot *put[MAXQUOTAS];
@@ -1357,54 +1378,128 @@ int dquot_drop(struct inode *inode)
                inode->i_dquot[cnt] = NULL;
        }
        up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        dqput_all(put);
+}
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+void dquot_drop(struct inode *inode)
-                dqput(put[cnt]);
+{
-        return 0;
+        int cnt;
+        if (IS_NOQUOTA(inode))
+                return;
+        /*
+         * Test before calling to rule out calls from proc and such
+         * where we are not allowed to block. Note that this is
+         * actually reliable test even without the lock - the caller
+         * must assure that nobody can come after the DQUOT_DROP and
+         * add quota pointers back anyway.
+         */
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+                if (inode->i_dquot[cnt])
+                        break;
+        }
+        if (cnt < MAXQUOTAS)
+                __dquot_drop(inode);
 }
 EXPORT_SYMBOL(dquot_drop);
-/* Wrapper to remove references to quota structures from inode */
+/*
-void vfs_dq_drop(struct inode *inode)
+ * inode_reserved_space is managed internally by quota, and protected by
-{
+ * i_lock similar to i_blocks+i_bytes.
-        /* Here we can get arbitrary inode from clear_inode() so we have
+ */
-         * to be careful. OTOH we don't need locking as quota operations
+static qsize_t *inode_reserved_space(struct inode * inode)
-         * are allowed to change only at mount time */
+{
-        if (!IS_NOQUOTA(inode) && inode->i_sb && inode->i_sb->dq_op
+        /* Filesystem must explicitly define it's own method in order to use
-            && inode->i_sb->dq_op->drop) {
+         * quota reservation interface */
-                int cnt;
+        BUG_ON(!inode->i_sb->dq_op->get_reserved_space);
-                /* Test before calling to rule out calls from proc and such
+        return inode->i_sb->dq_op->get_reserved_space(inode);
-                 * where we are not allowed to block. Note that this is
+}
-                 * actually reliable test even without the lock - the caller
-                 * must assure that nobody can come after the DQUOT_DROP and
+void inode_add_rsv_space(struct inode *inode, qsize_t number)
-                 * add quota pointers back anyway */
+{
-                for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+        spin_lock(&inode->i_lock);
-                        if (inode->i_dquot[cnt])
+        *inode_reserved_space(inode) += number;
-                                break;
+        spin_unlock(&inode->i_lock);
-                if (cnt < MAXQUOTAS)
+}
-                        inode->i_sb->dq_op->drop(inode);
+EXPORT_SYMBOL(inode_add_rsv_space);
-        }
-}
+void inode_claim_rsv_space(struct inode *inode, qsize_t number)
-EXPORT_SYMBOL(vfs_dq_drop);
+{
+        spin_lock(&inode->i_lock);
+        *inode_reserved_space(inode) -= number;
+        __inode_add_bytes(inode, number);
+        spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL(inode_claim_rsv_space);
+void inode_sub_rsv_space(struct inode *inode, qsize_t number)
+{
+        spin_lock(&inode->i_lock);
+        *inode_reserved_space(inode) -= number;
+        spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL(inode_sub_rsv_space);
+static qsize_t inode_get_rsv_space(struct inode *inode)
+{
+        qsize_t ret;
+        if (!inode->i_sb->dq_op->get_reserved_space)
+                return 0;
+        spin_lock(&inode->i_lock);
+        ret = *inode_reserved_space(inode);
+        spin_unlock(&inode->i_lock);
+        return ret;
+}
+static void inode_incr_space(struct inode *inode, qsize_t number,
+                                int reserve)
+{
+        if (reserve)
+                inode_add_rsv_space(inode, number);
+        else
+                inode_add_bytes(inode, number);
+}
+static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
+{
+        if (reserve)
+                inode_sub_rsv_space(inode, number);
+        else
+                inode_sub_bytes(inode, number);
+}
 /*
- * Following four functions update i_blocks+i_bytes fields and
+ * This functions updates i_blocks+i_bytes fields and quota information
- * quota information (together with appropriate checks)
+ * (together with appropriate checks).
- * NOTE: We absolutely rely on the fact that caller dirties
+ *
- * the inode (usually macros in quotaops.h care about this) and
+ * NOTE: We absolutely rely on the fact that caller dirties the inode
- * holds a handle for the current transaction so that dquot write and
+ * (usually helpers in quotaops.h care about this) and holds a handle for
- * inode write go into the same transaction.
+ * the current transaction so that dquot write and inode write go into the
+ * same transaction.
 */
 /*
 * This operation can block, but only after everything is updated
 */
 int __dquot_alloc_space(struct inode *inode, qsize_t number,
-                        int warn, int reserve)
+                int warn, int reserve)
 {
-        int cnt, ret = QUOTA_OK;
+        int cnt, ret = 0;
        char warntype[MAXQUOTAS];
+        /*
+         * First test before acquiring mutex - solves deadlocks when we
+         * re-enter the quota code and are already holding the mutex
+         */
+        if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
+                inode_incr_space(inode, number, reserve);
+                goto out;
+        }
+        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                warntype[cnt] = QUOTA_NL_NOWARN;
@@ -1412,10 +1507,11 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (!inode->i_dquot[cnt])
                        continue;
-                if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt)
+                ret = check_bdq(inode->i_dquot[cnt], number, !warn,
-                    == NO_QUOTA) {
+                                warntype+cnt);
-                        ret = NO_QUOTA;
+                if (ret) {
-                        goto out_unlock;
+                        spin_unlock(&dq_data_lock);
+                        goto out_flush_warn;
                }
        }
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1426,131 +1522,73 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
                else
                        dquot_incr_space(inode->i_dquot[cnt], number);
        }
-        if (!reserve)
+        inode_incr_space(inode, number, reserve);
-                inode_add_bytes(inode, number);
-out_unlock:
        spin_unlock(&dq_data_lock);
-        flush_warnings(inode->i_dquot, warntype);
-        return ret;
-}
-int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
-{
-        int cnt, ret = QUOTA_OK;
-        /*
-         * First test before acquiring mutex - solves deadlocks when we
-         * re-enter the quota code and are already holding the mutex
-         */
-        if (IS_NOQUOTA(inode)) {
-                inode_add_bytes(inode, number);
-                goto out;
-        }
-        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        if (reserve)
-        if (IS_NOQUOTA(inode)) {
+                goto out_flush_warn;
-                inode_add_bytes(inode, number);
+        mark_all_dquot_dirty(inode->i_dquot);
-                goto out_unlock;
+out_flush_warn:
-        }
+        flush_warnings(inode->i_dquot, warntype);
-        ret = __dquot_alloc_space(inode, number, warn, 0);
-        if (ret == NO_QUOTA)
-                goto out_unlock;
-        /* Dirtify all the dquots - this can block when journalling */
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                if (inode->i_dquot[cnt])
-                        mark_dquot_dirty(inode->i_dquot[cnt]);
-out_unlock:
-        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-out:
-        return ret;
-}
-EXPORT_SYMBOL(dquot_alloc_space);
-int dquot_reserve_space(struct inode *inode, qsize_t number, int warn)
-{
-        int ret = QUOTA_OK;
-        if (IS_NOQUOTA(inode))
-                goto out;
-        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-        if (IS_NOQUOTA(inode))
-                goto out_unlock;
-        ret = __dquot_alloc_space(inode, number, warn, 1);
-out_unlock:
        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 out:
        return ret;
 }
-EXPORT_SYMBOL(dquot_reserve_space);
+EXPORT_SYMBOL(__dquot_alloc_space);
 /*
 * This operation can block, but only after everything is updated
 */
-int dquot_alloc_inode(const struct inode *inode, qsize_t number)
+int dquot_alloc_inode(const struct inode *inode)
 {
-        int cnt, ret = NO_QUOTA;
+        int cnt, ret = 0;
        char warntype[MAXQUOTAS];
        /* First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex */
-        if (IS_NOQUOTA(inode))
+        if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
-                return QUOTA_OK;
+                return 0;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                warntype[cnt] = QUOTA_NL_NOWARN;
        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-        if (IS_NOQUOTA(inode)) {
-                up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-                return QUOTA_OK;
-        }
        spin_lock(&dq_data_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (!inode->i_dquot[cnt])
                        continue;
-                if (check_idq(inode->i_dquot[cnt], number, warntype+cnt)
+                ret = check_idq(inode->i_dquot[cnt], 1, warntype + cnt);
-                    == NO_QUOTA)
+                if (ret)
                        goto warn_put_all;
        }
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (!inode->i_dquot[cnt])
                        continue;
-                dquot_incr_inodes(inode->i_dquot[cnt], number);
+                dquot_incr_inodes(inode->i_dquot[cnt], 1);
        }
-        ret = QUOTA_OK;
 warn_put_all:
        spin_unlock(&dq_data_lock);
-        if (ret == QUOTA_OK)
+        if (ret == 0)
-                /* Dirtify all the dquots - this can block when journalling */
+                mark_all_dquot_dirty(inode->i_dquot);
-                for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                        if (inode->i_dquot[cnt])
-                                mark_dquot_dirty(inode->i_dquot[cnt]);
        flush_warnings(inode->i_dquot, warntype);
        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
        return ret;
 }
 EXPORT_SYMBOL(dquot_alloc_inode);
-int dquot_claim_space(struct inode *inode, qsize_t number)
+/*
+ * Convert in-memory reserved quotas to real consumed quotas
+ */
+int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 {
        int cnt;
-        int ret = QUOTA_OK;
-        if (IS_NOQUOTA(inode)) {
+        if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
-                inode_add_bytes(inode, number);
+                inode_claim_rsv_space(inode, number);
-                goto out;
+                return 0;
        }
        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-        if (IS_NOQUOTA(inode))  {
-                up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-                inode_add_bytes(inode, number);
-                goto out;
-        }
        spin_lock(&dq_data_lock);
        /* Claim reserved quotas to allocated quotas */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1559,191 +1597,129 @@ int dquot_claim_space(struct inode *inode, qsize_t number)
                                                        number);
        }
        /* Update inode bytes */
-        inode_add_bytes(inode, number);
+        inode_claim_rsv_space(inode, number);
        spin_unlock(&dq_data_lock);
-        /* Dirtify all the dquots - this can block when journalling */
+        mark_all_dquot_dirty(inode->i_dquot);
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                if (inode->i_dquot[cnt])
-                        mark_dquot_dirty(inode->i_dquot[cnt]);
        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-out:
+        return 0;
-        return ret;
-}
-EXPORT_SYMBOL(dquot_claim_space);
-/*
- * Release reserved quota space
- */
-void dquot_release_reserved_space(struct inode *inode, qsize_t number)
-{
-        int cnt;
-        if (IS_NOQUOTA(inode))
-                goto out;
-        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-        if (IS_NOQUOTA(inode))
-                goto out_unlock;
-        spin_lock(&dq_data_lock);
-        /* Release reserved dquots */
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                if (inode->i_dquot[cnt])
-                        dquot_free_reserved_space(inode->i_dquot[cnt], number);
-        }
-        spin_unlock(&dq_data_lock);
-out_unlock:
-        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-out:
-        return;
 }
-EXPORT_SYMBOL(dquot_release_reserved_space);
+EXPORT_SYMBOL(dquot_claim_space_nodirty);
 /*
 * This operation can block, but only after everything is updated
 */
-int dquot_free_space(struct inode *inode, qsize_t number)
+void __dquot_free_space(struct inode *inode, qsize_t number, int reserve)
 {
        unsigned int cnt;
        char warntype[MAXQUOTAS];
        /* First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex */
-        if (IS_NOQUOTA(inode)) {
+        if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
-out_sub:
+                inode_decr_space(inode, number, reserve);
-                inode_sub_bytes(inode, number);
+                return;
-                return QUOTA_OK;
        }
        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-        /* Now recheck reliably when holding dqptr_sem */
-        if (IS_NOQUOTA(inode)) {
-                up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-                goto out_sub;
-        }
        spin_lock(&dq_data_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (!inode->i_dquot[cnt])
                        continue;
                warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number);
-                dquot_decr_space(inode->i_dquot[cnt], number);
+                if (reserve)
+                        dquot_free_reserved_space(inode->i_dquot[cnt], number);
+                else
+                        dquot_decr_space(inode->i_dquot[cnt], number);
        }
-        inode_sub_bytes(inode, number);
+        inode_decr_space(inode, number, reserve);
        spin_unlock(&dq_data_lock);
-        /* Dirtify all the dquots - this can block when journalling */
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+        if (reserve)
-                if (inode->i_dquot[cnt])
+                goto out_unlock;
-                        mark_dquot_dirty(inode->i_dquot[cnt]);
+        mark_all_dquot_dirty(inode->i_dquot);
+out_unlock:
        flush_warnings(inode->i_dquot, warntype);
        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-        return QUOTA_OK;
 }
-EXPORT_SYMBOL(dquot_free_space);
+EXPORT_SYMBOL(__dquot_free_space);
 /*
 * This operation can block, but only after everything is updated
 */
-int dquot_free_inode(const struct inode *inode, qsize_t number)
+void dquot_free_inode(const struct inode *inode)
 {
        unsigned int cnt;
        char warntype[MAXQUOTAS];
        /* First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex */
-        if (IS_NOQUOTA(inode))
+        if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
-                return QUOTA_OK;
+                return;
        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-        /* Now recheck reliably when holding dqptr_sem */
-        if (IS_NOQUOTA(inode)) {
-                up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-                return QUOTA_OK;
-        }
        spin_lock(&dq_data_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (!inode->i_dquot[cnt])
                        continue;
-                warntype[cnt] = info_idq_free(inode->i_dquot[cnt], number);
+                warntype[cnt] = info_idq_free(inode->i_dquot[cnt], 1);
-                dquot_decr_inodes(inode->i_dquot[cnt], number);
+                dquot_decr_inodes(inode->i_dquot[cnt], 1);
        }
        spin_unlock(&dq_data_lock);
-        /* Dirtify all the dquots - this can block when journalling */
+        mark_all_dquot_dirty(inode->i_dquot);
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                if (inode->i_dquot[cnt])
-                        mark_dquot_dirty(inode->i_dquot[cnt]);
        flush_warnings(inode->i_dquot, warntype);
        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-        return QUOTA_OK;
 }
 EXPORT_SYMBOL(dquot_free_inode);
 /*
- * call back function, get reserved quota space from underlying fs
- */
-qsize_t dquot_get_reserved_space(struct inode *inode)
-{
-        qsize_t reserved_space = 0;
-        if (sb_any_quota_active(inode->i_sb) &&
-            inode->i_sb->dq_op->get_reserved_space)
-                reserved_space = inode->i_sb->dq_op->get_reserved_space(inode);
-        return reserved_space;
-}
-/*
 * Transfer the number of inode and blocks from one diskquota to an other.
 *
 * This operation can block, but only after everything is updated
 * A transaction must be started when entering this function.
 */
-int dquot_transfer(struct inode *inode, struct iattr *iattr)
+static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask)
 {
        qsize_t space, cur_space;
        qsize_t rsv_space = 0;
        struct dquot *transfer_from[MAXQUOTAS];
        struct dquot *transfer_to[MAXQUOTAS];
-        int cnt, ret = QUOTA_OK;
+        int cnt, ret = 0;
-        int chuid = iattr->ia_valid & ATTR_UID && inode->i_uid != iattr->ia_uid,
-            chgid = iattr->ia_valid & ATTR_GID && inode->i_gid != iattr->ia_gid;
        char warntype_to[MAXQUOTAS];
        char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
        /* First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex */
        if (IS_NOQUOTA(inode))
-                return QUOTA_OK;
+                return 0;
        /* Initialize the arrays */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                transfer_from[cnt] = NULL;
                transfer_to[cnt] = NULL;
                warntype_to[cnt] = QUOTA_NL_NOWARN;
        }
-        if (chuid)
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                transfer_to[USRQUOTA] = dqget(inode->i_sb, iattr->ia_uid,
+                if (mask & (1 << cnt))
-                                              USRQUOTA);
+                        transfer_to[cnt] = dqget(inode->i_sb, chid[cnt], cnt);
-        if (chgid)
+        }
-                transfer_to[GRPQUOTA] = dqget(inode->i_sb, iattr->ia_gid,
-                                              GRPQUOTA);
        down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-        /* Now recheck reliably when holding dqptr_sem */
        if (IS_NOQUOTA(inode)) {        /* File without quota accounting? */
                up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
                goto put_all;
        }
        spin_lock(&dq_data_lock);
        cur_space = inode_get_bytes(inode);
-        rsv_space = dquot_get_reserved_space(inode);
+        rsv_space = inode_get_rsv_space(inode);
        space = cur_space + rsv_space;
        /* Build the transfer_from list and check the limits */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (!transfer_to[cnt])
                        continue;
                transfer_from[cnt] = inode->i_dquot[cnt];
-                if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) ==
+                ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt);
-                    NO_QUOTA || check_bdq(transfer_to[cnt], space, 0,
+                if (ret)
-                    warntype_to + cnt) == NO_QUOTA)
+                        goto over_quota;
+                ret = check_bdq(transfer_to[cnt], space, 0, warntype_to + cnt);
+                if (ret)
                        goto over_quota;
        }
@@ -1778,25 +1754,18 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
        spin_unlock(&dq_data_lock);
        up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-        /* Dirtify all the dquots - this can block when journalling */
+        mark_all_dquot_dirty(transfer_from);
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+        mark_all_dquot_dirty(transfer_to);
-                if (transfer_from[cnt])
+        /* The reference we got is transferred to the inode */
-                        mark_dquot_dirty(transfer_from[cnt]);
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                if (transfer_to[cnt]) {
+                transfer_to[cnt] = NULL;
-                        mark_dquot_dirty(transfer_to[cnt]);
-                        /* The reference we got is transferred to the inode */
-                        transfer_to[cnt] = NULL;
-                }
-        }
 warn_put_all:
        flush_warnings(transfer_to, warntype_to);
        flush_warnings(transfer_from, warntype_from_inodes);
        flush_warnings(transfer_from, warntype_from_space);
 put_all:
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+        dqput_all(transfer_from);
-                dqput(transfer_from[cnt]);
+        dqput_all(transfer_to);
-                dqput(transfer_to[cnt]);
-        }
        return ret;
 over_quota:
        spin_unlock(&dq_data_lock);
@@ -1804,22 +1773,32 @@ over_quota:
        /* Clear dquot pointers we don't want to dqput() */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                transfer_from[cnt] = NULL;
-        ret = NO_QUOTA;
        goto warn_put_all;
 }
-EXPORT_SYMBOL(dquot_transfer);
-/* Wrapper for transferring ownership of an inode */
+/* Wrapper for transferring ownership of an inode for uid/gid only
-int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
+ * Called from FSXXX_setattr()
+ */
+int dquot_transfer(struct inode *inode, struct iattr *iattr)
 {
+        qid_t chid[MAXQUOTAS];
+        unsigned long mask = 0;
+        if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) {
+                mask |= 1 << USRQUOTA;
+                chid[USRQUOTA] = iattr->ia_uid;
+        }
+        if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) {
+                mask |= 1 << GRPQUOTA;
+                chid[GRPQUOTA] = iattr->ia_gid;
+        }
        if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) {
-                vfs_dq_init(inode);
+                dquot_initialize(inode);
-                if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA)
+                return __dquot_transfer(inode, chid, mask);
-                        return 1;
        }
        return 0;
 }
-EXPORT_SYMBOL(vfs_dq_transfer);
+EXPORT_SYMBOL(dquot_transfer);
 /*
 * Write info of quota file to disk
@@ -1840,13 +1819,6 @@ EXPORT_SYMBOL(dquot_commit_info);
 * Definitions of diskquota operations.
 */
 const struct dquot_operations dquot_operations = {
-        .initialize     = dquot_initialize,
-        .drop           = dquot_drop,
-        .alloc_space    = dquot_alloc_space,
-        .alloc_inode    = dquot_alloc_inode,
-        .free_space     = dquot_free_space,
-        .free_inode     = dquot_free_inode,
-        .transfer       = dquot_transfer,
        .write_dquot    = dquot_commit,
        .acquire_dquot  = dquot_acquire,
        .release_dquot  = dquot_release,
@@ -1857,6 +1829,20 @@ const struct dquot_operations dquot_operations = {
 };
 /*
+ * Generic helper for ->open on filesystems supporting disk quotas.
+ */
+int dquot_file_open(struct inode *inode, struct file *file)
+{
+        int error;
+        error = generic_file_open(inode, file);
+        if (!error && (file->f_mode & FMODE_WRITE))
+                dquot_initialize(inode);
+        return error;
+}
+EXPORT_SYMBOL(dquot_file_open);
+/*
 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
 */
 int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
@@ -2035,11 +2021,13 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
        }
        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
-                /* As we bypass the pagecache we must now flush the inode so
+                /* As we bypass the pagecache we must now flush all the
-                 * that we see all the changes from userspace... */
+                 * dirty data and invalidate caches so that kernel sees
-                write_inode_now(inode, 1);
+                 * changes from userspace. It is not enough to just flush
-                /* And now flush the block cache so that kernel sees the
+                 * the quota file since if blocksize < pagesize, invalidation
-                 * changes */
+                 * of the cache could fail because of other unrelated dirty
+                 * data */
+                sync_filesystem(sb);
                invalidate_bdev(sb->s_bdev);
        }
        mutex_lock(&dqopt->dqonoff_mutex);
@@ -2052,14 +2040,16 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
                /* We don't want quota and atime on quota files (deadlocks
                 * possible) Also nobody should write to the file - we use
                 * special IO operations which ignore the immutable bit. */
-                down_write(&dqopt->dqptr_sem);
                mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
                oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
                                             S_NOQUOTA);
                inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
                mutex_unlock(&inode->i_mutex);
-                up_write(&dqopt->dqptr_sem);
+                /*
-                sb->dq_op->drop(inode);
+                 * When S_NOQUOTA is set, remove dquot references as no more
+                 * references can be added
+                 */
+                __dquot_drop(inode);
        }
        error = -EIO;
@@ -2095,14 +2085,12 @@ out_file_init:
        iput(inode);
 out_lock:
        if (oldflags != -1) {
-                down_write(&dqopt->dqptr_sem);
                mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
                /* Set the flags back (in the case of accidental quotaon()
                 * on a wrong file we don't want to mess up the flags) */
                inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
                inode->i_flags |= oldflags;
                mutex_unlock(&inode->i_mutex);
-                up_write(&dqopt->dqptr_sem);
        }
        mutex_unlock(&dqopt->dqonoff_mutex);
 out_fmt:
@@ -2233,7 +2221,9 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
        struct dentry *dentry;
        int error;
+        mutex_lock(&sb->s_root->d_inode->i_mutex);
        dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name));
+        mutex_unlock(&sb->s_root->d_inode->i_mutex);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
@@ -2473,100 +2463,89 @@ const struct quotactl_ops vfs_quotactl_ops = {
 static ctl_table fs_dqstats_table[] = {
        {
-                .ctl_name       = FS_DQ_LOOKUPS,
                .procname       = "lookups",
                .data           = &dqstats.lookups,
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = FS_DQ_DROPS,
                .procname       = "drops",
                .data           = &dqstats.drops,
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = FS_DQ_READS,
                .procname       = "reads",
                .data           = &dqstats.reads,
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = FS_DQ_WRITES,
                .procname       = "writes",
                .data           = &dqstats.writes,
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = FS_DQ_CACHE_HITS,
                .procname       = "cache_hits",
                .data           = &dqstats.cache_hits,
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = FS_DQ_ALLOCATED,
                .procname       = "allocated_dquots",
                .data           = &dqstats.allocated_dquots,
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = FS_DQ_FREE,
                .procname       = "free_dquots",
                .data           = &dqstats.free_dquots,
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = FS_DQ_SYNCS,
                .procname       = "syncs",
                .data           = &dqstats.syncs,
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #ifdef CONFIG_PRINT_QUOTA_WARNING
        {
-                .ctl_name       = FS_DQ_WARNINGS,
                .procname       = "warnings",
                .data           = &flag_print_warnings,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
-        { .ctl_name = 0 },
+        { },
 };
 static ctl_table fs_table[] = {
        {
-                .ctl_name       = FS_DQSTATS,
                .procname       = "quota",
                .mode           = 0555,
                .child          = fs_dqstats_table,
        },
-        { .ctl_name = 0 },
+        { },
 };
 static ctl_table sys_table[] = {
        {
-                .ctl_name       = CTL_FS,
                .procname       = "fs",
                .mode           = 0555,
                .child          = fs_table,
        },
-        { .ctl_name = 0 },
+        { },
 };
 static int __init dquot_init(void)
@@ -2607,12 +2586,6 @@ static int __init dquot_init(void)
        register_shrinker(&dqcache_shrinker);
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-        if (genl_register_family(&quota_genl_family) != 0)
-                printk(KERN_ERR
-                       "VFS: Failed to create quota netlink interface.\n");
-#endif
        return 0;
 }
 module_init(dquot_init);
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
new file mode 100644
index 000000000000..d67908b407d9
--- /dev/null
+++ b/fs/quota/netlink.c
@@ -0,0 +1,96 @@
+#include <linux/cred.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/quotaops.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+/* Netlink family structure for quota */
+static struct genl_family quota_genl_family = {
+        .id = GENL_ID_GENERATE,
+        .hdrsize = 0,
+        .name = "VFS_DQUOT",
+        .version = 1,
+        .maxattr = QUOTA_NL_A_MAX,
+};
+/**
+ * quota_send_warning - Send warning to userspace about exceeded quota
+ * @type: The quota type: USRQQUOTA, GRPQUOTA,...
+ * @id: The user or group id of the quota that was exceeded
+ * @dev: The device on which the fs is mounted (sb->s_dev)
+ * @warntype: The type of the warning: QUOTA_NL_...
+ *
+ * This can be used by filesystems (including those which don't use
+ * dquot) to send a message to userspace relating to quota limits.
+ *
+ */
+void quota_send_warning(short type, unsigned int id, dev_t dev,
+                        const char warntype)
+{
+        static atomic_t seq;
+        struct sk_buff *skb;
+        void *msg_head;
+        int ret;
+        int msg_size = 4 * nla_total_size(sizeof(u32)) +
+                       2 * nla_total_size(sizeof(u64));
+        /* We have to allocate using GFP_NOFS as we are called from a
+         * filesystem performing write and thus further recursion into
+         * the fs to free some data could cause deadlocks. */
+        skb = genlmsg_new(msg_size, GFP_NOFS);
+        if (!skb) {
+                printk(KERN_ERR
+                  "VFS: Not enough memory to send quota warning.\n");
+                return;
+        }
+        msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
+                        &quota_genl_family, 0, QUOTA_NL_C_WARNING);
+        if (!msg_head) {
+                printk(KERN_ERR
+                  "VFS: Cannot store netlink header in quota warning.\n");
+                goto err_out;
+        }
+        ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type);
+        if (ret)
+                goto attr_err_out;
+        ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id);
+        if (ret)
+                goto attr_err_out;
+        ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
+        if (ret)
+                goto attr_err_out;
+        ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev));
+        if (ret)
+                goto attr_err_out;
+        ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev));
+        if (ret)
+                goto attr_err_out;
+        ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
+        if (ret)
+                goto attr_err_out;
+        genlmsg_end(skb, msg_head);
+        genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
+        return;
+attr_err_out:
+        printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
+err_out:
+        kfree_skb(skb);
+}
+EXPORT_SYMBOL(quota_send_warning);
+static int __init quota_init(void)
+{
+        if (genl_register_family(&quota_genl_family) != 0)
+                printk(KERN_ERR
+                       "VFS: Failed to create quota netlink interface.\n");
+        return 0;
+};
+module_init(quota_init);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 95c5b42384b2..95388f9b7356 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -10,7 +10,6 @@
 #include <linux/slab.h>
 #include <asm/current.h>
 #include <asm/uaccess.h>
-#include <linux/compat.h>
 #include <linux/kernel.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
@@ -18,218 +17,205 @@
 #include <linux/capability.h>
 #include <linux/quotaops.h>
 #include <linux/types.h>
+#include <linux/writeback.h>
-/* Check validity of generic quotactl commands */
+static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
-static int generic_quotactl_valid(struct super_block *sb, int type, int cmd,
+                                     qid_t id)
-                                  qid_t id)
 {
-        if (type >= MAXQUOTAS)
-                return -EINVAL;
-        if (!sb && cmd != Q_SYNC)
-                return -ENODEV;
-        /* Is operation supported? */
-        if (sb && !sb->s_qcop)
-                return -ENOSYS;
        switch (cmd) {
-                case Q_GETFMT:
+        /* these commands do not require any special privilegues */
-                        break;
+        case Q_GETFMT:
-                case Q_QUOTAON:
+        case Q_SYNC:
-                        if (!sb->s_qcop->quota_on)
+        case Q_GETINFO:
-                                return -ENOSYS;
+        case Q_XGETQSTAT:
-                        break;
+        case Q_XQUOTASYNC:
-                case Q_QUOTAOFF:
+                break;
-                        if (!sb->s_qcop->quota_off)
+        /* allow to query information for dquots we "own" */
-                                return -ENOSYS;
+        case Q_GETQUOTA:
-                        break;
+        case Q_XGETQUOTA:
-                case Q_SETINFO:
+                if ((type == USRQUOTA && current_euid() == id) ||
-                        if (!sb->s_qcop->set_info)
+                    (type == GRPQUOTA && in_egroup_p(id)))
-                                return -ENOSYS;
-                        break;
-                case Q_GETINFO:
-                        if (!sb->s_qcop->get_info)
-                                return -ENOSYS;
-                        break;
-                case Q_SETQUOTA:
-                        if (!sb->s_qcop->set_dqblk)
-                                return -ENOSYS;
-                        break;
-                case Q_GETQUOTA:
-                        if (!sb->s_qcop->get_dqblk)
-                                return -ENOSYS;
-                        break;
-                case Q_SYNC:
-                        if (sb && !sb->s_qcop->quota_sync)
-                                return -ENOSYS;
                        break;
-                default:
+                /*FALLTHROUGH*/
-                        return -EINVAL;
+        default:
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
        }
-        /* Is quota turned on for commands which need it? */
+        return security_quotactl(cmd, type, id, sb);
-        switch (cmd) {
+}
-                case Q_GETFMT:
-                case Q_GETINFO:
-                case Q_SETINFO:
-                case Q_SETQUOTA:
-                case Q_GETQUOTA:
-                        /* This is just an informative test so we are satisfied
-                         * without the lock */
-                        if (!sb_has_quota_active(sb, type))
-                                return -ESRCH;
-        }
-        /* Check privileges */
+static int quota_sync_all(int type)
-        if (cmd == Q_GETQUOTA) {
+{
-                if (((type == USRQUOTA && current_euid() != id) ||
+        struct super_block *sb;
-                     (type == GRPQUOTA && !in_egroup_p(id))) &&
+        int ret;
-                    !capable(CAP_SYS_ADMIN))
-                        return -EPERM;
+        if (type >= MAXQUOTAS)
+                return -EINVAL;
+        ret = security_quotactl(Q_SYNC, type, 0, NULL);
+        if (ret)
+                return ret;
+        spin_lock(&sb_lock);
+restart:
+        list_for_each_entry(sb, &super_blocks, s_list) {
+                if (!sb->s_qcop || !sb->s_qcop->quota_sync)
+                        continue;
+                sb->s_count++;
+                spin_unlock(&sb_lock);
+                down_read(&sb->s_umount);
+                if (sb->s_root)
+                        sb->s_qcop->quota_sync(sb, type, 1);
+                up_read(&sb->s_umount);
+                spin_lock(&sb_lock);
+                if (__put_super_and_need_restart(sb))
+                        goto restart;
        }
-        else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO)
+        spin_unlock(&sb_lock);
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
        return 0;
 }
-/* Check validity of XFS Quota Manager commands */
+static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
-static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd,
+                         void __user *addr)
-                              qid_t id)
 {
-        if (type >= XQM_MAXQUOTAS)
+        char *pathname;
-                return -EINVAL;
+        int ret = -ENOSYS;
-        if (!sb)
-                return -ENODEV;
+        pathname = getname(addr);
-        if (!sb->s_qcop)
+        if (IS_ERR(pathname))
-                return -ENOSYS;
+                return PTR_ERR(pathname);
+        if (sb->s_qcop->quota_on)
+                ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0);
+        putname(pathname);
+        return ret;
+}
-        switch (cmd) {
+static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
-                case Q_XQUOTAON:
+{
-                case Q_XQUOTAOFF:
+        __u32 fmt;
-                case Q_XQUOTARM:
-                        if (!sb->s_qcop->set_xstate)
-                                return -ENOSYS;
-                        break;
-                case Q_XGETQSTAT:
-                        if (!sb->s_qcop->get_xstate)
-                                return -ENOSYS;
-                        break;
-                case Q_XSETQLIM:
-                        if (!sb->s_qcop->set_xquota)
-                                return -ENOSYS;
-                        break;
-                case Q_XGETQUOTA:
-                        if (!sb->s_qcop->get_xquota)
-                                return -ENOSYS;
-                        break;
-                case Q_XQUOTASYNC:
-                        if (!sb->s_qcop->quota_sync)
-                                return -ENOSYS;
-                        break;
-                default:
-                        return -EINVAL;
-        }
-        /* Check privileges */
+        down_read(&sb_dqopt(sb)->dqptr_sem);
-        if (cmd == Q_XGETQUOTA) {
+        if (!sb_has_quota_active(sb, type)) {
-                if (((type == XQM_USRQUOTA && current_euid() != id) ||
+                up_read(&sb_dqopt(sb)->dqptr_sem);
-                     (type == XQM_GRPQUOTA && !in_egroup_p(id))) &&
+                return -ESRCH;
-                     !capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-        } else if (cmd != Q_XGETQSTAT && cmd != Q_XQUOTASYNC) {
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
        }
+        fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id;
+        up_read(&sb_dqopt(sb)->dqptr_sem);
+        if (copy_to_user(addr, &fmt, sizeof(fmt)))
+                return -EFAULT;
+        return 0;
+}
+static int quota_getinfo(struct super_block *sb, int type, void __user *addr)
+{
+        struct if_dqinfo info;
+        int ret;
+        if (!sb_has_quota_active(sb, type))
+                return -ESRCH;
+        if (!sb->s_qcop->get_info)
+                return -ENOSYS;
+        ret = sb->s_qcop->get_info(sb, type, &info);
+        if (!ret && copy_to_user(addr, &info, sizeof(info)))
+                return -EFAULT;
+        return ret;
+}
+static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
+{
+        struct if_dqinfo info;
+        if (copy_from_user(&info, addr, sizeof(info)))
+                return -EFAULT;
+        if (!sb_has_quota_active(sb, type))
+                return -ESRCH;
+        if (!sb->s_qcop->set_info)
+                return -ENOSYS;
+        return sb->s_qcop->set_info(sb, type, &info);
+}
+static int quota_getquota(struct super_block *sb, int type, qid_t id,
+                          void __user *addr)
+{
+        struct if_dqblk idq;
+        int ret;
+        if (!sb_has_quota_active(sb, type))
+                return -ESRCH;
+        if (!sb->s_qcop->get_dqblk)
+                return -ENOSYS;
+        ret = sb->s_qcop->get_dqblk(sb, type, id, &idq);
+        if (ret)
+                return ret;
+        if (copy_to_user(addr, &idq, sizeof(idq)))
+                return -EFAULT;
        return 0;
 }
-static int check_quotactl_valid(struct super_block *sb, int type, int cmd,
+static int quota_setquota(struct super_block *sb, int type, qid_t id,
-                                qid_t id)
+                          void __user *addr)
 {
-        int error;
+        struct if_dqblk idq;
-        if (XQM_COMMAND(cmd))
+        if (copy_from_user(&idq, addr, sizeof(idq)))
-                error = xqm_quotactl_valid(sb, type, cmd, id);
+                return -EFAULT;
-        else
+        if (!sb_has_quota_active(sb, type))
-                error = generic_quotactl_valid(sb, type, cmd, id);
+                return -ESRCH;
-        if (!error)
+        if (!sb->s_qcop->set_dqblk)
-                error = security_quotactl(cmd, type, id, sb);
+                return -ENOSYS;
-        return error;
+        return sb->s_qcop->set_dqblk(sb, type, id, &idq);
 }
-#ifdef CONFIG_QUOTA
+static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr)
-void sync_quota_sb(struct super_block *sb, int type)
 {
-        int cnt;
+        __u32 flags;
-        if (!sb->s_qcop->quota_sync)
+        if (copy_from_user(&flags, addr, sizeof(flags)))
-                return;
+                return -EFAULT;
+        if (!sb->s_qcop->set_xstate)
+                return -ENOSYS;
+        return sb->s_qcop->set_xstate(sb, flags, cmd);
+}
+static int quota_getxstate(struct super_block *sb, void __user *addr)
+{
+        struct fs_quota_stat fqs;
+        int ret;
-        sb->s_qcop->quota_sync(sb, type);
+        if (!sb->s_qcop->get_xstate)
+                return -ENOSYS;
+        ret = sb->s_qcop->get_xstate(sb, &fqs);
+        if (!ret && copy_to_user(addr, &fqs, sizeof(fqs)))
+                return -EFAULT;
+        return ret;
+}
-        if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)
+static int quota_setxquota(struct super_block *sb, int type, qid_t id,
-                return;
+                           void __user *addr)
-        /* This is not very clever (and fast) but currently I don't know about
+{
-         * any other simple way of getting quota data to disk and we must get
+        struct fs_disk_quota fdq;
-         * them there for userspace to be visible... */
-        if (sb->s_op->sync_fs)
-                sb->s_op->sync_fs(sb, 1);
-        sync_blockdev(sb->s_bdev);
-        /*
+        if (copy_from_user(&fdq, addr, sizeof(fdq)))
-         * Now when everything is written we can discard the pagecache so
+                return -EFAULT;
-         * that userspace sees the changes.
+        if (!sb->s_qcop->set_xquota)
-         */
+                return -ENOSYS;
-        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+        return sb->s_qcop->set_xquota(sb, type, id, &fdq);
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                if (type != -1 && cnt != type)
-                        continue;
-                if (!sb_has_quota_active(sb, cnt))
-                        continue;
-                mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex,
-                                  I_MUTEX_QUOTA);
-                truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
-                mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex);
-        }
-        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 }
-#endif
-static void sync_dquots(int type)
+static int quota_getxquota(struct super_block *sb, int type, qid_t id,
+                           void __user *addr)
 {
-        struct super_block *sb;
+        struct fs_disk_quota fdq;
-        int cnt;
+        int ret;
-        spin_lock(&sb_lock);
+        if (!sb->s_qcop->get_xquota)
-restart:
+                return -ENOSYS;
-        list_for_each_entry(sb, &super_blocks, s_list) {
+        ret = sb->s_qcop->get_xquota(sb, type, id, &fdq);
-                /* This test just improves performance so it needn't be
+        if (!ret && copy_to_user(addr, &fdq, sizeof(fdq)))
-                 * reliable... */
+                return -EFAULT;
-                for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+        return ret;
-                        if (type != -1 && type != cnt)
-                                continue;
-                        if (!sb_has_quota_active(sb, cnt))
-                                continue;
-                        if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
-                           list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
-                                continue;
-                        break;
-                }
-                if (cnt == MAXQUOTAS)
-                        continue;
-                sb->s_count++;
-                spin_unlock(&sb_lock);
-                down_read(&sb->s_umount);
-                if (sb->s_root)
-                        sync_quota_sb(sb, type);
-                up_read(&sb->s_umount);
-                spin_lock(&sb_lock);
-                if (__put_super_and_need_restart(sb))
-                        goto restart;
-        }
-        spin_unlock(&sb_lock);
 }
 /* Copy parameters and call proper function */
@@ -238,117 +224,55 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
 {
        int ret;
+        if (type >= (XQM_COMMAND(cmd) ? XQM_MAXQUOTAS : MAXQUOTAS))
+                return -EINVAL;
+        if (!sb->s_qcop)
+                return -ENOSYS;
+        ret = check_quotactl_permission(sb, type, cmd, id);
+        if (ret < 0)
+                return ret;
        switch (cmd) {
-                case Q_QUOTAON: {
+        case Q_QUOTAON:
-                        char *pathname;
+                return quota_quotaon(sb, type, cmd, id, addr);
+        case Q_QUOTAOFF:
-                        pathname = getname(addr);
+                if (!sb->s_qcop->quota_off)
-                        if (IS_ERR(pathname))
+                        return -ENOSYS;
-                                return PTR_ERR(pathname);
+                return sb->s_qcop->quota_off(sb, type, 0);
-                        ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0);
+        case Q_GETFMT:
-                        putname(pathname);
+                return quota_getfmt(sb, type, addr);
-                        return ret;
+        case Q_GETINFO:
-                }
+                return quota_getinfo(sb, type, addr);
-                case Q_QUOTAOFF:
+        case Q_SETINFO:
-                        return sb->s_qcop->quota_off(sb, type, 0);
+                return quota_setinfo(sb, type, addr);
+        case Q_GETQUOTA:
-                case Q_GETFMT: {
+                return quota_getquota(sb, type, id, addr);
-                        __u32 fmt;
+        case Q_SETQUOTA:
+                return quota_setquota(sb, type, id, addr);
-                        down_read(&sb_dqopt(sb)->dqptr_sem);
+        case Q_SYNC:
-                        if (!sb_has_quota_active(sb, type)) {
+                if (!sb->s_qcop->quota_sync)
-                                up_read(&sb_dqopt(sb)->dqptr_sem);
+                        return -ENOSYS;
-                                return -ESRCH;
+                return sb->s_qcop->quota_sync(sb, type, 1);
-                        }
+        case Q_XQUOTAON:
-                        fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id;
+        case Q_XQUOTAOFF:
-                        up_read(&sb_dqopt(sb)->dqptr_sem);
+        case Q_XQUOTARM:
-                        if (copy_to_user(addr, &fmt, sizeof(fmt)))
+                return quota_setxstate(sb, cmd, addr);
-                                return -EFAULT;
+        case Q_XGETQSTAT:
-                        return 0;
+                return quota_getxstate(sb, addr);
-                }
+        case Q_XSETQLIM:
-                case Q_GETINFO: {
+                return quota_setxquota(sb, type, id, addr);
-                        struct if_dqinfo info;
+        case Q_XGETQUOTA:
+                return quota_getxquota(sb, type, id, addr);
-                        ret = sb->s_qcop->get_info(sb, type, &info);
+        case Q_XQUOTASYNC:
-                        if (ret)
+                /* caller already holds s_umount */
-                                return ret;
+                if (sb->s_flags & MS_RDONLY)
-                        if (copy_to_user(addr, &info, sizeof(info)))
+                        return -EROFS;
-                                return -EFAULT;
+                writeback_inodes_sb(sb);
-                        return 0;
+                return 0;
-                }
+        default:
-                case Q_SETINFO: {
+                return -EINVAL;
-                        struct if_dqinfo info;
-                        if (copy_from_user(&info, addr, sizeof(info)))
-                                return -EFAULT;
-                        return sb->s_qcop->set_info(sb, type, &info);
-                }
-                case Q_GETQUOTA: {
-                        struct if_dqblk idq;
-                        ret = sb->s_qcop->get_dqblk(sb, type, id, &idq);
-                        if (ret)
-                                return ret;
-                        if (copy_to_user(addr, &idq, sizeof(idq)))
-                                return -EFAULT;
-                        return 0;
-                }
-                case Q_SETQUOTA: {
-                        struct if_dqblk idq;
-                        if (copy_from_user(&idq, addr, sizeof(idq)))
-                                return -EFAULT;
-                        return sb->s_qcop->set_dqblk(sb, type, id, &idq);
-                }
-                case Q_SYNC:
-                        if (sb)
-                                sync_quota_sb(sb, type);
-                        else
-                                sync_dquots(type);
-                        return 0;
-                case Q_XQUOTAON:
-                case Q_XQUOTAOFF:
-                case Q_XQUOTARM: {
-                        __u32 flags;
-                        if (copy_from_user(&flags, addr, sizeof(flags)))
-                                return -EFAULT;
-                        return sb->s_qcop->set_xstate(sb, flags, cmd);
-                }
-                case Q_XGETQSTAT: {
-                        struct fs_quota_stat fqs;
-                
-                        if ((ret = sb->s_qcop->get_xstate(sb, &fqs)))
-                                return ret;
-                        if (copy_to_user(addr, &fqs, sizeof(fqs)))
-                                return -EFAULT;
-                        return 0;
-                }
-                case Q_XSETQLIM: {
-                        struct fs_disk_quota fdq;
-                        if (copy_from_user(&fdq, addr, sizeof(fdq)))
-                                return -EFAULT;
-                       return sb->s_qcop->set_xquota(sb, type, id, &fdq);
-                }
-                case Q_XGETQUOTA: {
-                        struct fs_disk_quota fdq;
-                        ret = sb->s_qcop->get_xquota(sb, type, id, &fdq);
-                        if (ret)
-                                return ret;
-                        if (copy_to_user(addr, &fdq, sizeof(fdq)))
-                                return -EFAULT;
-                        return 0;
-                }
-                case Q_XQUOTASYNC:
-                        return sb->s_qcop->quota_sync(sb, type);
-                /* We never reach here unless validity check is broken */
-                default:
-                        BUG();
        }
-        return 0;
 }
 /*
@@ -395,133 +319,23 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
        cmds = cmd >> SUBCMDSHIFT;
        type = cmd & SUBCMDMASK;
-        if (cmds != Q_SYNC || special) {
+        /*
-                sb = quotactl_block(special);
+         * As a special case Q_SYNC can be called without a specific device.
-                if (IS_ERR(sb))
+         * It will iterate all superblocks that have quota enabled and call
-                        return PTR_ERR(sb);
+         * the sync action on each of them.
+         */
+        if (!special) {
+                if (cmds == Q_SYNC)
+                        return quota_sync_all(type);
+                return -ENODEV;
        }
-        ret = check_quotactl_valid(sb, type, cmds, id);
+        sb = quotactl_block(special);
-        if (ret >= 0)
+        if (IS_ERR(sb))
-                ret = do_quotactl(sb, type, cmds, id, addr);
+                return PTR_ERR(sb);
-        if (sb)
-                drop_super(sb);
-        return ret;
+        ret = do_quotactl(sb, type, cmds, id, addr);
-}
-#if defined(CONFIG_COMPAT_FOR_U64_ALIGNMENT)
+        drop_super(sb);
-/*
- * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64)
- * and is necessary due to alignment problems.
- */
-struct compat_if_dqblk {
-        compat_u64 dqb_bhardlimit;
-        compat_u64 dqb_bsoftlimit;
-        compat_u64 dqb_curspace;
-        compat_u64 dqb_ihardlimit;
-        compat_u64 dqb_isoftlimit;
-        compat_u64 dqb_curinodes;
-        compat_u64 dqb_btime;
-        compat_u64 dqb_itime;
-        compat_uint_t dqb_valid;
-};
-/* XFS structures */
-struct compat_fs_qfilestat {
-        compat_u64 dqb_bhardlimit;
-        compat_u64 qfs_nblks;
-        compat_uint_t qfs_nextents;
-};
-struct compat_fs_quota_stat {
-        __s8            qs_version;
-        __u16           qs_flags;
-        __s8            qs_pad;
-        struct compat_fs_qfilestat      qs_uquota;
-        struct compat_fs_qfilestat      qs_gquota;
-        compat_uint_t   qs_incoredqs;
-        compat_int_t    qs_btimelimit;
-        compat_int_t    qs_itimelimit;
-        compat_int_t    qs_rtbtimelimit;
-        __u16           qs_bwarnlimit;
-        __u16           qs_iwarnlimit;
-};
-asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
-                                                qid_t id, void __user *addr)
-{
-        unsigned int cmds;
-        struct if_dqblk __user *dqblk;
-        struct compat_if_dqblk __user *compat_dqblk;
-        struct fs_quota_stat __user *fsqstat;
-        struct compat_fs_quota_stat __user *compat_fsqstat;
-        compat_uint_t data;
-        u16 xdata;
-        long ret;
-        cmds = cmd >> SUBCMDSHIFT;
-        switch (cmds) {
-        case Q_GETQUOTA:
-                dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
-                compat_dqblk = addr;
-                ret = sys_quotactl(cmd, special, id, dqblk);
-                if (ret)
-                        break;
-                if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) ||
-                        get_user(data, &dqblk->dqb_valid) ||
-                        put_user(data, &compat_dqblk->dqb_valid))
-                        ret = -EFAULT;
-                break;
-        case Q_SETQUOTA:
-                dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
-                compat_dqblk = addr;
-                ret = -EFAULT;
-                if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) ||
-                        get_user(data, &compat_dqblk->dqb_valid) ||
-                        put_user(data, &dqblk->dqb_valid))
-                        break;
-                ret = sys_quotactl(cmd, special, id, dqblk);
-                break;
-        case Q_XGETQSTAT:
-                fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat));
-                compat_fsqstat = addr;
-                ret = sys_quotactl(cmd, special, id, fsqstat);
-                if (ret)
-                        break;
-                ret = -EFAULT;
-                /* Copying qs_version, qs_flags, qs_pad */
-                if (copy_in_user(compat_fsqstat, fsqstat,
-                        offsetof(struct compat_fs_quota_stat, qs_uquota)))
-                        break;
-                /* Copying qs_uquota */
-                if (copy_in_user(&compat_fsqstat->qs_uquota,
-                        &fsqstat->qs_uquota,
-                        sizeof(compat_fsqstat->qs_uquota)) ||
-                        get_user(data, &fsqstat->qs_uquota.qfs_nextents) ||
-                        put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents))
-                        break;
-                /* Copying qs_gquota */
-                if (copy_in_user(&compat_fsqstat->qs_gquota,
-                        &fsqstat->qs_gquota,
-                        sizeof(compat_fsqstat->qs_gquota)) ||
-                        get_user(data, &fsqstat->qs_gquota.qfs_nextents) ||
-                        put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents))
-                        break;
-                /* Copying the rest */
-                if (copy_in_user(&compat_fsqstat->qs_incoredqs,
-                        &fsqstat->qs_incoredqs,
-                        sizeof(struct compat_fs_quota_stat) -
-                        offsetof(struct compat_fs_quota_stat, qs_incoredqs)) ||
-                        get_user(xdata, &fsqstat->qs_iwarnlimit) ||
-                        put_user(xdata, &compat_fsqstat->qs_iwarnlimit))
-                        break;
-                ret = 0;
-                break;
-        default:
-                ret = sys_quotactl(cmd, special, id, addr);
-        }
        return ret;
 }
-#endif
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 0edcf42b1778..2ae757e9c008 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -204,7 +204,7 @@ out:
        return ret;
 }
-static struct quota_format_ops v1_format_ops = {
+static const struct quota_format_ops v1_format_ops = {
        .check_quota_file       = v1_check_quota_file,
        .read_file_info         = v1_read_file_info,
        .write_file_info        = v1_write_file_info,
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index a5475fb1ae44..e3da02f4986f 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -23,14 +23,23 @@ MODULE_LICENSE("GPL");
 #define __QUOTA_V2_PARANOIA
-static void v2_mem2diskdqb(void *dp, struct dquot *dquot);
+static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot);
-static void v2_disk2memdqb(struct dquot *dquot, void *dp);
+static void v2r0_disk2memdqb(struct dquot *dquot, void *dp);
-static int v2_is_id(void *dp, struct dquot *dquot);
+static int v2r0_is_id(void *dp, struct dquot *dquot);
+static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot);
-static struct qtree_fmt_operations v2_qtree_ops = {
+static void v2r1_disk2memdqb(struct dquot *dquot, void *dp);
-        .mem2disk_dqblk = v2_mem2diskdqb,
+static int v2r1_is_id(void *dp, struct dquot *dquot);
-        .disk2mem_dqblk = v2_disk2memdqb,
-        .is_id = v2_is_id,
+static struct qtree_fmt_operations v2r0_qtree_ops = {
+        .mem2disk_dqblk = v2r0_mem2diskdqb,
+        .disk2mem_dqblk = v2r0_disk2memdqb,
+        .is_id = v2r0_is_id,
+};
+static struct qtree_fmt_operations v2r1_qtree_ops = {
+        .mem2disk_dqblk = v2r1_mem2diskdqb,
+        .disk2mem_dqblk = v2r1_disk2memdqb,
+        .is_id = v2r1_is_id,
 };
 #define QUOTABLOCK_BITS 10
@@ -46,23 +55,33 @@ static inline qsize_t v2_qbtos(qsize_t blocks)
        return blocks << QUOTABLOCK_BITS;
 }
+static int v2_read_header(struct super_block *sb, int type,
+                          struct v2_disk_dqheader *dqhead)
+{
+        ssize_t size;
+        size = sb->s_op->quota_read(sb, type, (char *)dqhead,
+                                    sizeof(struct v2_disk_dqheader), 0);
+        if (size != sizeof(struct v2_disk_dqheader)) {
+                printk(KERN_WARNING "quota_v2: Failed header read:"
+                       " expected=%zd got=%zd\n",
+                        sizeof(struct v2_disk_dqheader), size);
+                return 0;
+        }
+        return 1;
+}
 /* Check whether given file is really vfsv0 quotafile */
 static int v2_check_quota_file(struct super_block *sb, int type)
 {
        struct v2_disk_dqheader dqhead;
-        ssize_t size;
        static const uint quota_magics[] = V2_INITQMAGICS;
        static const uint quota_versions[] = V2_INITQVERSIONS;
 
-        size = sb->s_op->quota_read(sb, type, (char *)&dqhead,
+        if (!v2_read_header(sb, type, &dqhead))
-                                    sizeof(struct v2_disk_dqheader), 0);
-        if (size != sizeof(struct v2_disk_dqheader)) {
-                printk("quota_v2: failed read expected=%zd got=%zd\n",
-                        sizeof(struct v2_disk_dqheader), size);
                return 0;
-        }
        if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] ||
-            le32_to_cpu(dqhead.dqh_version) != quota_versions[type])
+            le32_to_cpu(dqhead.dqh_version) > quota_versions[type])
                return 0;
        return 1;
 }
@@ -71,14 +90,23 @@ static int v2_check_quota_file(struct super_block *sb, int type)
 static int v2_read_file_info(struct super_block *sb, int type)
 {
        struct v2_disk_dqinfo dinfo;
+        struct v2_disk_dqheader dqhead;
        struct mem_dqinfo *info = sb_dqinfo(sb, type);
        struct qtree_mem_dqinfo *qinfo;
        ssize_t size;
+        unsigned int version;
+        if (!v2_read_header(sb, type, &dqhead))
+                return -1;
+        version = le32_to_cpu(dqhead.dqh_version);
+        if ((info->dqi_fmt_id == QFMT_VFS_V0 && version != 0) ||
+            (info->dqi_fmt_id == QFMT_VFS_V1 && version != 1))
+                return -1;
        size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
               sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
        if (size != sizeof(struct v2_disk_dqinfo)) {
-                printk(KERN_WARNING "Can't read info structure on device %s.\n",
+                printk(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n",
                        sb->s_id);
                return -1;
        }
@@ -89,9 +117,15 @@ static int v2_read_file_info(struct super_block *sb, int type)
                return -1;
        }
        qinfo = info->dqi_priv;
-        /* limits are stored as unsigned 32-bit data */
+        if (version == 0) {
-        info->dqi_maxblimit = 0xffffffff;
+                /* limits are stored as unsigned 32-bit data */
-        info->dqi_maxilimit = 0xffffffff;
+                info->dqi_maxblimit = 0xffffffff;
+                info->dqi_maxilimit = 0xffffffff;
+        } else {
+                /* used space is stored as unsigned 64-bit value */
+                info->dqi_maxblimit = 0xffffffffffffffffULL;    /* 2^64-1 */
+                info->dqi_maxilimit = 0xffffffffffffffffULL;
+        }
        info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
        info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
        info->dqi_flags = le32_to_cpu(dinfo.dqi_flags);
@@ -103,8 +137,13 @@ static int v2_read_file_info(struct super_block *sb, int type)
        qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS;
        qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS;
        qinfo->dqi_qtree_depth = qtree_depth(qinfo);
-        qinfo->dqi_entry_size = sizeof(struct v2_disk_dqblk);
+        if (version == 0) {
-        qinfo->dqi_ops = &v2_qtree_ops;
+                qinfo->dqi_entry_size = sizeof(struct v2r0_disk_dqblk);
+                qinfo->dqi_ops = &v2r0_qtree_ops;
+        } else {
+                qinfo->dqi_entry_size = sizeof(struct v2r1_disk_dqblk);
+                qinfo->dqi_ops = &v2r1_qtree_ops;
+        }
        return 0;
 }
@@ -135,9 +174,9 @@ static int v2_write_file_info(struct super_block *sb, int type)
        return 0;
 }
-static void v2_disk2memdqb(struct dquot *dquot, void *dp)
+static void v2r0_disk2memdqb(struct dquot *dquot, void *dp)
 {
-        struct v2_disk_dqblk *d = dp, empty;
+        struct v2r0_disk_dqblk *d = dp, empty;
        struct mem_dqblk *m = &dquot->dq_dqb;
        m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit);
@@ -149,15 +188,15 @@ static void v2_disk2memdqb(struct dquot *dquot, void *dp)
        m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
        m->dqb_btime = le64_to_cpu(d->dqb_btime);
        /* We need to escape back all-zero structure */
-        memset(&empty, 0, sizeof(struct v2_disk_dqblk));
+        memset(&empty, 0, sizeof(struct v2r0_disk_dqblk));
        empty.dqb_itime = cpu_to_le64(1);
-        if (!memcmp(&empty, dp, sizeof(struct v2_disk_dqblk)))
+        if (!memcmp(&empty, dp, sizeof(struct v2r0_disk_dqblk)))
                m->dqb_itime = 0;
 }
-static void v2_mem2diskdqb(void *dp, struct dquot *dquot)
+static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot)
 {
-        struct v2_disk_dqblk *d = dp;
+        struct v2r0_disk_dqblk *d = dp;
        struct mem_dqblk *m = &dquot->dq_dqb;
        struct qtree_mem_dqinfo *info =
                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
@@ -175,9 +214,60 @@ static void v2_mem2diskdqb(void *dp, struct dquot *dquot)
                d->dqb_itime = cpu_to_le64(1);
 }
-static int v2_is_id(void *dp, struct dquot *dquot)
+static int v2r0_is_id(void *dp, struct dquot *dquot)
 {
-        struct v2_disk_dqblk *d = dp;
+        struct v2r0_disk_dqblk *d = dp;
+        struct qtree_mem_dqinfo *info =
+                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+        if (qtree_entry_unused(info, dp))
+                return 0;
+        return le32_to_cpu(d->dqb_id) == dquot->dq_id;
+}
+static void v2r1_disk2memdqb(struct dquot *dquot, void *dp)
+{
+        struct v2r1_disk_dqblk *d = dp, empty;
+        struct mem_dqblk *m = &dquot->dq_dqb;
+        m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit);
+        m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit);
+        m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes);
+        m->dqb_itime = le64_to_cpu(d->dqb_itime);
+        m->dqb_bhardlimit = v2_qbtos(le64_to_cpu(d->dqb_bhardlimit));
+        m->dqb_bsoftlimit = v2_qbtos(le64_to_cpu(d->dqb_bsoftlimit));
+        m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
+        m->dqb_btime = le64_to_cpu(d->dqb_btime);
+        /* We need to escape back all-zero structure */
+        memset(&empty, 0, sizeof(struct v2r1_disk_dqblk));
+        empty.dqb_itime = cpu_to_le64(1);
+        if (!memcmp(&empty, dp, sizeof(struct v2r1_disk_dqblk)))
+                m->dqb_itime = 0;
+}
+static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot)
+{
+        struct v2r1_disk_dqblk *d = dp;
+        struct mem_dqblk *m = &dquot->dq_dqb;
+        struct qtree_mem_dqinfo *info =
+                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+        d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
+        d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
+        d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
+        d->dqb_itime = cpu_to_le64(m->dqb_itime);
+        d->dqb_bhardlimit = cpu_to_le64(v2_stoqb(m->dqb_bhardlimit));
+        d->dqb_bsoftlimit = cpu_to_le64(v2_stoqb(m->dqb_bsoftlimit));
+        d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
+        d->dqb_btime = cpu_to_le64(m->dqb_btime);
+        d->dqb_id = cpu_to_le32(dquot->dq_id);
+        if (qtree_entry_unused(info, dp))
+                d->dqb_itime = cpu_to_le64(1);
+}
+static int v2r1_is_id(void *dp, struct dquot *dquot)
+{
+        struct v2r1_disk_dqblk *d = dp;
        struct qtree_mem_dqinfo *info =
                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
@@ -207,7 +297,7 @@ static int v2_free_file_info(struct super_block *sb, int type)
        return 0;
 }
-static struct quota_format_ops v2_format_ops = {
+static const struct quota_format_ops v2_format_ops = {
        .check_quota_file       = v2_check_quota_file,
        .read_file_info         = v2_read_file_info,
        .write_file_info        = v2_write_file_info,
@@ -217,20 +307,32 @@ static struct quota_format_ops v2_format_ops = {
        .release_dqblk          = v2_release_dquot,
 };
-static struct quota_format_type v2_quota_format = {
+static struct quota_format_type v2r0_quota_format = {
        .qf_fmt_id      = QFMT_VFS_V0,
        .qf_ops         = &v2_format_ops,
        .qf_owner       = THIS_MODULE
 };
+static struct quota_format_type v2r1_quota_format = {
+        .qf_fmt_id      = QFMT_VFS_V1,
+        .qf_ops         = &v2_format_ops,
+        .qf_owner       = THIS_MODULE
+};
 static int __init init_v2_quota_format(void)
 {
-        return register_quota_format(&v2_quota_format);
+        int ret;
+        ret = register_quota_format(&v2r0_quota_format);
+        if (ret)
+                return ret;
+        return register_quota_format(&v2r1_quota_format);
 }
 static void __exit exit_v2_quota_format(void)
 {
-        unregister_quota_format(&v2_quota_format);
+        unregister_quota_format(&v2r0_quota_format);
+        unregister_quota_format(&v2r1_quota_format);
 }
 module_init(init_v2_quota_format);
diff --git a/fs/quota/quotaio_v2.h b/fs/quota/quotaio_v2.h
index 530fe580685c..f1966b42c2fd 100644
--- a/fs/quota/quotaio_v2.h
+++ b/fs/quota/quotaio_v2.h
@@ -17,8 +17,8 @@
 }
 #define V2_INITQVERSIONS {\
-        0,              /* USRQUOTA */\
+        1,              /* USRQUOTA */\
-        0               /* GRPQUOTA */\
+        1               /* GRPQUOTA */\
 }
 /* First generic header */
@@ -32,7 +32,7 @@ struct v2_disk_dqheader {
 * (as it appears on disk) - the file is a radix tree whose leaves point
 * to blocks of these structures.
 */
-struct v2_disk_dqblk {
+struct v2r0_disk_dqblk {
        __le32 dqb_id;          /* id this quota applies to */
        __le32 dqb_ihardlimit;  /* absolute limit on allocated inodes */
        __le32 dqb_isoftlimit;  /* preferred inode limit */
@@ -44,6 +44,19 @@ struct v2_disk_dqblk {
        __le64 dqb_itime;       /* time limit for excessive inode use */
 };
+struct v2r1_disk_dqblk {
+        __le32 dqb_id;          /* id this quota applies to */
+        __le32 dqb_pad;
+        __le64 dqb_ihardlimit;  /* absolute limit on allocated inodes */
+        __le64 dqb_isoftlimit;  /* preferred inode limit */
+        __le64 dqb_curinodes;   /* current # allocated inodes */
+        __le64 dqb_bhardlimit;  /* absolute limit on disk space (in QUOTABLOCK_SIZE) */
+        __le64 dqb_bsoftlimit;  /* preferred limit on disk space (in QUOTABLOCK_SIZE) */
+        __le64 dqb_curspace;    /* current space occupied (in bytes) */
+        __le64 dqb_btime;       /* time limit for excessive disk use */
+        __le64 dqb_itime;       /* time limit for excessive inode use */
+};
 /* Header with type and version specific information */
 struct v2_disk_dqinfo {
        __le32 dqi_bgrace;      /* Time before block soft limit becomes hard limit */
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 32fae4040ebf..5ea4ad81a429 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -21,6 +21,7 @@
 #include <linux/pagevec.h>
 #include <linux/mman.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -60,7 +61,7 @@ const struct inode_operations ramfs_file_inode_operations = {
 */
 int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 {
-        unsigned long npages, xpages, loop, limit;
+        unsigned long npages, xpages, loop;
        struct page *pages;
        unsigned order;
        void *data;
@@ -123,30 +124,6 @@ add_error:
 /*****************************************************************************/
 /*
- * check that file shrinkage doesn't leave any VMAs dangling in midair
- */
-static int ramfs_nommu_check_mappings(struct inode *inode,
-                                      size_t newsize, size_t size)
-{
-        struct vm_area_struct *vma;
-        struct prio_tree_iter iter;
-        /* search for VMAs that fall within the dead zone */
-        vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
-                              newsize >> PAGE_SHIFT,
-                              (size + PAGE_SIZE - 1) >> PAGE_SHIFT
-                              ) {
-                /* found one - only interested if it's shared out of the page
-                 * cache */
-                if (vma->vm_flags & VM_SHARED)
-                        return -ETXTBSY; /* not quite true, but near enough */
-        }
-        return 0;
-}
-/*****************************************************************************/
-/*
 *
 */
 static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
@@ -164,7 +141,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
        /* check that a decrease in size doesn't cut off any shared mappings */
        if (newsize < size) {
-                ret = ramfs_nommu_check_mappings(inode, newsize, size);
+                ret = nommu_shrink_inode_mappings(inode, size, newsize);
                if (ret < 0)
                        return ret;
        }
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a6090aa1a7c1..c94853473ca9 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -35,6 +35,7 @@
 #include <linux/sched.h>
 #include <linux/parser.h>
 #include <linux/magic.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include "internal.h"
diff --git a/fs/read_write.c b/fs/read_write.c
index 3ac28987f22a..113386d6fd2d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -258,6 +258,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = *ppos;
        kiocb.ki_left = len;
+        kiocb.ki_nbytes = len;
        for (;;) {
                ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
@@ -313,6 +314,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = *ppos;
        kiocb.ki_left = len;
+        kiocb.ki_nbytes = len;
        for (;;) {
                ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
@@ -826,8 +828,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
        if (!(out_file->f_mode & FMODE_WRITE))
                goto fput_out;
        retval = -EINVAL;
-        if (!out_file->f_op || !out_file->f_op->sendpage)
-                goto fput_out;
        in_inode = in_file->f_path.dentry->d_inode;
        out_inode = out_file->f_path.dentry->d_inode;
        retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
index 7c5ab6330dd6..792b3cb2cd18 100644
--- a/fs/reiserfs/Makefile
+++ b/fs/reiserfs/Makefile
@@ -7,7 +7,11 @@ obj-$(CONFIG_REISERFS_FS) += reiserfs.o
 reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \
                 super.o prints.o objectid.o lbalance.o ibalance.o stree.o \
                 hashes.o tail_conversion.o journal.o resize.o \
-                 item_ops.o ioctl.o procfs.o xattr.o
+                 item_ops.o ioctl.o xattr.o lock.o
+ifeq ($(CONFIG_REISERFS_PROC_INFO),y)
+reiserfs-objs += procfs.o
+endif
 ifeq ($(CONFIG_REISERFS_FS_XATTR),y)
 reiserfs-objs += xattr_user.o xattr_trusted.o
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index e716161ab325..483442e66ed6 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -169,7 +169,7 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
                        return 0;       // No free blocks in this bitmap
                }
-                /* search for a first zero bit -- beggining of a window */
+                /* search for a first zero bit -- beginning of a window */
                *beg = reiserfs_find_next_zero_le_bit
                    ((unsigned long *)(bh->b_data), boundary, *beg);
@@ -425,7 +425,7 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
        journal_mark_dirty(th, s, sbh);
        if (for_unformatted)
-                vfs_dq_free_block_nodirty(inode, 1);
+                dquot_free_block_nodirty(inode, 1);
 }
 void reiserfs_free_block(struct reiserfs_transaction_handle *th,
@@ -1049,7 +1049,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
                               amount_needed, hint->inode->i_uid);
 #endif
                quota_ret =
-                    vfs_dq_alloc_block_nodirty(hint->inode, amount_needed);
+                    dquot_alloc_block_nodirty(hint->inode, amount_needed);
                if (quota_ret)  /* Quota exceeded? */
                        return QUOTA_EXCEEDED;
                if (hint->preallocate && hint->prealloc_size) {
@@ -1058,7 +1058,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
                                       "reiserquota: allocating (prealloc) %d blocks id=%u",
                                       hint->prealloc_size, hint->inode->i_uid);
 #endif
-                        quota_ret = vfs_dq_prealloc_block_nodirty(hint->inode,
+                        quota_ret = dquot_prealloc_block_nodirty(hint->inode,
                                                         hint->prealloc_size);
                        if (quota_ret)
                                hint->preallocate = hint->prealloc_size = 0;
@@ -1092,7 +1092,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
                                               hint->inode->i_uid);
 #endif
                                /* Free not allocated blocks */
-                                vfs_dq_free_block_nodirty(hint->inode,
+                                dquot_free_block_nodirty(hint->inode,
                                        amount_needed + hint->prealloc_size -
                                        nr_allocated);
                        }
@@ -1125,7 +1125,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
                               REISERFS_I(hint->inode)->i_prealloc_count,
                               hint->inode->i_uid);
 #endif
-                vfs_dq_free_block_nodirty(hint->inode, amount_needed +
+                dquot_free_block_nodirty(hint->inode, amount_needed +
                                         hint->prealloc_size - nr_allocated -
                                         REISERFS_I(hint->inode)->
                                         i_prealloc_count);
@@ -1249,14 +1249,18 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
        else if (bitmap == 0)
                block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1;
+        reiserfs_write_unlock(sb);
        bh = sb_bread(sb, block);
+        reiserfs_write_lock(sb);
        if (bh == NULL)
                reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) "
                                 "reading failed", __func__, block);
        else {
                if (buffer_locked(bh)) {
                        PROC_INFO_INC(sb, scan_bitmap.wait);
+                        reiserfs_write_unlock(sb);
                        __wait_on_buffer(bh);
+                        reiserfs_write_lock(sb);
                }
                BUG_ON(!buffer_uptodate(bh));
                BUG_ON(atomic_read(&bh->b_count) == 0);
@@ -1273,7 +1277,10 @@ int reiserfs_init_bitmap_cache(struct super_block *sb)
        struct reiserfs_bitmap_info *bitmap;
        unsigned int bmap_nr = reiserfs_bmap_count(sb);
+        /* Avoid lock recursion in fault case */
+        reiserfs_write_unlock(sb);
        bitmap = vmalloc(sizeof(*bitmap) * bmap_nr);
+        reiserfs_write_lock(sb);
        if (bitmap == NULL)
                return -ENOMEM;
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 6d2668fdc384..f8a6075abf50 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -8,6 +8,7 @@
 #include <linux/reiserfs_fs.h>
 #include <linux/stat.h>
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 extern const struct reiserfs_key MIN_KEY;
@@ -20,7 +21,7 @@ const struct file_operations reiserfs_dir_operations = {
        .read = generic_read_dir,
        .readdir = reiserfs_readdir,
        .fsync = reiserfs_dir_fsync,
-        .ioctl = reiserfs_ioctl,
+        .unlocked_ioctl = reiserfs_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl = reiserfs_compat_ioctl,
 #endif
@@ -174,14 +175,22 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
                                // user space buffer is swapped out. At that time
                                // entry can move to somewhere else
                                memcpy(local_buf, d_name, d_reclen);
+                                /*
+                                 * Since filldir might sleep, we can release
+                                 * the write lock here for other waiters
+                                 */
+                                reiserfs_write_unlock(inode->i_sb);
                                if (filldir
                                    (dirent, local_buf, d_reclen, d_off, d_ino,
                                     DT_UNKNOWN) < 0) {
+                                        reiserfs_write_lock(inode->i_sb);
                                        if (local_buf != small_buf) {
                                                kfree(local_buf);
                                        }
                                        goto end;
                                }
+                                reiserfs_write_lock(inode->i_sb);
                                if (local_buf != small_buf) {
                                        kfree(local_buf);
                                }
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 128d3f7c8aa5..60c080440661 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -21,14 +21,6 @@
 #include <linux/buffer_head.h>
 #include <linux/kernel.h>
-#ifdef CONFIG_REISERFS_CHECK
-struct tree_balance *cur_tb = NULL;     /* detects whether more than one
-                                           copy of tb exists as a means
-                                           of checking whether schedule
-                                           is interrupting do_balance */
-#endif
 static inline void buffer_info_init_left(struct tree_balance *tb,
                                         struct buffer_info *bi)
 {
@@ -1840,11 +1832,12 @@ static int check_before_balancing(struct tree_balance *tb)
 {
        int retval = 0;
-        if (cur_tb) {
+        if (REISERFS_SB(tb->tb_sb)->cur_tb) {
                reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule "
                               "occurred based on cur_tb not being null at "
                               "this point in code. do_balance cannot properly "
-                               "handle schedule occurring while it runs.");
+                               "handle concurrent tree accesses on a same "
+                               "mount point.");
        }
        /* double check that buffers that we will modify are unlocked. (fix_nodes should already have
@@ -1986,7 +1979,7 @@ static inline void do_balance_starts(struct tree_balance *tb)
             "check");*/
        RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB");
 #ifdef CONFIG_REISERFS_CHECK
-        cur_tb = tb;
+        REISERFS_SB(tb->tb_sb)->cur_tb = tb;
 #endif
 }
@@ -1996,7 +1989,7 @@ static inline void do_balance_completed(struct tree_balance *tb)
 #ifdef CONFIG_REISERFS_CHECK
        check_leaf_level(tb);
        check_internal_levels(tb);
-        cur_tb = NULL;
+        REISERFS_SB(tb->tb_sb)->cur_tb = NULL;
 #endif
        /* reiserfs_free_block is no longer schedule safe.  So, we need to
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 9f436668b7f8..1d9c12714c5c 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -284,12 +284,12 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
 const struct file_operations reiserfs_file_operations = {
        .read = do_sync_read,
        .write = reiserfs_file_write,
-        .ioctl = reiserfs_ioctl,
+        .unlocked_ioctl = reiserfs_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl = reiserfs_compat_ioctl,
 #endif
        .mmap = reiserfs_file_mmap,
-        .open = generic_file_open,
+        .open = dquot_file_open,
        .release = reiserfs_file_release,
        .fsync = reiserfs_sync_file,
        .aio_read = generic_file_aio_read,
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 5e5a4e6fbaf8..1e4250bc3a6f 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -35,6 +35,7 @@
 **/
 #include <linux/time.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/buffer_head.h>
@@ -563,9 +564,6 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
        return needed_nodes;
 }
-#ifdef CONFIG_REISERFS_CHECK
-extern struct tree_balance *cur_tb;
-#endif
 /* Set parameters for balancing.
 * Performs write of results of analysis of balancing into structure tb,
@@ -834,7 +832,7 @@ static int get_empty_nodes(struct tree_balance *tb, int h)
                RFALSE(buffer_dirty(new_bh) ||
                       buffer_journaled(new_bh) ||
                       buffer_journal_dirty(new_bh),
-                       "PAP-8140: journlaled or dirty buffer %b for the new block",
+                       "PAP-8140: journaled or dirty buffer %b for the new block",
                       new_bh);
                /* Put empty buffers into the array. */
@@ -1022,7 +1020,11 @@ static int get_far_parent(struct tree_balance *tb,
        /* Check whether the common parent is locked. */
        if (buffer_locked(*pcom_father)) {
+                /* Release the write lock while the buffer is busy */
+                reiserfs_write_unlock(tb->tb_sb);
                __wait_on_buffer(*pcom_father);
+                reiserfs_write_lock(tb->tb_sb);
                if (FILESYSTEM_CHANGED_TB(tb)) {
                        brelse(*pcom_father);
                        return REPEAT_SEARCH;
@@ -1927,7 +1929,9 @@ static int get_direct_parent(struct tree_balance *tb, int h)
                return REPEAT_SEARCH;
        if (buffer_locked(bh)) {
+                reiserfs_write_unlock(tb->tb_sb);
                __wait_on_buffer(bh);
+                reiserfs_write_lock(tb->tb_sb);
                if (FILESYSTEM_CHANGED_TB(tb))
                        return REPEAT_SEARCH;
        }
@@ -1965,7 +1969,9 @@ static int get_neighbors(struct tree_balance *tb, int h)
                     tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->
                                                                       FL[h]);
                son_number = B_N_CHILD_NUM(tb->FL[h], child_position);
+                reiserfs_write_unlock(sb);
                bh = sb_bread(sb, son_number);
+                reiserfs_write_lock(sb);
                if (!bh)
                        return IO_ERROR;
                if (FILESYSTEM_CHANGED_TB(tb)) {
@@ -2003,7 +2009,9 @@ static int get_neighbors(struct tree_balance *tb, int h)
                child_position =
                    (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0;
                son_number = B_N_CHILD_NUM(tb->FR[h], child_position);
+                reiserfs_write_unlock(sb);
                bh = sb_bread(sb, son_number);
+                reiserfs_write_lock(sb);
                if (!bh)
                        return IO_ERROR;
                if (FILESYSTEM_CHANGED_TB(tb)) {
@@ -2278,7 +2286,9 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
                                    REPEAT_SEARCH : CARRY_ON;
                        }
 #endif
+                        reiserfs_write_unlock(tb->tb_sb);
                        __wait_on_buffer(locked);
+                        reiserfs_write_lock(tb->tb_sb);
                        if (FILESYSTEM_CHANGED_TB(tb))
                                return REPEAT_SEARCH;
                }
@@ -2349,12 +2359,14 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
        /* if it possible in indirect_to_direct conversion */
        if (buffer_locked(tbS0)) {
+                reiserfs_write_unlock(tb->tb_sb);
                __wait_on_buffer(tbS0);
+                reiserfs_write_lock(tb->tb_sb);
                if (FILESYSTEM_CHANGED_TB(tb))
                        return REPEAT_SEARCH;
        }
 #ifdef CONFIG_REISERFS_CHECK
-        if (cur_tb) {
+        if (REISERFS_SB(tb->tb_sb)->cur_tb) {
                print_cur_tb("fix_nodes");
                reiserfs_panic(tb->tb_sb, "PAP-8305",
                               "there is pending do_balance");
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a14d6cd9eeda..dc2c65e04853 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -11,6 +11,7 @@
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/unaligned.h>
 #include <linux/buffer_head.h>
@@ -31,11 +32,15 @@ void reiserfs_delete_inode(struct inode *inode)
            JOURNAL_PER_BALANCE_CNT * 2 +
            2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
        struct reiserfs_transaction_handle th;
+        int depth;
        int err;
+        if (!is_bad_inode(inode))
+                dquot_initialize(inode);
        truncate_inode_pages(&inode->i_data, 0);
-        reiserfs_write_lock(inode->i_sb);
+        depth = reiserfs_write_lock_once(inode->i_sb);
        /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
        if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {  /* also handles bad_inode case */
@@ -53,7 +58,7 @@ void reiserfs_delete_inode(struct inode *inode)
                 * after delete_object so that quota updates go into the same transaction as
                 * stat data deletion */
                if (!err) 
-                        vfs_dq_free_inode(inode);
+                        dquot_free_inode(inode);
                if (journal_end(&th, inode->i_sb, jbegin_count))
                        goto out;
@@ -74,7 +79,7 @@ void reiserfs_delete_inode(struct inode *inode)
      out:
        clear_inode(inode);     /* note this must go after the journal_end to prevent deadlock */
        inode->i_blocks = 0;
-        reiserfs_write_unlock(inode->i_sb);
+        reiserfs_write_unlock_once(inode->i_sb, depth);
 }
 static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
@@ -251,7 +256,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
        struct cpu_key key;
        struct buffer_head *bh;
        struct item_head *ih, tmp_ih;
-        int fs_gen;
        b_blocknr_t blocknr;
        char *p = NULL;
        int chars;
@@ -265,7 +269,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
                     (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
                     3);
-      research:
        result = search_for_position_by_key(inode->i_sb, &key, &path);
        if (result != POSITION_FOUND) {
                pathrelse(&path);
@@ -340,7 +343,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
        }
        // read file tail into part of page
        offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
-        fs_gen = get_generation(inode->i_sb);
        copy_item_head(&tmp_ih, ih);
        /* we only want to kmap if we are reading the tail into the page.
@@ -348,13 +350,9 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
         ** sure we need to.  But, this means the item might move if
         ** kmap schedules
         */
-        if (!p) {
+        if (!p)
                p = (char *)kmap(bh_result->b_page);
-                if (fs_changed(fs_gen, inode->i_sb)
-                    && item_moved(&tmp_ih, &path)) {
-                        goto research;
-                }
-        }
        p += offset;
        memset(p, 0, inode->i_sb->s_blocksize);
        do {
@@ -489,10 +487,14 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode,
           disappeared */
        if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
                int err;
-                lock_kernel();
+                reiserfs_write_lock(inode->i_sb);
                err = reiserfs_commit_for_inode(inode);
                REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
-                unlock_kernel();
+                reiserfs_write_unlock(inode->i_sb);
                if (err < 0)
                        ret = err;
        }
@@ -601,6 +603,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
        __le32 *item;
        int done;
        int fs_gen;
+        int lock_depth;
        struct reiserfs_transaction_handle *th = NULL;
        /* space reserved in transaction batch:
           . 3 balancings in direct->indirect conversion
@@ -616,12 +619,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
        loff_t new_offset =
            (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
-        /* bad.... */
+        lock_depth = reiserfs_write_lock_once(inode->i_sb);
-        reiserfs_write_lock(inode->i_sb);
        version = get_inode_item_key_version(inode);
        if (!file_capable(inode, block)) {
-                reiserfs_write_unlock(inode->i_sb);
+                reiserfs_write_unlock_once(inode->i_sb, lock_depth);
                return -EFBIG;
        }
@@ -633,7 +635,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                /* find number of block-th logical block of the file */
                ret = _get_block_create_0(inode, block, bh_result,
                                          create | GET_BLOCK_READ_DIRECT);
-                reiserfs_write_unlock(inode->i_sb);
+                reiserfs_write_unlock_once(inode->i_sb, lock_depth);
                return ret;
        }
        /*
@@ -751,7 +753,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                if (!dangle && th)
                        retval = reiserfs_end_persistent_transaction(th);
-                reiserfs_write_unlock(inode->i_sb);
+                reiserfs_write_unlock_once(inode->i_sb, lock_depth);
                /* the item was found, so new blocks were not added to the file
                 ** there is no need to make sure the inode is updated with this
@@ -935,7 +937,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                        if (blocks_needed == 1) {
                                un = &unf_single;
                        } else {
-                                un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_ATOMIC);      // We need to avoid scheduling.
+                                un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_NOFS);
                                if (!un) {
                                        un = &unf_single;
                                        blocks_needed = 1;
@@ -997,10 +999,16 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                        if (retval)
                                goto failure;
                }
-                /* inserting indirect pointers for a hole can take a
+                /*
-                 ** long time.  reschedule if needed
+                 * inserting indirect pointers for a hole can take a
+                 * long time.  reschedule if needed and also release the write
+                 * lock for others.
                 */
-                cond_resched();
+                if (need_resched()) {
+                        reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+                        schedule();
+                        lock_depth = reiserfs_write_lock_once(inode->i_sb);
+                }
                retval = search_for_position_by_key(inode->i_sb, &key, &path);
                if (retval == IO_ERROR) {
@@ -1035,7 +1043,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                        retval = err;
        }
-        reiserfs_write_unlock(inode->i_sb);
+        reiserfs_write_unlock_once(inode->i_sb, lock_depth);
        reiserfs_check_path(&path);
        return retval;
 }
@@ -1493,9 +1501,11 @@ struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
        args.objectid = key->on_disk_key.k_objectid;
        args.dirid = key->on_disk_key.k_dir_id;
+        reiserfs_write_unlock(s);
        inode = iget5_locked(s, key->on_disk_key.k_objectid,
                             reiserfs_find_actor, reiserfs_init_locked_inode,
                             (void *)(&args));
+        reiserfs_write_lock(s);
        if (!inode)
                return ERR_PTR(-ENOMEM);
@@ -1609,7 +1619,7 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
 ** to properly mark inodes for datasync and such, but only actually
 ** does something when called for a synchronous update.
 */
-int reiserfs_write_inode(struct inode *inode, int do_sync)
+int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        struct reiserfs_transaction_handle th;
        int jbegin_count = 1;
@@ -1621,7 +1631,7 @@ int reiserfs_write_inode(struct inode *inode, int do_sync)
         ** inode needs to reach disk for safety, and they can safely be
         ** ignored because the altered inode has already been logged.
         */
-        if (do_sync && !(current->flags & PF_MEMALLOC)) {
+        if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) {
                reiserfs_write_lock(inode->i_sb);
                if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
                        reiserfs_update_sd(&th, inode);
@@ -1759,10 +1769,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
        BUG_ON(!th->t_trans_id);
-        if (vfs_dq_alloc_inode(inode)) {
+        dquot_initialize(inode);
-                err = -EDQUOT;
+        err = dquot_alloc_inode(inode);
+        if (err)
                goto out_end_trans;
-        }
        if (!dir->i_nlink) {
                err = -EPERM;
                goto out_bad_inode;
@@ -1953,12 +1963,12 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
        INODE_PKEY(inode)->k_objectid = 0;
        /* Quota change must be inside a transaction for journaling */
-        vfs_dq_free_inode(inode);
+        dquot_free_inode(inode);
      out_end_trans:
        journal_end(th, th->t_super, th->t_blocks_allocated);
        /* Drop can be outside and it needs more credits so it's better to have it outside */
-        vfs_dq_drop(inode);
+        dquot_drop(inode);
        inode->i_flags |= S_NOQUOTA;
        make_bad_inode(inode);
@@ -2072,8 +2082,9 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
        int error;
        struct buffer_head *bh = NULL;
        int err2;
+        int lock_depth;
-        reiserfs_write_lock(inode->i_sb);
+        lock_depth = reiserfs_write_lock_once(inode->i_sb);
        if (inode->i_size > 0) {
                error = grab_tail_page(inode, &page, &bh);
@@ -2142,14 +2153,17 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
                page_cache_release(page);
        }
-        reiserfs_write_unlock(inode->i_sb);
+        reiserfs_write_unlock_once(inode->i_sb, lock_depth);
        return 0;
      out:
        if (page) {
                unlock_page(page);
                page_cache_release(page);
        }
-        reiserfs_write_unlock(inode->i_sb);
+        reiserfs_write_unlock_once(inode->i_sb, lock_depth);
        return error;
 }
@@ -2531,6 +2545,12 @@ static int reiserfs_writepage(struct page *page, struct writeback_control *wbc)
        return reiserfs_write_full_page(page, wbc);
 }
+static void reiserfs_truncate_failed_write(struct inode *inode)
+{
+        truncate_inode_pages(inode->i_mapping, inode->i_size);
+        reiserfs_truncate_file(inode, 0);
+}
 static int reiserfs_write_begin(struct file *file,
                                struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned flags,
@@ -2597,6 +2617,8 @@ static int reiserfs_write_begin(struct file *file,
        if (ret) {
                unlock_page(page);
                page_cache_release(page);
+                /* Truncate allocated blocks */
+                reiserfs_truncate_failed_write(inode);
        }
        return ret;
 }
@@ -2608,7 +2630,10 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
        int ret;
        int old_ref = 0;
+        reiserfs_write_unlock(inode->i_sb);
        reiserfs_wait_on_write_block(inode->i_sb);
+        reiserfs_write_lock(inode->i_sb);
        fix_tail_page_for_writing(page);
        if (reiserfs_transaction_running(inode->i_sb)) {
                struct reiserfs_transaction_handle *th;
@@ -2664,6 +2689,8 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
        int update_sd = 0;
        struct reiserfs_transaction_handle *th;
        unsigned start;
+        int lock_depth = 0;
+        bool locked = false;
        if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND)
                pos ++;
@@ -2689,10 +2716,10 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
         ** transaction tracking stuff when the size changes.  So, we have
         ** to do the i_size updates here.
         */
-        pos += copied;
+        if (pos + copied > inode->i_size) {
-        if (pos > inode->i_size) {
                struct reiserfs_transaction_handle myth;
-                reiserfs_write_lock(inode->i_sb);
+                lock_depth = reiserfs_write_lock_once(inode->i_sb);
+                locked = true;
                /* If the file have grown beyond the border where it
                   can have a tail, unmark it as needing a tail
                   packing */
@@ -2703,12 +2730,11 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
                        REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
                ret = journal_begin(&myth, inode->i_sb, 1);
-                if (ret) {
+                if (ret)
-                        reiserfs_write_unlock(inode->i_sb);
                        goto journal_error;
-                }
                reiserfs_update_inode_transaction(inode);
-                inode->i_size = pos;
+                inode->i_size = pos + copied;
                /*
                 * this will just nest into our transaction.  It's important
                 * to use mark_inode_dirty so the inode gets pushed around on the
@@ -2718,34 +2744,40 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
                reiserfs_update_sd(&myth, inode);
                update_sd = 1;
                ret = journal_end(&myth, inode->i_sb, 1);
-                reiserfs_write_unlock(inode->i_sb);
                if (ret)
                        goto journal_error;
        }
        if (th) {
-                reiserfs_write_lock(inode->i_sb);
+                if (!locked) {
+                        lock_depth = reiserfs_write_lock_once(inode->i_sb);
+                        locked = true;
+                }
                if (!update_sd)
                        mark_inode_dirty(inode);
                ret = reiserfs_end_persistent_transaction(th);
-                reiserfs_write_unlock(inode->i_sb);
                if (ret)
                        goto out;
        }
      out:
+        if (locked)
+                reiserfs_write_unlock_once(inode->i_sb, lock_depth);
        unlock_page(page);
        page_cache_release(page);
+        if (pos + len > inode->i_size)
+                reiserfs_truncate_failed_write(inode);
        return ret == 0 ? copied : ret;
      journal_error:
+        reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+        locked = false;
        if (th) {
-                reiserfs_write_lock(inode->i_sb);
                if (!update_sd)
                        reiserfs_update_sd(th, inode);
                ret = reiserfs_end_persistent_transaction(th);
-                reiserfs_write_unlock(inode->i_sb);
        }
        goto out;
 }
@@ -2758,7 +2790,10 @@ int reiserfs_commit_write(struct file *f, struct page *page,
        int update_sd = 0;
        struct reiserfs_transaction_handle *th = NULL;
+        reiserfs_write_unlock(inode->i_sb);
        reiserfs_wait_on_write_block(inode->i_sb);
+        reiserfs_write_lock(inode->i_sb);
        if (reiserfs_transaction_running(inode->i_sb)) {
                th = current->journal_info;
        }
@@ -2770,7 +2805,6 @@ int reiserfs_commit_write(struct file *f, struct page *page,
         */
        if (pos > inode->i_size) {
                struct reiserfs_transaction_handle myth;
-                reiserfs_write_lock(inode->i_sb);
                /* If the file have grown beyond the border where it
                   can have a tail, unmark it as needing a tail
                   packing */
@@ -2781,10 +2815,9 @@ int reiserfs_commit_write(struct file *f, struct page *page,
                        REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
                ret = journal_begin(&myth, inode->i_sb, 1);
-                if (ret) {
+                if (ret)
-                        reiserfs_write_unlock(inode->i_sb);
                        goto journal_error;
-                }
                reiserfs_update_inode_transaction(inode);
                inode->i_size = pos;
                /*
@@ -2796,16 +2829,13 @@ int reiserfs_commit_write(struct file *f, struct page *page,
                reiserfs_update_sd(&myth, inode);
                update_sd = 1;
                ret = journal_end(&myth, inode->i_sb, 1);
-                reiserfs_write_unlock(inode->i_sb);
                if (ret)
                        goto journal_error;
        }
        if (th) {
-                reiserfs_write_lock(inode->i_sb);
                if (!update_sd)
                        mark_inode_dirty(inode);
                ret = reiserfs_end_persistent_transaction(th);
-                reiserfs_write_unlock(inode->i_sb);
                if (ret)
                        goto out;
        }
@@ -2815,11 +2845,9 @@ int reiserfs_commit_write(struct file *f, struct page *page,
      journal_error:
        if (th) {
-                reiserfs_write_lock(inode->i_sb);
                if (!update_sd)
                        reiserfs_update_sd(th, inode);
                ret = reiserfs_end_persistent_transaction(th);
-                reiserfs_write_unlock(inode->i_sb);
        }
        return ret;
@@ -3040,14 +3068,17 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
 int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
-        int error;
        unsigned int ia_valid;
+        int depth;
+        int error;
        /* must be turned off for recursive notify_change calls */
        ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
-        reiserfs_write_lock(inode->i_sb);
+        depth = reiserfs_write_lock_once(inode->i_sb);
        if (attr->ia_valid & ATTR_SIZE) {
+                dquot_initialize(inode);
                /* version 2 items will be caught by the s_maxbytes check
                 ** done for us in vmtruncate
                 */
@@ -3109,8 +3140,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
                                                  jbegin_count);
                                if (error)
                                        goto out;
-                                error =
+                                error = dquot_transfer(inode, attr);
-                                    vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
                                if (error) {
                                        journal_end(&th, inode->i_sb,
                                                    jbegin_count);
@@ -3127,8 +3157,17 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
                                    journal_end(&th, inode->i_sb, jbegin_count);
                        }
                }
-                if (!error)
+                if (!error) {
+                        /*
+                         * Relax the lock here, as it might truncate the
+                         * inode pages and wait for inode pages locks.
+                         * To release such page lock, the owner needs the
+                         * reiserfs lock
+                         */
+                        reiserfs_write_unlock_once(inode->i_sb, depth);
                        error = inode_setattr(inode, attr);
+                        depth = reiserfs_write_lock_once(inode->i_sb);
+                }
        }
        if (!error && reiserfs_posixacl(inode->i_sb)) {
@@ -3137,7 +3176,8 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
        }
      out:
-        reiserfs_write_unlock(inode->i_sb);
+        reiserfs_write_unlock_once(inode->i_sb, depth);
        return error;
 }
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 0ccc3fdda7bf..f53505de0712 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -13,44 +13,52 @@
 #include <linux/compat.h>
 /*
-** reiserfs_ioctl - handler for ioctl for inode
+ * reiserfs_ioctl - handler for ioctl for inode
-** supported commands:
+ * supported commands:
-**  1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
+ *  1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
-**                           and prevent packing file (argument arg has to be non-zero)
+ *                           and prevent packing file (argument arg has to be non-zero)
-**  2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
+ *  2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
-**  3) That's all for a while ...
+ *  3) That's all for a while ...
-*/
+ */
-int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
+long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-                   unsigned long arg)
 {
+        struct inode *inode = filp->f_path.dentry->d_inode;
        unsigned int flags;
        int err = 0;
+        reiserfs_write_lock(inode->i_sb);
        switch (cmd) {
        case REISERFS_IOC_UNPACK:
                if (S_ISREG(inode->i_mode)) {
                        if (arg)
-                                return reiserfs_unpack(inode, filp);
+                                err = reiserfs_unpack(inode, filp);
-                        else
-                                return 0;
                } else
-                        return -ENOTTY;
+                        err = -ENOTTY;
-                /* following two cases are taken from fs/ext2/ioctl.c by Remy
+                break;
-                   Card (card@masi.ibp.fr) */
+                /*
+                 * following two cases are taken from fs/ext2/ioctl.c by Remy
+                 * Card (card@masi.ibp.fr)
+                 */
        case REISERFS_IOC_GETFLAGS:
-                if (!reiserfs_attrs(inode->i_sb))
+                if (!reiserfs_attrs(inode->i_sb)) {
-                        return -ENOTTY;
+                        err = -ENOTTY;
+                        break;
+                }
                flags = REISERFS_I(inode)->i_attrs;
                i_attrs_to_sd_attrs(inode, (__u16 *) & flags);
-                return put_user(flags, (int __user *)arg);
+                err = put_user(flags, (int __user *)arg);
+                break;
        case REISERFS_IOC_SETFLAGS:{
-                        if (!reiserfs_attrs(inode->i_sb))
+                        if (!reiserfs_attrs(inode->i_sb)) {
-                                return -ENOTTY;
+                                err = -ENOTTY;
+                                break;
+                        }
                        err = mnt_want_write(filp->f_path.mnt);
                        if (err)
-                                return err;
+                                break;
                        if (!is_owner_or_cap(inode)) {
                                err = -EPERM;
@@ -90,16 +98,19 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
                        mark_inode_dirty(inode);
 setflags_out:
                        mnt_drop_write(filp->f_path.mnt);
-                        return err;
+                        break;
                }
        case REISERFS_IOC_GETVERSION:
-                return put_user(inode->i_generation, (int __user *)arg);
+                err = put_user(inode->i_generation, (int __user *)arg);
+                break;
        case REISERFS_IOC_SETVERSION:
-                if (!is_owner_or_cap(inode))
+                if (!is_owner_or_cap(inode)) {
-                        return -EPERM;
+                        err = -EPERM;
+                        break;
+                }
                err = mnt_want_write(filp->f_path.mnt);
                if (err)
-                        return err;
+                        break;
                if (get_user(inode->i_generation, (int __user *)arg)) {
                        err = -EFAULT;
                        goto setversion_out;
@@ -108,19 +119,20 @@ setflags_out:
                mark_inode_dirty(inode);
 setversion_out:
                mnt_drop_write(filp->f_path.mnt);
-                return err;
+                break;
        default:
-                return -ENOTTY;
+                err = -ENOTTY;
        }
+        reiserfs_write_unlock(inode->i_sb);
+        return err;
 }
 #ifdef CONFIG_COMPAT
 long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
                                unsigned long arg)
 {
-        struct inode *inode = file->f_path.dentry->d_inode;
-        int ret;
        /* These are just misnamed, they actually get/put from/to user an int */
        switch (cmd) {
        case REISERFS_IOC32_UNPACK:
@@ -141,10 +153,8 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
        default:
                return -ENOIOCTLCMD;
        }
-        lock_kernel();
-        ret = reiserfs_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
+        return reiserfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
-        unlock_kernel();
-        return ret;
 }
 #endif
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 90622200b39c..19fbc810e8e7 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -50,6 +50,7 @@
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/uaccess.h>
+#include <linux/slab.h>
 #include <asm/system.h>
@@ -429,21 +430,6 @@ static void clear_prepared_bits(struct buffer_head *bh)
        clear_buffer_journal_restore_dirty(bh);
 }
-/* utility function to force a BUG if it is called without the big
-** kernel lock held.  caller is the string printed just before calling BUG()
-*/
-void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
-{
-#ifdef CONFIG_SMP
-        if (current->lock_depth < 0) {
-                reiserfs_panic(sb, "journal-1", "%s called without kernel "
-                               "lock held", caller);
-        }
-#else
-        ;
-#endif
-}
 /* return a cnode with same dev, block number and size in table, or null if not found */
 static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
                                                                  super_block
@@ -556,7 +542,8 @@ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
 static inline void lock_journal(struct super_block *sb)
 {
        PROC_INFO_INC(sb, journal.lock_journal);
-        mutex_lock(&SB_JOURNAL(sb)->j_mutex);
+        reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb);
 }
 /* unlock the current transaction */
@@ -708,7 +695,9 @@ static void check_barrier_completion(struct super_block *s,
                disable_barrier(s);
                set_buffer_uptodate(bh);
                set_buffer_dirty(bh);
+                reiserfs_write_unlock(s);
                sync_dirty_buffer(bh);
+                reiserfs_write_lock(s);
        }
 }
@@ -996,8 +985,13 @@ static int reiserfs_async_progress_wait(struct super_block *s)
 {
        DEFINE_WAIT(wait);
        struct reiserfs_journal *j = SB_JOURNAL(s);
-        if (atomic_read(&j->j_async_throttle))
+        if (atomic_read(&j->j_async_throttle)) {
+                reiserfs_write_unlock(s);
                congestion_wait(BLK_RW_ASYNC, HZ / 10);
+                reiserfs_write_lock(s);
+        }
        return 0;
 }
@@ -1043,7 +1037,8 @@ static int flush_commit_list(struct super_block *s,
        }
        /* make sure nobody is trying to flush this one at the same time */
-        mutex_lock(&jl->j_commit_mutex);
+        reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s);
        if (!journal_list_still_alive(s, trans_id)) {
                mutex_unlock(&jl->j_commit_mutex);
                goto put_jl;
@@ -1061,12 +1056,17 @@ static int flush_commit_list(struct super_block *s,
        if (!list_empty(&jl->j_bh_list)) {
                int ret;
-                unlock_kernel();
+                /*
+                 * We might sleep in numerous places inside
+                 * write_ordered_buffers. Relax the write lock.
+                 */
+                reiserfs_write_unlock(s);
                ret = write_ordered_buffers(&journal->j_dirty_buffers_lock,
                                            journal, jl, &jl->j_bh_list);
                if (ret < 0 && retval == 0)
                        retval = ret;
-                lock_kernel();
+                reiserfs_write_lock(s);
        }
        BUG_ON(!list_empty(&jl->j_bh_list));
        /*
@@ -1085,8 +1085,11 @@ static int flush_commit_list(struct super_block *s,
                    SB_ONDISK_JOURNAL_SIZE(s);
                tbh = journal_find_get_block(s, bn);
                if (tbh) {
-                        if (buffer_dirty(tbh))
+                        if (buffer_dirty(tbh)) {
-                            ll_rw_block(WRITE, 1, &tbh) ;
+                            reiserfs_write_unlock(s);
+                            ll_rw_block(WRITE, 1, &tbh);
+                            reiserfs_write_lock(s);
+                        }
                        put_bh(tbh) ;
                }
        }
@@ -1114,12 +1117,19 @@ static int flush_commit_list(struct super_block *s,
                bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
                    (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
                tbh = journal_find_get_block(s, bn);
+                reiserfs_write_unlock(s);
                wait_on_buffer(tbh);
+                reiserfs_write_lock(s);
                // since we're using ll_rw_blk above, it might have skipped over
                // a locked buffer.  Double check here
                //
-                if (buffer_dirty(tbh))  /* redundant, sync_dirty_buffer() checks */
+                /* redundant, sync_dirty_buffer() checks */
+                if (buffer_dirty(tbh)) {
+                        reiserfs_write_unlock(s);
                        sync_dirty_buffer(tbh);
+                        reiserfs_write_lock(s);
+                }
                if (unlikely(!buffer_uptodate(tbh))) {
 #ifdef CONFIG_REISERFS_CHECK
                        reiserfs_warning(s, "journal-601",
@@ -1143,10 +1153,15 @@ static int flush_commit_list(struct super_block *s,
                        if (buffer_dirty(jl->j_commit_bh))
                                BUG();
                        mark_buffer_dirty(jl->j_commit_bh) ;
+                        reiserfs_write_unlock(s);
                        sync_dirty_buffer(jl->j_commit_bh) ;
+                        reiserfs_write_lock(s);
                }
-        } else
+        } else {
+                reiserfs_write_unlock(s);
                wait_on_buffer(jl->j_commit_bh);
+                reiserfs_write_lock(s);
+        }
        check_barrier_completion(s, jl->j_commit_bh);
@@ -1286,7 +1301,9 @@ static int _update_journal_header_block(struct super_block *sb,
        if (trans_id >= journal->j_last_flush_trans_id) {
                if (buffer_locked((journal->j_header_bh))) {
+                        reiserfs_write_unlock(sb);
                        wait_on_buffer((journal->j_header_bh));
+                        reiserfs_write_lock(sb);
                        if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
 #ifdef CONFIG_REISERFS_CHECK
                                reiserfs_warning(sb, "journal-699",
@@ -1312,12 +1329,16 @@ static int _update_journal_header_block(struct super_block *sb,
                                disable_barrier(sb);
                                goto sync;
                        }
+                        reiserfs_write_unlock(sb);
                        wait_on_buffer(journal->j_header_bh);
+                        reiserfs_write_lock(sb);
                        check_barrier_completion(sb, journal->j_header_bh);
                } else {
                      sync:
                        set_buffer_dirty(journal->j_header_bh);
+                        reiserfs_write_unlock(sb);
                        sync_dirty_buffer(journal->j_header_bh);
+                        reiserfs_write_lock(sb);
                }
                if (!buffer_uptodate(journal->j_header_bh)) {
                        reiserfs_warning(sb, "journal-837",
@@ -1409,7 +1430,7 @@ static int flush_journal_list(struct super_block *s,
        /* if flushall == 0, the lock is already held */
        if (flushall) {
-                mutex_lock(&journal->j_flush_mutex);
+                reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
        } else if (mutex_trylock(&journal->j_flush_mutex)) {
                BUG();
        }
@@ -1553,7 +1574,11 @@ static int flush_journal_list(struct super_block *s,
                                        reiserfs_panic(s, "journal-1011",
                                                       "cn->bh is NULL");
                                }
+                                reiserfs_write_unlock(s);
                                wait_on_buffer(cn->bh);
+                                reiserfs_write_lock(s);
                                if (!cn->bh) {
                                        reiserfs_panic(s, "journal-1012",
                                                       "cn->bh is NULL");
@@ -1769,7 +1794,7 @@ static int kupdate_transactions(struct super_block *s,
        struct reiserfs_journal *journal = SB_JOURNAL(s);
        chunk.nr = 0;
-        mutex_lock(&journal->j_flush_mutex);
+        reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
        if (!journal_list_still_alive(s, orig_trans_id)) {
                goto done;
        }
@@ -1973,7 +1998,14 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
        reiserfs_mounted_fs_count--;
        /* wait for all commits to finish */
        cancel_delayed_work(&SB_JOURNAL(sb)->j_work);
+        /*
+         * We must release the write lock here because
+         * the workqueue job (flush_async_commit) needs this lock
+         */
+        reiserfs_write_unlock(sb);
        flush_workqueue(commit_wq);
        if (!reiserfs_mounted_fs_count) {
                destroy_workqueue(commit_wq);
                commit_wq = NULL;
@@ -1981,6 +2013,8 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
        free_journal_ram(sb);
+        reiserfs_write_lock(sb);
        return 0;
 }
@@ -2184,6 +2218,15 @@ static int journal_read_transaction(struct super_block *sb,
                brelse(d_bh);
                return 1;
        }
+        if (bdev_read_only(sb->s_bdev)) {
+                reiserfs_warning(sb, "clm-2076",
+                                 "device is readonly, unable to replay log");
+                brelse(c_bh);
+                brelse(d_bh);
+                return -EROFS;
+        }
        trans_id = get_desc_trans_id(desc);
        /* now we know we've got a good transaction, and it was inside the valid time ranges */
        log_blocks = kmalloc(get_desc_trans_len(desc) *
@@ -2243,7 +2286,11 @@ static int journal_read_transaction(struct super_block *sb,
        /* read in the log blocks, memcpy to the corresponding real block */
        ll_rw_block(READ, get_desc_trans_len(desc), log_blocks);
        for (i = 0; i < get_desc_trans_len(desc); i++) {
+                reiserfs_write_unlock(sb);
                wait_on_buffer(log_blocks[i]);
+                reiserfs_write_lock(sb);
                if (!buffer_uptodate(log_blocks[i])) {
                        reiserfs_warning(sb, "journal-1212",
                                         "REPLAY FAILURE fsck required! "
@@ -2422,12 +2469,6 @@ static int journal_read(struct super_block *sb)
                goto start_log_replay;
        }
-        if (continue_replay && bdev_read_only(sb->s_bdev)) {
-                reiserfs_warning(sb, "clm-2076",
-                                 "device is readonly, unable to replay log");
-                return -1;
-        }
        /* ok, there are transactions that need to be replayed.  start with the first log block, find
         ** all the valid transactions, and pick out the oldest.
         */
@@ -2722,11 +2763,18 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        struct reiserfs_journal *journal;
        struct reiserfs_journal_list *jl;
        char b[BDEVNAME_SIZE];
+        int ret;
+        /*
+         * Unlock here to avoid various RECLAIM-FS-ON <-> IN-RECLAIM-FS
+         * dependency inversion warnings.
+         */
+        reiserfs_write_unlock(sb);
        journal = SB_JOURNAL(sb) = vmalloc(sizeof(struct reiserfs_journal));
        if (!journal) {
                reiserfs_warning(sb, "journal-1256",
                                 "unable to get memory for journal structure");
+                reiserfs_write_lock(sb);
                return 1;
        }
        memset(journal, 0, sizeof(struct reiserfs_journal));
@@ -2735,10 +2783,12 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        INIT_LIST_HEAD(&journal->j_working_list);
        INIT_LIST_HEAD(&journal->j_journal_list);
        journal->j_persistent_trans = 0;
-        if (reiserfs_allocate_list_bitmaps(sb,
+        ret = reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
-                                           journal->j_list_bitmap,
+                                           reiserfs_bmap_count(sb));
-                                           reiserfs_bmap_count(sb)))
+        reiserfs_write_lock(sb);
+        if (ret)
                goto free_and_return;
        allocate_bitmap_nodes(sb);
        /* reserved for journal area support */
@@ -2765,11 +2815,27 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
                goto free_and_return;
        }
+        /*
+         * We need to unlock here to avoid creating the following
+         * dependency:
+         * reiserfs_lock -> sysfs_mutex
+         * Because the reiserfs mmap path creates the following dependency:
+         * mm->mmap -> reiserfs_lock, hence we have
+         * mm->mmap -> reiserfs_lock ->sysfs_mutex
+         * This would ends up in a circular dependency with sysfs readdir path
+         * which does sysfs_mutex -> mm->mmap_sem
+         * This is fine because the reiserfs lock is useless in mount path,
+         * at least until we call journal_begin. We keep it for paranoid
+         * reasons.
+         */
+        reiserfs_write_unlock(sb);
        if (journal_init_dev(sb, journal, j_dev_name) != 0) {
+                reiserfs_write_lock(sb);
                reiserfs_warning(sb, "sh-462",
                                 "unable to initialize jornal device");
                goto free_and_return;
        }
+        reiserfs_write_lock(sb);
        rs = SB_DISK_SUPER_BLOCK(sb);
@@ -2851,7 +2917,9 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        journal->j_mount_id = 10;
        journal->j_state = 0;
        atomic_set(&(journal->j_jlock), 0);
+        reiserfs_write_unlock(sb);
        journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
+        reiserfs_write_lock(sb);
        journal->j_cnode_free_orig = journal->j_cnode_free_list;
        journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
        journal->j_cnode_used = 0;
@@ -2881,8 +2949,11 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        }
        reiserfs_mounted_fs_count++;
-        if (reiserfs_mounted_fs_count <= 1)
+        if (reiserfs_mounted_fs_count <= 1) {
+                reiserfs_write_unlock(sb);
                commit_wq = create_workqueue("reiserfs");
+                reiserfs_write_lock(sb);
+        }
        INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
        journal->j_work_sb = sb;
@@ -2964,8 +3035,11 @@ static void queue_log_writer(struct super_block *s)
        init_waitqueue_entry(&wait, current);
        add_wait_queue(&journal->j_join_wait, &wait);
        set_current_state(TASK_UNINTERRUPTIBLE);
-        if (test_bit(J_WRITERS_QUEUED, &journal->j_state))
+        if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) {
+                reiserfs_write_unlock(s);
                schedule();
+                reiserfs_write_lock(s);
+        }
        __set_current_state(TASK_RUNNING);
        remove_wait_queue(&journal->j_join_wait, &wait);
 }
@@ -2982,7 +3056,9 @@ static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
        struct reiserfs_journal *journal = SB_JOURNAL(sb);
        unsigned long bcount = journal->j_bcount;
        while (1) {
+                reiserfs_write_unlock(sb);
                schedule_timeout_uninterruptible(1);
+                reiserfs_write_lock(sb);
                journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
                while ((atomic_read(&journal->j_wcount) > 0 ||
                        atomic_read(&journal->j_jlock)) &&
@@ -3033,7 +3109,9 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
        if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
                unlock_journal(sb);
+                reiserfs_write_unlock(sb);
                reiserfs_wait_on_write_block(sb);
+                reiserfs_write_lock(sb);
                PROC_INFO_INC(sb, journal.journal_relock_writers);
                goto relock;
        }
@@ -3506,14 +3584,14 @@ static void flush_async_commits(struct work_struct *work)
        struct reiserfs_journal_list *jl;
        struct list_head *entry;
-        lock_kernel();
+        reiserfs_write_lock(sb);
        if (!list_empty(&journal->j_journal_list)) {
                /* last entry is the youngest, commit it and you get everything */
                entry = journal->j_journal_list.prev;
                jl = JOURNAL_LIST_ENTRY(entry);
                flush_commit_list(sb, jl, 1);
        }
-        unlock_kernel();
+        reiserfs_write_unlock(sb);
 }
 /*
@@ -4041,7 +4119,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
         * the new transaction is fully setup, and we've already flushed the
         * ordered bh list
         */
-        mutex_lock(&jl->j_commit_mutex);
+        reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb);
        /* save the transaction id in case we need to commit it later */
        commit_trans_id = jl->j_trans_id;
@@ -4156,7 +4234,9 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
                next = cn->next;
                free_cnode(sb, cn);
                cn = next;
+                reiserfs_write_unlock(sb);
                cond_resched();
+                reiserfs_write_lock(sb);
        }
        /* we are done  with both the c_bh and d_bh, but
@@ -4203,10 +4283,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
         * is lost.
         */
        if (!list_empty(&jl->j_tail_bh_list)) {
-                unlock_kernel();
+                reiserfs_write_unlock(sb);
                write_ordered_buffers(&journal->j_dirty_buffers_lock,
                                      journal, jl, &jl->j_tail_bh_list);
-                lock_kernel();
+                reiserfs_write_lock(sb);
        }
        BUG_ON(!list_empty(&jl->j_tail_bh_list));
        mutex_unlock(&jl->j_commit_mutex);
diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c
new file mode 100644
index 000000000000..b87aa2c1afc1
--- /dev/null
+++ b/fs/reiserfs/lock.c
@@ -0,0 +1,97 @@
+#include <linux/reiserfs_fs.h>
+#include <linux/mutex.h>
+/*
+ * The previous reiserfs locking scheme was heavily based on
+ * the tricky properties of the Bkl:
+ *
+ * - it was acquired recursively by a same task
+ * - the performances relied on the release-while-schedule() property
+ *
+ * Now that we replace it by a mutex, we still want to keep the same
+ * recursive property to avoid big changes in the code structure.
+ * We use our own lock_owner here because the owner field on a mutex
+ * is only available in SMP or mutex debugging, also we only need this field
+ * for this mutex, no need for a system wide mutex facility.
+ *
+ * Also this lock is often released before a call that could block because
+ * reiserfs performances were partialy based on the release while schedule()
+ * property of the Bkl.
+ */
+void reiserfs_write_lock(struct super_block *s)
+{
+        struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+        if (sb_i->lock_owner != current) {
+                mutex_lock(&sb_i->lock);
+                sb_i->lock_owner = current;
+        }
+        /* No need to protect it, only the current task touches it */
+        sb_i->lock_depth++;
+}
+void reiserfs_write_unlock(struct super_block *s)
+{
+        struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+        /*
+         * Are we unlocking without even holding the lock?
+         * Such a situation must raise a BUG() if we don't want
+         * to corrupt the data.
+         */
+        BUG_ON(sb_i->lock_owner != current);
+        if (--sb_i->lock_depth == -1) {
+                sb_i->lock_owner = NULL;
+                mutex_unlock(&sb_i->lock);
+        }
+}
+/*
+ * If we already own the lock, just exit and don't increase the depth.
+ * Useful when we don't want to lock more than once.
+ *
+ * We always return the lock_depth we had before calling
+ * this function.
+ */
+int reiserfs_write_lock_once(struct super_block *s)
+{
+        struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+        if (sb_i->lock_owner != current) {
+                mutex_lock(&sb_i->lock);
+                sb_i->lock_owner = current;
+                return sb_i->lock_depth++;
+        }
+        return sb_i->lock_depth;
+}
+void reiserfs_write_unlock_once(struct super_block *s, int lock_depth)
+{
+        if (lock_depth == -1)
+                reiserfs_write_unlock(s);
+}
+/*
+ * Utility function to force a BUG if it is called without the superblock
+ * write lock held.  caller is the string printed just before calling BUG()
+ */
+void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
+{
+        struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
+        if (sb_i->lock_depth < 0)
+                reiserfs_panic(sb, "%s called without kernel lock held %d",
+                               caller);
+}
+#ifdef CONFIG_REISERFS_CHECK
+void reiserfs_lock_check_recursive(struct super_block *sb)
+{
+        struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
+        WARN_ONCE((sb_i->lock_depth > 0), "Unwanted recursive reiserfs lock!\n");
+}
+#endif
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 271579128634..d0c43cb99ffc 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -13,6 +13,7 @@
 #include <linux/time.h>
 #include <linux/bitops.h>
+#include <linux/slab.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/reiserfs_acl.h>
 #include <linux/reiserfs_xattr.h>
@@ -324,6 +325,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
                                      struct nameidata *nd)
 {
        int retval;
+        int lock_depth;
        struct inode *inode = NULL;
        struct reiserfs_dir_entry de;
        INITIALIZE_PATH(path_to_entry);
@@ -331,7 +333,13 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
        if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len)
                return ERR_PTR(-ENAMETOOLONG);
-        reiserfs_write_lock(dir->i_sb);
+        /*
+         * Might be called with or without the write lock, must be careful
+         * to not recursively hold it in case we want to release the lock
+         * before rescheduling.
+         */
+        lock_depth = reiserfs_write_lock_once(dir->i_sb);
        de.de_gen_number_bit_string = NULL;
        retval =
            reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
@@ -341,7 +349,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
                inode = reiserfs_iget(dir->i_sb,
                                      (struct cpu_key *)&(de.de_dir_id));
                if (!inode || IS_ERR(inode)) {
-                        reiserfs_write_unlock(dir->i_sb);
+                        reiserfs_write_unlock_once(dir->i_sb, lock_depth);
                        return ERR_PTR(-EACCES);
                }
@@ -350,7 +358,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
                if (IS_PRIVATE(dir))
                        inode->i_flags |= S_PRIVATE;
        }
-        reiserfs_write_unlock(dir->i_sb);
+        reiserfs_write_unlock_once(dir->i_sb, lock_depth);
        if (retval == IO_ERROR) {
                return ERR_PTR(-EIO);
        }
@@ -539,7 +547,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
 */
 static int drop_new_inode(struct inode *inode)
 {
-        vfs_dq_drop(inode);
+        dquot_drop(inode);
        make_bad_inode(inode);
        inode->i_flags |= S_NOQUOTA;
        iput(inode);
@@ -547,7 +555,7 @@ static int drop_new_inode(struct inode *inode)
 }
 /* utility function that does setup for reiserfs_new_inode.
-** vfs_dq_init needs lots of credits so it's better to have it
+** dquot_initialize needs lots of credits so it's better to have it
 ** outside of a transaction, so we had to pull some bits of
 ** reiserfs_new_inode out into this func.
 */
@@ -570,7 +578,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
        } else {
                inode->i_gid = current_fsgid();
        }
-        vfs_dq_init(inode);
+        dquot_initialize(inode);
        return 0;
 }
@@ -587,6 +595,8 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
        struct reiserfs_transaction_handle th;
        struct reiserfs_security_handle security;
+        dquot_initialize(dir);
        if (!(inode = new_inode(dir->i_sb))) {
                return -ENOMEM;
        }
@@ -659,6 +669,8 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
        if (!new_valid_dev(rdev))
                return -EINVAL;
+        dquot_initialize(dir);
        if (!(inode = new_inode(dir->i_sb))) {
                return -ENOMEM;
        }
@@ -725,12 +737,15 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct inode *inode;
        struct reiserfs_transaction_handle th;
        struct reiserfs_security_handle security;
+        int lock_depth;
        /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
        int jbegin_count =
            JOURNAL_PER_BALANCE_CNT * 3 +
            2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
                 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
+        dquot_initialize(dir);
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
        /* set flag that new packing locality created and new blocks for the content     * of that directory are not displaced yet */
        REISERFS_I(dir)->new_packing_locality = 1;
@@ -748,7 +763,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                return retval;
        }
        jbegin_count += retval;
-        reiserfs_write_lock(dir->i_sb);
+        lock_depth = reiserfs_write_lock_once(dir->i_sb);
        retval = journal_begin(&th, dir->i_sb, jbegin_count);
        if (retval) {
@@ -798,8 +813,8 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        d_instantiate(dentry, inode);
        unlock_new_inode(inode);
        retval = journal_end(&th, dir->i_sb, jbegin_count);
-      out_failed:
+out_failed:
-        reiserfs_write_unlock(dir->i_sb);
+        reiserfs_write_unlock_once(dir->i_sb, lock_depth);
        return retval;
 }
@@ -834,6 +849,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
            JOURNAL_PER_BALANCE_CNT * 2 + 2 +
            4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
+        dquot_initialize(dir);
        reiserfs_write_lock(dir->i_sb);
        retval = journal_begin(&th, dir->i_sb, jbegin_count);
        if (retval)
@@ -913,6 +930,9 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
        struct reiserfs_transaction_handle th;
        int jbegin_count;
        unsigned long savelink;
+        int depth;
+        dquot_initialize(dir);
        inode = dentry->d_inode;
@@ -924,7 +944,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
            JOURNAL_PER_BALANCE_CNT * 2 + 2 +
            4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
-        reiserfs_write_lock(dir->i_sb);
+        depth = reiserfs_write_lock_once(dir->i_sb);
        retval = journal_begin(&th, dir->i_sb, jbegin_count);
        if (retval)
                goto out_unlink;
@@ -985,7 +1005,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
        retval = journal_end(&th, dir->i_sb, jbegin_count);
        reiserfs_check_path(&path);
-        reiserfs_write_unlock(dir->i_sb);
+        reiserfs_write_unlock_once(dir->i_sb, depth);
        return retval;
      end_unlink:
@@ -995,7 +1015,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
        if (err)
                retval = err;
      out_unlink:
-        reiserfs_write_unlock(dir->i_sb);
+        reiserfs_write_unlock_once(dir->i_sb, depth);
        return retval;
 }
@@ -1015,6 +1035,8 @@ static int reiserfs_symlink(struct inode *parent_dir,
            2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) +
                 REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb));
+        dquot_initialize(parent_dir);
        if (!(inode = new_inode(parent_dir->i_sb))) {
                return -ENOMEM;
        }
@@ -1102,6 +1124,8 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
            JOURNAL_PER_BALANCE_CNT * 3 +
            2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
+        dquot_initialize(dir);
        reiserfs_write_lock(dir->i_sb);
        if (inode->i_nlink >= REISERFS_LINK_MAX) {
                //FIXME: sd_nlink is 32 bit for new files
@@ -1226,6 +1250,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
            JOURNAL_PER_BALANCE_CNT * 3 + 5 +
            4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb);
+        dquot_initialize(old_dir);
+        dquot_initialize(new_dir);
        old_inode = old_dentry->d_inode;
        new_dentry_inode = new_dentry->d_inode;
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 536eacaeb710..adbc6f538515 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -349,10 +349,6 @@ void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
   .  */
-#ifdef CONFIG_REISERFS_CHECK
-extern struct tree_balance *cur_tb;
-#endif
 void __reiserfs_panic(struct super_block *sb, const char *id,
                      const char *function, const char *fmt, ...)
 {
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 9229e5514a4e..7a9981196c1c 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -17,8 +17,6 @@
 #include <linux/init.h>
 #include <linux/proc_fs.h>
-#ifdef CONFIG_REISERFS_PROC_INFO
 /*
 * LOCKING:
 *
@@ -48,14 +46,6 @@ static int show_version(struct seq_file *m, struct super_block *sb)
        return 0;
 }
-int reiserfs_global_version_in_proc(char *buffer, char **start, off_t offset,
-                                    int count, int *eof, void *data)
-{
-        *start = buffer;
-        *eof = 1;
-        return 0;
-}
 #define SF( x ) ( r -> x )
 #define SFP( x ) SF( s_proc_info_data.x )
 #define SFPL( x ) SFP( x[ level ] )
@@ -538,19 +528,6 @@ int reiserfs_proc_info_done(struct super_block *sb)
        return 0;
 }
-struct proc_dir_entry *reiserfs_proc_register_global(char *name,
-                                                     read_proc_t * func)
-{
-        return (proc_info_root) ? create_proc_read_entry(name, 0,
-                                                         proc_info_root,
-                                                         func, NULL) : NULL;
-}
-void reiserfs_proc_unregister_global(const char *name)
-{
-        remove_proc_entry(name, proc_info_root);
-}
 int reiserfs_proc_info_global_init(void)
 {
        if (proc_info_root == NULL) {
@@ -572,48 +549,6 @@ int reiserfs_proc_info_global_done(void)
        }
        return 0;
 }
-/* REISERFS_PROC_INFO */
-#else
-int reiserfs_proc_info_init(struct super_block *sb)
-{
-        return 0;
-}
-int reiserfs_proc_info_done(struct super_block *sb)
-{
-        return 0;
-}
-struct proc_dir_entry *reiserfs_proc_register_global(char *name,
-                                                     read_proc_t * func)
-{
-        return NULL;
-}
-void reiserfs_proc_unregister_global(const char *name)
-{;
-}
-int reiserfs_proc_info_global_init(void)
-{
-        return 0;
-}
-int reiserfs_proc_info_global_done(void)
-{
-        return 0;
-}
-int reiserfs_global_version_in_proc(char *buffer, char **start,
-                                    off_t offset,
-                                    int count, int *eof, void *data)
-{
-        return 0;
-}
-/* REISERFS_PROC_INFO */
-#endif
 /*
 * Revision 1.1.8.2  2001/07/15 17:08:42  god
 *  . use get_super() in procfs.c
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index 18b315d3d104..b3a94d20f0fc 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -141,7 +141,9 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
                        set_buffer_uptodate(bh);
                        mark_buffer_dirty(bh);
+                        reiserfs_write_unlock(s);
                        sync_dirty_buffer(bh);
+                        reiserfs_write_lock(s);
                        // update bitmap_info stuff
                        bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
                        brelse(bh);
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index d036ee5b1c81..313d39d639eb 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -222,9 +222,6 @@ static inline int bin_search(const void *key,	/* Key to search for. */
        return ITEM_NOT_FOUND;
 }
-#ifdef CONFIG_REISERFS_CHECK
-extern struct tree_balance *cur_tb;
-#endif
 /* Minimal possible key. It is never in the tree. */
 const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} };
@@ -519,25 +516,48 @@ static int is_tree_node(struct buffer_head *bh, int level)
 #define SEARCH_BY_KEY_READA 16
-/* The function is NOT SCHEDULE-SAFE! */
+/*
-static void search_by_key_reada(struct super_block *s,
+ * The function is NOT SCHEDULE-SAFE!
+ * It might unlock the write lock if we needed to wait for a block
+ * to be read. Note that in this case it won't recover the lock to avoid
+ * high contention resulting from too much lock requests, especially
+ * the caller (search_by_key) will perform other schedule-unsafe
+ * operations just after calling this function.
+ *
+ * @return true if we have unlocked
+ */
+static bool search_by_key_reada(struct super_block *s,
                                struct buffer_head **bh,
                                b_blocknr_t *b, int num)
 {
        int i, j;
+        bool unlocked = false;
        for (i = 0; i < num; i++) {
                bh[i] = sb_getblk(s, b[i]);
        }
+        /*
+         * We are going to read some blocks on which we
+         * have a reference. It's safe, though we might be
+         * reading blocks concurrently changed if we release
+         * the lock. But it's still fine because we check later
+         * if the tree changed
+         */
        for (j = 0; j < i; j++) {
                /*
                 * note, this needs attention if we are getting rid of the BKL
                 * you have to make sure the prepared bit isn't set on this buffer
                 */
-                if (!buffer_uptodate(bh[j]))
+                if (!buffer_uptodate(bh[j])) {
+                        if (!unlocked) {
+                                reiserfs_write_unlock(s);
+                                unlocked = true;
+                        }
                        ll_rw_block(READA, 1, bh + j);
+                }
                brelse(bh[j]);
        }
+        return unlocked;
 }
 /**************************************************************************
@@ -625,11 +645,26 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key,	/* Key to s
                   have a pointer to it. */
                if ((bh = last_element->pe_buffer =
                     sb_getblk(sb, block_number))) {
+                        bool unlocked = false;
                        if (!buffer_uptodate(bh) && reada_count > 1)
-                                search_by_key_reada(sb, reada_bh,
+                                /* may unlock the write lock */
+                                unlocked = search_by_key_reada(sb, reada_bh,
                                                    reada_blocks, reada_count);
+                        /*
+                         * If we haven't already unlocked the write lock,
+                         * then we need to do that here before reading
+                         * the current block
+                         */
+                        if (!buffer_uptodate(bh) && !unlocked) {
+                                reiserfs_write_unlock(sb);
+                                unlocked = true;
+                        }
                        ll_rw_block(READ, 1, &bh);
                        wait_on_buffer(bh);
+                        if (unlocked)
+                                reiserfs_write_lock(sb);
                        if (!buffer_uptodate(bh))
                                goto io_error;
                } else {
@@ -673,7 +708,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key,	/* Key to s
                       !key_in_buffer(search_path, key, sb),
                       "PAP-5130: key is not in the buffer");
 #ifdef CONFIG_REISERFS_CHECK
-                if (cur_tb) {
+                if (REISERFS_SB(sb)->cur_tb) {
                        print_cur_tb("5140");
                        reiserfs_panic(sb, "PAP-5140",
                                       "schedule occurred in do_balance!");
@@ -1024,7 +1059,9 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
                        reiserfs_free_block(th, inode, block, 1);
                    }
+                    reiserfs_write_unlock(sb);
                    cond_resched();
+                    reiserfs_write_lock(sb);
                    if (item_moved (&s_ih, path))  {
                        need_re_search = 1;
@@ -1262,7 +1299,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
                       "reiserquota delete_item(): freeing %u, id=%u type=%c",
                       quota_cut_bytes, inode->i_uid, head2type(&s_ih));
 #endif
-        vfs_dq_free_space_nodirty(inode, quota_cut_bytes);
+        dquot_free_space_nodirty(inode, quota_cut_bytes);
        /* Return deleted body length */
        return ret_value;
@@ -1346,7 +1383,7 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
                                               quota_cut_bytes, inode->i_uid,
                                               key2type(key));
 #endif
-                                vfs_dq_free_space_nodirty(inode,
+                                dquot_free_space_nodirty(inode,
                                                         quota_cut_bytes);
                        }
                        break;
@@ -1696,7 +1733,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
                       "reiserquota cut_from_item(): freeing %u id=%u type=%c",
                       quota_cut_bytes, inode->i_uid, '?');
 #endif
-        vfs_dq_free_space_nodirty(inode, quota_cut_bytes);
+        dquot_free_space_nodirty(inode, quota_cut_bytes);
        return ret_value;
 }
@@ -1931,9 +1968,10 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
                       key2type(&(key->on_disk_key)));
 #endif
-        if (vfs_dq_alloc_space_nodirty(inode, pasted_size)) {
+        retval = dquot_alloc_space_nodirty(inode, pasted_size);
+        if (retval) {
                pathrelse(search_path);
-                return -EDQUOT;
+                return retval;
        }
        init_tb_struct(th, &s_paste_balance, th->t_super, search_path,
                       pasted_size);
@@ -1987,7 +2025,7 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
                       pasted_size, inode->i_uid,
                       key2type(&(key->on_disk_key)));
 #endif
-        vfs_dq_free_space_nodirty(inode, pasted_size);
+        dquot_free_space_nodirty(inode, pasted_size);
        return retval;
 }
@@ -2025,9 +2063,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
 #endif
                /* We can't dirty inode here. It would be immediately written but
                 * appropriate stat item isn't inserted yet... */
-                if (vfs_dq_alloc_space_nodirty(inode, quota_bytes)) {
+                retval = dquot_alloc_space_nodirty(inode, quota_bytes);
+                if (retval) {
                        pathrelse(path);
-                        return -EDQUOT;
+                        return retval;
                }
        }
        init_tb_struct(th, &s_ins_balance, th->t_super, path,
@@ -2076,6 +2115,6 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
                       quota_bytes, inode->i_uid, head2type(ih));
 #endif
        if (inode)
-                vfs_dq_free_space_nodirty(inode, quota_bytes);
+                dquot_free_space_nodirty(inode, quota_bytes);
        return retval;
 }
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f0ad05f38022..59125fb36d42 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -12,6 +12,7 @@
 */
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/time.h>
 #include <asm/uaccess.h>
@@ -246,7 +247,7 @@ static int finish_unfinished(struct super_block *s)
                        retval = remove_save_link_only(s, &save_link_key, 0);
                        continue;
                }
-                vfs_dq_init(inode);
+                dquot_initialize(inode);
                if (truncate && S_ISDIR(inode->i_mode)) {
                        /* We got a truncate request for a dir which is impossible.
@@ -465,7 +466,7 @@ static void reiserfs_put_super(struct super_block *s)
        struct reiserfs_transaction_handle th;
        th.t_trans_id = 0;
-        lock_kernel();
+        reiserfs_write_lock(s);
        if (s->s_dirt)
                reiserfs_write_super(s);
@@ -499,10 +500,10 @@ static void reiserfs_put_super(struct super_block *s)
        reiserfs_proc_info_done(s);
+        reiserfs_write_unlock(s);
+        mutex_destroy(&REISERFS_SB(s)->lock);
        kfree(s->s_fs_info);
        s->s_fs_info = NULL;
-        unlock_kernel();
 }
 static struct kmem_cache *reiserfs_inode_cachep;
@@ -554,25 +555,33 @@ static void reiserfs_dirty_inode(struct inode *inode)
        struct reiserfs_transaction_handle th;
        int err = 0;
+        int lock_depth;
        if (inode->i_sb->s_flags & MS_RDONLY) {
                reiserfs_warning(inode->i_sb, "clm-6006",
                                 "writing inode %lu on readonly FS",
                                 inode->i_ino);
                return;
        }
-        reiserfs_write_lock(inode->i_sb);
+        lock_depth = reiserfs_write_lock_once(inode->i_sb);
        /* this is really only used for atime updates, so they don't have
         ** to be included in O_SYNC or fsync
         */
        err = journal_begin(&th, inode->i_sb, 1);
-        if (err) {
+        if (err)
-                reiserfs_write_unlock(inode->i_sb);
+                goto out;
-                return;
-        }
        reiserfs_update_sd(&th, inode);
        journal_end(&th, inode->i_sb, 1);
-        reiserfs_write_unlock(inode->i_sb);
+out:
+        reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+}
+static void reiserfs_clear_inode(struct inode *inode)
+{
+        dquot_drop(inode);
 }
 #ifdef CONFIG_QUOTA
@@ -587,6 +596,7 @@ static const struct super_operations reiserfs_sops = {
        .destroy_inode = reiserfs_destroy_inode,
        .write_inode = reiserfs_write_inode,
        .dirty_inode = reiserfs_dirty_inode,
+        .clear_inode = reiserfs_clear_inode,
        .delete_inode = reiserfs_delete_inode,
        .put_super = reiserfs_put_super,
        .write_super = reiserfs_write_super,
@@ -613,13 +623,6 @@ static int reiserfs_write_info(struct super_block *, int);
 static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
 static const struct dquot_operations reiserfs_quota_operations = {
-        .initialize = dquot_initialize,
-        .drop = dquot_drop,
-        .alloc_space = dquot_alloc_space,
-        .alloc_inode = dquot_alloc_inode,
-        .free_space = dquot_free_space,
-        .free_inode = dquot_free_inode,
-        .transfer = dquot_transfer,
        .write_dquot = reiserfs_write_dquot,
        .acquire_dquot = reiserfs_acquire_dquot,
        .release_dquot = reiserfs_release_dquot,
@@ -1168,11 +1171,14 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
        unsigned int qfmt = 0;
 #ifdef CONFIG_QUOTA
        int i;
+#endif
+        reiserfs_write_lock(s);
+#ifdef CONFIG_QUOTA
        memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
 #endif
-        lock_kernel();
        rs = SB_DISK_SUPER_BLOCK(s);
        if (!reiserfs_parse_options
@@ -1295,12 +1301,12 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 out_ok:
        replace_mount_options(s, new_opts);
-        unlock_kernel();
+        reiserfs_write_unlock(s);
        return 0;
 out_err:
        kfree(new_opts);
-        unlock_kernel();
+        reiserfs_write_unlock(s);
        return err;
 }
@@ -1404,7 +1410,9 @@ static int read_super_block(struct super_block *s, int offset)
 static int reread_meta_blocks(struct super_block *s)
 {
        ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s)));
+        reiserfs_write_unlock(s);
        wait_on_buffer(SB_BUFFER_WITH_SB(s));
+        reiserfs_write_lock(s);
        if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
                reiserfs_warning(s, "reiserfs-2504", "error reading the super");
                return 1;
@@ -1611,10 +1619,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        save_mount_options(s, data);
        sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
-        if (!sbi) {
+        if (!sbi)
-                errval = -ENOMEM;
+                return -ENOMEM;
-                goto error;
-        }
        s->s_fs_info = sbi;
        /* Set default values for options: non-aggressive tails, RO on errors */
        REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
@@ -1627,6 +1633,20 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        /* setup default block allocator options */
        reiserfs_init_alloc_options(s);
+        mutex_init(&REISERFS_SB(s)->lock);
+        REISERFS_SB(s)->lock_depth = -1;
+        /*
+         * This function is called with the bkl, which also was the old
+         * locking used here.
+         * do_journal_begin() will soon check if we hold the lock (ie: was the
+         * bkl). This is likely because do_journal_begin() has several another
+         * callers because at this time, it doesn't seem to be necessary to
+         * protect against anything.
+         * Anyway, let's be conservative and lock for now.
+         */
+        reiserfs_write_lock(s);
        jdev_name = NULL;
        if (reiserfs_parse_options
            (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name,
@@ -1852,6 +1872,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        init_waitqueue_head(&(sbi->s_wait));
        spin_lock_init(&sbi->bitmap_lock);
+        reiserfs_write_unlock(s);
        return (0);
 error:
@@ -1859,6 +1881,8 @@ error:
                journal_release_error(NULL, s);
        }
+        reiserfs_write_unlock(s);
        reiserfs_free_bitmap_cache(s);
        if (SB_BUFFER_WITH_SB(s))
                brelse(SB_BUFFER_WITH_SB(s));
@@ -2196,8 +2220,6 @@ static int __init init_reiserfs_fs(void)
        }
        reiserfs_proc_info_global_init();
-        reiserfs_proc_register_global("version",
-                                      reiserfs_global_version_in_proc);
        ret = register_filesystem(&reiserfs_fs_type);
@@ -2205,7 +2227,6 @@ static int __init init_reiserfs_fs(void)
                return 0;
        }
-        reiserfs_proc_unregister_global("version");
        reiserfs_proc_info_global_done();
        destroy_inodecache();
@@ -2214,7 +2235,6 @@ static int __init init_reiserfs_fs(void)
 static void __exit exit_reiserfs_fs(void)
 {
-        reiserfs_proc_unregister_global("version");
        reiserfs_proc_info_global_done();
        unregister_filesystem(&reiserfs_fs_type);
        destroy_inodecache();
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 6925b835a43b..4f9586bb7631 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -38,6 +38,7 @@
 #include <linux/dcache.h>
 #include <linux/namei.h>
 #include <linux/errno.h>
+#include <linux/gfp.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/pagemap.h>
@@ -48,6 +49,7 @@
 #include <net/checksum.h>
 #include <linux/stat.h>
 #include <linux/quotaops.h>
+#include <linux/security.h>
 #define PRIVROOT_NAME ".reiserfs_priv"
 #define XAROOT_NAME   "xattrs"
@@ -60,7 +62,6 @@
 static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
 {
        BUG_ON(!mutex_is_locked(&dir->i_mutex));
-        vfs_dq_init(dir);
        return dir->i_op->create(dir, dentry, mode, NULL);
 }
 #endif
@@ -68,7 +69,6 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
 static int xattr_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
        BUG_ON(!mutex_is_locked(&dir->i_mutex));
-        vfs_dq_init(dir);
        return dir->i_op->mkdir(dir, dentry, mode);
 }
@@ -80,9 +80,9 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry)
 {
        int error;
        BUG_ON(!mutex_is_locked(&dir->i_mutex));
-        vfs_dq_init(dir);
-        mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
+        reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
+                                        I_MUTEX_CHILD, dir->i_sb);
        error = dir->i_op->unlink(dir, dentry);
        mutex_unlock(&dentry->d_inode->i_mutex);
@@ -95,9 +95,9 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
 {
        int error;
        BUG_ON(!mutex_is_locked(&dir->i_mutex));
-        vfs_dq_init(dir);
-        mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
+        reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
+                                        I_MUTEX_CHILD, dir->i_sb);
        dentry_unhash(dentry);
        error = dir->i_op->rmdir(dir, dentry);
        if (!error)
@@ -234,16 +234,22 @@ static int reiserfs_for_each_xattr(struct inode *inode,
        if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1)
                return 0;
+        reiserfs_write_unlock(inode->i_sb);
        dir = open_xa_dir(inode, XATTR_REPLACE);
        if (IS_ERR(dir)) {
                err = PTR_ERR(dir);
+                reiserfs_write_lock(inode->i_sb);
                goto out;
        } else if (!dir->d_inode) {
                err = 0;
+                reiserfs_write_lock(inode->i_sb);
                goto out_dir;
        }
        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
+        reiserfs_write_lock(inode->i_sb);
        buf.xadir = dir;
        err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos);
        while ((err == 0 || err == -ENOSPC) && buf.count) {
@@ -282,8 +288,9 @@ static int reiserfs_for_each_xattr(struct inode *inode,
                err = journal_begin(&th, inode->i_sb, blocks);
                if (!err) {
                        int jerror;
-                        mutex_lock_nested(&dir->d_parent->d_inode->i_mutex,
+                        reiserfs_mutex_lock_nested_safe(
-                                          I_MUTEX_XATTR);
+                                          &dir->d_parent->d_inode->i_mutex,
+                                          I_MUTEX_XATTR, inode->i_sb);
                        err = action(dir, data);
                        jerror = journal_end(&th, inode->i_sb, blocks);
                        mutex_unlock(&dir->d_parent->d_inode->i_mutex);
@@ -442,7 +449,9 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name)
        }
        if (dentry->d_inode) {
+                reiserfs_write_lock(inode->i_sb);
                err = xattr_unlink(xadir->d_inode, dentry);
+                reiserfs_write_unlock(inode->i_sb);
                update_ctime(inode);
        }
@@ -476,15 +485,24 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
        if (get_inode_sd_version(inode) == STAT_DATA_V1)
                return -EOPNOTSUPP;
-        if (!buffer)
+        reiserfs_write_unlock(inode->i_sb);
-                return lookup_and_delete_xattr(inode, name);
+        if (!buffer) {
+                err = lookup_and_delete_xattr(inode, name);
+                reiserfs_write_lock(inode->i_sb);
+                return err;
+        }
        dentry = xattr_lookup(inode, name, flags);
-        if (IS_ERR(dentry))
+        if (IS_ERR(dentry)) {
+                reiserfs_write_lock(inode->i_sb);
                return PTR_ERR(dentry);
+        }
        down_write(&REISERFS_I(inode)->i_xattr_sem);
+        reiserfs_write_lock(inode->i_sb);
        xahash = xattr_hash(buffer, buffer_size);
        while (buffer_pos < buffer_size || buffer_pos == 0) {
                size_t chunk;
@@ -539,8 +557,12 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
                        .ia_size = buffer_size,
                        .ia_valid = ATTR_SIZE | ATTR_CTIME,
                };
+                reiserfs_write_unlock(inode->i_sb);
                mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR);
                down_write(&dentry->d_inode->i_alloc_sem);
+                reiserfs_write_lock(inode->i_sb);
                err = reiserfs_setattr(dentry, &newattrs);
                up_write(&dentry->d_inode->i_alloc_sem);
                mutex_unlock(&dentry->d_inode->i_mutex);
@@ -726,15 +748,14 @@ ssize_t
 reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
                  size_t size)
 {
-        struct inode *inode = dentry->d_inode;
        struct xattr_handler *handler;
-        handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name);
+        handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
-        if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1)
+        if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
                return -EOPNOTSUPP;
-        return handler->get(inode, name, buffer, size);
+        return handler->get(dentry, name, buffer, size, handler->flags);
 }
 /*
@@ -746,15 +767,14 @@ int
 reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                  size_t size, int flags)
 {
-        struct inode *inode = dentry->d_inode;
        struct xattr_handler *handler;
-        handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name);
+        handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
-        if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1)
+        if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
                return -EOPNOTSUPP;
-        return handler->set(inode, name, value, size, flags);
+        return handler->set(dentry, name, value, size, flags, handler->flags);
 }
 /*
@@ -764,21 +784,20 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 */
 int reiserfs_removexattr(struct dentry *dentry, const char *name)
 {
-        struct inode *inode = dentry->d_inode;
        struct xattr_handler *handler;
-        handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name);
+        handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
-        if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1)
+        if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
                return -EOPNOTSUPP;
-        return handler->set(inode, name, NULL, 0, XATTR_REPLACE);
+        return handler->set(dentry, name, NULL, 0, XATTR_REPLACE, handler->flags);
 }
 struct listxattr_buf {
        size_t size;
        size_t pos;
        char *buf;
-        struct inode *inode;
+        struct dentry *dentry;
 };
 static int listxattr_filler(void *buf, const char *name, int namelen,
@@ -789,17 +808,19 @@ static int listxattr_filler(void *buf, const char *name, int namelen,
        if (name[0] != '.' ||
            (namelen != 1 && (name[1] != '.' || namelen != 2))) {
                struct xattr_handler *handler;
-                handler = find_xattr_handler_prefix(b->inode->i_sb->s_xattr,
+                handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr,
                                                    name);
                if (!handler)   /* Unsupported xattr name */
                        return 0;
                if (b->buf) {
-                        size = handler->list(b->inode, b->buf + b->pos,
+                        size = handler->list(b->dentry, b->buf + b->pos,
-                                         b->size, name, namelen);
+                                         b->size, name, namelen,
+                                         handler->flags);
                        if (size > b->size)
                                return -ERANGE;
                } else {
-                        size = handler->list(b->inode, NULL, 0, name, namelen);
+                        size = handler->list(b->dentry, NULL, 0, name,
+                                             namelen, handler->flags);
                }
                b->pos += size;
@@ -820,7 +841,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
        int err = 0;
        loff_t pos = 0;
        struct listxattr_buf buf = {
-                .inode = dentry->d_inode,
+                .dentry = dentry,
                .buf = buffer,
                .size = buffer ? size : 0,
        };
@@ -975,7 +996,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
        int err = 0;
        /* If we don't have the privroot located yet - go find it */
-        mutex_lock(&s->s_root->d_inode->i_mutex);
+        reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s);
        dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
                                strlen(PRIVROOT_NAME));
        if (!IS_ERR(dentry)) {
@@ -1004,14 +1025,14 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
                goto error;
        if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) {
-                mutex_lock(&s->s_root->d_inode->i_mutex);
+                reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s);
                err = create_privroot(REISERFS_SB(s)->priv_root);
                mutex_unlock(&s->s_root->d_inode->i_mutex);
        }
        if (privroot->d_inode) {
                s->s_xattr = reiserfs_xattr_handlers;
-                mutex_lock(&privroot->d_inode->i_mutex);
+                reiserfs_mutex_lock_safe(&privroot->d_inode->i_mutex, s);
                if (!REISERFS_SB(s)->xattr_root) {
                        struct dentry *dentry;
                        dentry = lookup_one_len(XAROOT_NAME, privroot,
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 35d6e672a279..9cdb759645a9 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -5,6 +5,7 @@
 #include <linux/errno.h>
 #include <linux/pagemap.h>
 #include <linux/xattr.h>
+#include <linux/slab.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/reiserfs_xattr.h>
 #include <linux/reiserfs_acl.h>
@@ -15,8 +16,10 @@ static int reiserfs_set_acl(struct reiserfs_transaction_handle *th,
                            struct posix_acl *acl);
 static int
-xattr_set_acl(struct inode *inode, int type, const void *value, size_t size)
+posix_acl_set(struct dentry *dentry, const char *name, const void *value,
+                size_t size, int flags, int type)
 {
+        struct inode *inode = dentry->d_inode;
        struct posix_acl *acl;
        int error, error2;
        struct reiserfs_transaction_handle th;
@@ -60,15 +63,16 @@ xattr_set_acl(struct inode *inode, int type, const void *value, size_t size)
 }
 static int
-xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
+posix_acl_get(struct dentry *dentry, const char *name, void *buffer,
+                size_t size, int type)
 {
        struct posix_acl *acl;
        int error;
-        if (!reiserfs_posixacl(inode->i_sb))
+        if (!reiserfs_posixacl(dentry->d_sb))
                return -EOPNOTSUPP;
-        acl = reiserfs_get_acl(inode, type);
+        acl = reiserfs_get_acl(dentry->d_inode, type);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl == NULL)
@@ -452,7 +456,9 @@ int reiserfs_acl_chmod(struct inode *inode)
                return 0;
        }
+        reiserfs_write_unlock(inode->i_sb);
        acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
+        reiserfs_write_lock(inode->i_sb);
        if (!acl)
                return 0;
        if (IS_ERR(acl))
@@ -482,30 +488,12 @@ int reiserfs_acl_chmod(struct inode *inode)
        return error;
 }
-static int
+static size_t posix_acl_access_list(struct dentry *dentry, char *list,
-posix_acl_access_get(struct inode *inode, const char *name,
-                     void *buffer, size_t size)
-{
-        if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS) - 1)
-                return -EINVAL;
-        return xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
-}
-static int
-posix_acl_access_set(struct inode *inode, const char *name,
-                     const void *value, size_t size, int flags)
-{
-        if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS) - 1)
-                return -EINVAL;
-        return xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
-}
-static size_t posix_acl_access_list(struct inode *inode, char *list,
                                    size_t list_size, const char *name,
-                                    size_t name_len)
+                                    size_t name_len, int type)
 {
        const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-        if (!reiserfs_posixacl(inode->i_sb))
+        if (!reiserfs_posixacl(dentry->d_sb))
                return 0;
        if (list && size <= list_size)
                memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
@@ -514,35 +502,18 @@ static size_t posix_acl_access_list(struct inode *inode, char *list,
 struct xattr_handler reiserfs_posix_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
-        .get = posix_acl_access_get,
+        .flags = ACL_TYPE_ACCESS,
-        .set = posix_acl_access_set,
+        .get = posix_acl_get,
+        .set = posix_acl_set,
        .list = posix_acl_access_list,
 };
-static int
+static size_t posix_acl_default_list(struct dentry *dentry, char *list,
-posix_acl_default_get(struct inode *inode, const char *name,
-                      void *buffer, size_t size)
-{
-        if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT) - 1)
-                return -EINVAL;
-        return xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
-}
-static int
-posix_acl_default_set(struct inode *inode, const char *name,
-                      const void *value, size_t size, int flags)
-{
-        if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT) - 1)
-                return -EINVAL;
-        return xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
-}
-static size_t posix_acl_default_list(struct inode *inode, char *list,
                                     size_t list_size, const char *name,
-                                     size_t name_len)
+                                     size_t name_len, int type)
 {
        const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-        if (!reiserfs_posixacl(inode->i_sb))
+        if (!reiserfs_posixacl(dentry->d_sb))
                return 0;
        if (list && size <= list_size)
                memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
@@ -551,7 +522,8 @@ static size_t posix_acl_default_list(struct inode *inode, char *list,
 struct xattr_handler reiserfs_posix_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
-        .get = posix_acl_default_get,
+        .flags = ACL_TYPE_DEFAULT,
-        .set = posix_acl_default_set,
+        .get = posix_acl_get,
+        .set = posix_acl_set,
        .list = posix_acl_default_list,
 };
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index a92c8792c0f6..7271a477c041 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -3,41 +3,43 @@
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/xattr.h>
+#include <linux/slab.h>
 #include <linux/reiserfs_xattr.h>
 #include <linux/security.h>
 #include <asm/uaccess.h>
 static int
-security_get(struct inode *inode, const char *name, void *buffer, size_t size)
+security_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
+                int handler_flags)
 {
        if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
                return -EINVAL;
-        if (IS_PRIVATE(inode))
+        if (IS_PRIVATE(dentry->d_inode))
                return -EPERM;
-        return reiserfs_xattr_get(inode, name, buffer, size);
+        return reiserfs_xattr_get(dentry->d_inode, name, buffer, size);
 }
 static int
-security_set(struct inode *inode, const char *name, const void *buffer,
+security_set(struct dentry *dentry, const char *name, const void *buffer,
-             size_t size, int flags)
+             size_t size, int flags, int handler_flags)
 {
        if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
                return -EINVAL;
-        if (IS_PRIVATE(inode))
+        if (IS_PRIVATE(dentry->d_inode))
                return -EPERM;
-        return reiserfs_xattr_set(inode, name, buffer, size, flags);
+        return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags);
 }
-static size_t security_list(struct inode *inode, char *list, size_t list_len,
+static size_t security_list(struct dentry *dentry, char *list, size_t list_len,
-                            const char *name, size_t namelen)
+                            const char *name, size_t namelen, int handler_flags)
 {
        const size_t len = namelen + 1;
-        if (IS_PRIVATE(inode))
+        if (IS_PRIVATE(dentry->d_inode))
                return 0;
        if (list && len <= list_len) {
@@ -75,7 +77,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
                return error;
        }
-        if (sec->length) {
+        if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) {
                blocks = reiserfs_xattr_jcreate_nblocks(inode) +
                         reiserfs_xattr_nblocks(inode, sec->length);
                /* We don't want to count the directories twice if we have
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index a865042f75e2..5b08aaca3daf 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -8,36 +8,37 @@
 #include <asm/uaccess.h>
 static int
-trusted_get(struct inode *inode, const char *name, void *buffer, size_t size)
+trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
+            int handler_flags)
 {
        if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
                return -EINVAL;
-        if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
+        if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode))
                return -EPERM;
-        return reiserfs_xattr_get(inode, name, buffer, size);
+        return reiserfs_xattr_get(dentry->d_inode, name, buffer, size);
 }
 static int
-trusted_set(struct inode *inode, const char *name, const void *buffer,
+trusted_set(struct dentry *dentry, const char *name, const void *buffer,
-            size_t size, int flags)
+            size_t size, int flags, int handler_flags)
 {
        if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
                return -EINVAL;
-        if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
+        if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode))
                return -EPERM;
-        return reiserfs_xattr_set(inode, name, buffer, size, flags);
+        return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags);
 }
-static size_t trusted_list(struct inode *inode, char *list, size_t list_size,
+static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size,
-                           const char *name, size_t name_len)
+                           const char *name, size_t name_len, int handler_flags)
 {
        const size_t len = name_len + 1;
-        if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
+        if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode))
                return 0;
        if (list && len <= list_size) {
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index e3238dc4f3db..75d59c49b911 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -7,34 +7,35 @@
 #include <asm/uaccess.h>
 static int
-user_get(struct inode *inode, const char *name, void *buffer, size_t size)
+user_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
+         int handler_flags)
 {
        if (strlen(name) < sizeof(XATTR_USER_PREFIX))
                return -EINVAL;
-        if (!reiserfs_xattrs_user(inode->i_sb))
+        if (!reiserfs_xattrs_user(dentry->d_sb))
                return -EOPNOTSUPP;
-        return reiserfs_xattr_get(inode, name, buffer, size);
+        return reiserfs_xattr_get(dentry->d_inode, name, buffer, size);
 }
 static int
-user_set(struct inode *inode, const char *name, const void *buffer,
+user_set(struct dentry *dentry, const char *name, const void *buffer,
-         size_t size, int flags)
+         size_t size, int flags, int handler_flags)
 {
        if (strlen(name) < sizeof(XATTR_USER_PREFIX))
                return -EINVAL;
-        if (!reiserfs_xattrs_user(inode->i_sb))
+        if (!reiserfs_xattrs_user(dentry->d_sb))
                return -EOPNOTSUPP;
-        return reiserfs_xattr_set(inode, name, buffer, size, flags);
+        return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags);
 }
-static size_t user_list(struct inode *inode, char *list, size_t list_size,
+static size_t user_list(struct dentry *dentry, char *list, size_t list_size,
-                        const char *name, size_t name_len)
+                        const char *name, size_t name_len, int handler_flags)
 {
        const size_t len = name_len + 1;
-        if (!reiserfs_xattrs_user(inode->i_sb))
+        if (!reiserfs_xattrs_user(dentry->d_sb))
                return 0;
        if (list && len <= list_size) {
                memcpy(list, name, name_len);
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index c117fa80d1e9..42d213546894 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -544,6 +544,7 @@ error:
 error_rsb_inval:
        ret = -EINVAL;
 error_rsb:
+        kfree(rsb);
        return ret;
 }
diff --git a/fs/select.c b/fs/select.c
index fd38ce2e32e3..500a669f7790 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -691,6 +691,23 @@ SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
 }
 #endif /* HAVE_SET_RESTORE_SIGMASK */
+#ifdef __ARCH_WANT_SYS_OLD_SELECT
+struct sel_arg_struct {
+        unsigned long n;
+        fd_set __user *inp, *outp, *exp;
+        struct timeval __user *tvp;
+};
+SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
+{
+        struct sel_arg_struct a;
+        if (copy_from_user(&a, arg, sizeof(a)))
+                return -EFAULT;
+        return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
+}
+#endif
 struct poll_list {
        struct poll_list *next;
        int len;
@@ -821,7 +838,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
        struct poll_list *walk = head;
        unsigned long todo = nfds;
-        if (nfds > current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
+        if (nfds > rlimit(RLIMIT_NOFILE))
                return -EINVAL;
        len = min_t(unsigned int, nfds, N_STACK_PPS);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index eae7d9dbf3ff..e1f437be6c3c 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -674,7 +674,6 @@ struct list_head *seq_list_start(struct list_head *head, loff_t pos)
        return NULL;
 }
 EXPORT_SYMBOL(seq_list_start);
 struct list_head *seq_list_start_head(struct list_head *head, loff_t pos)
@@ -684,7 +683,6 @@ struct list_head *seq_list_start_head(struct list_head *head, loff_t pos)
        return seq_list_start(head, pos - 1);
 }
 EXPORT_SYMBOL(seq_list_start_head);
 struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
@@ -695,5 +693,131 @@ struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
        ++*ppos;
        return lh == head ? NULL : lh;
 }
 EXPORT_SYMBOL(seq_list_next);
+/**
+ * seq_hlist_start - start an iteration of a hlist
+ * @head: the head of the hlist
+ * @pos:  the start position of the sequence
+ *
+ * Called at seq_file->op->start().
+ */
+struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos)
+{
+        struct hlist_node *node;
+        hlist_for_each(node, head)
+                if (pos-- == 0)
+                        return node;
+        return NULL;
+}
+EXPORT_SYMBOL(seq_hlist_start);
+/**
+ * seq_hlist_start_head - start an iteration of a hlist
+ * @head: the head of the hlist
+ * @pos:  the start position of the sequence
+ *
+ * Called at seq_file->op->start(). Call this function if you want to
+ * print a header at the top of the output.
+ */
+struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos)
+{
+        if (!pos)
+                return SEQ_START_TOKEN;
+        return seq_hlist_start(head, pos - 1);
+}
+EXPORT_SYMBOL(seq_hlist_start_head);
+/**
+ * seq_hlist_next - move to the next position of the hlist
+ * @v:    the current iterator
+ * @head: the head of the hlist
+ * @ppos: the current position
+ *
+ * Called at seq_file->op->next().
+ */
+struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head,
+                                  loff_t *ppos)
+{
+        struct hlist_node *node = v;
+        ++*ppos;
+        if (v == SEQ_START_TOKEN)
+                return head->first;
+        else
+                return node->next;
+}
+EXPORT_SYMBOL(seq_hlist_next);
+/**
+ * seq_hlist_start_rcu - start an iteration of a hlist protected by RCU
+ * @head: the head of the hlist
+ * @pos:  the start position of the sequence
+ *
+ * Called at seq_file->op->start().
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as hlist_add_head_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ */
+struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head,
+                                       loff_t pos)
+{
+        struct hlist_node *node;
+        __hlist_for_each_rcu(node, head)
+                if (pos-- == 0)
+                        return node;
+        return NULL;
+}
+EXPORT_SYMBOL(seq_hlist_start_rcu);
+/**
+ * seq_hlist_start_head_rcu - start an iteration of a hlist protected by RCU
+ * @head: the head of the hlist
+ * @pos:  the start position of the sequence
+ *
+ * Called at seq_file->op->start(). Call this function if you want to
+ * print a header at the top of the output.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as hlist_add_head_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ */
+struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head,
+                                            loff_t pos)
+{
+        if (!pos)
+                return SEQ_START_TOKEN;
+        return seq_hlist_start_rcu(head, pos - 1);
+}
+EXPORT_SYMBOL(seq_hlist_start_head_rcu);
+/**
+ * seq_hlist_next_rcu - move to the next position of the hlist protected by RCU
+ * @v:    the current iterator
+ * @head: the head of the hlist
+ * @ppos: the current position
+ *
+ * Called at seq_file->op->next().
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as hlist_add_head_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ */
+struct hlist_node *seq_hlist_next_rcu(void *v,
+                                      struct hlist_head *head,
+                                      loff_t *ppos)
+{
+        struct hlist_node *node = v;
+        ++*ppos;
+        if (v == SEQ_START_TOKEN)
+                return rcu_dereference(head->first);
+        else
+                return rcu_dereference(node->next);
+}
+EXPORT_SYMBOL(seq_hlist_next_rcu);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index b07565c94386..f329849ce3c0 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -22,6 +22,7 @@
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/list.h>
@@ -236,7 +237,7 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
                 * anon_inode_getfd() will install the fd.
                 */
                ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx,
-                                       flags & (O_CLOEXEC | O_NONBLOCK));
+                                       O_RDWR | (flags & (O_CLOEXEC | O_NONBLOCK)));
                if (ufd < 0)
                        kfree(ctx);
        } else {
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index 92d5e8ffb639..dbf6548bbf06 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -13,7 +13,6 @@
 #include <linux/fcntl.h>
 #include <linux/stat.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/smp_lock.h>
 #include <linux/net.h>
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
index 6bd9b691a463..0e39a924f10a 100644
--- a/fs/smbfs/smbiod.c
+++ b/fs/smbfs/smbiod.c
@@ -12,7 +12,6 @@
 #include <linux/string.h>
 #include <linux/stat.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/file.h>
 #include <linux/dcache.h>
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
index 00b2909bd469..54350b59046b 100644
--- a/fs/smbfs/symlink.c
+++ b/fs/smbfs/symlink.c
@@ -15,6 +15,7 @@
 #include <linux/pagemap.h>
 #include <linux/net.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
diff --git a/fs/splice.c b/fs/splice.c
index 7394e9e17534..9313b6124a2e 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -30,6 +30,7 @@
 #include <linux/syscalls.h>
 #include <linux/uio.h>
 #include <linux/security.h>
+#include <linux/gfp.h>
 /*
 * Attempt to steal a page from a pipe buffer. This should perhaps go into
@@ -648,9 +649,11 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
        ret = buf->ops->confirm(pipe, buf);
        if (!ret) {
                more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
+                if (file->f_op && file->f_op->sendpage)
-                ret = file->f_op->sendpage(file, buf->page, buf->offset,
+                        ret = file->f_op->sendpage(file, buf->page, buf->offset,
-                                           sd->len, &pos, more);
+                                                   sd->len, &pos, more);
+                else
+                        ret = -EINVAL;
        }
        return ret;
@@ -1068,8 +1071,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
        if (unlikely(ret < 0))
                return ret;
-        splice_write = out->f_op->splice_write;
+        if (out->f_op && out->f_op->splice_write)
-        if (!splice_write)
+                splice_write = out->f_op->splice_write;
+        else
                splice_write = default_file_splice_write;
        return splice_write(pipe, out, ppos, len, flags);
@@ -1093,8 +1097,9 @@ static long do_splice_to(struct file *in, loff_t *ppos,
        if (unlikely(ret < 0))
                return ret;
-        splice_read = in->f_op->splice_read;
+        if (in->f_op && in->f_op->splice_read)
-        if (!splice_read)
+                splice_read = in->f_op->splice_read;
+        else
                splice_read = default_file_splice_read;
        return splice_read(in, ppos, pipe, len, flags);
@@ -1316,7 +1321,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                if (off_in)
                        return -ESPIPE;
                if (off_out) {
-                        if (out->f_op->llseek == no_llseek)
+                        if (!out->f_op || !out->f_op->llseek ||
+                            out->f_op->llseek == no_llseek)
                                return -EINVAL;
                        if (copy_from_user(&offset, off_out, sizeof(loff_t)))
                                return -EFAULT;
@@ -1336,7 +1342,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                if (off_out)
                        return -ESPIPE;
                if (off_in) {
-                        if (in->f_op->llseek == no_llseek)
+                        if (!in->f_op || !in->f_op->llseek ||
+                            in->f_op->llseek == no_llseek)
                                return -EINVAL;
                        if (copy_from_user(&offset, off_in, sizeof(loff_t)))
                                return -EFAULT;
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 70e3244fa30f..df8a19ef870d 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -4,4 +4,4 @@
 obj-$(CONFIG_SQUASHFS) += squashfs.o
 squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
-squashfs-y += namei.o super.o symlink.o
+squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 2a7960310349..1cb0d81b164b 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -29,15 +29,14 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
-#include <linux/mutex.h>
 #include <linux/string.h>
 #include <linux/buffer_head.h>
-#include <linux/zlib.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "decompressor.h"
 /*
 * Read the metadata block length, this is stored in the first two
@@ -153,72 +152,10 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
        }
        if (compressed) {
-                int zlib_err = 0, zlib_init = 0;
+                length = squashfs_decompress(msblk, buffer, bh, b, offset,
+                         length, srclength, pages);
-                /*
+                if (length < 0)
-                 * Uncompress block.
+                        goto read_failure;
-                 */
-                mutex_lock(&msblk->read_data_mutex);
-                msblk->stream.avail_out = 0;
-                msblk->stream.avail_in = 0;
-                bytes = length;
-                do {
-                        if (msblk->stream.avail_in == 0 && k < b) {
-                                avail = min(bytes, msblk->devblksize - offset);
-                                bytes -= avail;
-                                wait_on_buffer(bh[k]);
-                                if (!buffer_uptodate(bh[k]))
-                                        goto release_mutex;
-                                if (avail == 0) {
-                                        offset = 0;
-                                        put_bh(bh[k++]);
-                                        continue;
-                                }
-                                msblk->stream.next_in = bh[k]->b_data + offset;
-                                msblk->stream.avail_in = avail;
-                                offset = 0;
-                        }
-                        if (msblk->stream.avail_out == 0 && page < pages) {
-                                msblk->stream.next_out = buffer[page++];
-                                msblk->stream.avail_out = PAGE_CACHE_SIZE;
-                        }
-                        if (!zlib_init) {
-                                zlib_err = zlib_inflateInit(&msblk->stream);
-                                if (zlib_err != Z_OK) {
-                                        ERROR("zlib_inflateInit returned"
-                                                " unexpected result 0x%x,"
-                                                " srclength %d\n", zlib_err,
-                                                srclength);
-                                        goto release_mutex;
-                                }
-                                zlib_init = 1;
-                        }
-                        zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH);
-                        if (msblk->stream.avail_in == 0 && k < b)
-                                put_bh(bh[k++]);
-                } while (zlib_err == Z_OK);
-                if (zlib_err != Z_STREAM_END) {
-                        ERROR("zlib_inflate error, data probably corrupt\n");
-                        goto release_mutex;
-                }
-                zlib_err = zlib_inflateEnd(&msblk->stream);
-                if (zlib_err != Z_OK) {
-                        ERROR("zlib_inflate error, data probably corrupt\n");
-                        goto release_mutex;
-                }
-                length = msblk->stream.total_out;
-                mutex_unlock(&msblk->read_data_mutex);
        } else {
                /*
                 * Block is uncompressed.
@@ -255,9 +192,6 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
        kfree(bh);
        return length;
-release_mutex:
-        mutex_unlock(&msblk->read_data_mutex);
 block_release:
        for (; k < b; k++)
                put_bh(bh[k]);
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 40c98fa6b5d6..57314bee9059 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -51,7 +51,6 @@
 #include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/wait.h>
-#include <linux/zlib.h>
 #include <linux/pagemap.h>
 #include "squashfs_fs.h"
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
new file mode 100644
index 000000000000..157478da6ac9
--- /dev/null
+++ b/fs/squashfs/decompressor.c
@@ -0,0 +1,68 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * decompressor.c
+ */
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "decompressor.h"
+#include "squashfs.h"
+/*
+ * This file (and decompressor.h) implements a decompressor framework for
+ * Squashfs, allowing multiple decompressors to be easily supported
+ */
+static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
+        NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
+};
+static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = {
+        NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
+};
+static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
+        NULL, NULL, NULL, 0, "unknown", 0
+};
+static const struct squashfs_decompressor *decompressor[] = {
+        &squashfs_zlib_comp_ops,
+        &squashfs_lzma_unsupported_comp_ops,
+        &squashfs_lzo_unsupported_comp_ops,
+        &squashfs_unknown_comp_ops
+};
+const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
+{
+        int i;
+        for (i = 0; decompressor[i]->id; i++)
+                if (id == decompressor[i]->id)
+                        break;
+        return decompressor[i];
+}
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
new file mode 100644
index 000000000000..7425f80783f6
--- /dev/null
+++ b/fs/squashfs/decompressor.h
@@ -0,0 +1,55 @@
+#ifndef DECOMPRESSOR_H
+#define DECOMPRESSOR_H
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * decompressor.h
+ */
+struct squashfs_decompressor {
+        void    *(*init)(struct squashfs_sb_info *);
+        void    (*free)(void *);
+        int     (*decompress)(struct squashfs_sb_info *, void **,
+                struct buffer_head **, int, int, int, int, int);
+        int     id;
+        char    *name;
+        int     supported;
+};
+static inline void *squashfs_decompressor_init(struct squashfs_sb_info *msblk)
+{
+        return msblk->decompressor->init(msblk);
+}
+static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk,
+        void *s)
+{
+        if (msblk->decompressor)
+                msblk->decompressor->free(s);
+}
+static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
+        void **buffer, struct buffer_head **bh, int b, int offset, int length,
+        int srclength, int pages)
+{
+        return msblk->decompressor->decompress(msblk, buffer, bh, b, offset,
+                length, srclength, pages);
+}
+#endif
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 566b0eaed868..12b933ac6585 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -30,7 +30,6 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
-#include <linux/zlib.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 2b1b8fe5e037..7f93d5a9ee05 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -39,7 +39,6 @@
 #include <linux/vfs.h>
 #include <linux/dcache.h>
 #include <linux/exportfs.h>
-#include <linux/zlib.h>
 #include <linux/slab.h>
 #include "squashfs_fs.h"
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 717767d831df..a25c5060bdcb 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -47,7 +47,6 @@
 #include <linux/string.h>
 #include <linux/pagemap.h>
 #include <linux/mutex.h>
-#include <linux/zlib.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index b5a2c15bbbc7..7c90bbd6879d 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -36,7 +36,6 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
-#include <linux/zlib.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index 3795b837ba28..b7f64bcd2b70 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -34,7 +34,6 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
-#include <linux/zlib.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 9101dbde39ec..49daaf669e41 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -40,7 +40,6 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
-#include <linux/zlib.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 9e398653b22b..5266bd8ad932 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -57,7 +57,6 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/dcache.h>
-#include <linux/zlib.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 0e9feb6adf7e..fe2587af5512 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -51,6 +51,9 @@ extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *,
                                u64, int);
 extern int squashfs_read_table(struct super_block *, void *, u64, int);
+/* decompressor.c */
+extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
 /* export.c */
 extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64,
                                unsigned int);
@@ -71,7 +74,7 @@ extern struct inode *squashfs_iget(struct super_block *, long long,
 extern int squashfs_read_inode(struct inode *, long long);
 /*
- * Inodes and files operations
+ * Inodes, files and decompressor operations
 */
 /* dir.c */
@@ -88,3 +91,6 @@ extern const struct inode_operations squashfs_dir_inode_ops;
 /* symlink.c */
 extern const struct address_space_operations squashfs_symlink_aops;
+/* zlib_wrapper.c */
+extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 283daafc568e..79024245ea00 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -183,8 +183,6 @@
 #define SQUASHFS_MAX_FILE_SIZE          (1LL << \
                                        (SQUASHFS_MAX_FILE_SIZE_LOG - 2))
-#define SQUASHFS_MARKER_BYTE            0xff
 /* meta index cache */
 #define SQUASHFS_META_INDEXES   (SQUASHFS_METADATA_SIZE / sizeof(unsigned int))
 #define SQUASHFS_META_ENTRIES   127
@@ -211,7 +209,9 @@ struct meta_index {
 /*
 * definitions for structures on disk
 */
-#define ZLIB_COMPRESSION         1
+#define ZLIB_COMPRESSION        1
+#define LZMA_COMPRESSION        2
+#define LZO_COMPRESSION         3
 struct squashfs_super_block {
        __le32                  s_magic;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index c8c65614dd1c..2e77dc547e25 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -52,25 +52,25 @@ struct squashfs_cache_entry {
 };
 struct squashfs_sb_info {
-        int                     devblksize;
+        const struct squashfs_decompressor      *decompressor;
-        int                     devblksize_log2;
+        int                                     devblksize;
-        struct squashfs_cache   *block_cache;
+        int                                     devblksize_log2;
-        struct squashfs_cache   *fragment_cache;
+        struct squashfs_cache                   *block_cache;
-        struct squashfs_cache   *read_page;
+        struct squashfs_cache                   *fragment_cache;
-        int                     next_meta_index;
+        struct squashfs_cache                   *read_page;
-        __le64                  *id_table;
+        int                                     next_meta_index;
-        __le64                  *fragment_index;
+        __le64                                  *id_table;
-        unsigned int            *fragment_index_2;
+        __le64                                  *fragment_index;
-        struct mutex            read_data_mutex;
+        struct mutex                            read_data_mutex;
-        struct mutex            meta_index_mutex;
+        struct mutex                            meta_index_mutex;
-        struct meta_index       *meta_index;
+        struct meta_index                       *meta_index;
-        z_stream                stream;
+        void                                    *stream;
-        __le64                  *inode_lookup_table;
+        __le64                                  *inode_lookup_table;
-        u64                     inode_table;
+        u64                                     inode_table;
-        u64                     directory_table;
+        u64                                     directory_table;
-        unsigned int            block_size;
+        unsigned int                            block_size;
-        unsigned short          block_log;
+        unsigned short                          block_log;
-        long long               bytes_used;
+        long long                               bytes_used;
-        unsigned int            inodes;
+        unsigned int                            inodes;
 };
 #endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 6c197ef53add..3550aec2f655 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -35,34 +35,41 @@
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/zlib.h>
 #include <linux/magic.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "decompressor.h"
 static struct file_system_type squashfs_fs_type;
 static const struct super_operations squashfs_super_ops;
-static int supported_squashfs_filesystem(short major, short minor, short comp)
+static const struct squashfs_decompressor *supported_squashfs_filesystem(short
+        major, short minor, short id)
 {
+        const struct squashfs_decompressor *decompressor;
        if (major < SQUASHFS_MAJOR) {
                ERROR("Major/Minor mismatch, older Squashfs %d.%d "
                        "filesystems are unsupported\n", major, minor);
-                return -EINVAL;
+                return NULL;
        } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) {
                ERROR("Major/Minor mismatch, trying to mount newer "
                        "%d.%d filesystem\n", major, minor);
                ERROR("Please update your kernel\n");
-                return -EINVAL;
+                return NULL;
        }
-        if (comp != ZLIB_COMPRESSION)
+        decompressor = squashfs_lookup_decompressor(id);
-                return -EINVAL;
+        if (!decompressor->supported) {
+                ERROR("Filesystem uses \"%s\" compression. This is not "
+                        "supported\n", decompressor->name);
+                return NULL;
+        }
-        return 0;
+        return decompressor;
 }
@@ -87,13 +94,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
        }
        msblk = sb->s_fs_info;
-        msblk->stream.workspace = kmalloc(zlib_inflate_workspacesize(),
-                GFP_KERNEL);
-        if (msblk->stream.workspace == NULL) {
-                ERROR("Failed to allocate zlib workspace\n");
-                goto failure;
-        }
        sblk = kzalloc(sizeof(*sblk), GFP_KERNEL);
        if (sblk == NULL) {
                ERROR("Failed to allocate squashfs_super_block\n");
@@ -120,25 +120,25 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
+        err = -EINVAL;
        /* Check it is a SQUASHFS superblock */
        sb->s_magic = le32_to_cpu(sblk->s_magic);
        if (sb->s_magic != SQUASHFS_MAGIC) {
                if (!silent)
                        ERROR("Can't find a SQUASHFS superblock on %s\n",
                                                bdevname(sb->s_bdev, b));
-                err = -EINVAL;
                goto failed_mount;
        }
-        /* Check the MAJOR & MINOR versions and compression type */
+        /* Check the MAJOR & MINOR versions and lookup compression type */
-        err = supported_squashfs_filesystem(le16_to_cpu(sblk->s_major),
+        msblk->decompressor = supported_squashfs_filesystem(
+                        le16_to_cpu(sblk->s_major),
                        le16_to_cpu(sblk->s_minor),
                        le16_to_cpu(sblk->compression));
-        if (err < 0)
+        if (msblk->decompressor == NULL)
                goto failed_mount;
-        err = -EINVAL;
        /*
         * Check if there's xattrs in the filesystem.  These are not
         * supported in this version, so warn that they will be ignored.
@@ -205,6 +205,10 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
        err = -ENOMEM;
+        msblk->stream = squashfs_decompressor_init(msblk);
+        if (msblk->stream == NULL)
+                goto failed_mount;
        msblk->block_cache = squashfs_cache_init("metadata",
                        SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
        if (msblk->block_cache == NULL)
@@ -292,17 +296,16 @@ failed_mount:
        squashfs_cache_delete(msblk->block_cache);
        squashfs_cache_delete(msblk->fragment_cache);
        squashfs_cache_delete(msblk->read_page);
+        squashfs_decompressor_free(msblk, msblk->stream);
        kfree(msblk->inode_lookup_table);
        kfree(msblk->fragment_index);
        kfree(msblk->id_table);
-        kfree(msblk->stream.workspace);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
        kfree(sblk);
        return err;
 failure:
-        kfree(msblk->stream.workspace);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
        return -ENOMEM;
@@ -346,10 +349,10 @@ static void squashfs_put_super(struct super_block *sb)
                squashfs_cache_delete(sbi->block_cache);
                squashfs_cache_delete(sbi->fragment_cache);
                squashfs_cache_delete(sbi->read_page);
+                squashfs_decompressor_free(sbi, sbi->stream);
                kfree(sbi->id_table);
                kfree(sbi->fragment_index);
                kfree(sbi->meta_index);
-                kfree(sbi->stream.workspace);
                kfree(sb->s_fs_info);
                sb->s_fs_info = NULL;
        }
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 83d87880aac8..32b911f4ee39 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -33,10 +33,8 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/pagemap.h>
-#include <linux/zlib.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
new file mode 100644
index 000000000000..15a03d0fb9f3
--- /dev/null
+++ b/fs/squashfs/zlib_wrapper.c
@@ -0,0 +1,151 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * zlib_wrapper.c
+ */
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+#include "decompressor.h"
+static void *zlib_init(struct squashfs_sb_info *dummy)
+{
+        z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL);
+        if (stream == NULL)
+                goto failed;
+        stream->workspace = kmalloc(zlib_inflate_workspacesize(),
+                GFP_KERNEL);
+        if (stream->workspace == NULL)
+                goto failed;
+        return stream;
+failed:
+        ERROR("Failed to allocate zlib workspace\n");
+        kfree(stream);
+        return NULL;
+}
+static void zlib_free(void *strm)
+{
+        z_stream *stream = strm;
+        if (stream)
+                kfree(stream->workspace);
+        kfree(stream);
+}
+static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
+        struct buffer_head **bh, int b, int offset, int length, int srclength,
+        int pages)
+{
+        int zlib_err = 0, zlib_init = 0;
+        int avail, bytes, k = 0, page = 0;
+        z_stream *stream = msblk->stream;
+        mutex_lock(&msblk->read_data_mutex);
+        stream->avail_out = 0;
+        stream->avail_in = 0;
+        bytes = length;
+        do {
+                if (stream->avail_in == 0 && k < b) {
+                        avail = min(bytes, msblk->devblksize - offset);
+                        bytes -= avail;
+                        wait_on_buffer(bh[k]);
+                        if (!buffer_uptodate(bh[k]))
+                                goto release_mutex;
+                        if (avail == 0) {
+                                offset = 0;
+                                put_bh(bh[k++]);
+                                continue;
+                        }
+                        stream->next_in = bh[k]->b_data + offset;
+                        stream->avail_in = avail;
+                        offset = 0;
+                }
+                if (stream->avail_out == 0 && page < pages) {
+                        stream->next_out = buffer[page++];
+                        stream->avail_out = PAGE_CACHE_SIZE;
+                }
+                if (!zlib_init) {
+                        zlib_err = zlib_inflateInit(stream);
+                        if (zlib_err != Z_OK) {
+                                ERROR("zlib_inflateInit returned unexpected "
+                                        "result 0x%x, srclength %d\n",
+                                        zlib_err, srclength);
+                                goto release_mutex;
+                        }
+                        zlib_init = 1;
+                }
+                zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH);
+                if (stream->avail_in == 0 && k < b)
+                        put_bh(bh[k++]);
+        } while (zlib_err == Z_OK);
+        if (zlib_err != Z_STREAM_END) {
+                ERROR("zlib_inflate error, data probably corrupt\n");
+                goto release_mutex;
+        }
+        zlib_err = zlib_inflateEnd(stream);
+        if (zlib_err != Z_OK) {
+                ERROR("zlib_inflate error, data probably corrupt\n");
+                goto release_mutex;
+        }
+        mutex_unlock(&msblk->read_data_mutex);
+        return stream->total_out;
+release_mutex:
+        mutex_unlock(&msblk->read_data_mutex);
+        for (; k < b; k++)
+                put_bh(bh[k]);
+        return -EIO;
+}
+const struct squashfs_decompressor squashfs_zlib_comp_ops = {
+        .init = zlib_init,
+        .free = zlib_free,
+        .decompress = zlib_uncompress,
+        .id = ZLIB_COMPRESSION,
+        .name = "zlib",
+        .supported = 1
+};
diff --git a/fs/stack.c b/fs/stack.c
index 67716f6a1a4a..4a6f7f440658 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -7,18 +7,63 @@
 * This function cannot be inlined since i_size_{read,write} is rather
 * heavy-weight on 32-bit systems
 */
-void fsstack_copy_inode_size(struct inode *dst, const struct inode *src)
+void fsstack_copy_inode_size(struct inode *dst, struct inode *src)
 {
-        i_size_write(dst, i_size_read((struct inode *)src));
+        loff_t i_size;
-        dst->i_blocks = src->i_blocks;
+        blkcnt_t i_blocks;
+        /*
+         * i_size_read() includes its own seqlocking and protection from
+         * preemption (see include/linux/fs.h): we need nothing extra for
+         * that here, and prefer to avoid nesting locks than attempt to keep
+         * i_size and i_blocks in sync together.
+         */
+        i_size = i_size_read(src);
+        /*
+         * But if CONFIG_LBDAF (on 32-bit), we ought to make an effort to
+         * keep the two halves of i_blocks in sync despite SMP or PREEMPT -
+         * though stat's generic_fillattr() doesn't bother, and we won't be
+         * applying quotas (where i_blocks does become important) at the
+         * upper level.
+         *
+         * We don't actually know what locking is used at the lower level;
+         * but if it's a filesystem that supports quotas, it will be using
+         * i_lock as in inode_add_bytes().  tmpfs uses other locking, and
+         * its 32-bit is (just) able to exceed 2TB i_size with the aid of
+         * holes; but its i_blocks cannot carry into the upper long without
+         * almost 2TB swap - let's ignore that case.
+         */
+        if (sizeof(i_blocks) > sizeof(long))
+                spin_lock(&src->i_lock);
+        i_blocks = src->i_blocks;
+        if (sizeof(i_blocks) > sizeof(long))
+                spin_unlock(&src->i_lock);
+        /*
+         * If CONFIG_SMP or CONFIG_PREEMPT on 32-bit, it's vital for
+         * fsstack_copy_inode_size() to hold some lock around
+         * i_size_write(), otherwise i_size_read() may spin forever (see
+         * include/linux/fs.h).  We don't necessarily hold i_mutex when this
+         * is called, so take i_lock for that case.
+         *
+         * And if CONFIG_LBADF (on 32-bit), continue our effort to keep the
+         * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock
+         * for that case too, and do both at once by combining the tests.
+         *
+         * There is none of this locking overhead in the 64-bit case.
+         */
+        if (sizeof(i_size) > sizeof(long) || sizeof(i_blocks) > sizeof(long))
+                spin_lock(&dst->i_lock);
+        i_size_write(dst, i_size);
+        dst->i_blocks = i_blocks;
+        if (sizeof(i_size) > sizeof(long) || sizeof(i_blocks) > sizeof(long))
+                spin_unlock(&dst->i_lock);
 }
 EXPORT_SYMBOL_GPL(fsstack_copy_inode_size);
-/* copy all attributes; get_nlinks is optional way to override the i_nlink
+/* copy all attributes */
- * copying
+void fsstack_copy_attr_all(struct inode *dest, const struct inode *src)
- */
-void fsstack_copy_attr_all(struct inode *dest, const struct inode *src,
-                                int (*get_nlinks)(struct inode *))
 {
        dest->i_mode = src->i_mode;
        dest->i_uid = src->i_uid;
@@ -29,14 +74,6 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src,
        dest->i_ctime = src->i_ctime;
        dest->i_blkbits = src->i_blkbits;
        dest->i_flags = src->i_flags;
+        dest->i_nlink = src->i_nlink;
-        /*
-         * Update the nlinks AFTER updating the above fields, because the
-         * get_links callback may depend on them.
-         */
-        if (!get_nlinks)
-                dest->i_nlink = src->i_nlink;
-        else
-                dest->i_nlink = (*get_nlinks)(dest);
 }
 EXPORT_SYMBOL_GPL(fsstack_copy_attr_all);
diff --git a/fs/stat.c b/fs/stat.c
index 075694e31d8b..c4ecd52c5737 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -401,9 +401,9 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename,
 }
 #endif /* __ARCH_WANT_STAT64 */
-void inode_add_bytes(struct inode *inode, loff_t bytes)
+/* Caller is here responsible for sufficient locking (ie. inode->i_lock) */
+void __inode_add_bytes(struct inode *inode, loff_t bytes)
 {
-        spin_lock(&inode->i_lock);
        inode->i_blocks += bytes >> 9;
        bytes &= 511;
        inode->i_bytes += bytes;
@@ -411,6 +411,12 @@ void inode_add_bytes(struct inode *inode, loff_t bytes)
                inode->i_blocks++;
                inode->i_bytes -= 512;
        }
+}
+void inode_add_bytes(struct inode *inode, loff_t bytes)
+{
+        spin_lock(&inode->i_lock);
+        __inode_add_bytes(inode, bytes);
        spin_unlock(&inode->i_lock);
 }
diff --git a/fs/super.c b/fs/super.c
index 19eb70b374bc..f35ac6022109 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -568,7 +568,7 @@ out:
 int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 {
        int retval;
-        int remount_rw;
+        int remount_rw, remount_ro;
        if (sb->s_frozen != SB_UNFROZEN)
                return -EBUSY;
@@ -583,9 +583,12 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
        shrink_dcache_sb(sb);
        sync_filesystem(sb);
+        remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
+        remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
        /* If we are remounting RDONLY and current sb is read/write,
           make sure there are no rw files opened */
-        if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
+        if (remount_ro) {
                if (force)
                        mark_files_ro(sb);
                else if (!fs_may_remount_ro(sb))
@@ -594,7 +597,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
                if (retval < 0 && retval != -ENOSYS)
                        return -EBUSY;
        }
-        remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
        if (sb->s_op->remount_fs) {
                retval = sb->s_op->remount_fs(sb, &flags, data);
@@ -604,6 +606,16 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
        sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
        if (remount_rw)
                vfs_dq_quota_on_remount(sb);
+        /*
+         * Some filesystems modify their metadata via some other path than the
+         * bdev buffer cache (eg. use a private mapping, or directories in
+         * pagecache, etc). Also file data modifications go via their own
+         * mappings. So If we try to mount readonly then copy the filesystem
+         * from bdev, we could get stale data, so invalidate it to give a best
+         * effort at coherency.
+         */
+        if (remount_ro && sb->s_bdev)
+                invalidate_bdev(sb->s_bdev);
        return 0;
 }
@@ -901,8 +913,9 @@ int get_sb_single(struct file_system_type *fs_type,
                        return error;
                }
                s->s_flags |= MS_ACTIVE;
+        } else {
+                do_remount_sb(s, flags, data, 0);
        }
-        do_remount_sb(s, flags, data, 0);
        simple_set_mnt(mnt, s);
        return 0;
 }
@@ -924,6 +937,9 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
        if (!mnt)
                goto out;
+        if (flags & MS_KERNMOUNT)
+                mnt->mnt_flags = MNT_INTERNAL;
        if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
                secdata = alloc_secdata();
                if (!secdata)
diff --git a/fs/sync.c b/fs/sync.c
index d104591b066b..fc5c3d75cf3c 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -5,6 +5,7 @@
 #include <linux/kernel.h>
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/writeback.h>
@@ -34,14 +35,14 @@ static int __sync_filesystem(struct super_block *sb, int wait)
        if (!sb->s_bdi)
                return 0;
-        /* Avoid doing twice syncing and cache pruning for quota sync */
+        if (sb->s_qcop && sb->s_qcop->quota_sync)
-        if (!wait) {
+                sb->s_qcop->quota_sync(sb, -1, wait);
-                writeout_quota_sb(sb, -1);
-                writeback_inodes_sb(sb);
+        if (wait)
-        } else {
-                sync_quota_sb(sb, -1);
                sync_inodes_sb(sb);
-        }
+        else
+                writeback_inodes_sb(sb);
        if (sb->s_op->sync_fs)
                sb->s_op->sync_fs(sb, wait);
        return __sync_blockdev(sb->s_bdev, wait);
@@ -295,10 +296,11 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
 */
 int generic_write_sync(struct file *file, loff_t pos, loff_t count)
 {
-        if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host))
+        if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
                return 0;
        return vfs_fsync_range(file, file->f_path.dentry, pos,
-                               pos + count - 1, 1);
+                               pos + count - 1,
+                               (file->f_flags & __O_SYNC) ? 0 : 1);
 }
 EXPORT_SYMBOL(generic_write_sync);
@@ -354,6 +356,7 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,
 {
        int ret;
        struct file *file;
+        struct address_space *mapping;
        loff_t endbyte;                 /* inclusive */
        int fput_needed;
        umode_t i_mode;
@@ -404,7 +407,28 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,
                        !S_ISLNK(i_mode))
                goto out_put;
-        ret = do_sync_mapping_range(file->f_mapping, offset, endbyte, flags);
+        mapping = file->f_mapping;
+        if (!mapping) {
+                ret = -EINVAL;
+                goto out_put;
+        }
+        ret = 0;
+        if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
+                ret = filemap_fdatawait_range(mapping, offset, endbyte);
+                if (ret < 0)
+                        goto out_put;
+        }
+        if (flags & SYNC_FILE_RANGE_WRITE) {
+                ret = filemap_fdatawrite_range(mapping, offset, endbyte);
+                if (ret < 0)
+                        goto out_put;
+        }
+        if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
+                ret = filemap_fdatawait_range(mapping, offset, endbyte);
 out_put:
        fput_light(file, fput_needed);
 out:
@@ -436,42 +460,3 @@ asmlinkage long SyS_sync_file_range2(long fd, long flags,
 }
 SYSCALL_ALIAS(sys_sync_file_range2, SyS_sync_file_range2);
 #endif
-/*
- * `endbyte' is inclusive
- */
-int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
-                          loff_t endbyte, unsigned int flags)
-{
-        int ret;
-        if (!mapping) {
-                ret = -EINVAL;
-                goto out;
-        }
-        ret = 0;
-        if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
-                ret = wait_on_page_writeback_range(mapping,
-                                        offset >> PAGE_CACHE_SHIFT,
-                                        endbyte >> PAGE_CACHE_SHIFT);
-                if (ret < 0)
-                        goto out;
-        }
-        if (flags & SYNC_FILE_RANGE_WRITE) {
-                ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
-                                                WB_SYNC_ALL);
-                if (ret < 0)
-                        goto out;
-        }
-        if (flags & SYNC_FILE_RANGE_WAIT_AFTER) {
-                ret = wait_on_page_writeback_range(mapping,
-                                        offset >> PAGE_CACHE_SHIFT,
-                                        endbyte >> PAGE_CACHE_SHIFT);
-        }
-out:
-        return ret;
-}
-EXPORT_SYMBOL_GPL(do_sync_mapping_range);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 60c702bc10ae..e9d293593e52 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -54,14 +54,14 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
        int rc;
        /* need attr_sd for attr, its parent for kobj */
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return -ENODEV;
        rc = -EIO;
        if (attr->read)
                rc = attr->read(kobj, attr, buffer, off, count);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        return rc;
 }
@@ -125,14 +125,14 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
        int rc;
        /* need attr_sd for attr, its parent for kobj */
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return -ENODEV;
        rc = -EIO;
        if (attr->write)
                rc = attr->write(kobj, attr, buffer, offset, count);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        return rc;
 }
@@ -184,12 +184,12 @@ static void bin_vma_open(struct vm_area_struct *vma)
        if (!bb->vm_ops || !bb->vm_ops->open)
                return;
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return;
        bb->vm_ops->open(vma);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
 }
 static void bin_vma_close(struct vm_area_struct *vma)
@@ -201,12 +201,12 @@ static void bin_vma_close(struct vm_area_struct *vma)
        if (!bb->vm_ops || !bb->vm_ops->close)
                return;
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return;
        bb->vm_ops->close(vma);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
 }
 static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -219,12 +219,12 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (!bb->vm_ops || !bb->vm_ops->fault)
                return VM_FAULT_SIGBUS;
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return VM_FAULT_SIGBUS;
        ret = bb->vm_ops->fault(vma, vmf);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        return ret;
 }
@@ -241,12 +241,12 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (!bb->vm_ops->page_mkwrite)
                return 0;
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return VM_FAULT_SIGBUS;
        ret = bb->vm_ops->page_mkwrite(vma, vmf);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        return ret;
 }
@@ -261,12 +261,12 @@ static int bin_access(struct vm_area_struct *vma, unsigned long addr,
        if (!bb->vm_ops || !bb->vm_ops->access)
                return -EINVAL;
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return -EINVAL;
        ret = bb->vm_ops->access(vma, addr, buf, len, write);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        return ret;
 }
@@ -281,12 +281,12 @@ static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
        if (!bb->vm_ops || !bb->vm_ops->set_policy)
                return 0;
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return -EINVAL;
        ret = bb->vm_ops->set_policy(vma, new);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        return ret;
 }
@@ -301,12 +301,12 @@ static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
        if (!bb->vm_ops || !bb->vm_ops->get_policy)
                return vma->vm_policy;
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return vma->vm_policy;
        pol = bb->vm_ops->get_policy(vma, addr);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        return pol;
 }
@@ -321,12 +321,12 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
        if (!bb->vm_ops || !bb->vm_ops->migrate)
                return 0;
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return 0;
        ret = bb->vm_ops->migrate(vma, from, to, flags);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        return ret;
 }
 #endif
@@ -356,7 +356,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
        /* need attr_sd for attr, its parent for kobj */
        rc = -ENODEV;
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                goto out_unlock;
        rc = -EINVAL;
@@ -384,7 +384,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
        bb->vm_ops = vma->vm_ops;
        vma->vm_ops = &bin_vm_ops;
 out_put:
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
 out_unlock:
        mutex_unlock(&bb->mutex);
@@ -399,7 +399,7 @@ static int open(struct inode * inode, struct file * file)
        int error;
        /* binary file operations requires both @sd and its parent */
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return -ENODEV;
        error = -EACCES;
@@ -426,11 +426,11 @@ static int open(struct inode * inode, struct file * file)
        mutex_unlock(&sysfs_bin_lock);
        /* open succeeded, put active references */
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        return 0;
 err_out:
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        kfree(bb);
        return error;
 }
@@ -483,7 +483,8 @@ void unmap_bin_file(struct sysfs_dirent *attr_sd)
 *      @attr:  attribute descriptor.
 */
-int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr)
+int sysfs_create_bin_file(struct kobject *kobj,
+                          const struct bin_attribute *attr)
 {
        BUG_ON(!kobj || !kobj->sd || !attr);
@@ -497,7 +498,8 @@ int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr)
 *      @attr:  attribute descriptor.
 */
-void sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr)
+void sysfs_remove_bin_file(struct kobject *kobj,
+                           const struct bin_attribute *attr)
 {
        sysfs_hash_and_remove(kobj->sd, attr->attr.name);
 }
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index e0201837d244..590717861c7a 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -25,7 +25,6 @@
 #include "sysfs.h"
 DEFINE_MUTEX(sysfs_mutex);
-DEFINE_MUTEX(sysfs_rename_mutex);
 DEFINE_SPINLOCK(sysfs_assoc_lock);
 static DEFINE_SPINLOCK(sysfs_ino_lock);
@@ -85,46 +84,6 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
 }
 /**
- *      sysfs_get_dentry - get dentry for the given sysfs_dirent
- *      @sd: sysfs_dirent of interest
- *
- *      Get dentry for @sd.  Dentry is looked up if currently not
- *      present.  This function descends from the root looking up
- *      dentry for each step.
- *
- *      LOCKING:
- *      mutex_lock(sysfs_rename_mutex)
- *
- *      RETURNS:
- *      Pointer to found dentry on success, ERR_PTR() value on error.
- */
-struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd)
-{
-        struct dentry *dentry = dget(sysfs_sb->s_root);
-        while (dentry->d_fsdata != sd) {
-                struct sysfs_dirent *cur;
-                struct dentry *parent;
-                /* find the first ancestor which hasn't been looked up */
-                cur = sd;
-                while (cur->s_parent != dentry->d_fsdata)
-                        cur = cur->s_parent;
-                /* look it up */
-                parent = dentry;
-                mutex_lock(&parent->d_inode->i_mutex);
-                dentry = lookup_one_noperm(cur->s_name, parent);
-                mutex_unlock(&parent->d_inode->i_mutex);
-                dput(parent);
-                if (IS_ERR(dentry))
-                        break;
-        }
-        return dentry;
-}
-/**
 *      sysfs_get_active - get an active reference to sysfs_dirent
 *      @sd: sysfs_dirent to get an active reference to
 *
@@ -134,7 +93,7 @@ struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd)
 *      RETURNS:
 *      Pointer to @sd on success, NULL on failure.
 */
-static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
+struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
 {
        if (unlikely(!sd))
                return NULL;
@@ -147,8 +106,10 @@ static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
                        return NULL;
                t = atomic_cmpxchg(&sd->s_active, v, v + 1);
-                if (likely(t == v))
+                if (likely(t == v)) {
+                        rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_);
                        return sd;
+                }
                if (t < 0)
                        return NULL;
@@ -163,7 +124,7 @@ static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
 *      Put an active reference to @sd.  This function is noop if @sd
 *      is NULL.
 */
-static void sysfs_put_active(struct sysfs_dirent *sd)
+void sysfs_put_active(struct sysfs_dirent *sd)
 {
        struct completion *cmpl;
        int v;
@@ -171,6 +132,7 @@ static void sysfs_put_active(struct sysfs_dirent *sd)
        if (unlikely(!sd))
                return;
+        rwsem_release(&sd->dep_map, 1, _RET_IP_);
        v = atomic_dec_return(&sd->s_active);
        if (likely(v != SD_DEACTIVATED_BIAS))
                return;
@@ -183,45 +145,6 @@ static void sysfs_put_active(struct sysfs_dirent *sd)
 }
 /**
- *      sysfs_get_active_two - get active references to sysfs_dirent and parent
- *      @sd: sysfs_dirent of interest
- *
- *      Get active reference to @sd and its parent.  Parent's active
- *      reference is grabbed first.  This function is noop if @sd is
- *      NULL.
- *
- *      RETURNS:
- *      Pointer to @sd on success, NULL on failure.
- */
-struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd)
-{
-        if (sd) {
-                if (sd->s_parent && unlikely(!sysfs_get_active(sd->s_parent)))
-                        return NULL;
-                if (unlikely(!sysfs_get_active(sd))) {
-                        sysfs_put_active(sd->s_parent);
-                        return NULL;
-                }
-        }
-        return sd;
-}
-/**
- *      sysfs_put_active_two - put active references to sysfs_dirent and parent
- *      @sd: sysfs_dirent of interest
- *
- *      Put active references to @sd and its parent.  This function is
- *      noop if @sd is NULL.
- */
-void sysfs_put_active_two(struct sysfs_dirent *sd)
-{
-        if (sd) {
-                sysfs_put_active(sd);
-                sysfs_put_active(sd->s_parent);
-        }
-}
-/**
 *      sysfs_deactivate - deactivate sysfs_dirent
 *      @sd: sysfs_dirent to deactivate
 *
@@ -233,17 +156,27 @@ static void sysfs_deactivate(struct sysfs_dirent *sd)
        int v;
        BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED));
+        if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
+                return;
        sd->s_sibling = (void *)&wait;
+        rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
        /* atomic_add_return() is a mb(), put_active() will always see
         * the updated sd->s_sibling.
         */
        v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);
-        if (v != SD_DEACTIVATED_BIAS)
+        if (v != SD_DEACTIVATED_BIAS) {
+                lock_contended(&sd->dep_map, _RET_IP_);
                wait_for_completion(&wait);
+        }
        sd->s_sibling = NULL;
+        lock_acquired(&sd->dep_map, _RET_IP_);
+        rwsem_release(&sd->dep_map, 1, _RET_IP_);
 }
 static int sysfs_alloc_ino(ino_t *pino)
@@ -298,7 +231,61 @@ void release_sysfs_dirent(struct sysfs_dirent * sd)
                goto repeat;
 }
-static void sysfs_d_iput(struct dentry * dentry, struct inode * inode)
+static int sysfs_dentry_delete(struct dentry *dentry)
+{
+        struct sysfs_dirent *sd = dentry->d_fsdata;
+        return !!(sd->s_flags & SYSFS_FLAG_REMOVED);
+}
+static int sysfs_dentry_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        struct sysfs_dirent *sd = dentry->d_fsdata;
+        int is_dir;
+        mutex_lock(&sysfs_mutex);
+        /* The sysfs dirent has been deleted */
+        if (sd->s_flags & SYSFS_FLAG_REMOVED)
+                goto out_bad;
+        /* The sysfs dirent has been moved? */
+        if (dentry->d_parent->d_fsdata != sd->s_parent)
+                goto out_bad;
+        /* The sysfs dirent has been renamed */
+        if (strcmp(dentry->d_name.name, sd->s_name) != 0)
+                goto out_bad;
+        mutex_unlock(&sysfs_mutex);
+out_valid:
+        return 1;
+out_bad:
+        /* Remove the dentry from the dcache hashes.
+         * If this is a deleted dentry we use d_drop instead of d_delete
+         * so sysfs doesn't need to cope with negative dentries.
+         *
+         * If this is a dentry that has simply been renamed we
+         * use d_drop to remove it from the dcache lookup on its
+         * old parent.  If this dentry persists later when a lookup
+         * is performed at its new name the dentry will be readded
+         * to the dcache hashes.
+         */
+        is_dir = (sysfs_type(sd) == SYSFS_DIR);
+        mutex_unlock(&sysfs_mutex);
+        if (is_dir) {
+                /* If we have submounts we must allow the vfs caches
+                 * to lie about the state of the filesystem to prevent
+                 * leaks and other nasty things.
+                 */
+                if (have_submounts(dentry))
+                        goto out_valid;
+                shrink_dcache_parent(dentry);
+        }
+        d_drop(dentry);
+        return 0;
+}
+static void sysfs_dentry_iput(struct dentry *dentry, struct inode *inode)
 {
        struct sysfs_dirent * sd = dentry->d_fsdata;
@@ -307,7 +294,9 @@ static void sysfs_d_iput(struct dentry * dentry, struct inode * inode)
 }
 static const struct dentry_operations sysfs_dentry_ops = {
-        .d_iput         = sysfs_d_iput,
+        .d_revalidate   = sysfs_dentry_revalidate,
+        .d_delete       = sysfs_dentry_delete,
+        .d_iput         = sysfs_dentry_iput,
 };
 struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
@@ -344,12 +333,6 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
        return NULL;
 }
-static int sysfs_ilookup_test(struct inode *inode, void *arg)
-{
-        struct sysfs_dirent *sd = arg;
-        return inode->i_ino == sd->s_ino;
-}
 /**
 *      sysfs_addrm_start - prepare for sysfs_dirent add/remove
 *      @acxt: pointer to sysfs_addrm_cxt to be used
@@ -357,47 +340,20 @@ static int sysfs_ilookup_test(struct inode *inode, void *arg)
 *
 *      This function is called when the caller is about to add or
 *      remove sysfs_dirent under @parent_sd.  This function acquires
- *      sysfs_mutex, grabs inode for @parent_sd if available and lock
+ *      sysfs_mutex.  @acxt is used to keep and pass context to
- *      i_mutex of it.  @acxt is used to keep and pass context to
 *      other addrm functions.
 *
 *      LOCKING:
 *      Kernel thread context (may sleep).  sysfs_mutex is locked on
- *      return.  i_mutex of parent inode is locked on return if
+ *      return.
- *      available.
 */
 void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
                       struct sysfs_dirent *parent_sd)
 {
-        struct inode *inode;
        memset(acxt, 0, sizeof(*acxt));
        acxt->parent_sd = parent_sd;
-        /* Lookup parent inode.  inode initialization is protected by
-         * sysfs_mutex, so inode existence can be determined by
-         * looking up inode while holding sysfs_mutex.
-         */
        mutex_lock(&sysfs_mutex);
-        inode = ilookup5(sysfs_sb, parent_sd->s_ino, sysfs_ilookup_test,
-                         parent_sd);
-        if (inode) {
-                WARN_ON(inode->i_state & I_NEW);
-                /* parent inode available */
-                acxt->parent_inode = inode;
-                /* sysfs_mutex is below i_mutex in lock hierarchy.
-                 * First, trylock i_mutex.  If fails, unlock
-                 * sysfs_mutex and lock them in order.
-                 */
-                if (!mutex_trylock(&inode->i_mutex)) {
-                        mutex_unlock(&sysfs_mutex);
-                        mutex_lock(&inode->i_mutex);
-                        mutex_lock(&sysfs_mutex);
-                }
-        }
 }
 /**
@@ -422,18 +378,22 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
 */
 int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
+        struct sysfs_inode_attrs *ps_iattr;
        if (sysfs_find_dirent(acxt->parent_sd, sd->s_name))
                return -EEXIST;
        sd->s_parent = sysfs_get(acxt->parent_sd);
-        if (sysfs_type(sd) == SYSFS_DIR && acxt->parent_inode)
-                inc_nlink(acxt->parent_inode);
-        acxt->cnt++;
        sysfs_link_sibling(sd);
+        /* Update timestamps on the parent */
+        ps_iattr = acxt->parent_sd->s_iattr;
+        if (ps_iattr) {
+                struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
+                ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
+        }
        return 0;
 }
@@ -512,70 +472,22 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 */
 void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
+        struct sysfs_inode_attrs *ps_iattr;
        BUG_ON(sd->s_flags & SYSFS_FLAG_REMOVED);
        sysfs_unlink_sibling(sd);
+        /* Update timestamps on the parent */
+        ps_iattr = acxt->parent_sd->s_iattr;
+        if (ps_iattr) {
+                struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
+                ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
+        }
        sd->s_flags |= SYSFS_FLAG_REMOVED;
        sd->s_sibling = acxt->removed;
        acxt->removed = sd;
-        if (sysfs_type(sd) == SYSFS_DIR && acxt->parent_inode)
-                drop_nlink(acxt->parent_inode);
-        acxt->cnt++;
-}
-/**
- *      sysfs_drop_dentry - drop dentry for the specified sysfs_dirent
- *      @sd: target sysfs_dirent
- *
- *      Drop dentry for @sd.  @sd must have been unlinked from its
- *      parent on entry to this function such that it can't be looked
- *      up anymore.
- */
-static void sysfs_drop_dentry(struct sysfs_dirent *sd)
-{
-        struct inode *inode;
-        struct dentry *dentry;
-        inode = ilookup(sysfs_sb, sd->s_ino);
-        if (!inode)
-                return;
-        /* Drop any existing dentries associated with sd.
-         *
-         * For the dentry to be properly freed we need to grab a
-         * reference to the dentry under the dcache lock,  unhash it,
-         * and then put it.  The playing with the dentry count allows
-         * dput to immediately free the dentry  if it is not in use.
-         */
-repeat:
-        spin_lock(&dcache_lock);
-        list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
-                if (d_unhashed(dentry))
-                        continue;
-                dget_locked(dentry);
-                spin_lock(&dentry->d_lock);
-                __d_drop(dentry);
-                spin_unlock(&dentry->d_lock);
-                spin_unlock(&dcache_lock);
-                dput(dentry);
-                goto repeat;
-        }
-        spin_unlock(&dcache_lock);
-        /* adjust nlink and update timestamp */
-        mutex_lock(&inode->i_mutex);
-        inode->i_ctime = CURRENT_TIME;
-        drop_nlink(inode);
-        if (sysfs_type(sd) == SYSFS_DIR)
-                drop_nlink(inode);
-        mutex_unlock(&inode->i_mutex);
-        iput(inode);
 }
 /**
@@ -584,25 +496,15 @@ repeat:
 *
 *      Finish up sysfs_dirent add/remove.  Resources acquired by
 *      sysfs_addrm_start() are released and removed sysfs_dirents are
- *      cleaned up.  Timestamps on the parent inode are updated.
+ *      cleaned up.
 *
 *      LOCKING:
- *      All mutexes acquired by sysfs_addrm_start() are released.
+ *      sysfs_mutex is released.
 */
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
 {
        /* release resources acquired by sysfs_addrm_start() */
        mutex_unlock(&sysfs_mutex);
-        if (acxt->parent_inode) {
-                struct inode *inode = acxt->parent_inode;
-                /* if added/removed, update timestamps on the parent */
-                if (acxt->cnt)
-                        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-                mutex_unlock(&inode->i_mutex);
-                iput(inode);
-        }
        /* kill removed sysfs_dirents */
        while (acxt->removed) {
@@ -611,7 +513,6 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
                acxt->removed = sd->s_sibling;
                sd->s_sibling = NULL;
-                sysfs_drop_dentry(sd);
                sysfs_deactivate(sd);
                unmap_bin_file(sd);
                sysfs_put(sd);
@@ -744,17 +645,22 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
        }
        /* attach dentry and inode */
-        inode = sysfs_get_inode(sd);
+        inode = sysfs_get_inode(dir->i_sb, sd);
        if (!inode) {
                ret = ERR_PTR(-ENOMEM);
                goto out_unlock;
        }
        /* instantiate and hash dentry */
-        dentry->d_op = &sysfs_dentry_ops;
+        ret = d_find_alias(inode);
-        dentry->d_fsdata = sysfs_get(sd);
+        if (!ret) {
-        d_instantiate(dentry, inode);
+                dentry->d_op = &sysfs_dentry_ops;
-        d_rehash(dentry);
+                dentry->d_fsdata = sysfs_get(sd);
+                d_add(dentry, inode);
+        } else {
+                d_move(ret, dentry);
+                iput(inode);
+        }
 out_unlock:
        mutex_unlock(&sysfs_mutex);
@@ -763,7 +669,9 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 const struct inode_operations sysfs_dir_inode_operations = {
        .lookup         = sysfs_lookup,
+        .permission     = sysfs_permission,
        .setattr        = sysfs_setattr,
+        .getattr        = sysfs_getattr,
        .setxattr       = sysfs_setxattr,
 };
@@ -826,141 +734,65 @@ void sysfs_remove_dir(struct kobject * kobj)
        __sysfs_remove_dir(sd);
 }
-int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
+int sysfs_rename(struct sysfs_dirent *sd,
+        struct sysfs_dirent *new_parent_sd, const char *new_name)
 {
-        struct sysfs_dirent *sd = kobj->sd;
-        struct dentry *parent = NULL;
-        struct dentry *old_dentry = NULL, *new_dentry = NULL;
        const char *dup_name = NULL;
        int error;
-        mutex_lock(&sysfs_rename_mutex);
+        mutex_lock(&sysfs_mutex);
        error = 0;
-        if (strcmp(sd->s_name, new_name) == 0)
+        if ((sd->s_parent == new_parent_sd) &&
+            (strcmp(sd->s_name, new_name) == 0))
                goto out;       /* nothing to rename */
-        /* get the original dentry */
-        old_dentry = sysfs_get_dentry(sd);
-        if (IS_ERR(old_dentry)) {
-                error = PTR_ERR(old_dentry);
-                old_dentry = NULL;
-                goto out;
-        }
-        parent = old_dentry->d_parent;
-        /* lock parent and get dentry for new name */
-        mutex_lock(&parent->d_inode->i_mutex);
-        mutex_lock(&sysfs_mutex);
        error = -EEXIST;
-        if (sysfs_find_dirent(sd->s_parent, new_name))
+        if (sysfs_find_dirent(new_parent_sd, new_name))
-                goto out_unlock;
+                goto out;
-        error = -ENOMEM;
-        new_dentry = d_alloc_name(parent, new_name);
-        if (!new_dentry)
-                goto out_unlock;
        /* rename sysfs_dirent */
-        error = -ENOMEM;
+        if (strcmp(sd->s_name, new_name) != 0) {
-        new_name = dup_name = kstrdup(new_name, GFP_KERNEL);
+                error = -ENOMEM;
-        if (!new_name)
+                new_name = dup_name = kstrdup(new_name, GFP_KERNEL);
-                goto out_unlock;
+                if (!new_name)
+                        goto out;
-        dup_name = sd->s_name;
-        sd->s_name = new_name;
+                dup_name = sd->s_name;
+                sd->s_name = new_name;
+        }
-        /* rename */
+        /* Remove from old parent's list and insert into new parent's list. */
-        d_add(new_dentry, NULL);
+        if (sd->s_parent != new_parent_sd) {
-        d_move(old_dentry, new_dentry);
+                sysfs_unlink_sibling(sd);
+                sysfs_get(new_parent_sd);
+                sysfs_put(sd->s_parent);
+                sd->s_parent = new_parent_sd;
+                sysfs_link_sibling(sd);
+        }
        error = 0;
- out_unlock:
+ out:
        mutex_unlock(&sysfs_mutex);
-        mutex_unlock(&parent->d_inode->i_mutex);
        kfree(dup_name);
-        dput(old_dentry);
-        dput(new_dentry);
- out:
-        mutex_unlock(&sysfs_rename_mutex);
        return error;
 }
+int sysfs_rename_dir(struct kobject *kobj, const char *new_name)
+{
+        return sysfs_rename(kobj->sd, kobj->sd->s_parent, new_name);
+}
 int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
 {
        struct sysfs_dirent *sd = kobj->sd;
        struct sysfs_dirent *new_parent_sd;
-        struct dentry *old_parent, *new_parent = NULL;
-        struct dentry *old_dentry = NULL, *new_dentry = NULL;
-        int error;
-        mutex_lock(&sysfs_rename_mutex);
        BUG_ON(!sd->s_parent);
-        new_parent_sd = (new_parent_kobj && new_parent_kobj->sd) ?
+        new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
                new_parent_kobj->sd : &sysfs_root;
-        error = 0;
+        return sysfs_rename(sd, new_parent_sd, sd->s_name);
-        if (sd->s_parent == new_parent_sd)
-                goto out;       /* nothing to move */
-        /* get dentries */
-        old_dentry = sysfs_get_dentry(sd);
-        if (IS_ERR(old_dentry)) {
-                error = PTR_ERR(old_dentry);
-                old_dentry = NULL;
-                goto out;
-        }
-        old_parent = old_dentry->d_parent;
-        new_parent = sysfs_get_dentry(new_parent_sd);
-        if (IS_ERR(new_parent)) {
-                error = PTR_ERR(new_parent);
-                new_parent = NULL;
-                goto out;
-        }
-again:
-        mutex_lock(&old_parent->d_inode->i_mutex);
-        if (!mutex_trylock(&new_parent->d_inode->i_mutex)) {
-                mutex_unlock(&old_parent->d_inode->i_mutex);
-                goto again;
-        }
-        mutex_lock(&sysfs_mutex);
-        error = -EEXIST;
-        if (sysfs_find_dirent(new_parent_sd, sd->s_name))
-                goto out_unlock;
-        error = -ENOMEM;
-        new_dentry = d_alloc_name(new_parent, sd->s_name);
-        if (!new_dentry)
-                goto out_unlock;
-        error = 0;
-        d_add(new_dentry, NULL);
-        d_move(old_dentry, new_dentry);
-        /* Remove from old parent's list and insert into new parent's list. */
-        sysfs_unlink_sibling(sd);
-        sysfs_get(new_parent_sd);
-        drop_nlink(old_parent->d_inode);
-        sysfs_put(sd->s_parent);
-        sd->s_parent = new_parent_sd;
-        inc_nlink(new_parent->d_inode);
-        sysfs_link_sibling(sd);
- out_unlock:
-        mutex_unlock(&sysfs_mutex);
-        mutex_unlock(&new_parent->d_inode->i_mutex);
-        mutex_unlock(&old_parent->d_inode->i_mutex);
- out:
-        dput(new_parent);
-        dput(old_dentry);
-        dput(new_dentry);
-        mutex_unlock(&sysfs_rename_mutex);
-        return error;
 }
 /* Relationship between s_mode and the DT_xxx types */
@@ -969,11 +801,46 @@ static inline unsigned char dt_type(struct sysfs_dirent *sd)
        return (sd->s_mode >> 12) & 15;
 }
+static int sysfs_dir_release(struct inode *inode, struct file *filp)
+{
+        sysfs_put(filp->private_data);
+        return 0;
+}
+static struct sysfs_dirent *sysfs_dir_pos(struct sysfs_dirent *parent_sd,
+        ino_t ino, struct sysfs_dirent *pos)
+{
+        if (pos) {
+                int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
+                        pos->s_parent == parent_sd &&
+                        ino == pos->s_ino;
+                sysfs_put(pos);
+                if (valid)
+                        return pos;
+        }
+        pos = NULL;
+        if ((ino > 1) && (ino < INT_MAX)) {
+                pos = parent_sd->s_dir.children;
+                while (pos && (ino > pos->s_ino))
+                        pos = pos->s_sibling;
+        }
+        return pos;
+}
+static struct sysfs_dirent *sysfs_dir_next_pos(struct sysfs_dirent *parent_sd,
+        ino_t ino, struct sysfs_dirent *pos)
+{
+        pos = sysfs_dir_pos(parent_sd, ino, pos);
+        if (pos)
+                pos = pos->s_sibling;
+        return pos;
+}
 static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
        struct dentry *dentry = filp->f_path.dentry;
        struct sysfs_dirent * parent_sd = dentry->d_fsdata;
-        struct sysfs_dirent *pos;
+        struct sysfs_dirent *pos = filp->private_data;
        ino_t ino;
        if (filp->f_pos == 0) {
@@ -989,29 +856,31 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
                if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0)
                        filp->f_pos++;
        }
-        if ((filp->f_pos > 1) && (filp->f_pos < INT_MAX)) {
+        mutex_lock(&sysfs_mutex);
-                mutex_lock(&sysfs_mutex);
+        for (pos = sysfs_dir_pos(parent_sd, filp->f_pos, pos);
+             pos;
-                /* Skip the dentries we have already reported */
+             pos = sysfs_dir_next_pos(parent_sd, filp->f_pos, pos)) {
-                pos = parent_sd->s_dir.children;
+                const char * name;
-                while (pos && (filp->f_pos > pos->s_ino))
+                unsigned int type;
-                        pos = pos->s_sibling;
+                int len, ret;
-                for ( ; pos; pos = pos->s_sibling) {
+                name = pos->s_name;
-                        const char * name;
+                len = strlen(name);
-                        int len;
+                ino = pos->s_ino;
+                type = dt_type(pos);
-                        name = pos->s_name;
+                filp->f_pos = ino;
-                        len = strlen(name);
+                filp->private_data = sysfs_get(pos);
-                        filp->f_pos = ino = pos->s_ino;
-                        if (filldir(dirent, name, len, filp->f_pos, ino,
-                                         dt_type(pos)) < 0)
-                                break;
-                }
-                if (!pos)
-                        filp->f_pos = INT_MAX;
                mutex_unlock(&sysfs_mutex);
+                ret = filldir(dirent, name, len, filp->f_pos, ino, type);
+                mutex_lock(&sysfs_mutex);
+                if (ret < 0)
+                        break;
+        }
+        mutex_unlock(&sysfs_mutex);
+        if ((filp->f_pos > 1) && !pos) { /* EOF */
+                filp->f_pos = INT_MAX;
+                filp->private_data = NULL;
        }
        return 0;
 }
@@ -1020,5 +889,6 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 const struct file_operations sysfs_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = sysfs_readdir,
+        .release        = sysfs_dir_release,
        .llseek         = generic_file_llseek,
 };
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index f5ea4680f15f..e222b2582746 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -53,7 +53,7 @@ struct sysfs_buffer {
        size_t                  count;
        loff_t                  pos;
        char                    * page;
-        struct sysfs_ops        * ops;
+        const struct sysfs_ops  * ops;
        struct mutex            mutex;
        int                     needs_read_fill;
        int                     event;
@@ -75,7 +75,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
 {
        struct sysfs_dirent *attr_sd = dentry->d_fsdata;
        struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
-        struct sysfs_ops * ops = buffer->ops;
+        const struct sysfs_ops * ops = buffer->ops;
        int ret = 0;
        ssize_t count;
@@ -85,13 +85,13 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
                return -ENOMEM;
        /* need attr_sd for attr and ops, its parent for kobj */
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return -ENODEV;
        buffer->event = atomic_read(&attr_sd->s_attr.open->event);
        count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        /*
         * The code works fine with PAGE_SIZE return but it's likely to
@@ -199,16 +199,16 @@ flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t
 {
        struct sysfs_dirent *attr_sd = dentry->d_fsdata;
        struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
-        struct sysfs_ops * ops = buffer->ops;
+        const struct sysfs_ops * ops = buffer->ops;
        int rc;
        /* need attr_sd for attr and ops, its parent for kobj */
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return -ENODEV;
        rc = ops->store(kobj, attr_sd->s_attr.attr, buffer->page, count);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        return rc;
 }
@@ -335,7 +335,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
        struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
        struct sysfs_buffer *buffer;
-        struct sysfs_ops *ops;
+        const struct sysfs_ops *ops;
        int error = -EACCES;
        char *p;
@@ -344,7 +344,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
                memmove(last_sysfs_file, p, strlen(p) + 1);
        /* need attr_sd for attr and ops, its parent for kobj */
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                return -ENODEV;
        /* every kobject with an attribute needs a ktype assigned */
@@ -393,13 +393,13 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
                goto err_free;
        /* open succeeded, put active references */
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        return 0;
 err_free:
        kfree(buffer);
 err_out:
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        return error;
 }
@@ -437,12 +437,12 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
        struct sysfs_open_dirent *od = attr_sd->s_attr.open;
        /* need parent for the kobj, grab both */
-        if (!sysfs_get_active_two(attr_sd))
+        if (!sysfs_get_active(attr_sd))
                goto trigger;
        poll_wait(filp, &od->poll, wait);
-        sysfs_put_active_two(attr_sd);
+        sysfs_put_active(attr_sd);
        if (buffer->event != atomic_read(&od->event))
                goto trigger;
@@ -509,6 +509,7 @@ int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
        if (!sd)
                return -ENOMEM;
        sd->s_attr.attr = (void *)attr;
+        sysfs_dirent_init_lockdep(sd);
        sysfs_addrm_start(&acxt, dir_sd);
        rc = sysfs_add_one(&acxt, sd);
@@ -542,6 +543,18 @@ int sysfs_create_file(struct kobject * kobj, const struct attribute * attr)
 }
+int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr)
+{
+        int err = 0;
+        int i;
+        for (i = 0; ptr[i] && !err; i++)
+                err = sysfs_create_file(kobj, ptr[i]);
+        if (err)
+                while (--i >= 0)
+                        sysfs_remove_file(kobj, ptr[i]);
+        return err;
+}
 /**
 * sysfs_add_file_to_group - add an attribute file to a pre-existing group.
@@ -579,46 +592,23 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
 */
 int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
 {
-        struct sysfs_dirent *victim_sd = NULL;
+        struct sysfs_dirent *sd;
-        struct dentry *victim = NULL;
-        struct inode * inode;
        struct iattr newattrs;
        int rc;
-        rc = -ENOENT;
+        mutex_lock(&sysfs_mutex);
-        victim_sd = sysfs_get_dirent(kobj->sd, attr->name);
-        if (!victim_sd)
-                goto out;
-        mutex_lock(&sysfs_rename_mutex);
+        rc = -ENOENT;
-        victim = sysfs_get_dentry(victim_sd);
+        sd = sysfs_find_dirent(kobj->sd, attr->name);
-        mutex_unlock(&sysfs_rename_mutex);
+        if (!sd)
-        if (IS_ERR(victim)) {
-                rc = PTR_ERR(victim);
-                victim = NULL;
                goto out;
-        }
-        inode = victim->d_inode;
-        mutex_lock(&inode->i_mutex);
-        newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
-        newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-        newattrs.ia_ctime = current_fs_time(inode->i_sb);
-        rc = sysfs_setattr(victim, &newattrs);
-        if (rc == 0) {
+        newattrs.ia_mode = (mode & S_IALLUGO) | (sd->s_mode & ~S_IALLUGO);
-                fsnotify_change(victim, newattrs.ia_valid);
+        newattrs.ia_valid = ATTR_MODE;
-                mutex_lock(&sysfs_mutex);
+        rc = sysfs_sd_setattr(sd, &newattrs);
-                victim_sd->s_mode = newattrs.ia_mode;
-                mutex_unlock(&sysfs_mutex);
-        }
-        mutex_unlock(&inode->i_mutex);
 out:
-        dput(victim);
+        mutex_unlock(&sysfs_mutex);
-        sysfs_put(victim_sd);
        return rc;
 }
 EXPORT_SYMBOL_GPL(sysfs_chmod_file);
@@ -637,6 +627,12 @@ void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
        sysfs_hash_and_remove(kobj->sd, attr->name);
 }
+void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr)
+{
+        int i;
+        for (i = 0; ptr[i]; i++)
+                sysfs_remove_file(kobj, ptr[i]);
+}
 /**
 * sysfs_remove_file_from_group - remove an attribute file from a group.
@@ -755,3 +751,5 @@ EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
 EXPORT_SYMBOL_GPL(sysfs_create_file);
 EXPORT_SYMBOL_GPL(sysfs_remove_file);
+EXPORT_SYMBOL_GPL(sysfs_remove_files);
+EXPORT_SYMBOL_GPL(sysfs_create_files);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index e28cecf179f5..a4a0a9419711 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -18,6 +18,7 @@
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/xattr.h>
 #include <linux/security.h>
 #include "sysfs.h"
@@ -37,7 +38,9 @@ static struct backing_dev_info sysfs_backing_dev_info = {
 };
 static const struct inode_operations sysfs_inode_operations ={
+        .permission     = sysfs_permission,
        .setattr        = sysfs_setattr,
+        .getattr        = sysfs_getattr,
        .setxattr       = sysfs_setxattr,
 };
@@ -46,7 +49,7 @@ int __init sysfs_inode_init(void)
        return bdi_init(&sysfs_backing_dev_info);
 }
-struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
+static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
 {
        struct sysfs_inode_attrs *attrs;
        struct iattr *iattrs;
@@ -64,81 +67,101 @@ struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
        return attrs;
 }
-int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
+int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr * iattr)
 {
-        struct inode * inode = dentry->d_inode;
-        struct sysfs_dirent * sd = dentry->d_fsdata;
        struct sysfs_inode_attrs *sd_attrs;
        struct iattr *iattrs;
        unsigned int ia_valid = iattr->ia_valid;
+        sd_attrs = sd->s_iattr;
+        if (!sd_attrs) {
+                /* setting attributes for the first time, allocate now */
+                sd_attrs = sysfs_init_inode_attrs(sd);
+                if (!sd_attrs)
+                        return -ENOMEM;
+                sd->s_iattr = sd_attrs;
+        }
+        /* attributes were changed at least once in past */
+        iattrs = &sd_attrs->ia_iattr;
+        if (ia_valid & ATTR_UID)
+                iattrs->ia_uid = iattr->ia_uid;
+        if (ia_valid & ATTR_GID)
+                iattrs->ia_gid = iattr->ia_gid;
+        if (ia_valid & ATTR_ATIME)
+                iattrs->ia_atime = iattr->ia_atime;
+        if (ia_valid & ATTR_MTIME)
+                iattrs->ia_mtime = iattr->ia_mtime;
+        if (ia_valid & ATTR_CTIME)
+                iattrs->ia_ctime = iattr->ia_ctime;
+        if (ia_valid & ATTR_MODE) {
+                umode_t mode = iattr->ia_mode;
+                iattrs->ia_mode = sd->s_mode = mode;
+        }
+        return 0;
+}
+int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+        struct inode *inode = dentry->d_inode;
+        struct sysfs_dirent *sd = dentry->d_fsdata;
        int error;
        if (!sd)
                return -EINVAL;
-        sd_attrs = sd->s_iattr;
+        mutex_lock(&sysfs_mutex);
        error = inode_change_ok(inode, iattr);
        if (error)
-                return error;
+                goto out;
        iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */
        error = inode_setattr(inode, iattr);
        if (error)
-                return error;
+                goto out;
-        if (!sd_attrs) {
+        error = sysfs_sd_setattr(sd, iattr);
-                /* setting attributes for the first time, allocate now */
+out:
-                sd_attrs = sysfs_init_inode_attrs(sd);
+        mutex_unlock(&sysfs_mutex);
-                if (!sd_attrs)
-                        return -ENOMEM;
-                sd->s_iattr = sd_attrs;
-        } else {
-                /* attributes were changed at least once in past */
-                iattrs = &sd_attrs->ia_iattr;
-                if (ia_valid & ATTR_UID)
-                        iattrs->ia_uid = iattr->ia_uid;
-                if (ia_valid & ATTR_GID)
-                        iattrs->ia_gid = iattr->ia_gid;
-                if (ia_valid & ATTR_ATIME)
-                        iattrs->ia_atime = timespec_trunc(iattr->ia_atime,
-                                        inode->i_sb->s_time_gran);
-                if (ia_valid & ATTR_MTIME)
-                        iattrs->ia_mtime = timespec_trunc(iattr->ia_mtime,
-                                        inode->i_sb->s_time_gran);
-                if (ia_valid & ATTR_CTIME)
-                        iattrs->ia_ctime = timespec_trunc(iattr->ia_ctime,
-                                        inode->i_sb->s_time_gran);
-                if (ia_valid & ATTR_MODE) {
-                        umode_t mode = iattr->ia_mode;
-                        if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
-                                mode &= ~S_ISGID;
-                        iattrs->ia_mode = sd->s_mode = mode;
-                }
-        }
        return error;
 }
+static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata, u32 *secdata_len)
+{
+        struct sysfs_inode_attrs *iattrs;
+        void *old_secdata;
+        size_t old_secdata_len;
+        iattrs = sd->s_iattr;
+        if (!iattrs)
+                iattrs = sysfs_init_inode_attrs(sd);
+        if (!iattrs)
+                return -ENOMEM;
+        old_secdata = iattrs->ia_secdata;
+        old_secdata_len = iattrs->ia_secdata_len;
+        iattrs->ia_secdata = *secdata;
+        iattrs->ia_secdata_len = *secdata_len;
+        *secdata = old_secdata;
+        *secdata_len = old_secdata_len;
+        return 0;
+}
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                size_t size, int flags)
 {
        struct sysfs_dirent *sd = dentry->d_fsdata;
-        struct sysfs_inode_attrs *iattrs;
        void *secdata;
        int error;
        u32 secdata_len = 0;
        if (!sd)
                return -EINVAL;
-        if (!sd->s_iattr)
-                sd->s_iattr = sysfs_init_inode_attrs(sd);
-        if (!sd->s_iattr)
-                return -ENOMEM;
-        iattrs = sd->s_iattr;
        if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
                const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
@@ -150,12 +173,13 @@ int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                                                &secdata, &secdata_len);
                if (error)
                        goto out;
-                if (iattrs->ia_secdata)
-                        security_release_secctx(iattrs->ia_secdata,
-                                                iattrs->ia_secdata_len);
-                iattrs->ia_secdata = secdata;
-                iattrs->ia_secdata_len = secdata_len;
+                mutex_lock(&sysfs_mutex);
+                error = sysfs_sd_setsecdata(sd, &secdata, &secdata_len);
+                mutex_unlock(&sysfs_mutex);
+                if (secdata)
+                        security_release_secctx(secdata, secdata_len);
        } else
                return -EINVAL;
 out:
@@ -170,7 +194,6 @@ static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
 static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
 {
-        inode->i_mode = iattr->ia_mode;
        inode->i_uid = iattr->ia_uid;
        inode->i_gid = iattr->ia_gid;
        inode->i_atime = iattr->ia_atime;
@@ -178,17 +201,6 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
        inode->i_ctime = iattr->ia_ctime;
 }
-/*
- * sysfs has a different i_mutex lock order behavior for i_mutex than other
- * filesystems; sysfs i_mutex is called in many places with subsystem locks
- * held. At the same time, many of the VFS locking rules do not apply to
- * sysfs at all (cross directory rename for example). To untangle this mess
- * (which gives false positives in lockdep), we're giving sysfs inodes their
- * own class for i_mutex.
- */
-static struct lock_class_key sysfs_inode_imutex_key;
 static int sysfs_count_nlink(struct sysfs_dirent *sd)
 {
        struct sysfs_dirent *child;
@@ -201,38 +213,55 @@ static int sysfs_count_nlink(struct sysfs_dirent *sd)
        return nr + 2;
 }
+static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
+{
+        struct sysfs_inode_attrs *iattrs = sd->s_iattr;
+        inode->i_mode = sd->s_mode;
+        if (iattrs) {
+                /* sysfs_dirent has non-default attributes
+                 * get them from persistent copy in sysfs_dirent
+                 */
+                set_inode_attr(inode, &iattrs->ia_iattr);
+                security_inode_notifysecctx(inode,
+                                            iattrs->ia_secdata,
+                                            iattrs->ia_secdata_len);
+        }
+        if (sysfs_type(sd) == SYSFS_DIR)
+                inode->i_nlink = sysfs_count_nlink(sd);
+}
+int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+        struct sysfs_dirent *sd = dentry->d_fsdata;
+        struct inode *inode = dentry->d_inode;
+        mutex_lock(&sysfs_mutex);
+        sysfs_refresh_inode(sd, inode);
+        mutex_unlock(&sysfs_mutex);
+        generic_fillattr(inode, stat);
+        return 0;
+}
 static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 {
        struct bin_attribute *bin_attr;
-        struct sysfs_inode_attrs *iattrs;
        inode->i_private = sysfs_get(sd);
        inode->i_mapping->a_ops = &sysfs_aops;
        inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
        inode->i_op = &sysfs_inode_operations;
-        inode->i_ino = sd->s_ino;
-        lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key);
-        iattrs = sd->s_iattr;
+        set_default_inode_attr(inode, sd->s_mode);
-        if (iattrs) {
+        sysfs_refresh_inode(sd, inode);
-                /* sysfs_dirent has non-default attributes
-                 * get them for the new inode from persistent copy
-                 * in sysfs_dirent
-                 */
-                set_inode_attr(inode, &iattrs->ia_iattr);
-                if (iattrs->ia_secdata)
-                        security_inode_notifysecctx(inode,
-                                                iattrs->ia_secdata,
-                                                iattrs->ia_secdata_len);
-        } else
-                set_default_inode_attr(inode, sd->s_mode);
        /* initialize inode according to type */
        switch (sysfs_type(sd)) {
        case SYSFS_DIR:
                inode->i_op = &sysfs_dir_inode_operations;
                inode->i_fop = &sysfs_dir_operations;
-                inode->i_nlink = sysfs_count_nlink(sd);
                break;
        case SYSFS_KOBJ_ATTR:
                inode->i_size = PAGE_SIZE;
@@ -255,6 +284,7 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 /**
 *      sysfs_get_inode - get inode for sysfs_dirent
+ *      @sb: super block
 *      @sd: sysfs_dirent to allocate inode for
 *
 *      Get inode for @sd.  If such inode doesn't exist, a new inode
@@ -267,11 +297,11 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 *      RETURNS:
 *      Pointer to allocated inode on success, NULL on failure.
 */
-struct inode * sysfs_get_inode(struct sysfs_dirent *sd)
+struct inode * sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
 {
        struct inode *inode;
-        inode = iget_locked(sysfs_sb, sd->s_ino);
+        inode = iget_locked(sb, sd->s_ino);
        if (inode && (inode->i_state & I_NEW))
                sysfs_init_inode(sd, inode);
@@ -315,3 +345,14 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
        else
                return -ENOENT;
 }
+int sysfs_permission(struct inode *inode, int mask)
+{
+        struct sysfs_dirent *sd = inode->i_private;
+        mutex_lock(&sysfs_mutex);
+        sysfs_refresh_inode(sd, inode);
+        mutex_unlock(&sysfs_mutex);
+        return generic_permission(inode, mask, NULL);
+}
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 49749955ccaf..776137828dca 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -18,12 +18,12 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/magic.h>
+#include <linux/slab.h>
 #include "sysfs.h"
 static struct vfsmount *sysfs_mount;
-struct super_block * sysfs_sb = NULL;
 struct kmem_cache *sysfs_dir_cachep;
 static const struct super_operations sysfs_ops = {
@@ -50,11 +50,10 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_magic = SYSFS_MAGIC;
        sb->s_op = &sysfs_ops;
        sb->s_time_gran = 1;
-        sysfs_sb = sb;
        /* get root inode, initialize and unlock it */
        mutex_lock(&sysfs_mutex);
-        inode = sysfs_get_inode(&sysfs_root);
+        inode = sysfs_get_inode(sb, &sysfs_root);
        mutex_unlock(&sysfs_mutex);
        if (!inode) {
                pr_debug("sysfs: could not get root inode\n");
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index c5081ad77026..b93ec51fa7ac 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -11,6 +11,7 @@
 */
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/mount.h>
 #include <linux/module.h>
 #include <linux/kobject.h>
@@ -123,6 +124,44 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
        sysfs_hash_and_remove(parent_sd, name);
 }
+/**
+ *      sysfs_rename_link - rename symlink in object's directory.
+ *      @kobj:  object we're acting for.
+ *      @targ:  object we're pointing to.
+ *      @old:   previous name of the symlink.
+ *      @new:   new name of the symlink.
+ *
+ *      A helper function for the common rename symlink idiom.
+ */
+int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
+                        const char *old, const char *new)
+{
+        struct sysfs_dirent *parent_sd, *sd = NULL;
+        int result;
+        if (!kobj)
+                parent_sd = &sysfs_root;
+        else
+                parent_sd = kobj->sd;
+        result = -ENOENT;
+        sd = sysfs_get_dirent(parent_sd, old);
+        if (!sd)
+                goto out;
+        result = -EINVAL;
+        if (sysfs_type(sd) != SYSFS_KOBJ_LINK)
+                goto out;
+        if (sd->s_symlink.target_sd->s_dir.kobj != targ)
+                goto out;
+        result = sysfs_rename(sd, parent_sd, new);
+out:
+        sysfs_put(sd);
+        return result;
+}
 static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
                                 struct sysfs_dirent *target_sd, char *path)
 {
@@ -210,10 +249,13 @@ static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *co
 }
 const struct inode_operations sysfs_symlink_inode_operations = {
-        .setxattr = sysfs_setxattr,
+        .setxattr       = sysfs_setxattr,
-        .readlink = generic_readlink,
+        .readlink       = generic_readlink,
-        .follow_link = sysfs_follow_link,
+        .follow_link    = sysfs_follow_link,
-        .put_link = sysfs_put_link,
+        .put_link       = sysfs_put_link,
+        .setattr        = sysfs_setattr,
+        .getattr        = sysfs_getattr,
+        .permission     = sysfs_permission,
 };
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index af4c4e7482ac..30f5a44fb5d3 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -8,6 +8,7 @@
 * This file is released under the GPLv2.
 */
+#include <linux/lockdep.h>
 #include <linux/fs.h>
 struct sysfs_open_dirent;
@@ -50,6 +51,9 @@ struct sysfs_inode_attrs {
 struct sysfs_dirent {
        atomic_t                s_count;
        atomic_t                s_active;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        struct lockdep_map      dep_map;
+#endif
        struct sysfs_dirent     *s_parent;
        struct sysfs_dirent     *s_sibling;
        const char              *s_name;
@@ -62,8 +66,8 @@ struct sysfs_dirent {
        };
        unsigned int            s_flags;
+        unsigned short          s_mode;
        ino_t                   s_ino;
-        umode_t                 s_mode;
        struct sysfs_inode_attrs *s_iattr;
 };
@@ -75,6 +79,7 @@ struct sysfs_dirent {
 #define SYSFS_KOBJ_BIN_ATTR             0x0004
 #define SYSFS_KOBJ_LINK                 0x0008
 #define SYSFS_COPY_NAME                 (SYSFS_DIR | SYSFS_KOBJ_LINK)
+#define SYSFS_ACTIVE_REF                (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
 #define SYSFS_FLAG_MASK                 ~SYSFS_TYPE_MASK
 #define SYSFS_FLAG_REMOVED              0x0200
@@ -84,36 +89,46 @@ static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
        return sd->s_flags & SYSFS_TYPE_MASK;
 }
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#define sysfs_dirent_init_lockdep(sd)                           \
+do {                                                            \
+        struct attribute *attr = sd->s_attr.attr;               \
+        struct lock_class_key *key = attr->key;                 \
+        if (!key)                                               \
+                key = &attr->skey;                              \
+                                                                \
+        lockdep_init_map(&sd->dep_map, "s_active", key, 0);     \
+} while(0)
+#else
+#define sysfs_dirent_init_lockdep(sd) do {} while(0)
+#endif
 /*
 * Context structure to be used while adding/removing nodes.
 */
 struct sysfs_addrm_cxt {
        struct sysfs_dirent     *parent_sd;
-        struct inode            *parent_inode;
        struct sysfs_dirent     *removed;
-        int                     cnt;
 };
 /*
 * mount.c
 */
 extern struct sysfs_dirent sysfs_root;
-extern struct super_block *sysfs_sb;
 extern struct kmem_cache *sysfs_dir_cachep;
 /*
 * dir.c
 */
 extern struct mutex sysfs_mutex;
-extern struct mutex sysfs_rename_mutex;
 extern spinlock_t sysfs_assoc_lock;
 extern const struct file_operations sysfs_dir_operations;
 extern const struct inode_operations sysfs_dir_inode_operations;
 struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd);
-struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd);
+struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
-void sysfs_put_active_two(struct sysfs_dirent *sd);
+void sysfs_put_active(struct sysfs_dirent *sd);
 void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
                       struct sysfs_dirent *parent_sd);
 int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
@@ -133,6 +148,9 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
                        struct sysfs_dirent **p_sd);
 void sysfs_remove_subdir(struct sysfs_dirent *sd);
+int sysfs_rename(struct sysfs_dirent *sd,
+        struct sysfs_dirent *new_parent_sd, const char *new_name);
 static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
 {
        if (sd) {
@@ -153,9 +171,12 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
 /*
 * inode.c
 */
-struct inode *sysfs_get_inode(struct sysfs_dirent *sd);
+struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
 void sysfs_delete_inode(struct inode *inode);
+int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
+int sysfs_permission(struct inode *inode, int mask);
 int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
+int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                size_t size, int flags);
 int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 9824743832a7..4573734d723d 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -26,6 +26,7 @@
 #include <linux/init.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
+#include <linux/writeback.h>
 #include <linux/namei.h>
 #include <asm/byteorder.h>
 #include "sysv.h"
@@ -246,7 +247,7 @@ bad_inode:
        return ERR_PTR(-EIO);
 }
-int sysv_write_inode(struct inode *inode, int wait)
+static int __sysv_write_inode(struct inode *inode, int wait)
 {
        struct super_block * sb = inode->i_sb;
        struct sysv_sb_info * sbi = SYSV_SB(sb);
@@ -296,9 +297,14 @@ int sysv_write_inode(struct inode *inode, int wait)
        return 0;
 }
+int sysv_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+        return __sysv_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
 int sysv_sync_inode(struct inode *inode)
 {
-        return sysv_write_inode(inode, 1);
+        return __sysv_write_inode(inode, 1);
 }
 static void sysv_delete_inode(struct inode *inode)
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 53786eb5cf60..94cb9b4d76c2 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -142,7 +142,7 @@ extern int __sysv_write_begin(struct file *file, struct address_space *mapping,
 /* inode.c */
 extern struct inode *sysv_iget(struct super_block *, unsigned int);
-extern int sysv_write_inode(struct inode *, int);
+extern int sysv_write_inode(struct inode *, struct writeback_control *wbc);
 extern int sysv_sync_inode(struct inode *);
 extern void sysv_set_inode(struct inode *, dev_t);
 extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *);
diff --git a/fs/timerfd.c b/fs/timerfd.c
index b042bd7034b1..98158de91d24 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -14,6 +14,7 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/time.h>
@@ -200,7 +201,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
        hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
        ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
-                               flags & TFD_SHARED_FCNTL_FLAGS);
+                               O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS));
        if (ufd < 0)
                kfree(ctx);
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 4775af401167..37fa7ed062d8 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -45,6 +45,7 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 /**
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index dbc093afd946..c2a68baa782f 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -34,6 +34,7 @@
 #include <linux/moduleparam.h>
 #include <linux/debugfs.h>
 #include <linux/math64.h>
+#include <linux/slab.h>
 #ifdef CONFIG_UBIFS_FS_DEBUG
@@ -350,13 +351,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
                       le32_to_cpu(sup->fmt_version));
                printk(KERN_DEBUG "\ttime_gran      %u\n",
                       le32_to_cpu(sup->time_gran));
-                printk(KERN_DEBUG "\tUUID           %02X%02X%02X%02X-%02X%02X"
+                printk(KERN_DEBUG "\tUUID           %pUB\n",
-                       "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X\n",
+                       sup->uuid);
-                       sup->uuid[0], sup->uuid[1], sup->uuid[2], sup->uuid[3],
-                       sup->uuid[4], sup->uuid[5], sup->uuid[6], sup->uuid[7],
-                       sup->uuid[8], sup->uuid[9], sup->uuid[10], sup->uuid[11],
-                       sup->uuid[12], sup->uuid[13], sup->uuid[14],
-                       sup->uuid[15]);
                break;
        }
        case UBIFS_MST_NODE:
@@ -2014,7 +2010,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
                inum = key_inum_flash(c, &dent->key);
                fscki1 = read_add_inode(c, priv, inum);
                if (IS_ERR(fscki1)) {
-                        err = PTR_ERR(fscki);
+                        err = PTR_ERR(fscki1);
                        ubifs_err("error %d while processing entry node and "
                                  "trying to find parent inode node %lu",
                                  err, (unsigned long)inum);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 552fb0111fff..401e503d44a1 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1120,7 +1120,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (release)
                ubifs_release_budget(c, &ino_req);
        if (IS_SYNC(old_inode))
-                err = old_inode->i_sb->s_op->write_inode(old_inode, 1);
+                err = old_inode->i_sb->s_op->write_inode(old_inode, NULL);
        return err;
 out_cancel:
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 1009adc8d602..5692cf72b807 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -45,13 +45,14 @@
 *
 * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the
 * read-ahead path does not lock it ("sys_read -> generic_file_aio_read ->
- * ondemand_readahead -> readpage"). In case of readahead, @I_LOCK flag is not
+ * ondemand_readahead -> readpage"). In case of readahead, @I_SYNC flag is not
 * set as well. However, UBIFS disables readahead.
 */
 #include "ubifs.h"
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 static int read_block(struct inode *inode, void *addr, unsigned int block,
                      struct ubifs_data_node *dn)
@@ -1011,7 +1012,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
        /* Is the page fully inside @i_size? */
        if (page->index < end_index) {
                if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) {
-                        err = inode->i_sb->s_op->write_inode(inode, 1);
+                        err = inode->i_sb->s_op->write_inode(inode, NULL);
                        if (err)
                                goto out_unlock;
                        /*
@@ -1039,7 +1040,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
        kunmap_atomic(kaddr, KM_USER0);
        if (i_size > synced_i_size) {
-                err = inode->i_sb->s_op->write_inode(inode, 1);
+                err = inode->i_sb->s_op->write_inode(inode, NULL);
                if (err)
                        goto out_unlock;
        }
@@ -1242,7 +1243,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
        if (release)
                ubifs_release_budget(c, &req);
        if (IS_SYNC(inode))
-                err = inode->i_sb->s_op->write_inode(inode, 1);
+                err = inode->i_sb->s_op->write_inode(inode, NULL);
        return err;
 out:
@@ -1316,7 +1317,7 @@ int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync)
         * the inode unless this is a 'datasync()' call.
         */
        if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
-                err = inode->i_sb->s_op->write_inode(inode, 1);
+                err = inode->i_sb->s_op->write_inode(inode, NULL);
                if (err)
                        return err;
        }
@@ -1389,7 +1390,6 @@ static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
                               unsigned long nr_segs, loff_t pos)
 {
        int err;
-        ssize_t ret;
        struct inode *inode = iocb->ki_filp->f_mapping->host;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -1397,17 +1397,7 @@ static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
        if (err)
                return err;
-        ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+        return generic_file_aio_write(iocb, iov, nr_segs, pos);
-        if (ret < 0)
-                return ret;
-        if (ret > 0 && (IS_SYNC(inode) || iocb->ki_filp->f_flags & O_SYNC)) {
-                err = ubifs_sync_wbufs_by_inode(c, inode);
-                if (err)
-                        return err;
-        }
-        return ret;
 }
 static int ubifs_set_page_dirty(struct page *page)
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 618c2701d3a7..918d1582ca05 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -53,7 +53,9 @@
 * good, and GC takes extra care when moving them.
 */
+#include <linux/slab.h>
 #include <linux/pagemap.h>
+#include <linux/list_sort.h>
 #include "ubifs.h"
 /*
@@ -108,101 +110,6 @@ static int switch_gc_head(struct ubifs_info *c)
 }
 /**
- * list_sort - sort a list.
- * @priv: private data, passed to @cmp
- * @head: the list to sort
- * @cmp: the elements comparison function
- *
- * This function has been implemented by Mark J Roberts <mjr@znex.org>. It
- * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted
- * in ascending order.
- *
- * The comparison function @cmp is supposed to return a negative value if @a is
- * than @b, and a positive value if @a is greater than @b. If @a and @b are
- * equivalent, then it does not matter what this function returns.
- */
-static void list_sort(void *priv, struct list_head *head,
-                      int (*cmp)(void *priv, struct list_head *a,
-                                 struct list_head *b))
-{
-        struct list_head *p, *q, *e, *list, *tail, *oldhead;
-        int insize, nmerges, psize, qsize, i;
-        if (list_empty(head))
-                return;
-        list = head->next;
-        list_del(head);
-        insize = 1;
-        for (;;) {
-                p = oldhead = list;
-                list = tail = NULL;
-                nmerges = 0;
-                while (p) {
-                        nmerges++;
-                        q = p;
-                        psize = 0;
-                        for (i = 0; i < insize; i++) {
-                                psize++;
-                                q = q->next == oldhead ? NULL : q->next;
-                                if (!q)
-                                        break;
-                        }
-                        qsize = insize;
-                        while (psize > 0 || (qsize > 0 && q)) {
-                                if (!psize) {
-                                        e = q;
-                                        q = q->next;
-                                        qsize--;
-                                        if (q == oldhead)
-                                                q = NULL;
-                                } else if (!qsize || !q) {
-                                        e = p;
-                                        p = p->next;
-                                        psize--;
-                                        if (p == oldhead)
-                                                p = NULL;
-                                } else if (cmp(priv, p, q) <= 0) {
-                                        e = p;
-                                        p = p->next;
-                                        psize--;
-                                        if (p == oldhead)
-                                                p = NULL;
-                                } else {
-                                        e = q;
-                                        q = q->next;
-                                        qsize--;
-                                        if (q == oldhead)
-                                                q = NULL;
-                                }
-                                if (tail)
-                                        tail->next = e;
-                                else
-                                        list = e;
-                                e->prev = tail;
-                                tail = e;
-                        }
-                        p = q;
-                }
-                tail->next = list;
-                list->prev = tail;
-                if (nmerges <= 1)
-                        break;
-                insize *= 2;
-        }
-        head->next = list;
-        head->prev = list->prev;
-        list->prev->next = head;
-        list->prev = head;
-}
-/**
 * data_nodes_cmp - compare 2 data nodes.
 * @priv: UBIFS file-system description object
 * @a: first data node
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index e589fedaf1ef..77d5cf4a7547 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -51,6 +51,7 @@
 */
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 /**
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index b2792e84d245..ad7f67b827ea 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -46,6 +46,7 @@
 #include "ubifs.h"
 #include <linux/crc16.h>
 #include <linux/math64.h>
+#include <linux/slab.h>
 /**
 * do_calc_lpt_geom - calculate sizes for the LPT area.
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 8cbfb8248025..13cb7a4237bf 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -26,6 +26,7 @@
 */
 #include <linux/crc16.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 /**
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index f94ddf7efba0..109c6ea03bb5 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -23,7 +23,7 @@
 /*
 * This file implements functions needed to recover from unclean un-mounts.
 * When UBIFS is mounted, it checks a flag on the master node to determine if
- * an un-mount was completed sucessfully. If not, the process of mounting
+ * an un-mount was completed successfully. If not, the process of mounting
 * incorparates additional checking and fixing of on-flash data structures.
 * UBIFS always cleans away all remnants of an unclean un-mount, so that
 * errors do not accumulate. However UBIFS defers recovery if it is mounted
@@ -31,6 +31,7 @@
 */
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 /**
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 57085e43320f..96cb62c8a9dd 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -27,6 +27,7 @@
 */
 #include "ubifs.h"
+#include <linux/slab.h>
 #include <linux/random.h>
 #include <linux/math64.h>
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 333e181ee987..4d2f2157dd3f 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -283,7 +283,7 @@ static void ubifs_destroy_inode(struct inode *inode)
 /*
 * Note, Linux write-back code calls this without 'i_mutex'.
 */
-static int ubifs_write_inode(struct inode *inode, int wait)
+static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        int err = 0;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -1393,12 +1393,7 @@ static int mount_ubifs(struct ubifs_info *c)
                c->leb_size, c->leb_size >> 10);
        dbg_msg("data journal heads:  %d",
                c->jhead_cnt - NONDATA_JHEADS_CNT);
-        dbg_msg("UUID:                %02X%02X%02X%02X-%02X%02X"
+        dbg_msg("UUID:                %pUB", c->uuid);
-               "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X",
-               c->uuid[0], c->uuid[1], c->uuid[2], c->uuid[3],
-               c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7],
-               c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11],
-               c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]);
        dbg_msg("big_lpt              %d", c->big_lpt);
        dbg_msg("log LEBs:            %d (%d - %d)",
                c->log_lebs, UBIFS_LOG_LNUM, c->log_last);
@@ -1842,22 +1837,32 @@ const struct super_operations ubifs_super_operations = {
 * @name: UBI volume name
 * @mode: UBI volume open mode
 *
- * There are several ways to specify UBI volumes when mounting UBIFS:
+ * The primary method of mounting UBIFS is by specifying the UBI volume
- * o ubiX_Y    - UBI device number X, volume Y;
+ * character device node path. However, UBIFS may also be mounted withoug any
- * o ubiY      - UBI device number 0, volume Y;
+ * character device node using one of the following methods:
+ *
+ * o ubiX_Y    - mount UBI device number X, volume Y;
+ * o ubiY      - mount UBI device number 0, volume Y;
 * o ubiX:NAME - mount UBI device X, volume with name NAME;
 * o ubi:NAME  - mount UBI device 0, volume with name NAME.
 *
 * Alternative '!' separator may be used instead of ':' (because some shells
 * like busybox may interpret ':' as an NFS host name separator). This function
- * returns ubi volume object in case of success and a negative error code in
+ * returns UBI volume description object in case of success and a negative
- * case of failure.
+ * error code in case of failure.
 */
 static struct ubi_volume_desc *open_ubi(const char *name, int mode)
 {
+        struct ubi_volume_desc *ubi;
        int dev, vol;
        char *endptr;
+        /* First, try to open using the device node path method */
+        ubi = ubi_open_volume_path(name, mode);
+        if (!IS_ERR(ubi))
+                return ubi;
+        /* Try the "nodev" method */
        if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i')
                return ERR_PTR(-EINVAL);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e5b1a7d00fa0..2194915220e5 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -31,6 +31,7 @@
 */
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 /*
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index b2d976366a46..bd2542dad014 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -28,6 +28,7 @@
 #include <linux/fs.h>
 #include <linux/err.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 195830f47569..c74400f88fe0 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -56,6 +56,7 @@
 */
 #include "ubifs.h"
+#include <linux/slab.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 1e068535b58b..19626e2491c4 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -31,55 +31,8 @@
 #define udf_clear_bit(nr, addr) ext2_clear_bit(nr, addr)
 #define udf_set_bit(nr, addr) ext2_set_bit(nr, addr)
 #define udf_test_bit(nr, addr) ext2_test_bit(nr, addr)
-#define udf_find_first_one_bit(addr, size) find_first_one_bit(addr, size)
 #define udf_find_next_one_bit(addr, size, offset) \
-                find_next_one_bit(addr, size, offset)
+                ext2_find_next_bit(addr, size, offset)
-#define leBPL_to_cpup(x) leNUM_to_cpup(BITS_PER_LONG, x)
-#define leNUM_to_cpup(x, y) xleNUM_to_cpup(x, y)
-#define xleNUM_to_cpup(x, y) (le ## x ## _to_cpup(y))
-#define uintBPL_t uint(BITS_PER_LONG)
-#define uint(x) xuint(x)
-#define xuint(x) __le ## x
-static inline int find_next_one_bit(void *addr, int size, int offset)
-{
-        uintBPL_t *p = ((uintBPL_t *) addr) + (offset / BITS_PER_LONG);
-        int result = offset & ~(BITS_PER_LONG - 1);
-        unsigned long tmp;
-        if (offset >= size)
-                return size;
-        size -= result;
-        offset &= (BITS_PER_LONG - 1);
-        if (offset) {
-                tmp = leBPL_to_cpup(p++);
-                tmp &= ~0UL << offset;
-                if (size < BITS_PER_LONG)
-                        goto found_first;
-                if (tmp)
-                        goto found_middle;
-                size -= BITS_PER_LONG;
-                result += BITS_PER_LONG;
-        }
-        while (size & ~(BITS_PER_LONG - 1)) {
-                tmp = leBPL_to_cpup(p++);
-                if (tmp)
-                        goto found_middle;
-                result += BITS_PER_LONG;
-                size -= BITS_PER_LONG;
-        }
-        if (!size)
-                return result;
-        tmp = leBPL_to_cpup(p);
-found_first:
-        tmp &= ~0UL >> (BITS_PER_LONG - size);
-found_middle:
-        return result + ffz(~tmp);
-}
-#define find_first_one_bit(addr, size)\
-        find_next_one_bit((addr), (size), 0)
 static int read_block_bitmap(struct super_block *sb,
                             struct udf_bitmap *bitmap, unsigned int block,
@@ -208,7 +161,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
                                        ((char *)bh->b_data)[(bit + i) >> 3]);
                        } else {
                                if (inode)
-                                        vfs_dq_free_block(inode, 1);
+                                        dquot_free_block(inode, 1);
                                udf_add_free_space(sb, sbi->s_partition, 1);
                        }
                }
@@ -260,11 +213,11 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
                while (bit < (sb->s_blocksize << 3) && block_count > 0) {
                        if (!udf_test_bit(bit, bh->b_data))
                                goto out;
-                        else if (vfs_dq_prealloc_block(inode, 1))
+                        else if (dquot_prealloc_block(inode, 1))
                                goto out;
                        else if (!udf_clear_bit(bit, bh->b_data)) {
                                udf_debug("bit already cleared for block %d\n", bit);
-                                vfs_dq_free_block(inode, 1);
+                                dquot_free_block(inode, 1);
                                goto out;
                        }
                        block_count--;
@@ -390,10 +343,14 @@ got_block:
        /*
         * Check quota for allocation of this block.
         */
-        if (inode && vfs_dq_alloc_block(inode, 1)) {
+        if (inode) {
-                mutex_unlock(&sbi->s_alloc_mutex);
+                int ret = dquot_alloc_block(inode, 1);
-                *err = -EDQUOT;
-                return 0;
+                if (ret) {
+                        mutex_unlock(&sbi->s_alloc_mutex);
+                        *err = ret;
+                        return 0;
+                }
        }
        newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) -
@@ -440,7 +397,7 @@ static void udf_table_free_blocks(struct super_block *sb,
            (bloc->logicalBlockNum + count) >
                partmap->s_partition_len) {
                udf_debug("%d < %d || %d + %d > %d\n",
-                          bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count,
+                          bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count,
                          partmap->s_partition_len);
                goto error_return;
        }
@@ -449,7 +406,7 @@ static void udf_table_free_blocks(struct super_block *sb,
        /* We do this up front - There are some error conditions that
           could occure, but.. oh well */
        if (inode)
-                vfs_dq_free_block(inode, count);
+                dquot_free_block(inode, count);
        udf_add_free_space(sb, sbi->s_partition, count);
        start = bloc->logicalBlockNum + offset;
@@ -547,7 +504,7 @@ static void udf_table_free_blocks(struct super_block *sb,
                }
                if (epos.offset + (2 * adsize) > sb->s_blocksize) {
-                        char *sptr, *dptr;
+                        unsigned char *sptr, *dptr;
                        int loffset;
                        brelse(oepos.bh);
@@ -694,7 +651,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
                epos.offset -= adsize;
                alloc_count = (elen >> sb->s_blocksize_bits);
-                if (inode && vfs_dq_prealloc_block(inode,
+                if (inode && dquot_prealloc_block(inode,
                        alloc_count > block_count ? block_count : alloc_count))
                        alloc_count = 0;
                else if (alloc_count > block_count) {
@@ -797,12 +754,13 @@ static int udf_table_new_block(struct super_block *sb,
        newblock = goal_eloc.logicalBlockNum;
        goal_eloc.logicalBlockNum++;
        goal_elen -= sb->s_blocksize;
+        if (inode) {
-        if (inode && vfs_dq_alloc_block(inode, 1)) {
+                *err = dquot_alloc_block(inode, 1);
-                brelse(goal_epos.bh);
+                if (*err) {
-                mutex_unlock(&sbi->s_alloc_mutex);
+                        brelse(goal_epos.bh);
-                *err = -EDQUOT;
+                        mutex_unlock(&sbi->s_alloc_mutex);
-                return 0;
+                        return 0;
+                }
        }
        if (goal_elen)
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 61d9a76a3a69..f0f2a436251e 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -45,8 +45,8 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
        int block, iblock;
        loff_t nf_pos = (filp->f_pos - 1) << 2;
        int flen;
-        char *fname = NULL;
+        unsigned char *fname = NULL;
-        char *nameptr;
+        unsigned char *nameptr;
        uint16_t liu;
        uint8_t lfi;
        loff_t size = udf_ext0_offset(dir) + dir->i_size;
diff --git a/fs/udf/file.c b/fs/udf/file.c
index b80cbd78833c..1eb06774ed90 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -34,6 +34,7 @@
 #include <linux/errno.h>
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
+#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/aio.h>
@@ -196,6 +197,7 @@ static int udf_release_file(struct inode *inode, struct file *filp)
                mutex_lock(&inode->i_mutex);
                lock_kernel();
                udf_discard_prealloc(inode);
+                udf_truncate_tail_extent(inode);
                unlock_kernel();
                mutex_unlock(&inode->i_mutex);
        }
@@ -206,7 +208,7 @@ const struct file_operations udf_file_operations = {
        .read                   = do_sync_read,
        .aio_read               = generic_file_aio_read,
        .ioctl                  = udf_ioctl,
-        .open                   = generic_file_open,
+        .open                   = dquot_file_open,
        .mmap                   = generic_file_mmap,
        .write                  = do_sync_write,
        .aio_write              = udf_file_aio_write,
@@ -216,6 +218,29 @@ const struct file_operations udf_file_operations = {
        .llseek                 = generic_file_llseek,
 };
+static int udf_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+        struct inode *inode = dentry->d_inode;
+        int error;
+        error = inode_change_ok(inode, iattr);
+        if (error)
+                return error;
+        if (iattr->ia_valid & ATTR_SIZE)
+                dquot_initialize(inode);
+        if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
+            (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
+                error = dquot_transfer(inode, iattr);
+                if (error)
+                        return error;
+        }
+        return inode_setattr(inode, iattr);
+}
 const struct inode_operations udf_file_inode_operations = {
-        .truncate = udf_truncate,
+        .truncate               = udf_truncate,
+        .setattr                = udf_setattr,
 };
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index c10fa39f97e2..fb68c9cd0c3e 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -36,8 +36,8 @@ void udf_free_inode(struct inode *inode)
         * Note: we must free any quota before locking the superblock,
         * as writing the quota to disk may need the lock as well.
         */
-        vfs_dq_free_inode(inode);
+        dquot_free_inode(inode);
-        vfs_dq_drop(inode);
+        dquot_drop(inode);
        clear_inode(inode);
@@ -61,7 +61,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
        struct super_block *sb = dir->i_sb;
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct inode *inode;
-        int block;
+        int block, ret;
        uint32_t start = UDF_I(dir)->i_location.logicalBlockNum;
        struct udf_inode_info *iinfo;
        struct udf_inode_info *dinfo = UDF_I(dir);
@@ -153,12 +153,14 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
        insert_inode_hash(inode);
        mark_inode_dirty(inode);
-        if (vfs_dq_alloc_inode(inode)) {
+        dquot_initialize(inode);
-                vfs_dq_drop(inode);
+        ret = dquot_alloc_inode(inode);
+        if (ret) {
+                dquot_drop(inode);
                inode->i_flags |= S_NOQUOTA;
                inode->i_nlink = 0;
                iput(inode);
-                *err = -EDQUOT;
+                *err = ret;
                return NULL;
        }
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 6d24c2c63f93..bb863fe579ac 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -36,6 +36,7 @@
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
+#include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/crc-itu-t.h>
@@ -70,6 +71,9 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
 void udf_delete_inode(struct inode *inode)
 {
+        if (!is_bad_inode(inode))
+                dquot_initialize(inode);
        truncate_inode_pages(&inode->i_data, 0);
        if (is_bad_inode(inode))
@@ -97,15 +101,19 @@ no_delete:
 */
 void udf_clear_inode(struct inode *inode)
 {
-        struct udf_inode_info *iinfo;
+        struct udf_inode_info *iinfo = UDF_I(inode);
-        if (!(inode->i_sb->s_flags & MS_RDONLY)) {
-                lock_kernel();
+        if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
-                udf_truncate_tail_extent(inode);
+            inode->i_size != iinfo->i_lenExtents) {
-                unlock_kernel();
+                printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has "
-                write_inode_now(inode, 0);
+                        "inode size %llu different from extent length %llu. "
-                invalidate_inode_buffers(inode);
+                        "Filesystem need not be standards compliant.\n",
+                        inode->i_sb->s_id, inode->i_ino, inode->i_mode,
+                        (unsigned long long)inode->i_size,
+                        (unsigned long long)iinfo->i_lenExtents);
        }
-        iinfo = UDF_I(inode);
+        dquot_drop(inode);
        kfree(iinfo->i_ext.i_data);
        iinfo->i_ext.i_data = NULL;
 }
@@ -198,7 +206,6 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
        int newblock;
        struct buffer_head *dbh = NULL;
        struct kernel_lb_addr eloc;
-        uint32_t elen;
        uint8_t alloctype;
        struct extent_position epos;
@@ -273,12 +280,11 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
        eloc.logicalBlockNum = *block;
        eloc.partitionReferenceNum =
                                iinfo->i_location.partitionReferenceNum;
-        elen = inode->i_sb->s_blocksize;
+        iinfo->i_lenExtents = inode->i_size;
-        iinfo->i_lenExtents = elen;
        epos.bh = NULL;
        epos.block = iinfo->i_location;
        epos.offset = udf_file_entry_alloc_offset(inode);
-        udf_add_aext(inode, &epos, &eloc, elen, 0);
+        udf_add_aext(inode, &epos, &eloc, inode->i_size, 0);
        /* UniqueID stuff */
        brelse(epos.bh);
@@ -1373,12 +1379,12 @@ static mode_t udf_convert_permissions(struct fileEntry *fe)
        return mode;
 }
-int udf_write_inode(struct inode *inode, int sync)
+int udf_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        int ret;
        lock_kernel();
-        ret = udf_update_inode(inode, sync);
+        ret = udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
        unlock_kernel();
        return ret;
@@ -1402,20 +1408,19 @@ static int udf_update_inode(struct inode *inode, int do_sync)
        unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
        struct udf_inode_info *iinfo = UDF_I(inode);
-        bh = udf_tread(inode->i_sb,
+        bh = udf_tgetblk(inode->i_sb,
-                        udf_get_lb_pblock(inode->i_sb,
+                        udf_get_lb_pblock(inode->i_sb, &iinfo->i_location, 0));
-                                          &iinfo->i_location, 0));
        if (!bh) {
-                udf_debug("bread failure\n");
+                udf_debug("getblk failure\n");
-                return -EIO;
+                return -ENOMEM;
        }
-        memset(bh->b_data, 0x00, inode->i_sb->s_blocksize);
+        lock_buffer(bh);
+        memset(bh->b_data, 0, inode->i_sb->s_blocksize);
        fe = (struct fileEntry *)bh->b_data;
        efe = (struct extendedFileEntry *)bh->b_data;
-        if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) {
+        if (iinfo->i_use) {
                struct unallocSpaceEntry *use =
                        (struct unallocSpaceEntry *)bh->b_data;
@@ -1423,20 +1428,18 @@ static int udf_update_inode(struct inode *inode, int do_sync)
                memcpy(bh->b_data + sizeof(struct unallocSpaceEntry),
                       iinfo->i_ext.i_data, inode->i_sb->s_blocksize -
                                        sizeof(struct unallocSpaceEntry));
+                use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE);
+                use->descTag.tagLocation =
+                                cpu_to_le32(iinfo->i_location.logicalBlockNum);
                crclen = sizeof(struct unallocSpaceEntry) +
                                iinfo->i_lenAlloc - sizeof(struct tag);
-                use->descTag.tagLocation = cpu_to_le32(
-                                                iinfo->i_location.
-                                                        logicalBlockNum);
                use->descTag.descCRCLength = cpu_to_le16(crclen);
                use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use +
                                                           sizeof(struct tag),
                                                           crclen));
                use->descTag.tagChecksum = udf_tag_checksum(&use->descTag);
-                mark_buffer_dirty(bh);
+                goto out;
-                brelse(bh);
-                return err;
        }
        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET))
@@ -1591,18 +1594,21 @@ static int udf_update_inode(struct inode *inode, int do_sync)
        fe->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number);
        fe->descTag.tagLocation = cpu_to_le32(
                                        iinfo->i_location.logicalBlockNum);
-        crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc -
+        crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - sizeof(struct tag);
-                                                                sizeof(struct tag);
        fe->descTag.descCRCLength = cpu_to_le16(crclen);
        fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag),
                                                  crclen));
        fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);
+out:
+        set_buffer_uptodate(bh);
+        unlock_buffer(bh);
        /* write the data blocks */
        mark_buffer_dirty(bh);
        if (do_sync) {
                sync_dirty_buffer(bh);
-                if (buffer_req(bh) && !buffer_uptodate(bh)) {
+                if (buffer_write_io_error(bh)) {
                        printk(KERN_WARNING "IO error syncing udf inode "
                                "[%s:%08lx]\n", inode->i_sb->s_id,
                                inode->i_ino);
@@ -1672,7 +1678,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
                return -1;
        if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) {
-                char *sptr, *dptr;
+                unsigned char *sptr, *dptr;
                struct buffer_head *nbh;
                int err, loffset;
                struct kernel_lb_addr obloc = epos->block;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 21dad8c608f9..db423ab078b1 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -34,8 +34,8 @@
 #include <linux/crc-itu-t.h>
 #include <linux/exportfs.h>
-static inline int udf_match(int len1, const char *name1, int len2,
+static inline int udf_match(int len1, const unsigned char *name1, int len2,
-                            const char *name2)
+                            const unsigned char *name2)
 {
        if (len1 != len2)
                return 0;
@@ -142,15 +142,15 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
 }
 static struct fileIdentDesc *udf_find_entry(struct inode *dir,
-                                            struct qstr *child,
+                                            const struct qstr *child,
                                            struct udf_fileident_bh *fibh,
                                            struct fileIdentDesc *cfi)
 {
        struct fileIdentDesc *fi = NULL;
        loff_t f_pos;
        int block, flen;
-        char *fname = NULL;
+        unsigned char *fname = NULL;
-        char *nameptr;
+        unsigned char *nameptr;
        uint8_t lfi;
        uint16_t liu;
        loff_t size;
@@ -308,7 +308,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
 {
        struct super_block *sb = dir->i_sb;
        struct fileIdentDesc *fi = NULL;
-        char *name = NULL;
+        unsigned char *name = NULL;
        int namelen;
        loff_t f_pos;
        loff_t size = udf_ext0_offset(dir) + dir->i_size;
@@ -408,15 +408,6 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
        }
 add:
-        /* Is there any extent whose size we need to round up? */
-        if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && elen) {
-                elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1);
-                if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                        epos.offset -= sizeof(struct short_ad);
-                else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                        epos.offset -= sizeof(struct long_ad);
-                udf_write_aext(dir, &epos, &eloc, elen, 1);
-        }
        f_pos += nfidlen;
        if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB &&
@@ -439,6 +430,7 @@ add:
                udf_current_aext(dir, &epos, &eloc, &elen, 1);
        }
+        /* Entry fits into current block? */
        if (sb->s_blocksize - fibh->eoffset >= nfidlen) {
                fibh->soffset = fibh->eoffset;
                fibh->eoffset += nfidlen;
@@ -462,6 +454,16 @@ add:
                                (fibh->sbh->b_data + fibh->soffset);
                }
        } else {
+                /* Round up last extent in the file */
+                elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1);
+                if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+                        epos.offset -= sizeof(struct short_ad);
+                else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+                        epos.offset -= sizeof(struct long_ad);
+                udf_write_aext(dir, &epos, &eloc, elen, 1);
+                dinfo->i_lenExtents = (dinfo->i_lenExtents + sb->s_blocksize
+                                        - 1) & ~(sb->s_blocksize - 1);
                fibh->soffset = fibh->eoffset - sb->s_blocksize;
                fibh->eoffset += nfidlen - sb->s_blocksize;
                if (fibh->sbh != fibh->ebh) {
@@ -508,6 +510,20 @@ add:
                dir->i_size += nfidlen;
                if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
                        dinfo->i_lenAlloc += nfidlen;
+                else {
+                        /* Find the last extent and truncate it to proper size */
+                        while (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
+                                (EXT_RECORDED_ALLOCATED >> 30))
+                                ;
+                        elen -= dinfo->i_lenExtents - dir->i_size;
+                        if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+                                epos.offset -= sizeof(struct short_ad);
+                        else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+                                epos.offset -= sizeof(struct long_ad);
+                        udf_write_aext(dir, &epos, &eloc, elen, 1);
+                        dinfo->i_lenExtents = dir->i_size;
+                }
                mark_inode_dirty(dir);
                goto out_ok;
        } else {
@@ -547,6 +563,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
        int err;
        struct udf_inode_info *iinfo;
+        dquot_initialize(dir);
        lock_kernel();
        inode = udf_new_inode(dir, mode, &err);
        if (!inode) {
@@ -600,6 +618,8 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
        if (!old_valid_dev(rdev))
                return -EINVAL;
+        dquot_initialize(dir);
        lock_kernel();
        err = -EIO;
        inode = udf_new_inode(dir, mode, &err);
@@ -646,6 +666,8 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct udf_inode_info *dinfo = UDF_I(dir);
        struct udf_inode_info *iinfo;
+        dquot_initialize(dir);
        lock_kernel();
        err = -EMLINK;
        if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
@@ -783,6 +805,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
        struct fileIdentDesc *fi, cfi;
        struct kernel_lb_addr tloc;
+        dquot_initialize(dir);
        retval = -ENOENT;
        lock_kernel();
        fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -829,6 +853,8 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
        struct fileIdentDesc cfi;
        struct kernel_lb_addr tloc;
+        dquot_initialize(dir);
        retval = -ENOENT;
        lock_kernel();
        fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -869,20 +895,22 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 {
        struct inode *inode;
        struct pathComponent *pc;
-        char *compstart;
+        const char *compstart;
        struct udf_fileident_bh fibh;
        struct extent_position epos = {};
        int eoffset, elen = 0;
        struct fileIdentDesc *fi;
        struct fileIdentDesc cfi;
-        char *ea;
+        uint8_t *ea;
        int err;
        int block;
-        char *name = NULL;
+        unsigned char *name = NULL;
        int namelen;
        struct buffer_head *bh;
        struct udf_inode_info *iinfo;
+        dquot_initialize(dir);
        lock_kernel();
        inode = udf_new_inode(dir, S_IFLNK, &err);
        if (!inode)
@@ -922,7 +950,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
                block = udf_get_pblock(inode->i_sb, block,
                                iinfo->i_location.partitionReferenceNum,
                                0);
-                epos.bh = udf_tread(inode->i_sb, block);
+                epos.bh = udf_tgetblk(inode->i_sb, block);
                lock_buffer(epos.bh);
                memset(epos.bh->b_data, 0x00, inode->i_sb->s_blocksize);
                set_buffer_uptodate(epos.bh);
@@ -954,7 +982,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
                pc = (struct pathComponent *)(ea + elen);
-                compstart = (char *)symname;
+                compstart = symname;
                do {
                        symname++;
@@ -999,6 +1027,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        inode->i_size = elen;
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
                iinfo->i_lenAlloc = inode->i_size;
+        else
+                udf_truncate_tail_extent(inode);
        mark_inode_dirty(inode);
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
@@ -1051,6 +1081,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
        int err;
        struct buffer_head *bh;
+        dquot_initialize(dir);
        lock_kernel();
        if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
                unlock_kernel();
@@ -1113,6 +1145,9 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct kernel_lb_addr tloc;
        struct udf_inode_info *old_iinfo = UDF_I(old_inode);
+        dquot_initialize(old_dir);
+        dquot_initialize(new_dir);
        lock_kernel();
        ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
        if (ofi) {
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 4b540ee632d5..745eb209be0c 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -24,7 +24,6 @@
 #include <linux/fs.h>
 #include <linux/string.h>
-#include <linux/slab.h>
 #include <linux/buffer_head.h>
 uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 9d1b8c2e6c45..1e4543cbcd27 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1078,21 +1078,39 @@ static int udf_fill_partdesc_info(struct super_block *sb,
        return 0;
 }
-static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
+static void udf_find_vat_block(struct super_block *sb, int p_index,
+                               int type1_index, sector_t start_block)
 {
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct udf_part_map *map = &sbi->s_partmaps[p_index];
+        sector_t vat_block;
        struct kernel_lb_addr ino;
+        /*
+         * VAT file entry is in the last recorded block. Some broken disks have
+         * it a few blocks before so try a bit harder...
+         */
+        ino.partitionReferenceNum = type1_index;
+        for (vat_block = start_block;
+             vat_block >= map->s_partition_root &&
+             vat_block >= start_block - 3 &&
+             !sbi->s_vat_inode; vat_block--) {
+                ino.logicalBlockNum = vat_block - map->s_partition_root;
+                sbi->s_vat_inode = udf_iget(sb, &ino);
+        }
+}
+static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
+{
+        struct udf_sb_info *sbi = UDF_SB(sb);
+        struct udf_part_map *map = &sbi->s_partmaps[p_index];
        struct buffer_head *bh = NULL;
        struct udf_inode_info *vati;
        uint32_t pos;
        struct virtualAllocationTable20 *vat20;
        sector_t blocks = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
-        /* VAT file entry is in the last recorded block */
+        udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block);
-        ino.partitionReferenceNum = type1_index;
-        ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root;
-        sbi->s_vat_inode = udf_iget(sb, &ino);
        if (!sbi->s_vat_inode &&
            sbi->s_last_block != blocks - 1) {
                printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the"
@@ -1100,9 +1118,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
                       "block of the device (%lu).\n",
                       (unsigned long)sbi->s_last_block,
                       (unsigned long)blocks - 1);
-                ino.partitionReferenceNum = type1_index;
+                udf_find_vat_block(sb, p_index, type1_index, blocks - 1);
-                ino.logicalBlockNum = blocks - 1 - map->s_partition_root;
-                sbi->s_vat_inode = udf_iget(sb, &ino);
        }
        if (!sbi->s_vat_inode)
                return 1;
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index c3265e1385d4..16064787d2b7 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -26,18 +26,17 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/stat.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include "udf_i.h"
-static void udf_pc_to_char(struct super_block *sb, char *from, int fromlen,
+static void udf_pc_to_char(struct super_block *sb, unsigned char *from,
-                           char *to)
+                           int fromlen, unsigned char *to)
 {
        struct pathComponent *pc;
        int elen = 0;
-        char *p = to;
+        unsigned char *p = to;
        while (elen < fromlen) {
                pc = (struct pathComponent *)(from + elen);
@@ -75,9 +74,9 @@ static int udf_symlink_filler(struct file *file, struct page *page)
 {
        struct inode *inode = page->mapping->host;
        struct buffer_head *bh = NULL;
-        char *symlink;
+        unsigned char *symlink;
        int err = -EIO;
-        char *p = kmap(page);
+        unsigned char *p = kmap(page);
        struct udf_inode_info *iinfo;
        lock_kernel();
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 8d46f4294ee7..4223ac855da9 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -142,7 +142,7 @@ extern void udf_truncate(struct inode *);
 extern void udf_read_inode(struct inode *);
 extern void udf_delete_inode(struct inode *);
 extern void udf_clear_inode(struct inode *);
-extern int udf_write_inode(struct inode *, int);
+extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
 extern long udf_block_map(struct inode *, sector_t);
 extern int udf_extend_file(struct inode *, struct extent_position *,
                           struct kernel_long_ad *, sector_t);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index cefa8c8913e6..d03a90b6ad69 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -24,6 +24,7 @@
 #include <linux/string.h>       /* for memset */
 #include <linux/nls.h>
 #include <linux/crc-itu-t.h>
+#include <linux/slab.h>
 #include "udf_sb.h"
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 54c16ec95dff..5cfa4d85ccf2 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -85,7 +85,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
                                   "bit already cleared for fragment %u", i);
        }
        
-        vfs_dq_free_block(inode, count);
+        dquot_free_block(inode, count);
        
        fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
@@ -195,7 +195,7 @@ do_more:
                ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
                if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
                        ufs_clusteracct (sb, ucpi, blkno, 1);
-                vfs_dq_free_block(inode, uspi->s_fpb);
+                dquot_free_block(inode, uspi->s_fpb);
                fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
                uspi->cs_total.cs_nbfree++;
@@ -511,6 +511,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
        struct ufs_cg_private_info * ucpi;
        struct ufs_cylinder_group * ucg;
        unsigned cgno, fragno, fragoff, count, fragsize, i;
+        int ret;
        
        UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n",
             (unsigned long long)fragment, oldcount, newcount);
@@ -556,8 +557,9 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
                fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
        for (i = oldcount; i < newcount; i++)
                ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
-        if (vfs_dq_alloc_block(inode, count)) {
+        ret = dquot_alloc_block(inode, count);
-                *err = -EDQUOT;
+        if (ret) {
+                *err = ret;
                return 0;
        }
@@ -596,6 +598,7 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
        struct ufs_cylinder_group * ucg;
        unsigned oldcg, i, j, k, allocsize;
        u64 result;
+        int ret;
        
        UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n",
             inode->i_ino, cgno, (unsigned long long)goal, count);
@@ -664,7 +667,7 @@ cg_found:
                for (i = count; i < uspi->s_fpb; i++)
                        ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
                i = uspi->s_fpb - count;
-                vfs_dq_free_block(inode, i);
+                dquot_free_block(inode, i);
                fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
                uspi->cs_total.cs_nffree += i;
@@ -676,8 +679,9 @@ cg_found:
        result = ufs_bitmap_search (sb, ucpi, goal, allocsize);
        if (result == INVBLOCK)
                return 0;
-        if (vfs_dq_alloc_block(inode, count)) {
+        ret = dquot_alloc_block(inode, count);
-                *err = -EDQUOT;
+        if (ret) {
+                *err = ret;
                return 0;
        }
        for (i = 0; i < count; i++)
@@ -714,6 +718,7 @@ static u64 ufs_alloccg_block(struct inode *inode,
        struct ufs_super_block_first * usb1;
        struct ufs_cylinder_group * ucg;
        u64 result, blkno;
+        int ret;
        UFSD("ENTER, goal %llu\n", (unsigned long long)goal);
@@ -747,8 +752,9 @@ gotit:
        ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
        if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
                ufs_clusteracct (sb, ucpi, blkno, -1);
-        if (vfs_dq_alloc_block(inode, uspi->s_fpb)) {
+        ret = dquot_alloc_block(inode, uspi->s_fpb);
-                *err = -EDQUOT;
+        if (ret) {
+                *err = ret;
                return INVBLOCK;
        }
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 6f671f1ac271..317a0d444f6b 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -31,7 +31,7 @@
 * len <= UFS_MAXNAMLEN and de != NULL are guaranteed by caller.
 */
 static inline int ufs_match(struct super_block *sb, int len,
-                const char * const name, struct ufs_dir_entry * de)
+                const unsigned char *name, struct ufs_dir_entry *de)
 {
        if (len != ufs_get_de_namlen(sb, de))
                return 0;
@@ -70,13 +70,13 @@ static inline unsigned long ufs_dir_pages(struct inode *inode)
        return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
 }
-ino_t ufs_inode_by_name(struct inode *dir, struct dentry *dentry)
+ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr)
 {
        ino_t res = 0;
        struct ufs_dir_entry *de;
        struct page *page;
        
-        de = ufs_find_entry(dir, dentry, &page);
+        de = ufs_find_entry(dir, qstr, &page);
        if (de) {
                res = fs32_to_cpu(dir->i_sb, de->d_ino);
                ufs_put_page(page);
@@ -249,12 +249,12 @@ struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p)
 * (as a parameter - res_dir). Page is returned mapped and unlocked.
 * Entry is guaranteed to be valid.
 */
-struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct dentry *dentry,
+struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr,
                                     struct page **res_page)
 {
        struct super_block *sb = dir->i_sb;
-        const char *name = dentry->d_name.name;
+        const unsigned char *name = qstr->name;
-        int namelen = dentry->d_name.len;
+        int namelen = qstr->len;
        unsigned reclen = UFS_DIR_REC_LEN(namelen);
        unsigned long start, n;
        unsigned long npages = ufs_dir_pages(dir);
@@ -313,7 +313,7 @@ found:
 int ufs_add_link(struct dentry *dentry, struct inode *inode)
 {
        struct inode *dir = dentry->d_parent->d_inode;
-        const char *name = dentry->d_name.name;
+        const unsigned char *name = dentry->d_name.name;
        int namelen = dentry->d_name.len;
        struct super_block *sb = dir->i_sb;
        unsigned reclen = UFS_DIR_REC_LEN(namelen);
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 73655c61240a..a8962cecde5b 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -24,6 +24,7 @@
 */
 #include <linux/fs.h>
+#include <linux/quotaops.h>
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -40,7 +41,7 @@ const struct file_operations ufs_file_operations = {
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
-        .open           = generic_file_open,
+        .open           = dquot_file_open,
        .fsync          = simple_fsync,
        .splice_read    = generic_file_splice_read,
 };
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 3527c00fef0d..230ecf608026 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -95,8 +95,8 @@ void ufs_free_inode (struct inode * inode)
        is_directory = S_ISDIR(inode->i_mode);
-        vfs_dq_free_inode(inode);
+        dquot_free_inode(inode);
-        vfs_dq_drop(inode);
+        dquot_drop(inode);
        clear_inode (inode);
@@ -355,9 +355,10 @@ cg_found:
        unlock_super (sb);
-        if (vfs_dq_alloc_inode(inode)) {
+        dquot_initialize(inode);
-                vfs_dq_drop(inode);
+        err = dquot_alloc_inode(inode);
-                err = -EDQUOT;
+        if (err) {
+                dquot_drop(inode);
                goto fail_without_unlock;
        }
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 7cf33379fd46..80b68c3702d1 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -36,6 +36,8 @@
 #include <linux/mm.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/quotaops.h>
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -890,11 +892,11 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
        return 0;
 }
-int ufs_write_inode (struct inode * inode, int wait)
+int ufs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        int ret;
        lock_kernel();
-        ret = ufs_update_inode (inode, wait);
+        ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
        unlock_kernel();
        return ret;
 }
@@ -908,6 +910,9 @@ void ufs_delete_inode (struct inode * inode)
 {
        loff_t old_i_size;
+        if (!is_bad_inode(inode))
+                dquot_initialize(inode);
        truncate_inode_pages(&inode->i_data, 0);
        if (is_bad_inode(inode))
                goto no_delete;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 23119fe7ad62..118556243e7a 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -30,6 +30,7 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/smp_lock.h>
+#include <linux/quotaops.h>
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -56,7 +57,7 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru
                return ERR_PTR(-ENAMETOOLONG);
        lock_kernel();
-        ino = ufs_inode_by_name(dir, dentry);
+        ino = ufs_inode_by_name(dir, &dentry->d_name);
        if (ino) {
                inode = ufs_iget(dir->i_sb, ino);
                if (IS_ERR(inode)) {
@@ -84,6 +85,9 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
        int err;
        UFSD("BEGIN\n");
+        dquot_initialize(dir);
        inode = ufs_new_inode(dir, mode);
        err = PTR_ERR(inode);
@@ -107,6 +111,9 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t
        if (!old_valid_dev(rdev))
                return -EINVAL;
+        dquot_initialize(dir);
        inode = ufs_new_inode(dir, mode);
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
@@ -131,6 +138,8 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
        if (l > sb->s_blocksize)
                goto out_notlocked;
+        dquot_initialize(dir);
        lock_kernel();
        inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
        err = PTR_ERR(inode);
@@ -176,6 +185,8 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
                return -EMLINK;
        }
+        dquot_initialize(dir);
        inode->i_ctime = CURRENT_TIME_SEC;
        inode_inc_link_count(inode);
        atomic_inc(&inode->i_count);
@@ -193,6 +204,8 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
        if (dir->i_nlink >= UFS_LINK_MAX)
                goto out;
+        dquot_initialize(dir);
        lock_kernel();
        inode_inc_link_count(dir);
@@ -237,7 +250,9 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
        struct page *page;
        int err = -ENOENT;
-        de = ufs_find_entry(dir, dentry, &page);
+        dquot_initialize(dir);
+        de = ufs_find_entry(dir, &dentry->d_name, &page);
        if (!de)
                goto out;
@@ -281,7 +296,10 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct ufs_dir_entry *old_de;
        int err = -ENOENT;
-        old_de = ufs_find_entry(old_dir, old_dentry, &old_page);
+        dquot_initialize(old_dir);
+        dquot_initialize(new_dir);
+        old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
        if (!old_de)
                goto out;
@@ -301,7 +319,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        goto out_dir;
                err = -ENOENT;
-                new_de = ufs_find_entry(new_dir, new_dentry, &new_page);
+                new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page);
                if (!new_de)
                        goto out_dir;
                inode_inc_link_count(old_inode);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 5faed7954d0a..14743d935a93 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -66,6 +66,7 @@
 */
+#include <linux/exportfs.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
@@ -96,6 +97,56 @@
 #include "swab.h"
 #include "util.h"
+static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
+{
+        struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+        struct inode *inode;
+        if (ino < UFS_ROOTINO || ino > uspi->s_ncg * uspi->s_ipg)
+                return ERR_PTR(-ESTALE);
+        inode = ufs_iget(sb, ino);
+        if (IS_ERR(inode))
+                return ERR_CAST(inode);
+        if (generation && inode->i_generation != generation) {
+                iput(inode);
+                return ERR_PTR(-ESTALE);
+        }
+        return inode;
+}
+static struct dentry *ufs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+                                       int fh_len, int fh_type)
+{
+        return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ufs_nfs_get_inode);
+}
+static struct dentry *ufs_fh_to_parent(struct super_block *sb, struct fid *fid,
+                                       int fh_len, int fh_type)
+{
+        return generic_fh_to_parent(sb, fid, fh_len, fh_type, ufs_nfs_get_inode);
+}
+static struct dentry *ufs_get_parent(struct dentry *child)
+{
+        struct qstr dot_dot = {
+                .name   = "..",
+                .len    = 2,
+        };
+        ino_t ino;
+        ino = ufs_inode_by_name(child->d_inode, &dot_dot);
+        if (!ino)
+                return ERR_PTR(-ENOENT);
+        return d_obtain_alias(ufs_iget(child->d_inode->i_sb, ino));
+}
+static const struct export_operations ufs_export_ops = {
+        .fh_to_dentry   = ufs_fh_to_dentry,
+        .fh_to_parent   = ufs_fh_to_parent,
+        .get_parent     = ufs_get_parent,
+};
 #ifdef CONFIG_UFS_DEBUG
 /*
 * Print contents of ufs_super_block, useful for debugging
@@ -965,6 +1016,9 @@ magic_found:
                case UFS_FSSTABLE:
                        UFSD("fs is stable\n");
                        break;
+                case UFS_FSLOG:
+                        UFSD("fs is logging fs\n");
+                        break;
                case UFS_FSOSF1:
                        UFSD("fs is DEC OSF/1\n");
                        break;
@@ -990,6 +1044,7 @@ magic_found:
         * Read ufs_super_block into internal data structures
         */
        sb->s_op = &ufs_super_ops;
+        sb->s_export_op = &ufs_export_ops;
        sb->dq_op = NULL; /***/
        sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic);
@@ -1380,6 +1435,11 @@ static void destroy_inodecache(void)
        kmem_cache_destroy(ufs_inode_cachep);
 }
+static void ufs_clear_inode(struct inode *inode)
+{
+        dquot_drop(inode);
+}
 #ifdef CONFIG_QUOTA
 static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t);
 static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t);
@@ -1390,6 +1450,7 @@ static const struct super_operations ufs_super_ops = {
        .destroy_inode  = ufs_destroy_inode,
        .write_inode    = ufs_write_inode,
        .delete_inode   = ufs_delete_inode,
+        .clear_inode    = ufs_clear_inode,
        .put_super      = ufs_put_super,
        .write_super    = ufs_write_super,
        .sync_fs        = ufs_sync_fs,
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 41dd431ce228..d3b6270cb377 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -44,6 +44,7 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/sched.h>
+#include <linux/quotaops.h>
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -517,9 +518,18 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr)
        if (error)
                return error;
+        if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+            (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+                error = dquot_transfer(inode, attr);
+                if (error)
+                        return error;
+        }
        if (ia_valid & ATTR_SIZE &&
            attr->ia_size != i_size_read(inode)) {
                loff_t old_i_size = inode->i_size;
+                dquot_initialize(inode);
                error = vmtruncate(inode, attr->ia_size);
                if (error)
                        return error;
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 644e77e13599..43f9f5d5670e 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -86,9 +86,9 @@ extern void ufs_put_cylinder (struct super_block *, unsigned);
 /* dir.c */
 extern const struct inode_operations ufs_dir_inode_operations;
 extern int ufs_add_link (struct dentry *, struct inode *);
-extern ino_t ufs_inode_by_name(struct inode *, struct dentry *);
+extern ino_t ufs_inode_by_name(struct inode *, const struct qstr *);
 extern int ufs_make_empty(struct inode *, struct inode *);
-extern struct ufs_dir_entry *ufs_find_entry(struct inode *, struct dentry *, struct page **);
+extern struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *, struct page **);
 extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *);
 extern int ufs_empty_dir (struct inode *);
 extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **);
@@ -106,7 +106,7 @@ extern struct inode * ufs_new_inode (struct inode *, int);
 /* inode.c */
 extern struct inode *ufs_iget(struct super_block *, unsigned long);
-extern int ufs_write_inode (struct inode *, int);
+extern int ufs_write_inode (struct inode *, struct writeback_control *);
 extern int ufs_sync_inode (struct inode *);
 extern void ufs_delete_inode (struct inode *);
 extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *);
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index 54bde1895a80..6943ec677c0b 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -138,6 +138,7 @@ typedef __u16 __bitwise __fs16;
 #define UFS_USEEFT  ((__u16)65535)
+/* fs_clean values */
 #define UFS_FSOK      0x7c269d38
 #define UFS_FSACTIVE  ((__s8)0x00)
 #define UFS_FSCLEAN   ((__s8)0x01)
@@ -145,6 +146,11 @@ typedef __u16 __bitwise __fs16;
 #define UFS_FSOSF1    ((__s8)0x03)      /* is this correct for DEC OSF/1? */
 #define UFS_FSBAD     ((__s8)0xff)
+/* Solaris-specific fs_clean values */
+#define UFS_FSSUSPEND ((__s8)0xfe)      /* temporarily suspended */
+#define UFS_FSLOG     ((__s8)0xfd)      /* logging fs */
+#define UFS_FSFIX     ((__s8)0xfc)      /* being repaired while mounted */
 /* From here to next blank line, s_flags for ufs_sb_info */
 /* directory entry encoding */
 #define UFS_DE_MASK             0x00000010      /* mask for the following */
@@ -227,11 +233,16 @@ typedef __u16 __bitwise __fs16;
 */
 #define ufs_cbtocylno(bno) \
        ((bno) * uspi->s_nspf / uspi->s_spc)
-#define ufs_cbtorpos(bno) \
+#define ufs_cbtorpos(bno)                                     \
+        ((UFS_SB(sb)->s_flags & UFS_CG_SUN) ?                 \
+         (((((bno) * uspi->s_nspf % uspi->s_spc) %            \
+            uspi->s_nsect) *                                  \
+           uspi->s_nrpos) / uspi->s_nsect)                    \
+         :                                                    \
        ((((bno) * uspi->s_nspf % uspi->s_spc / uspi->s_nsect \
        * uspi->s_trackskew + (bno) * uspi->s_nspf % uspi->s_spc \
        % uspi->s_nsect * uspi->s_interleave) % uspi->s_nsect \
-        * uspi->s_nrpos) / uspi->s_npsect)
+          * uspi->s_nrpos) / uspi->s_npsect))
 /*
 * The following macros optimize certain frequently calculated
diff --git a/fs/xattr.c b/fs/xattr.c
index 6d4f6d3449fb..46f87e828b48 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -615,12 +615,11 @@ ssize_t
 generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size)
 {
        struct xattr_handler *handler;
-        struct inode *inode = dentry->d_inode;
-        handler = xattr_resolve_name(inode->i_sb->s_xattr, &name);
+        handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
        if (!handler)
                return -EOPNOTSUPP;
-        return handler->get(inode, name, buffer, size);
+        return handler->get(dentry, name, buffer, size, handler->flags);
 }
 /*
@@ -630,18 +629,20 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s
 ssize_t
 generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
-        struct inode *inode = dentry->d_inode;
+        struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
-        struct xattr_handler *handler, **handlers = inode->i_sb->s_xattr;
        unsigned int size = 0;
        if (!buffer) {
-                for_each_xattr_handler(handlers, handler)
+                for_each_xattr_handler(handlers, handler) {
-                        size += handler->list(inode, NULL, 0, NULL, 0);
+                        size += handler->list(dentry, NULL, 0, NULL, 0,
+                                              handler->flags);
+                }
        } else {
                char *buf = buffer;
                for_each_xattr_handler(handlers, handler) {
-                        size = handler->list(inode, buf, buffer_size, NULL, 0);
+                        size = handler->list(dentry, buf, buffer_size,
+                                             NULL, 0, handler->flags);
                        if (size > buffer_size)
                                return -ERANGE;
                        buf += size;
@@ -659,14 +660,13 @@ int
 generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags)
 {
        struct xattr_handler *handler;
-        struct inode *inode = dentry->d_inode;
        if (size == 0)
                value = "";  /* empty EA, do not remove */
-        handler = xattr_resolve_name(inode->i_sb->s_xattr, &name);
+        handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
        if (!handler)
                return -EOPNOTSUPP;
-        return handler->set(inode, name, value, size, flags);
+        return handler->set(dentry, name, value, size, 0, handler->flags);
 }
 /*
@@ -677,12 +677,12 @@ int
 generic_removexattr(struct dentry *dentry, const char *name)
 {
        struct xattr_handler *handler;
-        struct inode *inode = dentry->d_inode;
-        handler = xattr_resolve_name(inode->i_sb->s_xattr, &name);
+        handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
        if (!handler)
                return -EOPNOTSUPP;
-        return handler->set(inode, name, NULL, 0, XATTR_REPLACE);
+        return handler->set(dentry, name, NULL, 0,
+                            XATTR_REPLACE, handler->flags);
 }
 EXPORT_SYMBOL(generic_getxattr);
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
index c6ad7c7e3ee9..8d5a506c82eb 100644
--- a/fs/xattr_acl.c
+++ b/fs/xattr_acl.c
@@ -6,9 +6,9 @@
 */
 #include <linux/module.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/gfp.h>
 /*
@@ -36,7 +36,7 @@ posix_acl_from_xattr(const void *value, size_t size)
        if (count == 0)
                return NULL;
        
-        acl = posix_acl_alloc(count, GFP_KERNEL);
+        acl = posix_acl_alloc(count, GFP_NOFS);
        if (!acl)
                return ERR_PTR(-ENOMEM);
        acl_e = acl->a_entries;
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 7a59daed1782..b4769e40e8bc 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -16,7 +16,7 @@
 # Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 #
-EXTRA_CFLAGS +=  -I$(src) -I$(src)/linux-2.6 -funsigned-char
+EXTRA_CFLAGS +=  -I$(src) -I$(src)/linux-2.6
 XFS_LINUX := linux-2.6
@@ -26,6 +26,8 @@ endif
 obj-$(CONFIG_XFS_FS)            += xfs.o
+xfs-y                           += linux-2.6/xfs_trace.o
 xfs-$(CONFIG_XFS_QUOTA)         += $(addprefix quota/, \
                                   xfs_dquot.o \
                                   xfs_dquot_item.o \
@@ -90,8 +92,7 @@ xfs-y				+= xfs_alloc.o \
                                   xfs_rw.o \
                                   xfs_dmops.o
-xfs-$(CONFIG_XFS_TRACE)         += xfs_btree_trace.o \
+xfs-$(CONFIG_XFS_TRACE)         += xfs_btree_trace.o
-                                   xfs_dir2_trace.o
 # Objects in linux/
 xfs-y                           += $(addprefix $(XFS_LINUX)/, \
@@ -104,7 +105,6 @@ xfs-y				+= $(addprefix $(XFS_LINUX)/, \
                                   xfs_globals.o \
                                   xfs_ioctl.o \
                                   xfs_iops.o \
-                                   xfs_lrw.o \
                                   xfs_super.o \
                                   xfs_sync.o \
                                   xfs_xattr.o)
@@ -113,6 +113,3 @@ xfs-y				+= $(addprefix $(XFS_LINUX)/, \
 xfs-y                           += $(addprefix support/, \
                                   debug.o \
                                   uuid.o)
-xfs-$(CONFIG_XFS_TRACE)         += support/ktrace.o
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 2d3f90afe5f1..666c9db48eb6 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -16,16 +16,33 @@
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
 #include <linux/mm.h>
-#include <linux/vmalloc.h>
 #include <linux/highmem.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include "time.h"
 #include "kmem.h"
-#define MAX_VMALLOCS    6
+/*
-#define MAX_SLAB_SIZE   0x20000
+ * Greedy allocation.  May fail and may return vmalloced memory.
+ *
+ * Must be freed using kmem_free_large.
+ */
+void *
+kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
+{
+        void            *ptr;
+        size_t          kmsize = maxsize;
+        while (!(ptr = kmem_zalloc_large(kmsize))) {
+                if ((kmsize >>= 1) <= minsize)
+                        kmsize = minsize;
+        }
+        if (ptr)
+                *size = kmsize;
+        return ptr;
+}
 void *
 kmem_alloc(size_t size, unsigned int __nocast flags)
@@ -34,19 +51,8 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
        gfp_t   lflags = kmem_flags_convert(flags);
        void    *ptr;
-#ifdef DEBUG
-        if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) {
-                printk(KERN_WARNING "Large %s attempt, size=%ld\n",
-                        __func__, (long)size);
-                dump_stack();
-        }
-#endif
        do {
-                if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS)
+                ptr = kmalloc(size, lflags);
-                        ptr = kmalloc(size, lflags);
-                else
-                        ptr = __vmalloc(size, lflags, PAGE_KERNEL);
                if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
                        return ptr;
                if (!(++retries % 100))
@@ -68,27 +74,6 @@ kmem_zalloc(size_t size, unsigned int __nocast flags)
        return ptr;
 }
-void *
-kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize,
-                   unsigned int __nocast flags)
-{
-        void            *ptr;
-        size_t          kmsize = maxsize;
-        unsigned int    kmflags = (flags & ~KM_SLEEP) | KM_NOSLEEP;
-        while (!(ptr = kmem_zalloc(kmsize, kmflags))) {
-                if ((kmsize <= minsize) && (flags & KM_NOSLEEP))
-                        break;
-                if ((kmsize >>= 1) <= minsize) {
-                        kmsize = minsize;
-                        kmflags = flags;
-                }
-        }
-        if (ptr)
-                *size = kmsize;
-        return ptr;
-}
 void
 kmem_free(const void *ptr)
 {
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 179cbd630f69..f7c8f7a9ea6d 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -21,6 +21,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/vmalloc.h>
 /*
 * General memory allocation interfaces
@@ -30,7 +31,6 @@
 #define KM_NOSLEEP      0x0002u
 #define KM_NOFS         0x0004u
 #define KM_MAYFAIL      0x0008u
-#define KM_LARGE        0x0010u
 /*
 * We use a special process flag to avoid recursive callbacks into
@@ -42,7 +42,7 @@ kmem_flags_convert(unsigned int __nocast flags)
 {
        gfp_t   lflags;
-        BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_LARGE));
+        BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL));
        if (flags & KM_NOSLEEP) {
                lflags = GFP_ATOMIC | __GFP_NOWARN;
@@ -56,10 +56,25 @@ kmem_flags_convert(unsigned int __nocast flags)
 extern void *kmem_alloc(size_t, unsigned int __nocast);
 extern void *kmem_zalloc(size_t, unsigned int __nocast);
-extern void *kmem_zalloc_greedy(size_t *, size_t, size_t, unsigned int __nocast);
 extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast);
 extern void  kmem_free(const void *);
+static inline void *kmem_zalloc_large(size_t size)
+{
+        void *ptr;
+        ptr = vmalloc(size);
+        if (ptr)
+                memset(ptr, 0, size);
+        return ptr;
+}
+static inline void kmem_free_large(void *ptr)
+{
+        vfree(ptr);
+}
+extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
 /*
 * Zone interfaces
 */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index b23a54506446..a7bc925c4d60 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -21,6 +21,8 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
 #include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+#include <linux/slab.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
@@ -105,7 +107,7 @@ xfs_get_acl(struct inode *inode, int type)
        struct posix_acl *acl;
        struct xfs_acl *xfs_acl;
        int len = sizeof(struct xfs_acl);
-        char *ea_name;
+        unsigned char *ea_name;
        int error;
        acl = get_cached_acl(inode, type);
@@ -132,7 +134,8 @@ xfs_get_acl(struct inode *inode, int type)
        if (!xfs_acl)
                return ERR_PTR(-ENOMEM);
-        error = -xfs_attr_get(ip, ea_name, (char *)xfs_acl, &len, ATTR_ROOT);
+        error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
+                                                        &len, ATTR_ROOT);
        if (error) {
                /*
                 * If the attribute doesn't exist make sure we have a negative
@@ -161,7 +164,7 @@ STATIC int
 xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 {
        struct xfs_inode *ip = XFS_I(inode);
-        char *ea_name;
+        unsigned char *ea_name;
        int error;
        if (S_ISLNK(inode->i_mode))
@@ -193,7 +196,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
                        (sizeof(struct xfs_acl_entry) *
                         (XFS_ACL_MAX_ENTRIES - acl->a_count));
-                error = -xfs_attr_set(ip, ea_name, (char *)xfs_acl,
+                error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
                                len, ATTR_ROOT);
                kfree(xfs_acl);
@@ -250,8 +253,9 @@ xfs_set_mode(struct inode *inode, mode_t mode)
        if (mode != inode->i_mode) {
                struct iattr iattr;
-                iattr.ia_valid = ATTR_MODE;
+                iattr.ia_valid = ATTR_MODE | ATTR_CTIME;
                iattr.ia_mode = mode;
+                iattr.ia_ctime = current_fs_time(inode->i_sb);
                error = -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
        }
@@ -260,7 +264,7 @@ xfs_set_mode(struct inode *inode, mode_t mode)
 }
 static int
-xfs_acl_exists(struct inode *inode, char *name)
+xfs_acl_exists(struct inode *inode, unsigned char *name)
 {
        int len = sizeof(struct xfs_acl);
@@ -353,37 +357,14 @@ xfs_acl_chmod(struct inode *inode)
        return error;
 }
-/*
- * System xattr handlers.
- *
- * Currently Posix ACLs are the only system namespace extended attribute
- * handlers supported by XFS, so we just implement the handlers here.
- * If we ever support other system extended attributes this will need
- * some refactoring.
- */
 static int
-xfs_decode_acl(const char *name)
+xfs_xattr_acl_get(struct dentry *dentry, const char *name,
-{
+                void *value, size_t size, int type)
-        if (strcmp(name, "posix_acl_access") == 0)
-                return ACL_TYPE_ACCESS;
-        else if (strcmp(name, "posix_acl_default") == 0)
-                return ACL_TYPE_DEFAULT;
-        return -EINVAL;
-}
-static int
-xfs_xattr_system_get(struct inode *inode, const char *name,
-                void *value, size_t size)
 {
        struct posix_acl *acl;
-        int type, error;
+        int error;
-        type = xfs_decode_acl(name);
-        if (type < 0)
-                return type;
-        acl = xfs_get_acl(inode, type);
+        acl = xfs_get_acl(dentry->d_inode, type);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl == NULL)
@@ -396,15 +377,13 @@ xfs_xattr_system_get(struct inode *inode, const char *name,
 }
 static int
-xfs_xattr_system_set(struct inode *inode, const char *name,
+xfs_xattr_acl_set(struct dentry *dentry, const char *name,
-                const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int type)
 {
+        struct inode *inode = dentry->d_inode;
        struct posix_acl *acl = NULL;
-        int error = 0, type;
+        int error = 0;
-        type = xfs_decode_acl(name);
-        if (type < 0)
-                return type;
        if (flags & XATTR_CREATE)
                return -EINVAL;
        if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
@@ -461,8 +440,16 @@ xfs_xattr_system_set(struct inode *inode, const char *name,
        return error;
 }
-struct xattr_handler xfs_xattr_system_handler = {
+struct xattr_handler xfs_xattr_acl_access_handler = {
-        .prefix = XATTR_SYSTEM_PREFIX,
+        .prefix = POSIX_ACL_XATTR_ACCESS,
-        .get    = xfs_xattr_system_get,
+        .flags  = ACL_TYPE_ACCESS,
-        .set    = xfs_xattr_system_set,
+        .get    = xfs_xattr_acl_get,
+        .set    = xfs_xattr_acl_set,
+};
+struct xattr_handler xfs_xattr_acl_default_handler = {
+        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .flags  = ACL_TYPE_DEFAULT,
+        .get    = xfs_xattr_acl_get,
+        .set    = xfs_xattr_acl_set,
 };
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c2e30eea74dc..0f8b9968a803 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -38,6 +38,9 @@
 #include "xfs_rw.h"
 #include "xfs_iomap.h"
 #include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+#include "xfs_bmap.h"
+#include <linux/gfp.h>
 #include <linux/mpage.h>
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
@@ -76,7 +79,7 @@ xfs_ioend_wake(
                wake_up(to_ioend_wq(ip));
 }
-STATIC void
+void
 xfs_count_page_state(
        struct page             *page,
        int                     *delalloc,
@@ -98,48 +101,6 @@ xfs_count_page_state(
        } while ((bh = bh->b_this_page) != head);
 }
-#if defined(XFS_RW_TRACE)
-void
-xfs_page_trace(
-        int             tag,
-        struct inode    *inode,
-        struct page     *page,
-        unsigned long   pgoff)
-{
-        xfs_inode_t     *ip;
-        loff_t          isize = i_size_read(inode);
-        loff_t          offset = page_offset(page);
-        int             delalloc = -1, unmapped = -1, unwritten = -1;
-        if (page_has_buffers(page))
-                xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
-        ip = XFS_I(inode);
-        if (!ip->i_rwtrace)
-                return;
-        ktrace_enter(ip->i_rwtrace,
-                (void *)((unsigned long)tag),
-                (void *)ip,
-                (void *)inode,
-                (void *)page,
-                (void *)pgoff,
-                (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
-                (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
-                (void *)((unsigned long)((isize >> 32) & 0xffffffff)),
-                (void *)((unsigned long)(isize & 0xffffffff)),
-                (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
-                (void *)((unsigned long)(offset & 0xffffffff)),
-                (void *)((unsigned long)delalloc),
-                (void *)((unsigned long)unmapped),
-                (void *)((unsigned long)unwritten),
-                (void *)((unsigned long)current_pid()),
-                (void *)NULL);
-}
-#else
-#define xfs_page_trace(tag, inode, page, pgoff)
-#endif
 STATIC struct block_device *
 xfs_find_bdev_for_inode(
        struct xfs_inode        *ip)
@@ -204,14 +165,17 @@ xfs_ioend_new_eof(
 }
 /*
- * Update on-disk file size now that data has been written to disk.
+ * Update on-disk file size now that data has been written to disk.  The
- * The current in-memory file size is i_size.  If a write is beyond
+ * current in-memory file size is i_size.  If a write is beyond eof i_new_size
- * eof i_new_size will be the intended file size until i_size is
+ * will be the intended file size until i_size is updated.  If this write does
- * updated.  If this write does not extend all the way to the valid
+ * not extend all the way to the valid file size then restrict this update to
- * file size then restrict this update to the end of the write.
+ * the end of the write.
+ *
+ * This function does not block as blocking on the inode lock in IO completion
+ * can lead to IO completion order dependency deadlocks.. If it can't get the
+ * inode ilock it will return EAGAIN. Callers must handle this.
 */
+STATIC int
-STATIC void
 xfs_setfilesize(
        xfs_ioend_t             *ioend)
 {
@@ -222,85 +186,19 @@ xfs_setfilesize(
        ASSERT(ioend->io_type != IOMAP_READ);
        if (unlikely(ioend->io_error))
-                return;
+                return 0;
+        if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+                return EAGAIN;
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
        isize = xfs_ioend_new_eof(ioend);
        if (isize) {
                ip->i_d.di_size = isize;
-                xfs_mark_inode_dirty_sync(ip);
+                xfs_mark_inode_dirty(ip);
        }
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-}
+        return 0;
-/*
- * Buffered IO write completion for delayed allocate extents.
- */
-STATIC void
-xfs_end_bio_delalloc(
-        struct work_struct      *work)
-{
-        xfs_ioend_t             *ioend =
-                container_of(work, xfs_ioend_t, io_work);
-        xfs_setfilesize(ioend);
-        xfs_destroy_ioend(ioend);
-}
-/*
- * Buffered IO write completion for regular, written extents.
- */
-STATIC void
-xfs_end_bio_written(
-        struct work_struct      *work)
-{
-        xfs_ioend_t             *ioend =
-                container_of(work, xfs_ioend_t, io_work);
-        xfs_setfilesize(ioend);
-        xfs_destroy_ioend(ioend);
-}
-/*
- * IO write completion for unwritten extents.
- *
- * Issue transactions to convert a buffer range from unwritten
- * to written extents.
- */
-STATIC void
-xfs_end_bio_unwritten(
-        struct work_struct      *work)
-{
-        xfs_ioend_t             *ioend =
-                container_of(work, xfs_ioend_t, io_work);
-        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
-        xfs_off_t               offset = ioend->io_offset;
-        size_t                  size = ioend->io_size;
-        if (likely(!ioend->io_error)) {
-                if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                        int error;
-                        error = xfs_iomap_write_unwritten(ip, offset, size);
-                        if (error)
-                                ioend->io_error = error;
-                }
-                xfs_setfilesize(ioend);
-        }
-        xfs_destroy_ioend(ioend);
-}
-/*
- * IO read completion for regular, written extents.
- */
-STATIC void
-xfs_end_bio_read(
-        struct work_struct      *work)
-{
-        xfs_ioend_t             *ioend =
-                container_of(work, xfs_ioend_t, io_work);
-        xfs_destroy_ioend(ioend);
 }
 /*
@@ -314,10 +212,10 @@ xfs_finish_ioend(
        int             wait)
 {
        if (atomic_dec_and_test(&ioend->io_remaining)) {
-                struct workqueue_struct *wq = xfsdatad_workqueue;
+                struct workqueue_struct *wq;
-                if (ioend->io_work.func == xfs_end_bio_unwritten)
-                        wq = xfsconvertd_workqueue;
+                wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
+                        xfsconvertd_workqueue : xfsdatad_workqueue;
                queue_work(wq, &ioend->io_work);
                if (wait)
                        flush_workqueue(wq);
@@ -325,6 +223,53 @@ xfs_finish_ioend(
 }
 /*
+ * IO write completion.
+ */
+STATIC void
+xfs_end_io(
+        struct work_struct *work)
+{
+        xfs_ioend_t     *ioend = container_of(work, xfs_ioend_t, io_work);
+        struct xfs_inode *ip = XFS_I(ioend->io_inode);
+        int             error = 0;
+        /*
+         * For unwritten extents we need to issue transactions to convert a
+         * range to normal written extens after the data I/O has finished.
+         */
+        if (ioend->io_type == IOMAP_UNWRITTEN &&
+            likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
+                error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
+                                                 ioend->io_size);
+                if (error)
+                        ioend->io_error = error;
+        }
+        /*
+         * We might have to update the on-disk file size after extending
+         * writes.
+         */
+        if (ioend->io_type != IOMAP_READ) {
+                error = xfs_setfilesize(ioend);
+                ASSERT(!error || error == EAGAIN);
+        }
+        /*
+         * If we didn't complete processing of the ioend, requeue it to the
+         * tail of the workqueue for another attempt later. Otherwise destroy
+         * it.
+         */
+        if (error == EAGAIN) {
+                atomic_inc(&ioend->io_remaining);
+                xfs_finish_ioend(ioend, 0);
+                /* ensure we don't spin on blocked ioends */
+                delay(1);
+        } else
+                xfs_destroy_ioend(ioend);
+}
+/*
 * Allocate and initialise an IO completion structure.
 * We need to track unwritten extent write completion here initially.
 * We'll need to extend this for updating the ondisk inode size later
@@ -355,15 +300,7 @@ xfs_alloc_ioend(
        ioend->io_offset = 0;
        ioend->io_size = 0;
-        if (type == IOMAP_UNWRITTEN)
+        INIT_WORK(&ioend->io_work, xfs_end_io);
-                INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten);
-        else if (type == IOMAP_DELAY)
-                INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc);
-        else if (type == IOMAP_READ)
-                INIT_WORK(&ioend->io_work, xfs_end_bio_read);
-        else
-                INIT_WORK(&ioend->io_work, xfs_end_bio_written);
        return ioend;
 }
@@ -380,7 +317,7 @@ xfs_map_blocks(
        return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps);
 }
-STATIC_INLINE int
+STATIC int
 xfs_iomap_valid(
        xfs_iomap_t             *iomapp,
        loff_t                  offset)
@@ -412,8 +349,9 @@ xfs_end_bio(
 STATIC void
 xfs_submit_ioend_bio(
-        xfs_ioend_t     *ioend,
+        struct writeback_control *wbc,
-        struct bio      *bio)
+        xfs_ioend_t             *ioend,
+        struct bio              *bio)
 {
        atomic_inc(&ioend->io_remaining);
        bio->bi_private = ioend;
@@ -424,9 +362,10 @@ xfs_submit_ioend_bio(
         * but don't update the inode size until I/O completion.
         */
        if (xfs_ioend_new_eof(ioend))
-                xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode));
+                xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
-        submit_bio(WRITE, bio);
+        submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
+                   WRITE_SYNC_PLUG : WRITE, bio);
        ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
        bio_put(bio);
 }
@@ -505,6 +444,7 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
 */
 STATIC void
 xfs_submit_ioend(
+        struct writeback_control *wbc,
        xfs_ioend_t             *ioend)
 {
        xfs_ioend_t             *head = ioend;
@@ -533,19 +473,19 @@ xfs_submit_ioend(
 retry:
                                bio = xfs_alloc_ioend_bio(bh);
                        } else if (bh->b_blocknr != lastblock + 1) {
-                                xfs_submit_ioend_bio(ioend, bio);
+                                xfs_submit_ioend_bio(wbc, ioend, bio);
                                goto retry;
                        }
                        if (bio_add_buffer(bio, bh) != bh->b_size) {
-                                xfs_submit_ioend_bio(ioend, bio);
+                                xfs_submit_ioend_bio(wbc, ioend, bio);
                                goto retry;
                        }
                        lastblock = bh->b_blocknr;
                }
                if (bio)
-                        xfs_submit_ioend_bio(ioend, bio);
+                        xfs_submit_ioend_bio(wbc, ioend, bio);
                xfs_finish_ioend(ioend, 0);
        } while ((ioend = next) != NULL);
 }
@@ -904,16 +844,9 @@ xfs_convert_page(
        if (startio) {
                if (count) {
-                        struct backing_dev_info *bdi;
-                        bdi = inode->i_mapping->backing_dev_info;
                        wbc->nr_to_write--;
-                        if (bdi_write_congested(bdi)) {
+                        if (wbc->nr_to_write <= 0)
-                                wbc->encountered_congestion = 1;
-                                done = 1;
-                        } else if (wbc->nr_to_write <= 0) {
                                done = 1;
-                        }
                }
                xfs_start_page_writeback(page, !page_dirty, count);
        }
@@ -962,6 +895,125 @@ xfs_cluster_write(
        }
 }
+STATIC void
+xfs_vm_invalidatepage(
+        struct page             *page,
+        unsigned long           offset)
+{
+        trace_xfs_invalidatepage(page->mapping->host, page, offset);
+        block_invalidatepage(page, offset);
+}
+/*
+ * If the page has delalloc buffers on it, we need to punch them out before we
+ * invalidate the page. If we don't, we leave a stale delalloc mapping on the
+ * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
+ * is done on that same region - the delalloc extent is returned when none is
+ * supposed to be there.
+ *
+ * We prevent this by truncating away the delalloc regions on the page before
+ * invalidating it. Because they are delalloc, we can do this without needing a
+ * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
+ * truncation without a transaction as there is no space left for block
+ * reservation (typically why we see a ENOSPC in writeback).
+ *
+ * This is not a performance critical path, so for now just do the punching a
+ * buffer head at a time.
+ */
+STATIC void
+xfs_aops_discard_page(
+        struct page             *page)
+{
+        struct inode            *inode = page->mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct buffer_head      *bh, *head;
+        loff_t                  offset = page_offset(page);
+        ssize_t                 len = 1 << inode->i_blkbits;
+        if (!xfs_is_delayed_page(page, IOMAP_DELAY))
+                goto out_invalidate;
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+                goto out_invalidate;
+        xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                "page discard on page %p, inode 0x%llx, offset %llu.",
+                        page, ip->i_ino, offset);
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        bh = head = page_buffers(page);
+        do {
+                int             done;
+                xfs_fileoff_t   offset_fsb;
+                xfs_bmbt_irec_t imap;
+                int             nimaps = 1;
+                int             error;
+                xfs_fsblock_t   firstblock;
+                xfs_bmap_free_t flist;
+                if (!buffer_delay(bh))
+                        goto next_buffer;
+                offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+                /*
+                 * Map the range first and check that it is a delalloc extent
+                 * before trying to unmap the range. Otherwise we will be
+                 * trying to remove a real extent (which requires a
+                 * transaction) or a hole, which is probably a bad idea...
+                 */
+                error = xfs_bmapi(NULL, ip, offset_fsb, 1,
+                                XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
+                                &nimaps, NULL, NULL);
+                if (error) {
+                        /* something screwed, just bail */
+                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                                "page discard failed delalloc mapping lookup.");
+                        }
+                        break;
+                }
+                if (!nimaps) {
+                        /* nothing there */
+                        goto next_buffer;
+                }
+                if (imap.br_startblock != DELAYSTARTBLOCK) {
+                        /* been converted, ignore */
+                        goto next_buffer;
+                }
+                WARN_ON(imap.br_blockcount == 0);
+                /*
+                 * Note: while we initialise the firstblock/flist pair, they
+                 * should never be used because blocks should never be
+                 * allocated or freed for a delalloc extent and hence we need
+                 * don't cancel or finish them after the xfs_bunmapi() call.
+                 */
+                xfs_bmap_init(&flist, &firstblock);
+                error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
+                                        &flist, NULL, &done);
+                ASSERT(!flist.xbf_count && !flist.xbf_first);
+                if (error) {
+                        /* something screwed, just bail */
+                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                        "page discard unable to remove delalloc mapping.");
+                        }
+                        break;
+                }
+next_buffer:
+                offset += len;
+        } while ((bh = bh->b_this_page) != head);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out_invalidate:
+        xfs_vm_invalidatepage(page, 0);
+        return;
+}
 /*
 * Calling this without startio set means we are being asked to make a dirty
 * page ready for freeing it's buffers.  When called with startio set then
@@ -1198,7 +1250,7 @@ xfs_page_state_convert(
        }
        if (iohead)
-                xfs_submit_ioend(iohead);
+                xfs_submit_ioend(wbc, iohead);
        return page_dirty;
@@ -1213,7 +1265,7 @@ error:
         */
        if (err != -EAGAIN) {
                if (!unmapped)
-                        block_invalidatepage(page, 0);
+                        xfs_aops_discard_page(page);
                ClearPageUptodate(page);
        }
        return err;
@@ -1249,7 +1301,7 @@ xfs_vm_writepage(
        int                     delalloc, unmapped, unwritten;
        struct inode            *inode = page->mapping->host;
-        xfs_page_trace(XFS_WRITEPAGE_ENTER, inode, page, 0);
+        trace_xfs_writepage(inode, page, 0);
        /*
         * We need a transaction if:
@@ -1354,7 +1406,7 @@ xfs_vm_releasepage(
                .nr_to_write = 1,
        };
-        xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, 0);
+        trace_xfs_releasepage(inode, page, 0);
        if (!page_has_buffers(page))
                return 0;
@@ -1535,7 +1587,7 @@ xfs_end_io_direct(
                 * didn't map an unwritten extent so switch it's completion
                 * handler.
                 */
-                INIT_WORK(&ioend->io_work, xfs_end_bio_written);
+                ioend->io_type = IOMAP_NEW;
                xfs_finish_ioend(ioend, 0);
        }
@@ -1562,19 +1614,13 @@ xfs_vm_direct_IO(
        bdev = xfs_find_bdev_for_inode(XFS_I(inode));
-        if (rw == WRITE) {
+        iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
-                iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
+                                        IOMAP_UNWRITTEN : IOMAP_READ);
-                ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
-                        bdev, iov, offset, nr_segs,
+        ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
-                        xfs_get_blocks_direct,
+                                            offset, nr_segs,
-                        xfs_end_io_direct);
+                                            xfs_get_blocks_direct,
-        } else {
+                                            xfs_end_io_direct);
-                iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
-                ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
-                        bdev, iov, offset, nr_segs,
-                        xfs_get_blocks_direct,
-                        xfs_end_io_direct);
-        }
        if (unlikely(ret != -EIOCBQUEUED && iocb->private))
                xfs_destroy_ioend(iocb->private);
@@ -1629,16 +1675,6 @@ xfs_vm_readpages(
        return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
 }
-STATIC void
-xfs_vm_invalidatepage(
-        struct page             *page,
-        unsigned long           offset)
-{
-        xfs_page_trace(XFS_INVALIDPAGE_ENTER,
-                        page->mapping->host, page, offset);
-        block_invalidatepage(page, offset);
-}
 const struct address_space_operations xfs_address_space_operations = {
        .readpage               = xfs_vm_readpage,
        .readpages              = xfs_vm_readpages,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 221b3e66ceef..4cfc6ea87df8 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -45,4 +45,6 @@ extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
 extern void xfs_ioend_init(void);
 extern void xfs_ioend_wait(struct xfs_inode *);
+extern void xfs_count_page_state(struct page *, int *, int *, int *);
 #endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 965df1227d64..44c2b0ef9a41 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -18,7 +18,7 @@
 #include "xfs.h"
 #include <linux/stddef.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
+#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/vmalloc.h>
@@ -33,12 +33,14 @@
 #include <linux/migrate.h>
 #include <linux/backing-dev.h>
 #include <linux/freezer.h>
+#include <linux/list_sort.h>
 #include "xfs_sb.h"
 #include "xfs_inum.h"
 #include "xfs_ag.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
+#include "xfs_trace.h"
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
@@ -53,34 +55,6 @@ static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
 struct workqueue_struct *xfsconvertd_workqueue;
-#ifdef XFS_BUF_TRACE
-void
-xfs_buf_trace(
-        xfs_buf_t       *bp,
-        char            *id,
-        void            *data,
-        void            *ra)
-{
-        ktrace_enter(xfs_buf_trace_buf,
-                bp, id,
-                (void *)(unsigned long)bp->b_flags,
-                (void *)(unsigned long)bp->b_hold.counter,
-                (void *)(unsigned long)bp->b_sema.count,
-                (void *)current,
-                data, ra,
-                (void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff),
-                (void *)(unsigned long)(bp->b_file_offset & 0xffffffff),
-                (void *)(unsigned long)bp->b_buffer_length,
-                NULL, NULL, NULL, NULL, NULL);
-}
-ktrace_t *xfs_buf_trace_buf;
-#define XFS_BUF_TRACE_SIZE      4096
-#define XB_TRACE(bp, id, data)  \
-        xfs_buf_trace(bp, id, (void *)data, (void *)__builtin_return_address(0))
-#else
-#define XB_TRACE(bp, id, data)  do { } while (0)
-#endif
 #ifdef XFS_BUF_LOCK_TRACKING
 # define XB_SET_OWNER(bp)       ((bp)->b_last_holder = current->pid)
 # define XB_CLEAR_OWNER(bp)     ((bp)->b_last_holder = -1)
@@ -103,6 +77,27 @@ ktrace_t *xfs_buf_trace_buf;
 #define xfs_buf_deallocate(bp) \
        kmem_zone_free(xfs_buf_zone, (bp));
+static inline int
+xfs_buf_is_vmapped(
+        struct xfs_buf  *bp)
+{
+        /*
+         * Return true if the buffer is vmapped.
+         *
+         * The XBF_MAPPED flag is set if the buffer should be mapped, but the
+         * code is clever enough to know it doesn't have to map a single page,
+         * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1.
+         */
+        return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1;
+}
+static inline int
+xfs_buf_vmap_len(
+        struct xfs_buf  *bp)
+{
+        return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
+}
 /*
 *      Page Region interfaces.
 *
@@ -149,7 +144,7 @@ page_region_mask(
        return mask;
 }
-STATIC_INLINE void
+STATIC void
 set_page_region(
        struct page     *page,
        size_t          offset,
@@ -161,7 +156,7 @@ set_page_region(
                SetPageUptodate(page);
 }
-STATIC_INLINE int
+STATIC int
 test_page_region(
        struct page     *page,
        size_t          offset,
@@ -173,75 +168,6 @@ test_page_region(
 }
 /*
- *      Mapping of multi-page buffers into contiguous virtual space
- */
-typedef struct a_list {
-        void            *vm_addr;
-        struct a_list   *next;
-} a_list_t;
-static a_list_t         *as_free_head;
-static int              as_list_len;
-static DEFINE_SPINLOCK(as_lock);
-/*
- *      Try to batch vunmaps because they are costly.
- */
-STATIC void
-free_address(
-        void            *addr)
-{
-        a_list_t        *aentry;
-#ifdef CONFIG_XEN
-        /*
-         * Xen needs to be able to make sure it can get an exclusive
-         * RO mapping of pages it wants to turn into a pagetable.  If
-         * a newly allocated page is also still being vmap()ed by xfs,
-         * it will cause pagetable construction to fail.  This is a
-         * quick workaround to always eagerly unmap pages so that Xen
-         * is happy.
-         */
-        vunmap(addr);
-        return;
-#endif
-        aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
-        if (likely(aentry)) {
-                spin_lock(&as_lock);
-                aentry->next = as_free_head;
-                aentry->vm_addr = addr;
-                as_free_head = aentry;
-                as_list_len++;
-                spin_unlock(&as_lock);
-        } else {
-                vunmap(addr);
-        }
-}
-STATIC void
-purge_addresses(void)
-{
-        a_list_t        *aentry, *old;
-        if (as_free_head == NULL)
-                return;
-        spin_lock(&as_lock);
-        aentry = as_free_head;
-        as_free_head = NULL;
-        as_list_len = 0;
-        spin_unlock(&as_lock);
-        while ((old = aentry) != NULL) {
-                vunmap(aentry->vm_addr);
-                aentry = aentry->next;
-                kfree(old);
-        }
-}
-/*
 *      Internal xfs_buf_t object manipulation
 */
@@ -279,7 +205,8 @@ _xfs_buf_initialize(
        init_waitqueue_head(&bp->b_waiters);
        XFS_STATS_INC(xb_create);
-        XB_TRACE(bp, "initialize", target);
+        trace_xfs_buf_init(bp, _RET_IP_);
 }
 /*
@@ -318,6 +245,7 @@ _xfs_buf_free_pages(
 {
        if (bp->b_pages != bp->b_page_array) {
                kmem_free(bp->b_pages);
+                bp->b_pages = NULL;
        }
 }
@@ -332,15 +260,16 @@ void
 xfs_buf_free(
        xfs_buf_t               *bp)
 {
-        XB_TRACE(bp, "free", 0);
+        trace_xfs_buf_free(bp, _RET_IP_);
        ASSERT(list_empty(&bp->b_hash_list));
        if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
                uint            i;
-                if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
+                if (xfs_buf_is_vmapped(bp))
-                        free_address(bp->b_addr - bp->b_offset);
+                        vm_unmap_ram(bp->b_addr - bp->b_offset,
+                                        bp->b_page_count);
                for (i = 0; i < bp->b_page_count; i++) {
                        struct page     *page = bp->b_pages[i];
@@ -349,9 +278,8 @@ xfs_buf_free(
                                ASSERT(!PagePrivate(page));
                        page_cache_release(page);
                }
-                _xfs_buf_free_pages(bp);
        }
+        _xfs_buf_free_pages(bp);
        xfs_buf_deallocate(bp);
 }
@@ -445,7 +373,6 @@ _xfs_buf_lookup_pages(
        if (page_count == bp->b_page_count)
                bp->b_flags |= XBF_DONE;
-        XB_TRACE(bp, "lookup_pages", (long)page_count);
        return error;
 }
@@ -462,10 +389,8 @@ _xfs_buf_map_pages(
                bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
                bp->b_flags |= XBF_MAPPED;
        } else if (flags & XBF_MAPPED) {
-                if (as_list_len > 64)
+                bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
-                        purge_addresses();
+                                        -1, PAGE_KERNEL);
-                bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
-                                        VM_MAP, PAGE_KERNEL);
                if (unlikely(bp->b_addr == NULL))
                        return -ENOMEM;
                bp->b_addr += bp->b_offset;
@@ -548,7 +473,6 @@ found:
        if (down_trylock(&bp->b_sema)) {
                if (!(flags & XBF_TRYLOCK)) {
                        /* wait for buffer ownership */
-                        XB_TRACE(bp, "get_lock", 0);
                        xfs_buf_lock(bp);
                        XFS_STATS_INC(xb_get_locked_waited);
                } else {
@@ -571,7 +495,8 @@ found:
                ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
                bp->b_flags &= XBF_MAPPED;
        }
-        XB_TRACE(bp, "got_lock", 0);
+        trace_xfs_buf_find(bp, flags, _RET_IP_);
        XFS_STATS_INC(xb_get_locked);
        return bp;
 }
@@ -582,7 +507,7 @@ found:
 *      although backing storage may not be.
 */
 xfs_buf_t *
-xfs_buf_get_flags(
+xfs_buf_get(
        xfs_buftarg_t           *target,/* target for buffer            */
        xfs_off_t               ioff,   /* starting offset of range     */
        size_t                  isize,  /* length of range              */
@@ -627,7 +552,7 @@ xfs_buf_get_flags(
        bp->b_bn = ioff;
        bp->b_count_desired = bp->b_buffer_length;
-        XB_TRACE(bp, "get", (unsigned long)flags);
+        trace_xfs_buf_get(bp, flags, _RET_IP_);
        return bp;
 no_buffer:
@@ -644,8 +569,6 @@ _xfs_buf_read(
 {
        int                     status;
-        XB_TRACE(bp, "_xfs_buf_read", (unsigned long)flags);
        ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE)));
        ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
@@ -661,7 +584,7 @@ _xfs_buf_read(
 }
 xfs_buf_t *
-xfs_buf_read_flags(
+xfs_buf_read(
        xfs_buftarg_t           *target,
        xfs_off_t               ioff,
        size_t                  isize,
@@ -671,21 +594,20 @@ xfs_buf_read_flags(
        flags |= XBF_READ;
-        bp = xfs_buf_get_flags(target, ioff, isize, flags);
+        bp = xfs_buf_get(target, ioff, isize, flags);
        if (bp) {
+                trace_xfs_buf_read(bp, flags, _RET_IP_);
                if (!XFS_BUF_ISDONE(bp)) {
-                        XB_TRACE(bp, "read", (unsigned long)flags);
                        XFS_STATS_INC(xb_get_read);
                        _xfs_buf_read(bp, flags);
                } else if (flags & XBF_ASYNC) {
-                        XB_TRACE(bp, "read_async", (unsigned long)flags);
                        /*
                         * Read ahead call which is already satisfied,
                         * drop the buffer
                         */
                        goto no_buffer;
                } else {
-                        XB_TRACE(bp, "read_done", (unsigned long)flags);
                        /* We do not want read in the flags */
                        bp->b_flags &= ~XBF_READ;
                }
@@ -718,7 +640,7 @@ xfs_buf_readahead(
                return;
        flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
-        xfs_buf_read_flags(target, ioff, isize, flags);
+        xfs_buf_read(target, ioff, isize, flags);
 }
 xfs_buf_t *
@@ -823,7 +745,7 @@ xfs_buf_get_noaddr(
        xfs_buf_unlock(bp);
-        XB_TRACE(bp, "no_daddr", len);
+        trace_xfs_buf_get_noaddr(bp, _RET_IP_);
        return bp;
 fail_free_mem:
@@ -845,8 +767,8 @@ void
 xfs_buf_hold(
        xfs_buf_t               *bp)
 {
+        trace_xfs_buf_hold(bp, _RET_IP_);
        atomic_inc(&bp->b_hold);
-        XB_TRACE(bp, "hold", 0);
 }
 /*
@@ -859,7 +781,7 @@ xfs_buf_rele(
 {
        xfs_bufhash_t           *hash = bp->b_hash;
-        XB_TRACE(bp, "rele", bp->b_relse);
+        trace_xfs_buf_rele(bp, _RET_IP_);
        if (unlikely(!hash)) {
                ASSERT(!bp->b_relse);
@@ -909,21 +831,19 @@ xfs_buf_cond_lock(
        int                     locked;
        locked = down_trylock(&bp->b_sema) == 0;
-        if (locked) {
+        if (locked)
                XB_SET_OWNER(bp);
-        }
-        XB_TRACE(bp, "cond_lock", (long)locked);
+        trace_xfs_buf_cond_lock(bp, _RET_IP_);
        return locked ? 0 : -EBUSY;
 }
-#if defined(DEBUG) || defined(XFS_BLI_TRACE)
 int
 xfs_buf_lock_value(
        xfs_buf_t               *bp)
 {
        return bp->b_sema.count;
 }
-#endif
 /*
 *      Locks a buffer object.
@@ -935,12 +855,14 @@ void
 xfs_buf_lock(
        xfs_buf_t               *bp)
 {
-        XB_TRACE(bp, "lock", 0);
+        trace_xfs_buf_lock(bp, _RET_IP_);
        if (atomic_read(&bp->b_io_remaining))
                blk_run_address_space(bp->b_target->bt_mapping);
        down(&bp->b_sema);
        XB_SET_OWNER(bp);
-        XB_TRACE(bp, "locked", 0);
+        trace_xfs_buf_lock_done(bp, _RET_IP_);
 }
 /*
@@ -962,7 +884,8 @@ xfs_buf_unlock(
        XB_CLEAR_OWNER(bp);
        up(&bp->b_sema);
-        XB_TRACE(bp, "unlock", 0);
+        trace_xfs_buf_unlock(bp, _RET_IP_);
 }
@@ -974,17 +897,18 @@ void
 xfs_buf_pin(
        xfs_buf_t               *bp)
 {
+        trace_xfs_buf_pin(bp, _RET_IP_);
        atomic_inc(&bp->b_pin_count);
-        XB_TRACE(bp, "pin", (long)bp->b_pin_count.counter);
 }
 void
 xfs_buf_unpin(
        xfs_buf_t               *bp)
 {
+        trace_xfs_buf_unpin(bp, _RET_IP_);
        if (atomic_dec_and_test(&bp->b_pin_count))
                wake_up_all(&bp->b_waiters);
-        XB_TRACE(bp, "unpin", (long)bp->b_pin_count.counter);
 }
 int
@@ -1035,7 +959,7 @@ xfs_buf_iodone_work(
         */
        if ((bp->b_error == EOPNOTSUPP) &&
            (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
-                XB_TRACE(bp, "ordered_retry", bp->b_iodone);
+                trace_xfs_buf_ordered_retry(bp, _RET_IP_);
                bp->b_flags &= ~XBF_ORDERED;
                bp->b_flags |= _XFS_BARRIER_FAILED;
                xfs_buf_iorequest(bp);
@@ -1050,12 +974,12 @@ xfs_buf_ioend(
        xfs_buf_t               *bp,
        int                     schedule)
 {
+        trace_xfs_buf_iodone(bp, _RET_IP_);
        bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
        if (bp->b_error == 0)
                bp->b_flags |= XBF_DONE;
-        XB_TRACE(bp, "iodone", bp->b_iodone);
        if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
                if (schedule) {
                        INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
@@ -1075,26 +999,34 @@ xfs_buf_ioerror(
 {
        ASSERT(error >= 0 && error <= 0xffff);
        bp->b_error = (unsigned short)error;
-        XB_TRACE(bp, "ioerror", (unsigned long)error);
+        trace_xfs_buf_ioerror(bp, error, _RET_IP_);
 }
 int
-xfs_bawrite(
+xfs_bwrite(
-        void                    *mp,
+        struct xfs_mount        *mp,
        struct xfs_buf          *bp)
 {
-        XB_TRACE(bp, "bawrite", 0);
+        int                     iowait = (bp->b_flags & XBF_ASYNC) == 0;
+        int                     error = 0;
-        ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
+        bp->b_strat = xfs_bdstrat_cb;
+        bp->b_mount = mp;
+        bp->b_flags |= XBF_WRITE;
+        if (!iowait)
+                bp->b_flags |= _XBF_RUN_QUEUES;
        xfs_buf_delwri_dequeue(bp);
+        xfs_buf_iostrategy(bp);
-        bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD);
+        if (iowait) {
-        bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES);
+                error = xfs_buf_iowait(bp);
+                if (error)
+                        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+                xfs_buf_relse(bp);
+        }
-        bp->b_mount = mp;
+        return error;
-        bp->b_strat = xfs_bdstrat_cb;
-        return xfs_bdstrat_cb(bp);
 }
 void
@@ -1102,7 +1034,7 @@ xfs_bdwrite(
        void                    *mp,
        struct xfs_buf          *bp)
 {
-        XB_TRACE(bp, "bdwrite", 0);
+        trace_xfs_buf_bdwrite(bp, _RET_IP_);
        bp->b_strat = xfs_bdstrat_cb;
        bp->b_mount = mp;
@@ -1113,7 +1045,127 @@ xfs_bdwrite(
        xfs_buf_delwri_queue(bp, 1);
 }
-STATIC_INLINE void
+/*
+ * Called when we want to stop a buffer from getting written or read.
+ * We attach the EIO error, muck with its flags, and call biodone
+ * so that the proper iodone callbacks get called.
+ */
+STATIC int
+xfs_bioerror(
+        xfs_buf_t *bp)
+{
+#ifdef XFSERRORDEBUG
+        ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
+#endif
+        /*
+         * No need to wait until the buffer is unpinned, we aren't flushing it.
+         */
+        XFS_BUF_ERROR(bp, EIO);
+        /*
+         * We're calling biodone, so delete XBF_DONE flag.
+         */
+        XFS_BUF_UNREAD(bp);
+        XFS_BUF_UNDELAYWRITE(bp);
+        XFS_BUF_UNDONE(bp);
+        XFS_BUF_STALE(bp);
+        XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+        xfs_biodone(bp);
+        return EIO;
+}
+/*
+ * Same as xfs_bioerror, except that we are releasing the buffer
+ * here ourselves, and avoiding the biodone call.
+ * This is meant for userdata errors; metadata bufs come with
+ * iodone functions attached, so that we can track down errors.
+ */
+STATIC int
+xfs_bioerror_relse(
+        struct xfs_buf  *bp)
+{
+        int64_t         fl = XFS_BUF_BFLAGS(bp);
+        /*
+         * No need to wait until the buffer is unpinned.
+         * We aren't flushing it.
+         *
+         * chunkhold expects B_DONE to be set, whether
+         * we actually finish the I/O or not. We don't want to
+         * change that interface.
+         */
+        XFS_BUF_UNREAD(bp);
+        XFS_BUF_UNDELAYWRITE(bp);
+        XFS_BUF_DONE(bp);
+        XFS_BUF_STALE(bp);
+        XFS_BUF_CLR_IODONE_FUNC(bp);
+        XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+        if (!(fl & XBF_ASYNC)) {
+                /*
+                 * Mark b_error and B_ERROR _both_.
+                 * Lot's of chunkcache code assumes that.
+                 * There's no reason to mark error for
+                 * ASYNC buffers.
+                 */
+                XFS_BUF_ERROR(bp, EIO);
+                XFS_BUF_FINISH_IOWAIT(bp);
+        } else {
+                xfs_buf_relse(bp);
+        }
+        return EIO;
+}
+/*
+ * All xfs metadata buffers except log state machine buffers
+ * get this attached as their b_bdstrat callback function.
+ * This is so that we can catch a buffer
+ * after prematurely unpinning it to forcibly shutdown the filesystem.
+ */
+int
+xfs_bdstrat_cb(
+        struct xfs_buf  *bp)
+{
+        if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
+                trace_xfs_bdstrat_shut(bp, _RET_IP_);
+                /*
+                 * Metadata write that didn't get logged but
+                 * written delayed anyway. These aren't associated
+                 * with a transaction, and can be ignored.
+                 */
+                if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
+                        return xfs_bioerror_relse(bp);
+                else
+                        return xfs_bioerror(bp);
+        }
+        xfs_buf_iorequest(bp);
+        return 0;
+}
+/*
+ * Wrapper around bdstrat so that we can stop data from going to disk in case
+ * we are shutting down the filesystem.  Typically user data goes thru this
+ * path; one of the exceptions is the superblock.
+ */
+void
+xfsbdstrat(
+        struct xfs_mount        *mp,
+        struct xfs_buf          *bp)
+{
+        if (XFS_FORCED_SHUTDOWN(mp)) {
+                trace_xfs_bdstrat_shut(bp, _RET_IP_);
+                xfs_bioerror_relse(bp);
+                return;
+        }
+        xfs_buf_iorequest(bp);
+}
+STATIC void
 _xfs_buf_ioend(
        xfs_buf_t               *bp,
        int                     schedule)
@@ -1135,6 +1187,9 @@ xfs_buf_bio_end_io(
        xfs_buf_ioerror(bp, -error);
+        if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
+                invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
        do {
                struct page     *page = bvec->bv_page;
@@ -1177,10 +1232,14 @@ _xfs_buf_ioapply(
        if (bp->b_flags & XBF_ORDERED) {
                ASSERT(!(bp->b_flags & XBF_READ));
                rw = WRITE_BARRIER;
-        } else if (bp->b_flags & _XBF_RUN_QUEUES) {
+        } else if (bp->b_flags & XBF_LOG_BUFFER) {
                ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
                bp->b_flags &= ~_XBF_RUN_QUEUES;
                rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC;
+        } else if (bp->b_flags & _XBF_RUN_QUEUES) {
+                ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
+                bp->b_flags &= ~_XBF_RUN_QUEUES;
+                rw = (bp->b_flags & XBF_WRITE) ? WRITE_META : READ_META;
        } else {
                rw = (bp->b_flags & XBF_WRITE) ? WRITE :
                     (bp->b_flags & XBF_READ_AHEAD) ? READA : READ;
@@ -1240,6 +1299,10 @@ next_chunk:
 submit_io:
        if (likely(bio->bi_size)) {
+                if (xfs_buf_is_vmapped(bp)) {
+                        flush_kernel_vmap_range(bp->b_addr,
+                                                xfs_buf_vmap_len(bp));
+                }
                submit_bio(rw, bio);
                if (size)
                        goto next_chunk;
@@ -1253,7 +1316,7 @@ int
 xfs_buf_iorequest(
        xfs_buf_t               *bp)
 {
-        XB_TRACE(bp, "iorequest", 0);
+        trace_xfs_buf_iorequest(bp, _RET_IP_);
        if (bp->b_flags & XBF_DELWRI) {
                xfs_buf_delwri_queue(bp, 1);
@@ -1287,11 +1350,13 @@ int
 xfs_buf_iowait(
        xfs_buf_t               *bp)
 {
-        XB_TRACE(bp, "iowait", 0);
+        trace_xfs_buf_iowait(bp, _RET_IP_);
        if (atomic_read(&bp->b_io_remaining))
                blk_run_address_space(bp->b_target->bt_mapping);
        wait_for_completion(&bp->b_iowait);
-        XB_TRACE(bp, "iowaited", (long)bp->b_error);
+        trace_xfs_buf_iowait_done(bp, _RET_IP_);
        return bp->b_error;
 }
@@ -1318,7 +1383,7 @@ xfs_buf_iomove(
        xfs_buf_t               *bp,    /* buffer to process            */
        size_t                  boff,   /* starting buffer offset       */
        size_t                  bsize,  /* length to copy               */
-        caddr_t                 data,   /* data address                 */
+        void                    *data,  /* data address                 */
        xfs_buf_rw_t            mode)   /* read/write/zero flag         */
 {
        size_t                  bend, cpoff, csize;
@@ -1400,8 +1465,8 @@ xfs_alloc_bufhash(
        btp->bt_hashshift = external ? 3 : 8;   /* 8 or 256 buckets */
        btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
-        btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) *
+        btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
-                                        sizeof(xfs_bufhash_t), KM_SLEEP | KM_LARGE);
+                                         sizeof(xfs_bufhash_t));
        for (i = 0; i < (1 << btp->bt_hashshift); i++) {
                spin_lock_init(&btp->bt_hash[i].bh_lock);
                INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
@@ -1412,7 +1477,7 @@ STATIC void
 xfs_free_bufhash(
        xfs_buftarg_t           *btp)
 {
-        kmem_free(btp->bt_hash);
+        kmem_free_large(btp->bt_hash);
        btp->bt_hash = NULL;
 }
@@ -1604,7 +1669,8 @@ xfs_buf_delwri_queue(
        struct list_head        *dwq = &bp->b_target->bt_delwrite_queue;
        spinlock_t              *dwlk = &bp->b_target->bt_delwrite_lock;
-        XB_TRACE(bp, "delwri_q", (long)unlock);
+        trace_xfs_buf_delwri_queue(bp, _RET_IP_);
        ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC));
        spin_lock(dwlk);
@@ -1616,6 +1682,11 @@ xfs_buf_delwri_queue(
                list_del(&bp->b_list);
        }
+        if (list_empty(dwq)) {
+                /* start xfsbufd as it is about to have something to do */
+                wake_up_process(bp->b_target->bt_task);
+        }
        bp->b_flags |= _XBF_DELWRI_Q;
        list_add_tail(&bp->b_list, dwq);
        bp->b_queuetime = jiffies;
@@ -1644,7 +1715,36 @@ xfs_buf_delwri_dequeue(
        if (dequeued)
                xfs_buf_rele(bp);
-        XB_TRACE(bp, "delwri_dq", (long)dequeued);
+        trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
+}
+/*
+ * If a delwri buffer needs to be pushed before it has aged out, then promote
+ * it to the head of the delwri queue so that it will be flushed on the next
+ * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
+ * than the age currently needed to flush the buffer. Hence the next time the
+ * xfsbufd sees it is guaranteed to be considered old enough to flush.
+ */
+void
+xfs_buf_delwri_promote(
+        struct xfs_buf  *bp)
+{
+        struct xfs_buftarg *btp = bp->b_target;
+        long            age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
+        ASSERT(bp->b_flags & XBF_DELWRI);
+        ASSERT(bp->b_flags & _XBF_DELWRI_Q);
+        /*
+         * Check the buffer age before locking the delayed write queue as we
+         * don't need to promote buffers that are already past the flush age.
+         */
+        if (bp->b_queuetime < jiffies - age)
+                return;
+        bp->b_queuetime = jiffies - age;
+        spin_lock(&btp->bt_delwrite_lock);
+        list_move(&bp->b_list, &btp->bt_delwrite_queue);
+        spin_unlock(&btp->bt_delwrite_lock);
 }
 STATIC void
@@ -1665,6 +1765,8 @@ xfsbufd_wakeup(
        list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
                if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
                        continue;
+                if (list_empty(&btp->bt_delwrite_queue))
+                        continue;
                set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
                wake_up_process(btp->bt_task);
        }
@@ -1692,7 +1794,7 @@ xfs_buf_delwri_split(
        INIT_LIST_HEAD(list);
        spin_lock(dwlk);
        list_for_each_entry_safe(bp, n, dwq, b_list) {
-                XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp));
+                trace_xfs_buf_delwri_split(bp, _RET_IP_);
                ASSERT(bp->b_flags & XBF_DELWRI);
                if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) {
@@ -1715,20 +1817,53 @@ xfs_buf_delwri_split(
 }
+/*
+ * Compare function is more complex than it needs to be because
+ * the return value is only 32 bits and we are doing comparisons
+ * on 64 bit values
+ */
+static int
+xfs_buf_cmp(
+        void            *priv,
+        struct list_head *a,
+        struct list_head *b)
+{
+        struct xfs_buf  *ap = container_of(a, struct xfs_buf, b_list);
+        struct xfs_buf  *bp = container_of(b, struct xfs_buf, b_list);
+        xfs_daddr_t             diff;
+        diff = ap->b_bn - bp->b_bn;
+        if (diff < 0)
+                return -1;
+        if (diff > 0)
+                return 1;
+        return 0;
+}
+void
+xfs_buf_delwri_sort(
+        xfs_buftarg_t   *target,
+        struct list_head *list)
+{
+        list_sort(NULL, list, xfs_buf_cmp);
+}
 STATIC int
 xfsbufd(
        void            *data)
 {
-        struct list_head tmp;
+        xfs_buftarg_t   *target = (xfs_buftarg_t *)data;
-        xfs_buftarg_t   *target = (xfs_buftarg_t *)data;
-        int             count;
-        xfs_buf_t       *bp;
        current->flags |= PF_MEMALLOC;
        set_freezable();
        do {
+                long    age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
+                long    tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
+                int     count = 0;
+                struct list_head tmp;
                if (unlikely(freezing(current))) {
                        set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
                        refrigerator();
@@ -1736,24 +1871,20 @@ xfsbufd(
                        clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
                }
-                schedule_timeout_interruptible(
+                /* sleep for a long time if there is nothing to do. */
-                        xfs_buf_timer_centisecs * msecs_to_jiffies(10));
+                if (list_empty(&target->bt_delwrite_queue))
+                        tout = MAX_SCHEDULE_TIMEOUT;
+                schedule_timeout_interruptible(tout);
-                xfs_buf_delwri_split(target, &tmp,
+                xfs_buf_delwri_split(target, &tmp, age);
-                                xfs_buf_age_centisecs * msecs_to_jiffies(10));
+                list_sort(NULL, &tmp, xfs_buf_cmp);
-                count = 0;
                while (!list_empty(&tmp)) {
-                        bp = list_entry(tmp.next, xfs_buf_t, b_list);
+                        struct xfs_buf *bp;
-                        ASSERT(target == bp->b_target);
+                        bp = list_first_entry(&tmp, struct xfs_buf, b_list);
                        list_del_init(&bp->b_list);
                        xfs_buf_iostrategy(bp);
                        count++;
                }
-                if (as_list_len > 0)
-                        purge_addresses();
                if (count)
                        blk_run_address_space(target->bt_mapping);
@@ -1772,42 +1903,45 @@ xfs_flush_buftarg(
        xfs_buftarg_t   *target,
        int             wait)
 {
-        struct list_head tmp;
+        xfs_buf_t       *bp;
-        xfs_buf_t       *bp, *n;
        int             pincount = 0;
+        LIST_HEAD(tmp_list);
+        LIST_HEAD(wait_list);
        xfs_buf_runall_queues(xfsconvertd_workqueue);
        xfs_buf_runall_queues(xfsdatad_workqueue);
        xfs_buf_runall_queues(xfslogd_workqueue);
        set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
-        pincount = xfs_buf_delwri_split(target, &tmp, 0);
+        pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
        /*
-         * Dropped the delayed write list lock, now walk the temporary list
+         * Dropped the delayed write list lock, now walk the temporary list.
+         * All I/O is issued async and then if we need to wait for completion
+         * we do that after issuing all the IO.
         */
-        list_for_each_entry_safe(bp, n, &tmp, b_list) {
+        list_sort(NULL, &tmp_list, xfs_buf_cmp);
+        while (!list_empty(&tmp_list)) {
+                bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
                ASSERT(target == bp->b_target);
-                if (wait)
+                list_del_init(&bp->b_list);
+                if (wait) {
                        bp->b_flags &= ~XBF_ASYNC;
-                else
+                        list_add(&bp->b_list, &wait_list);
-                        list_del_init(&bp->b_list);
+                }
                xfs_buf_iostrategy(bp);
        }
-        if (wait)
+        if (wait) {
+                /* Expedite and wait for IO to complete. */
                blk_run_address_space(target->bt_mapping);
+                while (!list_empty(&wait_list)) {
+                        bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
-        /*
+                        list_del_init(&bp->b_list);
-         * Remaining list items must be flushed before returning
+                        xfs_iowait(bp);
-         */
+                        xfs_buf_relse(bp);
-        while (!list_empty(&tmp)) {
+                }
-                bp = list_entry(tmp.next, xfs_buf_t, b_list);
-                list_del_init(&bp->b_list);
-                xfs_iowait(bp);
-                xfs_buf_relse(bp);
        }
        return pincount;
@@ -1816,14 +1950,10 @@ xfs_flush_buftarg(
 int __init
 xfs_buf_init(void)
 {
-#ifdef XFS_BUF_TRACE
-        xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_NOFS);
-#endif
        xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
                                                KM_ZONE_HWALIGN, NULL);
        if (!xfs_buf_zone)
-                goto out_free_trace_buf;
+                goto out;
        xfslogd_workqueue = create_workqueue("xfslogd");
        if (!xfslogd_workqueue)
@@ -1846,10 +1976,7 @@ xfs_buf_init(void)
        destroy_workqueue(xfslogd_workqueue);
 out_free_buf_zone:
        kmem_zone_destroy(xfs_buf_zone);
- out_free_trace_buf:
+ out:
-#ifdef XFS_BUF_TRACE
-        ktrace_free(xfs_buf_trace_buf);
-#endif
        return -ENOMEM;
 }
@@ -1861,9 +1988,6 @@ xfs_buf_terminate(void)
        destroy_workqueue(xfsdatad_workqueue);
        destroy_workqueue(xfslogd_workqueue);
        kmem_zone_destroy(xfs_buf_zone);
-#ifdef XFS_BUF_TRACE
-        ktrace_free(xfs_buf_trace_buf);
-#endif
 }
 #ifdef CONFIG_KDB_MODULES
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 9b4d666ad31f..386e7361e50e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -55,6 +55,7 @@ typedef enum {
        XBF_FS_MANAGED = (1 << 8),  /* filesystem controls freeing memory  */
        XBF_ORDERED = (1 << 11),    /* use ordered writes                  */
        XBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead             */
+        XBF_LOG_BUFFER = (1 << 13), /* this is a buffer used for the log   */
        /* flags used only as arguments to access routines */
        XBF_LOCK = (1 << 14),       /* lock requested                      */
@@ -95,6 +96,28 @@ typedef enum {
        _XFS_BARRIER_FAILED = (1 << 23),
 } xfs_buf_flags_t;
+#define XFS_BUF_FLAGS \
+        { XBF_READ,             "READ" }, \
+        { XBF_WRITE,            "WRITE" }, \
+        { XBF_MAPPED,           "MAPPED" }, \
+        { XBF_ASYNC,            "ASYNC" }, \
+        { XBF_DONE,             "DONE" }, \
+        { XBF_DELWRI,           "DELWRI" }, \
+        { XBF_STALE,            "STALE" }, \
+        { XBF_FS_MANAGED,       "FS_MANAGED" }, \
+        { XBF_ORDERED,          "ORDERED" }, \
+        { XBF_READ_AHEAD,       "READ_AHEAD" }, \
+        { XBF_LOCK,             "LOCK" },       /* should never be set */\
+        { XBF_TRYLOCK,          "TRYLOCK" },    /* ditto */\
+        { XBF_DONT_BLOCK,       "DONT_BLOCK" }, /* ditto */\
+        { _XBF_PAGE_CACHE,      "PAGE_CACHE" }, \
+        { _XBF_PAGES,           "PAGES" }, \
+        { _XBF_RUN_QUEUES,      "RUN_QUEUES" }, \
+        { _XBF_DELWRI_Q,        "DELWRI_Q" }, \
+        { _XBF_PAGE_LOCKED,     "PAGE_LOCKED" }, \
+        { _XFS_BARRIER_FAILED,  "BARRIER_FAILED" }
 typedef enum {
        XBT_FORCE_SLEEP = 0,
        XBT_FORCE_FLUSH = 1,
@@ -186,15 +209,10 @@ extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t,
 #define xfs_incore(buftarg,blkno,len,lockit) \
        _xfs_buf_find(buftarg, blkno ,len, lockit, NULL)
-extern xfs_buf_t *xfs_buf_get_flags(xfs_buftarg_t *, xfs_off_t, size_t,
+extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t,
                                xfs_buf_flags_t);
-#define xfs_buf_get(target, blkno, len, flags) \
+extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
-        xfs_buf_get_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED)
-extern xfs_buf_t *xfs_buf_read_flags(xfs_buftarg_t *, xfs_off_t, size_t,
                                xfs_buf_flags_t);
-#define xfs_buf_read(target, blkno, len, flags) \
-        xfs_buf_read_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED)
 extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
 extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *);
@@ -214,13 +232,17 @@ extern void xfs_buf_lock(xfs_buf_t *);
 extern void xfs_buf_unlock(xfs_buf_t *);
 /* Buffer Read and Write Routines */
-extern int xfs_bawrite(void *mp, xfs_buf_t *bp);
+extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
 extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
+extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
+extern int xfs_bdstrat_cb(struct xfs_buf *);
 extern void xfs_buf_ioend(xfs_buf_t *,  int);
 extern void xfs_buf_ioerror(xfs_buf_t *, int);
 extern int xfs_buf_iorequest(xfs_buf_t *);
 extern int xfs_buf_iowait(xfs_buf_t *);
-extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t,
+extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
                                xfs_buf_rw_t);
 static inline int xfs_buf_iostrategy(xfs_buf_t *bp)
@@ -243,49 +265,29 @@ extern int xfs_buf_ispin(xfs_buf_t *);
 /* Delayed Write Buffer Routines */
 extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
+extern void xfs_buf_delwri_promote(xfs_buf_t *);
 /* Buffer Daemon Setup Routines */
 extern int xfs_buf_init(void);
 extern void xfs_buf_terminate(void);
-#ifdef XFS_BUF_TRACE
-extern ktrace_t *xfs_buf_trace_buf;
-extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
-#else
-#define xfs_buf_trace(bp,id,ptr,ra)     do { } while (0)
-#endif
 #define xfs_buf_target_name(target)     \
        ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; })
-#define XFS_B_ASYNC             XBF_ASYNC
-#define XFS_B_DELWRI            XBF_DELWRI
-#define XFS_B_READ              XBF_READ
-#define XFS_B_WRITE             XBF_WRITE
-#define XFS_B_STALE             XBF_STALE
-#define XFS_BUF_TRYLOCK         XBF_TRYLOCK
-#define XFS_INCORE_TRYLOCK      XBF_TRYLOCK
-#define XFS_BUF_LOCK            XBF_LOCK
-#define XFS_BUF_MAPPED          XBF_MAPPED
-#define BUF_BUSY                XBF_DONT_BLOCK
 #define XFS_BUF_BFLAGS(bp)      ((bp)->b_flags)
 #define XFS_BUF_ZEROFLAGS(bp)   ((bp)->b_flags &= \
                ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
-#define XFS_BUF_STALE(bp)       ((bp)->b_flags |= XFS_B_STALE)
+#define XFS_BUF_STALE(bp)       ((bp)->b_flags |= XBF_STALE)
-#define XFS_BUF_UNSTALE(bp)     ((bp)->b_flags &= ~XFS_B_STALE)
+#define XFS_BUF_UNSTALE(bp)     ((bp)->b_flags &= ~XBF_STALE)
-#define XFS_BUF_ISSTALE(bp)     ((bp)->b_flags & XFS_B_STALE)
+#define XFS_BUF_ISSTALE(bp)     ((bp)->b_flags & XBF_STALE)
 #define XFS_BUF_SUPER_STALE(bp) do {                            \
                                        XFS_BUF_STALE(bp);      \
                                        xfs_buf_delwri_dequeue(bp);     \
                                        XFS_BUF_DONE(bp);       \
                                } while (0)
-#define XFS_BUF_MANAGE          XBF_FS_MANAGED
 #define XFS_BUF_UNMANAGE(bp)    ((bp)->b_flags &= ~XBF_FS_MANAGED)
 #define XFS_BUF_DELAYWRITE(bp)          ((bp)->b_flags |= XBF_DELWRI)
@@ -370,39 +372,15 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
 #define xfs_bpin(bp)            xfs_buf_pin(bp)
 #define xfs_bunpin(bp)          xfs_buf_unpin(bp)
-#define xfs_buftrace(id, bp)    \
-            xfs_buf_trace(bp, id, NULL, (void *)__builtin_return_address(0))
 #define xfs_biodone(bp)         xfs_buf_ioend(bp, 0)
 #define xfs_biomove(bp, off, len, data, rw) \
            xfs_buf_iomove((bp), (off), (len), (data), \
-                ((rw) == XFS_B_WRITE) ? XBRW_WRITE : XBRW_READ)
+                ((rw) == XBF_WRITE) ? XBRW_WRITE : XBRW_READ)
 #define xfs_biozero(bp, off, len) \
            xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
-static inline int XFS_bwrite(xfs_buf_t *bp)
-{
-        int     iowait = (bp->b_flags & XBF_ASYNC) == 0;
-        int     error = 0;
-        if (!iowait)
-                bp->b_flags |= _XBF_RUN_QUEUES;
-        xfs_buf_delwri_dequeue(bp);
-        xfs_buf_iostrategy(bp);
-        if (iowait) {
-                error = xfs_buf_iowait(bp);
-                xfs_buf_relse(bp);
-        }
-        return error;
-}
-#define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
 #define xfs_iowait(bp)  xfs_buf_iowait(bp)
 #define xfs_baread(target, rablkno, ralen)  \
@@ -417,6 +395,7 @@ extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
 extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
 extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
 #ifdef CONFIG_KDB_MODULES
 extern struct list_head *xfs_get_buftarg_list(void);
 #endif
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 87b8cbd23d4b..846b75aeb2ab 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -29,6 +29,7 @@
 #include "xfs_vnodeops.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
+#include "xfs_inode_item.h"
 /*
 * Note that we only accept fileids which are long enough rather than allow
@@ -215,9 +216,28 @@ xfs_fs_get_parent(
        return d_obtain_alias(VFS_I(cip));
 }
+STATIC int
+xfs_fs_nfs_commit_metadata(
+        struct inode            *inode)
+{
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        int                     error = 0;
+        xfs_ilock(ip, XFS_ILOCK_SHARED);
+        if (xfs_ipincount(ip)) {
+                error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn,
+                                XFS_LOG_SYNC, NULL);
+        }
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        return error;
+}
 const struct export_operations xfs_export_operations = {
        .encode_fh              = xfs_fs_encode_fh,
        .fh_to_dentry           = xfs_fs_fh_to_dentry,
        .fh_to_parent           = xfs_fs_fh_to_parent,
        .get_parent             = xfs_fs_get_parent,
+        .commit_metadata        = xfs_fs_nfs_commit_metadata,
 };
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index eff61e2732af..42dd3bcfba6b 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -16,6 +16,7 @@
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
 #include "xfs.h"
+#include "xfs_fs.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
@@ -34,52 +35,279 @@
 #include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
 #include "xfs_vnodeops.h"
 #include "xfs_da_btree.h"
 #include "xfs_ioctl.h"
+#include "xfs_trace.h"
 #include <linux/dcache.h>
 static const struct vm_operations_struct xfs_file_vm_ops;
-STATIC ssize_t
+/*
-xfs_file_aio_read(
+ *      xfs_iozero
-        struct kiocb            *iocb,
+ *
-        const struct iovec      *iov,
+ *      xfs_iozero clears the specified range of buffer supplied,
-        unsigned long           nr_segs,
+ *      and marks all the affected blocks as valid and modified.  If
-        loff_t                  pos)
+ *      an affected block is not allocated, it will be allocated.  If
+ *      an affected block is not completely overwritten, and is not
+ *      valid before the operation, it will be read from disk before
+ *      being partially zeroed.
+ */
+STATIC int
+xfs_iozero(
+        struct xfs_inode        *ip,    /* inode                        */
+        loff_t                  pos,    /* offset in file               */
+        size_t                  count)  /* size of data to zero         */
 {
-        struct file             *file = iocb->ki_filp;
+        struct page             *page;
-        int                     ioflags = IO_ISAIO;
+        struct address_space    *mapping;
+        int                     status;
-        BUG_ON(iocb->ki_pos != pos);
+        mapping = VFS_I(ip)->i_mapping;
-        if (unlikely(file->f_flags & O_DIRECT))
+        do {
-                ioflags |= IO_ISDIRECT;
+                unsigned offset, bytes;
-        if (file->f_mode & FMODE_NOCMTIME)
+                void *fsdata;
-                ioflags |= IO_INVIS;
-        return xfs_read(XFS_I(file->f_path.dentry->d_inode), iocb, iov,
+                offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
-                                nr_segs, &iocb->ki_pos, ioflags);
+                bytes = PAGE_CACHE_SIZE - offset;
+                if (bytes > count)
+                        bytes = count;
+                status = pagecache_write_begin(NULL, mapping, pos, bytes,
+                                        AOP_FLAG_UNINTERRUPTIBLE,
+                                        &page, &fsdata);
+                if (status)
+                        break;
+                zero_user(page, offset, bytes);
+                status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
+                                        page, fsdata);
+                WARN_ON(status <= 0); /* can't return less than zero! */
+                pos += bytes;
+                count -= bytes;
+                status = 0;
+        } while (count);
+        return (-status);
+}
+STATIC int
+xfs_file_fsync(
+        struct file             *file,
+        struct dentry           *dentry,
+        int                     datasync)
+{
+        struct xfs_inode        *ip = XFS_I(dentry->d_inode);
+        struct xfs_trans        *tp;
+        int                     error = 0;
+        int                     log_flushed = 0;
+        xfs_itrace_entry(ip);
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+                return -XFS_ERROR(EIO);
+        xfs_iflags_clear(ip, XFS_ITRUNCATED);
+        /*
+         * We always need to make sure that the required inode state is safe on
+         * disk.  The inode might be clean but we still might need to force the
+         * log because of committed transactions that haven't hit the disk yet.
+         * Likewise, there could be unflushed non-transactional changes to the
+         * inode core that have to go to disk and this requires us to issue
+         * a synchronous transaction to capture these changes correctly.
+         *
+         * This code relies on the assumption that if the i_update_core field
+         * of the inode is clear and the inode is unpinned then it is clean
+         * and no action is required.
+         */
+        xfs_ilock(ip, XFS_ILOCK_SHARED);
+        /*
+         * First check if the VFS inode is marked dirty.  All the dirtying
+         * of non-transactional updates no goes through mark_inode_dirty*,
+         * which allows us to distinguish beteeen pure timestamp updates
+         * and i_size updates which need to be caught for fdatasync.
+         * After that also theck for the dirty state in the XFS inode, which
+         * might gets cleared when the inode gets written out via the AIL
+         * or xfs_iflush_cluster.
+         */
+        if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) ||
+            ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
+            ip->i_update_core) {
+                /*
+                 * Kick off a transaction to log the inode core to get the
+                 * updates.  The sync transaction will also force the log.
+                 */
+                xfs_iunlock(ip, XFS_ILOCK_SHARED);
+                tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
+                error = xfs_trans_reserve(tp, 0,
+                                XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
+                if (error) {
+                        xfs_trans_cancel(tp, 0);
+                        return -error;
+                }
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                /*
+                 * Note - it's possible that we might have pushed ourselves out
+                 * of the way during trans_reserve which would flush the inode.
+                 * But there's no guarantee that the inode buffer has actually
+                 * gone out yet (it's delwri).  Plus the buffer could be pinned
+                 * anyway if it's part of an inode in another recent
+                 * transaction.  So we play it safe and fire off the
+                 * transaction anyway.
+                 */
+                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+                xfs_trans_ihold(tp, ip);
+                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+                xfs_trans_set_sync(tp);
+                error = _xfs_trans_commit(tp, 0, &log_flushed);
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        } else {
+                /*
+                 * Timestamps/size haven't changed since last inode flush or
+                 * inode transaction commit.  That means either nothing got
+                 * written or a transaction committed which caught the updates.
+                 * If the latter happened and the transaction hasn't hit the
+                 * disk yet, the inode will be still be pinned.  If it is,
+                 * force the log.
+                 */
+                if (xfs_ipincount(ip)) {
+                        error = _xfs_log_force_lsn(ip->i_mount,
+                                        ip->i_itemp->ili_last_lsn,
+                                        XFS_LOG_SYNC, &log_flushed);
+                }
+                xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        }
+        if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
+                /*
+                 * If the log write didn't issue an ordered tag we need
+                 * to flush the disk cache for the data device now.
+                 */
+                if (!log_flushed)
+                        xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
+                /*
+                 * If this inode is on the RT dev we need to flush that
+                 * cache as well.
+                 */
+                if (XFS_IS_REALTIME_INODE(ip))
+                        xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
+        }
+        return -error;
 }
 STATIC ssize_t
-xfs_file_aio_write(
+xfs_file_aio_read(
        struct kiocb            *iocb,
-        const struct iovec      *iov,
+        const struct iovec      *iovp,
        unsigned long           nr_segs,
        loff_t                  pos)
 {
        struct file             *file = iocb->ki_filp;
-        int                     ioflags = IO_ISAIO;
+        struct inode            *inode = file->f_mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        size_t                  size = 0;
+        ssize_t                 ret = 0;
+        int                     ioflags = 0;
+        xfs_fsize_t             n;
+        unsigned long           seg;
+        XFS_STATS_INC(xs_read_calls);
        BUG_ON(iocb->ki_pos != pos);
        if (unlikely(file->f_flags & O_DIRECT))
                ioflags |= IO_ISDIRECT;
        if (file->f_mode & FMODE_NOCMTIME)
                ioflags |= IO_INVIS;
-        return xfs_write(XFS_I(file->f_mapping->host), iocb, iov, nr_segs,
-                                &iocb->ki_pos, ioflags);
+        /* START copy & waste from filemap.c */
+        for (seg = 0; seg < nr_segs; seg++) {
+                const struct iovec *iv = &iovp[seg];
+                /*
+                 * If any segment has a negative length, or the cumulative
+                 * length ever wraps negative then return -EINVAL.
+                 */
+                size += iv->iov_len;
+                if (unlikely((ssize_t)(size|iv->iov_len) < 0))
+                        return XFS_ERROR(-EINVAL);
+        }
+        /* END copy & waste from filemap.c */
+        if (unlikely(ioflags & IO_ISDIRECT)) {
+                xfs_buftarg_t   *target =
+                        XFS_IS_REALTIME_INODE(ip) ?
+                                mp->m_rtdev_targp : mp->m_ddev_targp;
+                if ((iocb->ki_pos & target->bt_smask) ||
+                    (size & target->bt_smask)) {
+                        if (iocb->ki_pos == ip->i_size)
+                                return 0;
+                        return -XFS_ERROR(EINVAL);
+                }
+        }
+        n = XFS_MAXIOFFSET(mp) - iocb->ki_pos;
+        if (n <= 0 || size == 0)
+                return 0;
+        if (n < size)
+                size = n;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -EIO;
+        if (unlikely(ioflags & IO_ISDIRECT))
+                mutex_lock(&inode->i_mutex);
+        xfs_ilock(ip, XFS_IOLOCK_SHARED);
+        if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
+                int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
+                int iolock = XFS_IOLOCK_SHARED;
+                ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, iocb->ki_pos, size,
+                                        dmflags, &iolock);
+                if (ret) {
+                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+                        if (unlikely(ioflags & IO_ISDIRECT))
+                                mutex_unlock(&inode->i_mutex);
+                        return ret;
+                }
+        }
+        if (unlikely(ioflags & IO_ISDIRECT)) {
+                if (inode->i_mapping->nrpages) {
+                        ret = -xfs_flushinval_pages(ip,
+                                        (iocb->ki_pos & PAGE_CACHE_MASK),
+                                        -1, FI_REMAPF_LOCKED);
+                }
+                mutex_unlock(&inode->i_mutex);
+                if (ret) {
+                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+                        return ret;
+                }
+        }
+        trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
+        ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos);
+        if (ret > 0)
+                XFS_STATS_ADD(xs_read_bytes, ret);
+        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+        return ret;
 }
 STATIC ssize_t
@@ -87,16 +315,44 @@ xfs_file_splice_read(
        struct file             *infilp,
        loff_t                  *ppos,
        struct pipe_inode_info  *pipe,
-        size_t                  len,
+        size_t                  count,
        unsigned int            flags)
 {
+        struct xfs_inode        *ip = XFS_I(infilp->f_mapping->host);
+        struct xfs_mount        *mp = ip->i_mount;
        int                     ioflags = 0;
+        ssize_t                 ret;
+        XFS_STATS_INC(xs_read_calls);
        if (infilp->f_mode & FMODE_NOCMTIME)
                ioflags |= IO_INVIS;
-        return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode),
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-                                   infilp, ppos, pipe, len, flags, ioflags);
+                return -EIO;
+        xfs_ilock(ip, XFS_IOLOCK_SHARED);
+        if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
+                int iolock = XFS_IOLOCK_SHARED;
+                int error;
+                error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
+                                        FILP_DELAY_FLAG(infilp), &iolock);
+                if (error) {
+                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+                        return -error;
+                }
+        }
+        trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
+        ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
+        if (ret > 0)
+                XFS_STATS_ADD(xs_read_bytes, ret);
+        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+        return ret;
 }
 STATIC ssize_t
@@ -104,16 +360,538 @@ xfs_file_splice_write(
        struct pipe_inode_info  *pipe,
        struct file             *outfilp,
        loff_t                  *ppos,
-        size_t                  len,
+        size_t                  count,
        unsigned int            flags)
 {
+        struct inode            *inode = outfilp->f_mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        xfs_fsize_t             isize, new_size;
        int                     ioflags = 0;
+        ssize_t                 ret;
+        XFS_STATS_INC(xs_write_calls);
        if (outfilp->f_mode & FMODE_NOCMTIME)
                ioflags |= IO_INVIS;
-        return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode),
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-                                    pipe, outfilp, ppos, len, flags, ioflags);
+                return -EIO;
+        xfs_ilock(ip, XFS_IOLOCK_EXCL);
+        if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
+                int iolock = XFS_IOLOCK_EXCL;
+                int error;
+                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
+                                        FILP_DELAY_FLAG(outfilp), &iolock);
+                if (error) {
+                        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+                        return -error;
+                }
+        }
+        new_size = *ppos + count;
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        if (new_size > ip->i_size)
+                ip->i_new_size = new_size;
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
+        ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
+        if (ret > 0)
+                XFS_STATS_ADD(xs_write_bytes, ret);
+        isize = i_size_read(inode);
+        if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
+                *ppos = isize;
+        if (*ppos > ip->i_size) {
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                if (*ppos > ip->i_size)
+                        ip->i_size = *ppos;
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        }
+        if (ip->i_new_size) {
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                ip->i_new_size = 0;
+                if (ip->i_d.di_size > ip->i_size)
+                        ip->i_d.di_size = ip->i_size;
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        }
+        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+        return ret;
+}
+/*
+ * This routine is called to handle zeroing any space in the last
+ * block of the file that is beyond the EOF.  We do this since the
+ * size is being increased without writing anything to that block
+ * and we don't want anyone to read the garbage on the disk.
+ */
+STATIC int                              /* error (positive) */
+xfs_zero_last_block(
+        xfs_inode_t     *ip,
+        xfs_fsize_t     offset,
+        xfs_fsize_t     isize)
+{
+        xfs_fileoff_t   last_fsb;
+        xfs_mount_t     *mp = ip->i_mount;
+        int             nimaps;
+        int             zero_offset;
+        int             zero_len;
+        int             error = 0;
+        xfs_bmbt_irec_t imap;
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        zero_offset = XFS_B_FSB_OFFSET(mp, isize);
+        if (zero_offset == 0) {
+                /*
+                 * There are no extra bytes in the last block on disk to
+                 * zero, so return.
+                 */
+                return 0;
+        }
+        last_fsb = XFS_B_TO_FSBT(mp, isize);
+        nimaps = 1;
+        error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
+                          &nimaps, NULL, NULL);
+        if (error) {
+                return error;
+        }
+        ASSERT(nimaps > 0);
+        /*
+         * If the block underlying isize is just a hole, then there
+         * is nothing to zero.
+         */
+        if (imap.br_startblock == HOLESTARTBLOCK) {
+                return 0;
+        }
+        /*
+         * Zero the part of the last block beyond the EOF, and write it
+         * out sync.  We need to drop the ilock while we do this so we
+         * don't deadlock when the buffer cache calls back to us.
+         */
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        zero_len = mp->m_sb.sb_blocksize - zero_offset;
+        if (isize + zero_len > offset)
+                zero_len = offset - isize;
+        error = xfs_iozero(ip, isize, zero_len);
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        ASSERT(error >= 0);
+        return error;
+}
+/*
+ * Zero any on disk space between the current EOF and the new,
+ * larger EOF.  This handles the normal case of zeroing the remainder
+ * of the last block in the file and the unusual case of zeroing blocks
+ * out beyond the size of the file.  This second case only happens
+ * with fixed size extents and when the system crashes before the inode
+ * size was updated but after blocks were allocated.  If fill is set,
+ * then any holes in the range are filled and zeroed.  If not, the holes
+ * are left alone as holes.
+ */
+int                                     /* error (positive) */
+xfs_zero_eof(
+        xfs_inode_t     *ip,
+        xfs_off_t       offset,         /* starting I/O offset */
+        xfs_fsize_t     isize)          /* current inode size */
+{
+        xfs_mount_t     *mp = ip->i_mount;
+        xfs_fileoff_t   start_zero_fsb;
+        xfs_fileoff_t   end_zero_fsb;
+        xfs_fileoff_t   zero_count_fsb;
+        xfs_fileoff_t   last_fsb;
+        xfs_fileoff_t   zero_off;
+        xfs_fsize_t     zero_len;
+        int             nimaps;
+        int             error = 0;
+        xfs_bmbt_irec_t imap;
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+        ASSERT(offset > isize);
+        /*
+         * First handle zeroing the block on which isize resides.
+         * We only zero a part of that block so it is handled specially.
+         */
+        error = xfs_zero_last_block(ip, offset, isize);
+        if (error) {
+                ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+                return error;
+        }
+        /*
+         * Calculate the range between the new size and the old
+         * where blocks needing to be zeroed may exist.  To get the
+         * block where the last byte in the file currently resides,
+         * we need to subtract one from the size and truncate back
+         * to a block boundary.  We subtract 1 in case the size is
+         * exactly on a block boundary.
+         */
+        last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
+        start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
+        end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
+        ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
+        if (last_fsb == end_zero_fsb) {
+                /*
+                 * The size was only incremented on its last block.
+                 * We took care of that above, so just return.
+                 */
+                return 0;
+        }
+        ASSERT(start_zero_fsb <= end_zero_fsb);
+        while (start_zero_fsb <= end_zero_fsb) {
+                nimaps = 1;
+                zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
+                error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
+                                  0, NULL, 0, &imap, &nimaps, NULL, NULL);
+                if (error) {
+                        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+                        return error;
+                }
+                ASSERT(nimaps > 0);
+                if (imap.br_state == XFS_EXT_UNWRITTEN ||
+                    imap.br_startblock == HOLESTARTBLOCK) {
+                        /*
+                         * This loop handles initializing pages that were
+                         * partially initialized by the code below this
+                         * loop. It basically zeroes the part of the page
+                         * that sits on a hole and sets the page as P_HOLE
+                         * and calls remapf if it is a mapped file.
+                         */
+                        start_zero_fsb = imap.br_startoff + imap.br_blockcount;
+                        ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+                        continue;
+                }
+                /*
+                 * There are blocks we need to zero.
+                 * Drop the inode lock while we're doing the I/O.
+                 * We'll still have the iolock to protect us.
+                 */
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
+                zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
+                if ((zero_off + zero_len) > offset)
+                        zero_len = offset - zero_off;
+                error = xfs_iozero(ip, zero_off, zero_len);
+                if (error) {
+                        goto out_lock;
+                }
+                start_zero_fsb = imap.br_startoff + imap.br_blockcount;
+                ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+        }
+        return 0;
+out_lock:
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        ASSERT(error >= 0);
+        return error;
+}
+STATIC ssize_t
+xfs_file_aio_write(
+        struct kiocb            *iocb,
+        const struct iovec      *iovp,
+        unsigned long           nr_segs,
+        loff_t                  pos)
+{
+        struct file             *file = iocb->ki_filp;
+        struct address_space    *mapping = file->f_mapping;
+        struct inode            *inode = mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        ssize_t                 ret = 0, error = 0;
+        int                     ioflags = 0;
+        xfs_fsize_t             isize, new_size;
+        int                     iolock;
+        int                     eventsent = 0;
+        size_t                  ocount = 0, count;
+        int                     need_i_mutex;
+        XFS_STATS_INC(xs_write_calls);
+        BUG_ON(iocb->ki_pos != pos);
+        if (unlikely(file->f_flags & O_DIRECT))
+                ioflags |= IO_ISDIRECT;
+        if (file->f_mode & FMODE_NOCMTIME)
+                ioflags |= IO_INVIS;
+        error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
+        if (error)
+                return error;
+        count = ocount;
+        if (count == 0)
+                return 0;
+        xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -EIO;
+relock:
+        if (ioflags & IO_ISDIRECT) {
+                iolock = XFS_IOLOCK_SHARED;
+                need_i_mutex = 0;
+        } else {
+                iolock = XFS_IOLOCK_EXCL;
+                need_i_mutex = 1;
+                mutex_lock(&inode->i_mutex);
+        }
+        xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
+start:
+        error = -generic_write_checks(file, &pos, &count,
+                                        S_ISBLK(inode->i_mode));
+        if (error) {
+                xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+                goto out_unlock_mutex;
+        }
+        if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) &&
+            !(ioflags & IO_INVIS) && !eventsent)) {
+                int             dmflags = FILP_DELAY_FLAG(file);
+                if (need_i_mutex)
+                        dmflags |= DM_FLAGS_IMUX;
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip,
+                                      pos, count, dmflags, &iolock);
+                if (error) {
+                        goto out_unlock_internal;
+                }
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                eventsent = 1;
+                /*
+                 * The iolock was dropped and reacquired in XFS_SEND_DATA
+                 * so we have to recheck the size when appending.
+                 * We will only "goto start;" once, since having sent the
+                 * event prevents another call to XFS_SEND_DATA, which is
+                 * what allows the size to change in the first place.
+                 */
+                if ((file->f_flags & O_APPEND) && pos != ip->i_size)
+                        goto start;
+        }
+        if (ioflags & IO_ISDIRECT) {
+                xfs_buftarg_t   *target =
+                        XFS_IS_REALTIME_INODE(ip) ?
+                                mp->m_rtdev_targp : mp->m_ddev_targp;
+                if ((pos & target->bt_smask) || (count & target->bt_smask)) {
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+                        return XFS_ERROR(-EINVAL);
+                }
+                if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) {
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+                        iolock = XFS_IOLOCK_EXCL;
+                        need_i_mutex = 1;
+                        mutex_lock(&inode->i_mutex);
+                        xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
+                        goto start;
+                }
+        }
+        new_size = pos + count;
+        if (new_size > ip->i_size)
+                ip->i_new_size = new_size;
+        if (likely(!(ioflags & IO_INVIS)))
+                file_update_time(file);
+        /*
+         * If the offset is beyond the size of the file, we have a couple
+         * of things to do. First, if there is already space allocated
+         * we need to either create holes or zero the disk or ...
+         *
+         * If there is a page where the previous size lands, we need
+         * to zero it out up to the new size.
+         */
+        if (pos > ip->i_size) {
+                error = xfs_zero_eof(ip, pos, ip->i_size);
+                if (error) {
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                        goto out_unlock_internal;
+                }
+        }
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        /*
+         * If we're writing the file then make sure to clear the
+         * setuid and setgid bits if the process is not being run
+         * by root.  This keeps people from modifying setuid and
+         * setgid binaries.
+         */
+        error = -file_remove_suid(file);
+        if (unlikely(error))
+                goto out_unlock_internal;
+        /* We can write back this queue in page reclaim */
+        current->backing_dev_info = mapping->backing_dev_info;
+        if ((ioflags & IO_ISDIRECT)) {
+                if (mapping->nrpages) {
+                        WARN_ON(need_i_mutex == 0);
+                        error = xfs_flushinval_pages(ip,
+                                        (pos & PAGE_CACHE_MASK),
+                                        -1, FI_REMAPF_LOCKED);
+                        if (error)
+                                goto out_unlock_internal;
+                }
+                if (need_i_mutex) {
+                        /* demote the lock now the cached pages are gone */
+                        xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
+                        mutex_unlock(&inode->i_mutex);
+                        iolock = XFS_IOLOCK_SHARED;
+                        need_i_mutex = 0;
+                }
+                trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags);
+                ret = generic_file_direct_write(iocb, iovp,
+                                &nr_segs, pos, &iocb->ki_pos, count, ocount);
+                /*
+                 * direct-io write to a hole: fall through to buffered I/O
+                 * for completing the rest of the request.
+                 */
+                if (ret >= 0 && ret != count) {
+                        XFS_STATS_ADD(xs_write_bytes, ret);
+                        pos += ret;
+                        count -= ret;
+                        ioflags &= ~IO_ISDIRECT;
+                        xfs_iunlock(ip, iolock);
+                        goto relock;
+                }
+        } else {
+                int enospc = 0;
+                ssize_t ret2 = 0;
+write_retry:
+                trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags);
+                ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
+                                pos, &iocb->ki_pos, count, ret);
+                /*
+                 * if we just got an ENOSPC, flush the inode now we
+                 * aren't holding any page locks and retry *once*
+                 */
+                if (ret2 == -ENOSPC && !enospc) {
+                        error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
+                        if (error)
+                                goto out_unlock_internal;
+                        enospc = 1;
+                        goto write_retry;
+                }
+                ret = ret2;
+        }
+        current->backing_dev_info = NULL;
+        isize = i_size_read(inode);
+        if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize))
+                iocb->ki_pos = isize;
+        if (iocb->ki_pos > ip->i_size) {
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                if (iocb->ki_pos > ip->i_size)
+                        ip->i_size = iocb->ki_pos;
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        }
+        if (ret == -ENOSPC &&
+            DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
+                xfs_iunlock(ip, iolock);
+                if (need_i_mutex)
+                        mutex_unlock(&inode->i_mutex);
+                error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip,
+                                DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL,
+                                0, 0, 0); /* Delay flag intentionally  unused */
+                if (need_i_mutex)
+                        mutex_lock(&inode->i_mutex);
+                xfs_ilock(ip, iolock);
+                if (error)
+                        goto out_unlock_internal;
+                goto start;
+        }
+        error = -ret;
+        if (ret <= 0)
+                goto out_unlock_internal;
+        XFS_STATS_ADD(xs_write_bytes, ret);
+        /* Handle various SYNC-type writes */
+        if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
+                loff_t end = pos + ret - 1;
+                int error2;
+                xfs_iunlock(ip, iolock);
+                if (need_i_mutex)
+                        mutex_unlock(&inode->i_mutex);
+                error2 = filemap_write_and_wait_range(mapping, pos, end);
+                if (!error)
+                        error = error2;
+                if (need_i_mutex)
+                        mutex_lock(&inode->i_mutex);
+                xfs_ilock(ip, iolock);
+                error2 = -xfs_file_fsync(file, file->f_path.dentry,
+                                         (file->f_flags & __O_SYNC) ? 0 : 1);
+                if (!error)
+                        error = error2;
+        }
+ out_unlock_internal:
+        if (ip->i_new_size) {
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                ip->i_new_size = 0;
+                /*
+                 * If this was a direct or synchronous I/O that failed (such
+                 * as ENOSPC) then part of the I/O may have been written to
+                 * disk before the error occured.  In this case the on-disk
+                 * file size may have been adjusted beyond the in-memory file
+                 * size and now needs to be truncated back.
+                 */
+                if (ip->i_d.di_size > ip->i_size)
+                        ip->i_d.di_size = ip->i_size;
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        }
+        xfs_iunlock(ip, iolock);
+ out_unlock_mutex:
+        if (need_i_mutex)
+                mutex_unlock(&inode->i_mutex);
+        return -error;
 }
 STATIC int
@@ -160,28 +938,6 @@ xfs_file_release(
        return -xfs_release(XFS_I(inode));
 }
-/*
- * We ignore the datasync flag here because a datasync is effectively
- * identical to an fsync. That is, datasync implies that we need to write
- * only the metadata needed to be able to access the data that is written
- * if we crash after the call completes. Hence if we are writing beyond
- * EOF we have to log the inode size change as well, which makes it a
- * full fsync. If we don't write beyond EOF, the inode core will be
- * clean in memory and so we don't need to log the inode, just like
- * fsync.
- */
-STATIC int
-xfs_file_fsync(
-        struct file             *file,
-        struct dentry           *dentry,
-        int                     datasync)
-{
-        struct xfs_inode        *ip = XFS_I(dentry->d_inode);
-        xfs_iflags_clear(ip, XFS_ITRUNCATED);
-        return -xfs_fsync(ip);
-}
 STATIC int
 xfs_file_readdir(
        struct file     *filp,
@@ -203,9 +959,9 @@ xfs_file_readdir(
         *
         * Try to give it an estimate that's good enough, maybe at some
         * point we can change the ->readdir prototype to include the
-         * buffer size.
+         * buffer size.  For now we use the current glibc buffer size.
         */
-        bufsize = (size_t)min_t(loff_t, PAGE_SIZE, ip->i_d.di_size);
+        bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
        error = xfs_readdir(ip, dirent, bufsize,
                                (xfs_off_t *)&filp->f_pos, filldir);
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 08be36d7326c..b6918d76bc7b 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -19,6 +19,7 @@
 #include "xfs_vnodeops.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
+#include "xfs_trace.h"
 int  fs_noerr(void) { return 0; }
 int  fs_nosys(void) { return ENOSYS; }
@@ -51,6 +52,8 @@ xfs_flushinval_pages(
        struct address_space *mapping = VFS_I(ip)->i_mapping;
        int             ret = 0;
+        trace_xfs_pagecache_inval(ip, first, last);
        if (mapping->nrpages) {
                xfs_iflags_clear(ip, XFS_ITRUNCATED);
                ret = filemap_write_and_wait(mapping);
@@ -76,7 +79,7 @@ xfs_flush_pages(
                xfs_iflags_clear(ip, XFS_ITRUNCATED);
                ret = -filemap_fdatawrite(mapping);
        }
-        if (flags & XFS_B_ASYNC)
+        if (flags & XBF_ASYNC)
                return ret;
        ret2 = xfs_wait_on_pages(ip, first, last);
        if (!ret)
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 5bb523d7f37e..7b26cc2fd284 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -51,12 +51,14 @@
 #include "xfs_quota.h"
 #include "xfs_inode_item.h"
 #include "xfs_export.h"
+#include "xfs_trace.h"
 #include <linux/capability.h>
 #include <linux/dcache.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/exportfs.h>
 /*
@@ -446,12 +448,12 @@ xfs_attrlist_by_handle(
 int
 xfs_attrmulti_attr_get(
        struct inode            *inode,
-        char                    *name,
+        unsigned char           *name,
-        char                    __user *ubuf,
+        unsigned char           __user *ubuf,
        __uint32_t              *len,
        __uint32_t              flags)
 {
-        char                    *kbuf;
+        unsigned char           *kbuf;
        int                     error = EFAULT;
        if (*len > XATTR_SIZE_MAX)
@@ -475,12 +477,12 @@ xfs_attrmulti_attr_get(
 int
 xfs_attrmulti_attr_set(
        struct inode            *inode,
-        char                    *name,
+        unsigned char           *name,
-        const char              __user *ubuf,
+        const unsigned char     __user *ubuf,
        __uint32_t              len,
        __uint32_t              flags)
 {
-        char                    *kbuf;
+        unsigned char           *kbuf;
        int                     error = EFAULT;
        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -500,7 +502,7 @@ xfs_attrmulti_attr_set(
 int
 xfs_attrmulti_attr_remove(
        struct inode            *inode,
-        char                    *name,
+        unsigned char           *name,
        __uint32_t              flags)
 {
        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -518,7 +520,7 @@ xfs_attrmulti_by_handle(
        xfs_fsop_attrmulti_handlereq_t am_hreq;
        struct dentry           *dentry;
        unsigned int            i, size;
-        char                    *attr_name;
+        unsigned char           *attr_name;
        if (!capable(CAP_SYS_ADMIN))
                return -XFS_ERROR(EPERM);
@@ -546,7 +548,7 @@ xfs_attrmulti_by_handle(
        error = 0;
        for (i = 0; i < am_hreq.opcount; i++) {
-                ops[i].am_error = strncpy_from_user(attr_name,
+                ops[i].am_error = strncpy_from_user((char *)attr_name,
                                ops[i].am_attrname, MAXNAMELEN);
                if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
                        error = -ERANGE;
@@ -1430,6 +1432,9 @@ xfs_file_ioctl(
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
+                if (mp->m_flags & XFS_MOUNT_RDONLY)
+                        return -XFS_ERROR(EROFS);
                if (copy_from_user(&inout, arg, sizeof(inout)))
                        return -XFS_ERROR(EFAULT);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h
index 7bd7c6afc1eb..d56173b34a2a 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl.h
@@ -45,23 +45,23 @@ xfs_readlink_by_handle(
 extern int
 xfs_attrmulti_attr_get(
        struct inode            *inode,
-        char                    *name,
+        unsigned char           *name,
-        char                    __user *ubuf,
+        unsigned char           __user *ubuf,
        __uint32_t              *len,
        __uint32_t              flags);
 extern int
-        xfs_attrmulti_attr_set(
+xfs_attrmulti_attr_set(
        struct inode            *inode,
-        char                    *name,
+        unsigned char           *name,
-        const char              __user *ubuf,
+        const unsigned char     __user *ubuf,
        __uint32_t              len,
        __uint32_t              flags);
 extern int
 xfs_attrmulti_attr_remove(
        struct inode            *inode,
-        char                    *name,
+        unsigned char           *name,
        __uint32_t              flags);
 extern struct dentry *
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index eafcc7c18706..593c05b4df8d 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -18,6 +18,7 @@
 #include <linux/compat.h>
 #include <linux/ioctl.h>
 #include <linux/mount.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -46,6 +47,7 @@
 #include "xfs_attr.h"
 #include "xfs_ioctl.h"
 #include "xfs_ioctl32.h"
+#include "xfs_trace.h"
 #define  _NATIVE_IOC(cmd, type) \
          _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type))
@@ -410,7 +412,7 @@ xfs_compat_attrmulti_by_handle(
        compat_xfs_fsop_attrmulti_handlereq_t   am_hreq;
        struct dentry                           *dentry;
        unsigned int                            i, size;
-        char                                    *attr_name;
+        unsigned char                           *attr_name;
        if (!capable(CAP_SYS_ADMIN))
                return -XFS_ERROR(EPERM);
@@ -439,7 +441,7 @@ xfs_compat_attrmulti_by_handle(
        error = 0;
        for (i = 0; i < am_hreq.opcount; i++) {
-                ops[i].am_error = strncpy_from_user(attr_name,
+                ops[i].am_error = strncpy_from_user((char *)attr_name,
                                compat_ptr(ops[i].am_attrname),
                                MAXNAMELEN);
                if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index cd42ef78f6b5..e65a7937f3a4 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -47,6 +47,7 @@
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
 #include "xfs_vnodeops.h"
+#include "xfs_trace.h"
 #include <linux/capability.h>
 #include <linux/xattr.h>
@@ -55,6 +56,7 @@
 #include <linux/security.h>
 #include <linux/falloc.h>
 #include <linux/fiemap.h>
+#include <linux/slab.h>
 /*
 * Bring the timestamps in the XFS inode uptodate.
@@ -90,6 +92,16 @@ xfs_mark_inode_dirty_sync(
                mark_inode_dirty_sync(inode);
 }
+void
+xfs_mark_inode_dirty(
+        xfs_inode_t     *ip)
+{
+        struct inode    *inode = VFS_I(ip);
+        if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR)))
+                mark_inode_dirty(inode);
+}
 /*
 * Change the requested timestamp in the given inode.
 * We don't lock across timestamp updates, and we don't log them but
@@ -139,10 +151,10 @@ xfs_init_security(
        struct xfs_inode *ip = XFS_I(inode);
        size_t          length;
        void            *value;
-        char            *name;
+        unsigned char   *name;
        int             error;
-        error = security_inode_init_security(inode, dir, &name,
+        error = security_inode_init_security(inode, dir, (char **)&name,
                                             &value, &length);
        if (error) {
                if (error == -EOPNOTSUPP)
@@ -573,8 +585,8 @@ xfs_vn_fallocate(
        bf.l_len = len;
        xfs_ilock(ip, XFS_IOLOCK_EXCL);
-        error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
+        error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
-                                      0, XFS_ATTR_NOLOCK);
+                                       0, XFS_ATTR_NOLOCK);
        if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
            offset + len > i_size_read(inode))
                new_size = offset + len;
@@ -585,7 +597,7 @@ xfs_vn_fallocate(
                iattr.ia_valid = ATTR_SIZE;
                iattr.ia_size = new_size;
-                error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
+                error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
        }
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -793,7 +805,7 @@ xfs_setup_inode(
        struct inode            *inode = &ip->i_vnode;
        inode->i_ino = ip->i_ino;
-        inode->i_state = I_NEW|I_LOCK;
+        inode->i_state = I_NEW;
        inode_add_to_lists(ip->i_mount->m_super, inode);
        inode->i_mode   = ip->i_d.di_mode;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 6127e24062d0..facfb323a706 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -40,7 +40,6 @@
 #include <sv.h>
 #include <time.h>
-#include <support/ktrace.h>
 #include <support/debug.h>
 #include <support/uuid.h>
@@ -89,7 +88,6 @@
 #include <xfs_super.h>
 #include <xfs_globals.h>
 #include <xfs_fs_subr.h>
-#include <xfs_lrw.h>
 #include <xfs_buf.h>
 /*
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
deleted file mode 100644
index 072050f8d346..000000000000
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ /dev/null
@@ -1,922 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_rw.h"
-#include "xfs_attr.h"
-#include "xfs_inode_item.h"
-#include "xfs_buf_item.h"
-#include "xfs_utils.h"
-#include "xfs_iomap.h"
-#include "xfs_vnodeops.h"
-#include <linux/capability.h>
-#include <linux/writeback.h>
-#if defined(XFS_RW_TRACE)
-void
-xfs_rw_enter_trace(
-        int                     tag,
-        xfs_inode_t             *ip,
-        void                    *data,
-        size_t                  segs,
-        loff_t                  offset,
-        int                     ioflags)
-{
-        if (ip->i_rwtrace == NULL)
-                return;
-        ktrace_enter(ip->i_rwtrace,
-                (void *)(unsigned long)tag,
-                (void *)ip,
-                (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
-                (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
-                (void *)data,
-                (void *)((unsigned long)segs),
-                (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
-                (void *)((unsigned long)(offset & 0xffffffff)),
-                (void *)((unsigned long)ioflags),
-                (void *)((unsigned long)((ip->i_new_size >> 32) & 0xffffffff)),
-                (void *)((unsigned long)(ip->i_new_size & 0xffffffff)),
-                (void *)((unsigned long)current_pid()),
-                (void *)NULL,
-                (void *)NULL,
-                (void *)NULL,
-                (void *)NULL);
-}
-void
-xfs_inval_cached_trace(
-        xfs_inode_t     *ip,
-        xfs_off_t       offset,
-        xfs_off_t       len,
-        xfs_off_t       first,
-        xfs_off_t       last)
-{
-        if (ip->i_rwtrace == NULL)
-                return;
-        ktrace_enter(ip->i_rwtrace,
-                (void *)(__psint_t)XFS_INVAL_CACHED,
-                (void *)ip,
-                (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
-                (void *)((unsigned long)(offset & 0xffffffff)),
-                (void *)((unsigned long)((len >> 32) & 0xffffffff)),
-                (void *)((unsigned long)(len & 0xffffffff)),
-                (void *)((unsigned long)((first >> 32) & 0xffffffff)),
-                (void *)((unsigned long)(first & 0xffffffff)),
-                (void *)((unsigned long)((last >> 32) & 0xffffffff)),
-                (void *)((unsigned long)(last & 0xffffffff)),
-                (void *)((unsigned long)current_pid()),
-                (void *)NULL,
-                (void *)NULL,
-                (void *)NULL,
-                (void *)NULL,
-                (void *)NULL);
-}
-#endif
-/*
- *      xfs_iozero
- *
- *      xfs_iozero clears the specified range of buffer supplied,
- *      and marks all the affected blocks as valid and modified.  If
- *      an affected block is not allocated, it will be allocated.  If
- *      an affected block is not completely overwritten, and is not
- *      valid before the operation, it will be read from disk before
- *      being partially zeroed.
- */
-STATIC int
-xfs_iozero(
-        struct xfs_inode        *ip,    /* inode                        */
-        loff_t                  pos,    /* offset in file               */
-        size_t                  count)  /* size of data to zero         */
-{
-        struct page             *page;
-        struct address_space    *mapping;
-        int                     status;
-        mapping = VFS_I(ip)->i_mapping;
-        do {
-                unsigned offset, bytes;
-                void *fsdata;
-                offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
-                bytes = PAGE_CACHE_SIZE - offset;
-                if (bytes > count)
-                        bytes = count;
-                status = pagecache_write_begin(NULL, mapping, pos, bytes,
-                                        AOP_FLAG_UNINTERRUPTIBLE,
-                                        &page, &fsdata);
-                if (status)
-                        break;
-                zero_user(page, offset, bytes);
-                status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
-                                        page, fsdata);
-                WARN_ON(status <= 0); /* can't return less than zero! */
-                pos += bytes;
-                count -= bytes;
-                status = 0;
-        } while (count);
-        return (-status);
-}
-ssize_t                 /* bytes read, or (-)  error */
-xfs_read(
-        xfs_inode_t             *ip,
-        struct kiocb            *iocb,
-        const struct iovec      *iovp,
-        unsigned int            segs,
-        loff_t                  *offset,
-        int                     ioflags)
-{
-        struct file             *file = iocb->ki_filp;
-        struct inode            *inode = file->f_mapping->host;
-        xfs_mount_t             *mp = ip->i_mount;
-        size_t                  size = 0;
-        ssize_t                 ret = 0;
-        xfs_fsize_t             n;
-        unsigned long           seg;
-        XFS_STATS_INC(xs_read_calls);
-        /* START copy & waste from filemap.c */
-        for (seg = 0; seg < segs; seg++) {
-                const struct iovec *iv = &iovp[seg];
-                /*
-                 * If any segment has a negative length, or the cumulative
-                 * length ever wraps negative then return -EINVAL.
-                 */
-                size += iv->iov_len;
-                if (unlikely((ssize_t)(size|iv->iov_len) < 0))
-                        return XFS_ERROR(-EINVAL);
-        }
-        /* END copy & waste from filemap.c */
-        if (unlikely(ioflags & IO_ISDIRECT)) {
-                xfs_buftarg_t   *target =
-                        XFS_IS_REALTIME_INODE(ip) ?
-                                mp->m_rtdev_targp : mp->m_ddev_targp;
-                if ((*offset & target->bt_smask) ||
-                    (size & target->bt_smask)) {
-                        if (*offset == ip->i_size) {
-                                return (0);
-                        }
-                        return -XFS_ERROR(EINVAL);
-                }
-        }
-        n = XFS_MAXIOFFSET(mp) - *offset;
-        if ((n <= 0) || (size == 0))
-                return 0;
-        if (n < size)
-                size = n;
-        if (XFS_FORCED_SHUTDOWN(mp))
-                return -EIO;
-        if (unlikely(ioflags & IO_ISDIRECT))
-                mutex_lock(&inode->i_mutex);
-        xfs_ilock(ip, XFS_IOLOCK_SHARED);
-        if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
-                int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
-                int iolock = XFS_IOLOCK_SHARED;
-                ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size,
-                                        dmflags, &iolock);
-                if (ret) {
-                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-                        if (unlikely(ioflags & IO_ISDIRECT))
-                                mutex_unlock(&inode->i_mutex);
-                        return ret;
-                }
-        }
-        if (unlikely(ioflags & IO_ISDIRECT)) {
-                if (inode->i_mapping->nrpages)
-                        ret = -xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
-                                                    -1, FI_REMAPF_LOCKED);
-                mutex_unlock(&inode->i_mutex);
-                if (ret) {
-                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-                        return ret;
-                }
-        }
-        xfs_rw_enter_trace(XFS_READ_ENTER, ip,
-                                (void *)iovp, segs, *offset, ioflags);
-        iocb->ki_pos = *offset;
-        ret = generic_file_aio_read(iocb, iovp, segs, *offset);
-        if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
-                ret = wait_on_sync_kiocb(iocb);
-        if (ret > 0)
-                XFS_STATS_ADD(xs_read_bytes, ret);
-        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-        return ret;
-}
-ssize_t
-xfs_splice_read(
-        xfs_inode_t             *ip,
-        struct file             *infilp,
-        loff_t                  *ppos,
-        struct pipe_inode_info  *pipe,
-        size_t                  count,
-        int                     flags,
-        int                     ioflags)
-{
-        xfs_mount_t             *mp = ip->i_mount;
-        ssize_t                 ret;
-        XFS_STATS_INC(xs_read_calls);
-        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-                return -EIO;
-        xfs_ilock(ip, XFS_IOLOCK_SHARED);
-        if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
-                int iolock = XFS_IOLOCK_SHARED;
-                int error;
-                error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
-                                        FILP_DELAY_FLAG(infilp), &iolock);
-                if (error) {
-                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-                        return -error;
-                }
-        }
-        xfs_rw_enter_trace(XFS_SPLICE_READ_ENTER, ip,
-                           pipe, count, *ppos, ioflags);
-        ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
-        if (ret > 0)
-                XFS_STATS_ADD(xs_read_bytes, ret);
-        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-        return ret;
-}
-ssize_t
-xfs_splice_write(
-        xfs_inode_t             *ip,
-        struct pipe_inode_info  *pipe,
-        struct file             *outfilp,
-        loff_t                  *ppos,
-        size_t                  count,
-        int                     flags,
-        int                     ioflags)
-{
-        xfs_mount_t             *mp = ip->i_mount;
-        ssize_t                 ret;
-        struct inode            *inode = outfilp->f_mapping->host;
-        xfs_fsize_t             isize, new_size;
-        XFS_STATS_INC(xs_write_calls);
-        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-                return -EIO;
-        xfs_ilock(ip, XFS_IOLOCK_EXCL);
-        if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
-                int iolock = XFS_IOLOCK_EXCL;
-                int error;
-                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
-                                        FILP_DELAY_FLAG(outfilp), &iolock);
-                if (error) {
-                        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-                        return -error;
-                }
-        }
-        new_size = *ppos + count;
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        if (new_size > ip->i_size)
-                ip->i_new_size = new_size;
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, ip,
-                           pipe, count, *ppos, ioflags);
-        ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
-        if (ret > 0)
-                XFS_STATS_ADD(xs_write_bytes, ret);
-        isize = i_size_read(inode);
-        if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
-                *ppos = isize;
-        if (*ppos > ip->i_size) {
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                if (*ppos > ip->i_size)
-                        ip->i_size = *ppos;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        }
-        if (ip->i_new_size) {
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                ip->i_new_size = 0;
-                if (ip->i_d.di_size > ip->i_size)
-                        ip->i_d.di_size = ip->i_size;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        }
-        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-        return ret;
-}
-/*
- * This routine is called to handle zeroing any space in the last
- * block of the file that is beyond the EOF.  We do this since the
- * size is being increased without writing anything to that block
- * and we don't want anyone to read the garbage on the disk.
- */
-STATIC int                              /* error (positive) */
-xfs_zero_last_block(
-        xfs_inode_t     *ip,
-        xfs_fsize_t     offset,
-        xfs_fsize_t     isize)
-{
-        xfs_fileoff_t   last_fsb;
-        xfs_mount_t     *mp = ip->i_mount;
-        int             nimaps;
-        int             zero_offset;
-        int             zero_len;
-        int             error = 0;
-        xfs_bmbt_irec_t imap;
-        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        zero_offset = XFS_B_FSB_OFFSET(mp, isize);
-        if (zero_offset == 0) {
-                /*
-                 * There are no extra bytes in the last block on disk to
-                 * zero, so return.
-                 */
-                return 0;
-        }
-        last_fsb = XFS_B_TO_FSBT(mp, isize);
-        nimaps = 1;
-        error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
-                          &nimaps, NULL, NULL);
-        if (error) {
-                return error;
-        }
-        ASSERT(nimaps > 0);
-        /*
-         * If the block underlying isize is just a hole, then there
-         * is nothing to zero.
-         */
-        if (imap.br_startblock == HOLESTARTBLOCK) {
-                return 0;
-        }
-        /*
-         * Zero the part of the last block beyond the EOF, and write it
-         * out sync.  We need to drop the ilock while we do this so we
-         * don't deadlock when the buffer cache calls back to us.
-         */
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        zero_len = mp->m_sb.sb_blocksize - zero_offset;
-        if (isize + zero_len > offset)
-                zero_len = offset - isize;
-        error = xfs_iozero(ip, isize, zero_len);
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        ASSERT(error >= 0);
-        return error;
-}
-/*
- * Zero any on disk space between the current EOF and the new,
- * larger EOF.  This handles the normal case of zeroing the remainder
- * of the last block in the file and the unusual case of zeroing blocks
- * out beyond the size of the file.  This second case only happens
- * with fixed size extents and when the system crashes before the inode
- * size was updated but after blocks were allocated.  If fill is set,
- * then any holes in the range are filled and zeroed.  If not, the holes
- * are left alone as holes.
- */
-int                                     /* error (positive) */
-xfs_zero_eof(
-        xfs_inode_t     *ip,
-        xfs_off_t       offset,         /* starting I/O offset */
-        xfs_fsize_t     isize)          /* current inode size */
-{
-        xfs_mount_t     *mp = ip->i_mount;
-        xfs_fileoff_t   start_zero_fsb;
-        xfs_fileoff_t   end_zero_fsb;
-        xfs_fileoff_t   zero_count_fsb;
-        xfs_fileoff_t   last_fsb;
-        xfs_fileoff_t   zero_off;
-        xfs_fsize_t     zero_len;
-        int             nimaps;
-        int             error = 0;
-        xfs_bmbt_irec_t imap;
-        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
-        ASSERT(offset > isize);
-        /*
-         * First handle zeroing the block on which isize resides.
-         * We only zero a part of that block so it is handled specially.
-         */
-        error = xfs_zero_last_block(ip, offset, isize);
-        if (error) {
-                ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
-                return error;
-        }
-        /*
-         * Calculate the range between the new size and the old
-         * where blocks needing to be zeroed may exist.  To get the
-         * block where the last byte in the file currently resides,
-         * we need to subtract one from the size and truncate back
-         * to a block boundary.  We subtract 1 in case the size is
-         * exactly on a block boundary.
-         */
-        last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
-        start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
-        end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
-        ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
-        if (last_fsb == end_zero_fsb) {
-                /*
-                 * The size was only incremented on its last block.
-                 * We took care of that above, so just return.
-                 */
-                return 0;
-        }
-        ASSERT(start_zero_fsb <= end_zero_fsb);
-        while (start_zero_fsb <= end_zero_fsb) {
-                nimaps = 1;
-                zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
-                error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
-                                  0, NULL, 0, &imap, &nimaps, NULL, NULL);
-                if (error) {
-                        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
-                        return error;
-                }
-                ASSERT(nimaps > 0);
-                if (imap.br_state == XFS_EXT_UNWRITTEN ||
-                    imap.br_startblock == HOLESTARTBLOCK) {
-                        /*
-                         * This loop handles initializing pages that were
-                         * partially initialized by the code below this
-                         * loop. It basically zeroes the part of the page
-                         * that sits on a hole and sets the page as P_HOLE
-                         * and calls remapf if it is a mapped file.
-                         */
-                        start_zero_fsb = imap.br_startoff + imap.br_blockcount;
-                        ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-                        continue;
-                }
-                /*
-                 * There are blocks we need to zero.
-                 * Drop the inode lock while we're doing the I/O.
-                 * We'll still have the iolock to protect us.
-                 */
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
-                zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
-                if ((zero_off + zero_len) > offset)
-                        zero_len = offset - zero_off;
-                error = xfs_iozero(ip, zero_off, zero_len);
-                if (error) {
-                        goto out_lock;
-                }
-                start_zero_fsb = imap.br_startoff + imap.br_blockcount;
-                ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-        }
-        return 0;
-out_lock:
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        ASSERT(error >= 0);
-        return error;
-}
-ssize_t                         /* bytes written, or (-) error */
-xfs_write(
-        struct xfs_inode        *xip,
-        struct kiocb            *iocb,
-        const struct iovec      *iovp,
-        unsigned int            nsegs,
-        loff_t                  *offset,
-        int                     ioflags)
-{
-        struct file             *file = iocb->ki_filp;
-        struct address_space    *mapping = file->f_mapping;
-        struct inode            *inode = mapping->host;
-        unsigned long           segs = nsegs;
-        xfs_mount_t             *mp;
-        ssize_t                 ret = 0, error = 0;
-        xfs_fsize_t             isize, new_size;
-        int                     iolock;
-        int                     eventsent = 0;
-        size_t                  ocount = 0, count;
-        loff_t                  pos;
-        int                     need_i_mutex;
-        XFS_STATS_INC(xs_write_calls);
-        error = generic_segment_checks(iovp, &segs, &ocount, VERIFY_READ);
-        if (error)
-                return error;
-        count = ocount;
-        pos = *offset;
-        if (count == 0)
-                return 0;
-        mp = xip->i_mount;
-        xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
-        if (XFS_FORCED_SHUTDOWN(mp))
-                return -EIO;
-relock:
-        if (ioflags & IO_ISDIRECT) {
-                iolock = XFS_IOLOCK_SHARED;
-                need_i_mutex = 0;
-        } else {
-                iolock = XFS_IOLOCK_EXCL;
-                need_i_mutex = 1;
-                mutex_lock(&inode->i_mutex);
-        }
-        xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
-start:
-        error = -generic_write_checks(file, &pos, &count,
-                                        S_ISBLK(inode->i_mode));
-        if (error) {
-                xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
-                goto out_unlock_mutex;
-        }
-        if ((DM_EVENT_ENABLED(xip, DM_EVENT_WRITE) &&
-            !(ioflags & IO_INVIS) && !eventsent)) {
-                int             dmflags = FILP_DELAY_FLAG(file);
-                if (need_i_mutex)
-                        dmflags |= DM_FLAGS_IMUX;
-                xfs_iunlock(xip, XFS_ILOCK_EXCL);
-                error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip,
-                                      pos, count, dmflags, &iolock);
-                if (error) {
-                        goto out_unlock_internal;
-                }
-                xfs_ilock(xip, XFS_ILOCK_EXCL);
-                eventsent = 1;
-                /*
-                 * The iolock was dropped and reacquired in XFS_SEND_DATA
-                 * so we have to recheck the size when appending.
-                 * We will only "goto start;" once, since having sent the
-                 * event prevents another call to XFS_SEND_DATA, which is
-                 * what allows the size to change in the first place.
-                 */
-                if ((file->f_flags & O_APPEND) && pos != xip->i_size)
-                        goto start;
-        }
-        if (ioflags & IO_ISDIRECT) {
-                xfs_buftarg_t   *target =
-                        XFS_IS_REALTIME_INODE(xip) ?
-                                mp->m_rtdev_targp : mp->m_ddev_targp;
-                if ((pos & target->bt_smask) || (count & target->bt_smask)) {
-                        xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
-                        return XFS_ERROR(-EINVAL);
-                }
-                if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) {
-                        xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
-                        iolock = XFS_IOLOCK_EXCL;
-                        need_i_mutex = 1;
-                        mutex_lock(&inode->i_mutex);
-                        xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
-                        goto start;
-                }
-        }
-        new_size = pos + count;
-        if (new_size > xip->i_size)
-                xip->i_new_size = new_size;
-        if (likely(!(ioflags & IO_INVIS)))
-                file_update_time(file);
-        /*
-         * If the offset is beyond the size of the file, we have a couple
-         * of things to do. First, if there is already space allocated
-         * we need to either create holes or zero the disk or ...
-         *
-         * If there is a page where the previous size lands, we need
-         * to zero it out up to the new size.
-         */
-        if (pos > xip->i_size) {
-                error = xfs_zero_eof(xip, pos, xip->i_size);
-                if (error) {
-                        xfs_iunlock(xip, XFS_ILOCK_EXCL);
-                        goto out_unlock_internal;
-                }
-        }
-        xfs_iunlock(xip, XFS_ILOCK_EXCL);
-        /*
-         * If we're writing the file then make sure to clear the
-         * setuid and setgid bits if the process is not being run
-         * by root.  This keeps people from modifying setuid and
-         * setgid binaries.
-         */
-        if (((xip->i_d.di_mode & S_ISUID) ||
-            ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
-                (S_ISGID | S_IXGRP))) &&
-             !capable(CAP_FSETID)) {
-                error = xfs_write_clear_setuid(xip);
-                if (likely(!error))
-                        error = -file_remove_suid(file);
-                if (unlikely(error)) {
-                        goto out_unlock_internal;
-                }
-        }
-        /* We can write back this queue in page reclaim */
-        current->backing_dev_info = mapping->backing_dev_info;
-        if ((ioflags & IO_ISDIRECT)) {
-                if (mapping->nrpages) {
-                        WARN_ON(need_i_mutex == 0);
-                        xfs_inval_cached_trace(xip, pos, -1,
-                                        (pos & PAGE_CACHE_MASK), -1);
-                        error = xfs_flushinval_pages(xip,
-                                        (pos & PAGE_CACHE_MASK),
-                                        -1, FI_REMAPF_LOCKED);
-                        if (error)
-                                goto out_unlock_internal;
-                }
-                if (need_i_mutex) {
-                        /* demote the lock now the cached pages are gone */
-                        xfs_ilock_demote(xip, XFS_IOLOCK_EXCL);
-                        mutex_unlock(&inode->i_mutex);
-                        iolock = XFS_IOLOCK_SHARED;
-                        need_i_mutex = 0;
-                }
-                xfs_rw_enter_trace(XFS_DIOWR_ENTER, xip, (void *)iovp, segs,
-                                *offset, ioflags);
-                ret = generic_file_direct_write(iocb, iovp,
-                                &segs, pos, offset, count, ocount);
-                /*
-                 * direct-io write to a hole: fall through to buffered I/O
-                 * for completing the rest of the request.
-                 */
-                if (ret >= 0 && ret != count) {
-                        XFS_STATS_ADD(xs_write_bytes, ret);
-                        pos += ret;
-                        count -= ret;
-                        ioflags &= ~IO_ISDIRECT;
-                        xfs_iunlock(xip, iolock);
-                        goto relock;
-                }
-        } else {
-                int enospc = 0;
-                ssize_t ret2 = 0;
-write_retry:
-                xfs_rw_enter_trace(XFS_WRITE_ENTER, xip, (void *)iovp, segs,
-                                *offset, ioflags);
-                ret2 = generic_file_buffered_write(iocb, iovp, segs,
-                                pos, offset, count, ret);
-                /*
-                 * if we just got an ENOSPC, flush the inode now we
-                 * aren't holding any page locks and retry *once*
-                 */
-                if (ret2 == -ENOSPC && !enospc) {
-                        error = xfs_flush_pages(xip, 0, -1, 0, FI_NONE);
-                        if (error)
-                                goto out_unlock_internal;
-                        enospc = 1;
-                        goto write_retry;
-                }
-                ret = ret2;
-        }
-        current->backing_dev_info = NULL;
-        if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
-                ret = wait_on_sync_kiocb(iocb);
-        isize = i_size_read(inode);
-        if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
-                *offset = isize;
-        if (*offset > xip->i_size) {
-                xfs_ilock(xip, XFS_ILOCK_EXCL);
-                if (*offset > xip->i_size)
-                        xip->i_size = *offset;
-                xfs_iunlock(xip, XFS_ILOCK_EXCL);
-        }
-        if (ret == -ENOSPC &&
-            DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
-                xfs_iunlock(xip, iolock);
-                if (need_i_mutex)
-                        mutex_unlock(&inode->i_mutex);
-                error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip,
-                                DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL,
-                                0, 0, 0); /* Delay flag intentionally  unused */
-                if (need_i_mutex)
-                        mutex_lock(&inode->i_mutex);
-                xfs_ilock(xip, iolock);
-                if (error)
-                        goto out_unlock_internal;
-                goto start;
-        }
-        error = -ret;
-        if (ret <= 0)
-                goto out_unlock_internal;
-        XFS_STATS_ADD(xs_write_bytes, ret);
-        /* Handle various SYNC-type writes */
-        if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
-                loff_t end = pos + ret - 1;
-                int error2;
-                xfs_iunlock(xip, iolock);
-                if (need_i_mutex)
-                        mutex_unlock(&inode->i_mutex);
-                error2 = filemap_write_and_wait_range(mapping, pos, end);
-                if (!error)
-                        error = error2;
-                if (need_i_mutex)
-                        mutex_lock(&inode->i_mutex);
-                xfs_ilock(xip, iolock);
-                error2 = xfs_fsync(xip);
-                if (!error)
-                        error = error2;
-        }
- out_unlock_internal:
-        if (xip->i_new_size) {
-                xfs_ilock(xip, XFS_ILOCK_EXCL);
-                xip->i_new_size = 0;
-                /*
-                 * If this was a direct or synchronous I/O that failed (such
-                 * as ENOSPC) then part of the I/O may have been written to
-                 * disk before the error occured.  In this case the on-disk
-                 * file size may have been adjusted beyond the in-memory file
-                 * size and now needs to be truncated back.
-                 */
-                if (xip->i_d.di_size > xip->i_size)
-                        xip->i_d.di_size = xip->i_size;
-                xfs_iunlock(xip, XFS_ILOCK_EXCL);
-        }
-        xfs_iunlock(xip, iolock);
- out_unlock_mutex:
-        if (need_i_mutex)
-                mutex_unlock(&inode->i_mutex);
-        return -error;
-}
-/*
- * All xfs metadata buffers except log state machine buffers
- * get this attached as their b_bdstrat callback function.
- * This is so that we can catch a buffer
- * after prematurely unpinning it to forcibly shutdown the filesystem.
- */
-int
-xfs_bdstrat_cb(struct xfs_buf *bp)
-{
-        if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
-                xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
-                /*
-                 * Metadata write that didn't get logged but
-                 * written delayed anyway. These aren't associated
-                 * with a transaction, and can be ignored.
-                 */
-                if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
-                    (XFS_BUF_ISREAD(bp)) == 0)
-                        return (xfs_bioerror_relse(bp));
-                else
-                        return (xfs_bioerror(bp));
-        }
-        xfs_buf_iorequest(bp);
-        return 0;
-}
-/*
- * Wrapper around bdstrat so that we can stop data from going to disk in case
- * we are shutting down the filesystem.  Typically user data goes thru this
- * path; one of the exceptions is the superblock.
- */
-void
-xfsbdstrat(
-        struct xfs_mount        *mp,
-        struct xfs_buf          *bp)
-{
-        ASSERT(mp);
-        if (!XFS_FORCED_SHUTDOWN(mp)) {
-                xfs_buf_iorequest(bp);
-                return;
-        }
-        xfs_buftrace("XFSBDSTRAT IOERROR", bp);
-        xfs_bioerror_relse(bp);
-}
-/*
- * If the underlying (data/log/rt) device is readonly, there are some
- * operations that cannot proceed.
- */
-int
-xfs_dev_is_read_only(
-        xfs_mount_t             *mp,
-        char                    *message)
-{
-        if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
-            xfs_readonly_buftarg(mp->m_logdev_targp) ||
-            (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
-                cmn_err(CE_NOTE,
-                        "XFS: %s required on read-only device.", message);
-                cmn_err(CE_NOTE,
-                        "XFS: write access unavailable, cannot proceed.");
-                return EROFS;
-        }
-        return 0;
-}
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
deleted file mode 100644
index e6be37dbd0e9..000000000000
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_LRW_H__
-#define __XFS_LRW_H__
-struct xfs_mount;
-struct xfs_inode;
-struct xfs_bmbt_irec;
-struct xfs_buf;
-struct xfs_iomap;
-#if defined(XFS_RW_TRACE)
-/*
- * Defines for the trace mechanisms in xfs_lrw.c.
- */
-#define XFS_RW_KTRACE_SIZE      128
-#define XFS_READ_ENTER          1
-#define XFS_WRITE_ENTER         2
-#define XFS_IOMAP_READ_ENTER    3
-#define XFS_IOMAP_WRITE_ENTER   4
-#define XFS_IOMAP_READ_MAP      5
-#define XFS_IOMAP_WRITE_MAP     6
-#define XFS_IOMAP_WRITE_NOSPACE 7
-#define XFS_ITRUNC_START        8
-#define XFS_ITRUNC_FINISH1      9
-#define XFS_ITRUNC_FINISH2      10
-#define XFS_CTRUNC1             11
-#define XFS_CTRUNC2             12
-#define XFS_CTRUNC3             13
-#define XFS_CTRUNC4             14
-#define XFS_CTRUNC5             15
-#define XFS_CTRUNC6             16
-#define XFS_BUNMAP              17
-#define XFS_INVAL_CACHED        18
-#define XFS_DIORD_ENTER         19
-#define XFS_DIOWR_ENTER         20
-#define XFS_WRITEPAGE_ENTER     22
-#define XFS_RELEASEPAGE_ENTER   23
-#define XFS_INVALIDPAGE_ENTER   24
-#define XFS_IOMAP_ALLOC_ENTER   25
-#define XFS_IOMAP_ALLOC_MAP     26
-#define XFS_IOMAP_UNWRITTEN     27
-#define XFS_SPLICE_READ_ENTER   28
-#define XFS_SPLICE_WRITE_ENTER  29
-extern void xfs_rw_enter_trace(int, struct xfs_inode *,
-                void *, size_t, loff_t, int);
-extern void xfs_inval_cached_trace(struct xfs_inode *,
-                xfs_off_t, xfs_off_t, xfs_off_t, xfs_off_t);
-#else
-#define xfs_rw_enter_trace(tag, ip, data, size, offset, ioflags)
-#define xfs_inval_cached_trace(ip, offset, len, first, last)
-#endif
-/* errors from xfsbdstrat() must be extracted from the buffer */
-extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
-extern int xfs_bdstrat_cb(struct xfs_buf *);
-extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
-extern int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
-#endif  /* __XFS_LRW_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 3d4a0c84d634..1947514ce1ad 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -44,20 +44,6 @@ xfs_quota_type(int type)
 }
 STATIC int
-xfs_fs_quota_sync(
-        struct super_block      *sb,
-        int                     type)
-{
-        struct xfs_mount        *mp = XFS_M(sb);
-        if (sb->s_flags & MS_RDONLY)
-                return -EROFS;
-        if (!XFS_IS_QUOTA_RUNNING(mp))
-                return -ENOSYS;
-        return -xfs_sync_data(mp, 0);
-}
-STATIC int
 xfs_fs_get_xstate(
        struct super_block      *sb,
        struct fs_quota_stat    *fqs)
@@ -82,8 +68,6 @@ xfs_fs_set_xstate(
                return -EROFS;
        if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
                return -ENOSYS;
-        if (!capable(CAP_SYS_ADMIN))
-                return -EPERM;
        if (uflags & XFS_QUOTA_UDQ_ACCT)
                flags |= XFS_UQUOTA_ACCT;
@@ -144,14 +128,11 @@ xfs_fs_set_xquota(
                return -ENOSYS;
        if (!XFS_IS_QUOTA_ON(mp))
                return -ESRCH;
-        if (!capable(CAP_SYS_ADMIN))
-                return -EPERM;
        return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
 }
 const struct quotactl_ops xfs_quotactl_operations = {
-        .quota_sync             = xfs_fs_quota_sync,
        .get_xstate             = xfs_fs_get_xstate,
        .set_xstate             = xfs_fs_set_xstate,
        .get_xquota             = xfs_fs_get_xquota,
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 18a4b8e11df2..52e06b487ced 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -15,6 +15,7 @@
 * along with this program; if not, write the Free Software Foundation,
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
 #include "xfs.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
@@ -52,14 +53,15 @@
 #include "xfs_trans_priv.h"
 #include "xfs_filestream.h"
 #include "xfs_da_btree.h"
-#include "xfs_dir2_trace.h"
 #include "xfs_extfree_item.h"
 #include "xfs_mru_cache.h"
 #include "xfs_inode_item.h"
 #include "xfs_sync.h"
+#include "xfs_trace.h"
 #include <linux/namei.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/mempool.h>
 #include <linux/writeback.h>
@@ -876,12 +878,11 @@ xfsaild(
 {
        struct xfs_ail  *ailp = data;
        xfs_lsn_t       last_pushed_lsn = 0;
-        long            tout = 0;
+        long            tout = 0; /* milliseconds */
        while (!kthread_should_stop()) {
-                if (tout)
+                schedule_timeout_interruptible(tout ?
-                        schedule_timeout_interruptible(msecs_to_jiffies(tout));
+                                msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
-                tout = 1000;
                /* swsusp */
                try_to_freeze();
@@ -930,13 +931,37 @@ xfs_fs_alloc_inode(
 */
 STATIC void
 xfs_fs_destroy_inode(
-        struct inode    *inode)
+        struct inode            *inode)
 {
-        xfs_inode_t             *ip = XFS_I(inode);
+        struct xfs_inode        *ip = XFS_I(inode);
+        xfs_itrace_entry(ip);
        XFS_STATS_INC(vn_reclaim);
-        if (xfs_reclaim(ip))
-                panic("%s: cannot reclaim 0x%p\n", __func__, inode);
+        /* bad inode, get out here ASAP */
+        if (is_bad_inode(inode))
+                goto out_reclaim;
+        xfs_ioend_wait(ip);
+        ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+        /*
+         * We should never get here with one of the reclaim flags already set.
+         */
+        ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+        ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
+        /*
+         * We always use background reclaim here because even if the
+         * inode is clean, it still may be under IO and hence we have
+         * to take the flush lock. The background reclaim path handles
+         * this more efficiently than we can here, so simply let background
+         * reclaim tear down all inodes.
+         */
+out_reclaim:
+        xfs_inode_set_reclaim_tag(ip);
 }
 /*
@@ -973,7 +998,6 @@ xfs_fs_inode_init_once(
        mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
                     "xfsino", ip->i_ino);
-        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
 }
 /*
@@ -998,59 +1022,108 @@ xfs_fs_dirty_inode(
        XFS_I(inode)->i_update_core = 1;
 }
-/*
+STATIC int
- * Attempt to flush the inode, this will actually fail
+xfs_log_inode(
- * if the inode is pinned, but we dirty the inode again
+        struct xfs_inode        *ip)
- * at the point when it is unpinned after a log write,
+{
- * since this is when the inode itself becomes flushable.
+        struct xfs_mount        *mp = ip->i_mount;
- */
+        struct xfs_trans        *tp;
+        int                     error;
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+        error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                /* we need to return with the lock hold shared */
+                xfs_ilock(ip, XFS_ILOCK_SHARED);
+                return error;
+        }
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        /*
+         * Note - it's possible that we might have pushed ourselves out of the
+         * way during trans_reserve which would flush the inode.  But there's
+         * no guarantee that the inode buffer has actually gone out yet (it's
+         * delwri).  Plus the buffer could be pinned anyway if it's part of
+         * an inode in another recent transaction.  So we play it safe and
+         * fire off the transaction anyway.
+         */
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ihold(tp, ip);
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        xfs_trans_set_sync(tp);
+        error = xfs_trans_commit(tp, 0);
+        xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
+        return error;
+}
 STATIC int
 xfs_fs_write_inode(
        struct inode            *inode,
-        int                     sync)
+        struct writeback_control *wbc)
 {
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
-        int                     error = 0;
+        int                     error = EAGAIN;
        xfs_itrace_entry(ip);
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
-        if (sync) {
+        if (wbc->sync_mode == WB_SYNC_ALL) {
-                error = xfs_wait_on_pages(ip, 0, -1);
+                /*
-                if (error)
+                 * Make sure the inode has hit stable storage.  By using the
+                 * log and the fsync transactions we reduce the IOs we have
+                 * to do here from two (log and inode) to just the log.
+                 *
+                 * Note: We still need to do a delwri write of the inode after
+                 * this to flush it to the backing buffer so that bulkstat
+                 * works properly if this is the first time the inode has been
+                 * written.  Because we hold the ilock atomically over the
+                 * transaction commit and the inode flush we are guaranteed
+                 * that the inode is not pinned when it returns. If the flush
+                 * lock is already held, then the inode has already been
+                 * flushed once and we don't need to flush it again.  Hence
+                 * the code will only flush the inode if it isn't already
+                 * being flushed.
+                 */
+                xfs_ilock(ip, XFS_ILOCK_SHARED);
+                if (ip->i_update_core) {
+                        error = xfs_log_inode(ip);
+                        if (error)
+                                goto out_unlock;
+                }
+        } else {
+                /*
+                 * We make this non-blocking if the inode is contended, return
+                 * EAGAIN to indicate to the caller that they did not succeed.
+                 * This prevents the flush path from blocking on inodes inside
+                 * another operation right now, they get caught later by xfs_sync.
+                 */
+                if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
                        goto out;
        }
-        /*
+        if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
-         * Bypass inodes which have already been cleaned by
+                goto out_unlock;
-         * the inode flush clustering code inside xfs_iflush
-         */
-        if (xfs_inode_clean(ip))
-                goto out;
        /*
-         * We make this non-blocking if the inode is contended, return
+         * Now we have the flush lock and the inode is not pinned, we can check
-         * EAGAIN to indicate to the caller that they did not succeed.
+         * if the inode is really clean as we know that there are no pending
-         * This prevents the flush path from blocking on inodes inside
+         * transaction completions, it is not waiting on the delayed write
-         * another operation right now, they get caught later by xfs_sync.
+         * queue and there is no IO in progress.
         */
-        if (sync) {
+        if (xfs_inode_clean(ip)) {
-                xfs_ilock(ip, XFS_ILOCK_SHARED);
+                xfs_ifunlock(ip);
-                xfs_iflock(ip);
+                error = 0;
+                goto out_unlock;
-                error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
-        } else {
-                error = EAGAIN;
-                if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
-                        goto out;
-                if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
-                        goto out_unlock;
-                error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK);
        }
+        error = xfs_iflush(ip, 0);
 out_unlock:
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -1075,6 +1148,20 @@ xfs_fs_clear_inode(
        XFS_STATS_INC(vn_remove);
        XFS_STATS_DEC(vn_active);
+        /*
+         * The iolock is used by the file system to coordinate reads,
+         * writes, and block truncates.  Up to this point the lock
+         * protected concurrent accesses by users of the inode.  But
+         * from here forward we're doing some final processing of the
+         * inode because we're done with it, and although we reuse the
+         * iolock for protection it is really a distinct lock class
+         * (in the lockdep sense) from before.  To keep lockdep happy
+         * (and basically indicate what we are doing), we explicitly
+         * re-init the iolock here.
+         */
+        ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
        xfs_inactive(ip);
 }
@@ -1092,8 +1179,6 @@ xfs_fs_put_super(
        struct super_block      *sb)
 {
        struct xfs_mount        *mp = XFS_M(sb);
-        struct xfs_inode        *rip = mp->m_rootip;
-        int                     unmount_event_flags = 0;
        xfs_syncd_stop(mp);
@@ -1109,20 +1194,7 @@ xfs_fs_put_super(
                xfs_sync_attr(mp, 0);
        }
-#ifdef HAVE_DMAPI
+        XFS_SEND_PREUNMOUNT(mp);
-        if (mp->m_flags & XFS_MOUNT_DMAPI) {
-                unmount_event_flags =
-                        (mp->m_dmevmask & (1 << DM_EVENT_UNMOUNT)) ?
-                                0 : DM_FLAGS_UNWANTED;
-                /*
-                 * Ignore error from dmapi here, first unmount is not allowed
-                 * to fail anyway, and second we wouldn't want to fail a
-                 * unmount because of dmapi.
-                 */
-                XFS_SEND_PREUNMOUNT(mp, rip, DM_RIGHT_NULL, rip, DM_RIGHT_NULL,
-                                NULL, NULL, 0, 0, unmount_event_flags);
-        }
-#endif
        /*
         * Blow away any referenced inode in the filestreams cache.
@@ -1133,10 +1205,7 @@ xfs_fs_put_super(
        XFS_bflush(mp->m_ddev_targp);
-        if (mp->m_flags & XFS_MOUNT_DMAPI) {
+        XFS_SEND_UNMOUNT(mp);
-                XFS_SEND_UNMOUNT(mp, rip, DM_RIGHT_NULL, 0, 0,
-                                unmount_event_flags);
-        }
        xfs_unmountfs(mp);
        xfs_freesb(mp);
@@ -1237,6 +1306,29 @@ xfs_fs_statfs(
        return 0;
 }
+STATIC void
+xfs_save_resvblks(struct xfs_mount *mp)
+{
+        __uint64_t resblks = 0;
+        mp->m_resblks_save = mp->m_resblks;
+        xfs_reserve_blocks(mp, &resblks, NULL);
+}
+STATIC void
+xfs_restore_resvblks(struct xfs_mount *mp)
+{
+        __uint64_t resblks;
+        if (mp->m_resblks_save) {
+                resblks = mp->m_resblks_save;
+                mp->m_resblks_save = 0;
+        } else
+                resblks = xfs_default_resblks(mp);
+        xfs_reserve_blocks(mp, &resblks, NULL);
+}
 STATIC int
 xfs_fs_remount(
        struct super_block      *sb,
@@ -1316,11 +1408,27 @@ xfs_fs_remount(
                        }
                        mp->m_update_flags = 0;
                }
+                /*
+                 * Fill out the reserve pool if it is empty. Use the stashed
+                 * value if it is non-zero, otherwise go with the default.
+                 */
+                xfs_restore_resvblks(mp);
        }
        /* rw -> ro */
        if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
+                /*
+                 * After we have synced the data but before we sync the
+                 * metadata, we need to free up the reserve block pool so that
+                 * the used block count in the superblock on disk is correct at
+                 * the end of the remount. Stash the current reserve pool size
+                 * so that if we get remounted rw, we can return it to the same
+                 * size.
+                 */
                xfs_quiesce_data(mp);
+                xfs_save_resvblks(mp);
                xfs_quiesce_attr(mp);
                mp->m_flags |= XFS_MOUNT_RDONLY;
        }
@@ -1339,11 +1447,22 @@ xfs_fs_freeze(
 {
        struct xfs_mount        *mp = XFS_M(sb);
+        xfs_save_resvblks(mp);
        xfs_quiesce_attr(mp);
        return -xfs_fs_log_dummy(mp);
 }
 STATIC int
+xfs_fs_unfreeze(
+        struct super_block      *sb)
+{
+        struct xfs_mount        *mp = XFS_M(sb);
+        xfs_restore_resvblks(mp);
+        return 0;
+}
+STATIC int
 xfs_fs_show_options(
        struct seq_file         *m,
        struct vfsmount         *mnt)
@@ -1504,8 +1623,6 @@ xfs_fs_fill_super(
                goto fail_vnrele;
        kfree(mtpt);
-        xfs_itrace_exit(XFS_I(sb->s_root->d_inode));
        return 0;
 out_filestream_unmount:
@@ -1567,6 +1684,7 @@ static const struct super_operations xfs_super_operations = {
        .put_super              = xfs_fs_put_super,
        .sync_fs                = xfs_fs_sync_fs,
        .freeze_fs              = xfs_fs_freeze,
+        .unfreeze_fs            = xfs_fs_unfreeze,
        .statfs                 = xfs_fs_statfs,
        .remount_fs             = xfs_fs_remount,
        .show_options           = xfs_fs_show_options,
@@ -1581,94 +1699,6 @@ static struct file_system_type xfs_fs_type = {
 };
 STATIC int __init
-xfs_alloc_trace_bufs(void)
-{
-#ifdef XFS_ALLOC_TRACE
-        xfs_alloc_trace_buf = ktrace_alloc(XFS_ALLOC_TRACE_SIZE, KM_MAYFAIL);
-        if (!xfs_alloc_trace_buf)
-                goto out;
-#endif
-#ifdef XFS_BMAP_TRACE
-        xfs_bmap_trace_buf = ktrace_alloc(XFS_BMAP_TRACE_SIZE, KM_MAYFAIL);
-        if (!xfs_bmap_trace_buf)
-                goto out_free_alloc_trace;
-#endif
-#ifdef XFS_BTREE_TRACE
-        xfs_allocbt_trace_buf = ktrace_alloc(XFS_ALLOCBT_TRACE_SIZE,
-                                             KM_MAYFAIL);
-        if (!xfs_allocbt_trace_buf)
-                goto out_free_bmap_trace;
-        xfs_inobt_trace_buf = ktrace_alloc(XFS_INOBT_TRACE_SIZE, KM_MAYFAIL);
-        if (!xfs_inobt_trace_buf)
-                goto out_free_allocbt_trace;
-        xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_MAYFAIL);
-        if (!xfs_bmbt_trace_buf)
-                goto out_free_inobt_trace;
-#endif
-#ifdef XFS_ATTR_TRACE
-        xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_MAYFAIL);
-        if (!xfs_attr_trace_buf)
-                goto out_free_bmbt_trace;
-#endif
-#ifdef XFS_DIR2_TRACE
-        xfs_dir2_trace_buf = ktrace_alloc(XFS_DIR2_GTRACE_SIZE, KM_MAYFAIL);
-        if (!xfs_dir2_trace_buf)
-                goto out_free_attr_trace;
-#endif
-        return 0;
-#ifdef XFS_DIR2_TRACE
- out_free_attr_trace:
-#endif
-#ifdef XFS_ATTR_TRACE
-        ktrace_free(xfs_attr_trace_buf);
- out_free_bmbt_trace:
-#endif
-#ifdef XFS_BTREE_TRACE
-        ktrace_free(xfs_bmbt_trace_buf);
- out_free_inobt_trace:
-        ktrace_free(xfs_inobt_trace_buf);
- out_free_allocbt_trace:
-        ktrace_free(xfs_allocbt_trace_buf);
- out_free_bmap_trace:
-#endif
-#ifdef XFS_BMAP_TRACE
-        ktrace_free(xfs_bmap_trace_buf);
- out_free_alloc_trace:
-#endif
-#ifdef XFS_ALLOC_TRACE
-        ktrace_free(xfs_alloc_trace_buf);
- out:
-#endif
-        return -ENOMEM;
-}
-STATIC void
-xfs_free_trace_bufs(void)
-{
-#ifdef XFS_DIR2_TRACE
-        ktrace_free(xfs_dir2_trace_buf);
-#endif
-#ifdef XFS_ATTR_TRACE
-        ktrace_free(xfs_attr_trace_buf);
-#endif
-#ifdef XFS_BTREE_TRACE
-        ktrace_free(xfs_bmbt_trace_buf);
-        ktrace_free(xfs_inobt_trace_buf);
-        ktrace_free(xfs_allocbt_trace_buf);
-#endif
-#ifdef XFS_BMAP_TRACE
-        ktrace_free(xfs_bmap_trace_buf);
-#endif
-#ifdef XFS_ALLOC_TRACE
-        ktrace_free(xfs_alloc_trace_buf);
-#endif
-}
-STATIC int __init
 xfs_init_zones(void)
 {
@@ -1809,7 +1839,6 @@ init_xfs_fs(void)
        printk(KERN_INFO XFS_VERSION_STRING " with "
                         XFS_BUILD_OPTIONS " enabled\n");
-        ktrace_init(64);
        xfs_ioend_init();
        xfs_dir_startup();
@@ -1817,13 +1846,9 @@ init_xfs_fs(void)
        if (error)
                goto out;
-        error = xfs_alloc_trace_bufs();
-        if (error)
-                goto out_destroy_zones;
        error = xfs_mru_cache_init();
        if (error)
-                goto out_free_trace_buffers;
+                goto out_destroy_zones;
        error = xfs_filestream_init();
        if (error)
@@ -1858,8 +1883,6 @@ init_xfs_fs(void)
        xfs_filestream_uninit();
 out_mru_cache_uninit:
        xfs_mru_cache_uninit();
- out_free_trace_buffers:
-        xfs_free_trace_bufs();
 out_destroy_zones:
        xfs_destroy_zones();
 out:
@@ -1876,9 +1899,7 @@ exit_xfs_fs(void)
        xfs_buf_terminate();
        xfs_filestream_uninit();
        xfs_mru_cache_uninit();
-        xfs_free_trace_bufs();
        xfs_destroy_zones();
-        ktrace_uninit();
 }
 module_init(init_xfs_fs);
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 18175ebd58ed..233d4b9881b1 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -56,12 +56,6 @@ extern void xfs_qm_exit(void);
 # define XFS_BIGFS_STRING
 #endif
-#ifdef CONFIG_XFS_TRACE
-# define XFS_TRACE_STRING       "tracing, "
-#else
-# define XFS_TRACE_STRING
-#endif
 #ifdef CONFIG_XFS_DMAPI
 # define XFS_DMAPI_STRING       "dmapi support, "
 #else
@@ -78,7 +72,6 @@ extern void xfs_qm_exit(void);
                                XFS_SECURITY_STRING \
                                XFS_REALTIME_STRING \
                                XFS_BIGFS_STRING \
-                                XFS_TRACE_STRING \
                                XFS_DMAPI_STRING \
                                XFS_DBG_STRING /* DBG must be last */
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 961df0a22c78..05cd85317f6f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -44,6 +44,7 @@
 #include "xfs_inode_item.h"
 #include "xfs_rw.h"
 #include "xfs_quota.h"
+#include "xfs_trace.h"
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -64,7 +65,6 @@ xfs_inode_ag_lookup(
         * as the tree is sparse and a gang lookup walks to find
         * the number of objects requested.
         */
-        read_lock(&pag->pag_ici_lock);
        if (tag == XFS_ICI_NO_TAG) {
                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
                                (void **)&ip, *first_index, 1);
@@ -73,7 +73,7 @@ xfs_inode_ag_lookup(
                                (void **)&ip, *first_index, 1, tag);
        }
        if (!nr_found)
-                goto unlock;
+                return NULL;
        /*
         * Update the index for the next lookup. Catch overflows
@@ -83,25 +83,20 @@ xfs_inode_ag_lookup(
         */
        *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
        if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
-                goto unlock;
+                return NULL;
        return ip;
-unlock:
-        read_unlock(&pag->pag_ici_lock);
-        return NULL;
 }
 STATIC int
 xfs_inode_ag_walk(
        struct xfs_mount        *mp,
-        xfs_agnumber_t          ag,
+        struct xfs_perag        *pag,
        int                     (*execute)(struct xfs_inode *ip,
                                           struct xfs_perag *pag, int flags),
        int                     flags,
-        int                     tag)
+        int                     tag,
+        int                     exclusive)
 {
-        struct xfs_perag        *pag = &mp->m_perag[ag];
        uint32_t                first_index;
        int                     last_error = 0;
        int                     skipped;
@@ -113,10 +108,20 @@ restart:
                int             error = 0;
                xfs_inode_t     *ip;
+                if (exclusive)
+                        write_lock(&pag->pag_ici_lock);
+                else
+                        read_lock(&pag->pag_ici_lock);
                ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
-                if (!ip)
+                if (!ip) {
+                        if (exclusive)
+                                write_unlock(&pag->pag_ici_lock);
+                        else
+                                read_unlock(&pag->pag_ici_lock);
                        break;
+                }
+                /* execute releases pag->pag_ici_lock */
                error = execute(ip, pag, flags);
                if (error == EAGAIN) {
                        skipped++;
@@ -124,9 +129,8 @@ restart:
                }
                if (error)
                        last_error = error;
-                /*
-                 * bail out if the filesystem is corrupted.
+                /* bail out if the filesystem is corrupted.  */
-                 */
                if (error == EFSCORRUPTED)
                        break;
@@ -136,8 +140,6 @@ restart:
                delay(1);
                goto restart;
        }
-        xfs_put_perag(mp, pag);
        return last_error;
 }
@@ -147,16 +149,24 @@ xfs_inode_ag_iterator(
        int                     (*execute)(struct xfs_inode *ip,
                                           struct xfs_perag *pag, int flags),
        int                     flags,
-        int                     tag)
+        int                     tag,
+        int                     exclusive)
 {
        int                     error = 0;
        int                     last_error = 0;
        xfs_agnumber_t          ag;
        for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
-                if (!mp->m_perag[ag].pag_ici_init)
+                struct xfs_perag        *pag;
+                pag = xfs_perag_get(mp, ag);
+                if (!pag->pag_ici_init) {
+                        xfs_perag_put(pag);
                        continue;
-                error = xfs_inode_ag_walk(mp, ag, execute, flags, tag);
+                }
+                error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
+                                                exclusive);
+                xfs_perag_put(pag);
                if (error) {
                        last_error = error;
                        if (error == EFSCORRUPTED)
@@ -173,30 +183,31 @@ xfs_sync_inode_valid(
        struct xfs_perag        *pag)
 {
        struct inode            *inode = VFS_I(ip);
+        int                     error = EFSCORRUPTED;
        /* nothing to sync during shutdown */
-        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-                read_unlock(&pag->pag_ici_lock);
+                goto out_unlock;
-                return EFSCORRUPTED;
-        }
-        /*
+        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-         * If we can't get a reference on the inode, it must be in reclaim.
+        error = ENOENT;
-         * Leave it for the reclaim code to flush. Also avoid inodes that
+        if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-         * haven't been fully initialised.
+                goto out_unlock;
-         */
-        if (!igrab(inode)) {
+        /* If we can't grab the inode, it must on it's way to reclaim. */
-                read_unlock(&pag->pag_ici_lock);
+        if (!igrab(inode))
-                return ENOENT;
+                goto out_unlock;
-        }
-        read_unlock(&pag->pag_ici_lock);
-        if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) {
+        if (is_bad_inode(inode)) {
                IRELE(ip);
-                return ENOENT;
+                goto out_unlock;
        }
-        return 0;
+        /* inode is valid */
+        error = 0;
+out_unlock:
+        read_unlock(&pag->pag_ici_lock);
+        return error;
 }
 STATIC int
@@ -223,7 +234,7 @@ xfs_sync_inode_data(
        }
        error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
-                                0 : XFS_B_ASYNC, FI_NONE);
+                                0 : XBF_ASYNC, FI_NONE);
        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 out_wait:
@@ -259,8 +270,7 @@ xfs_sync_inode_attr(
                goto out_unlock;
        }
-        error = xfs_iflush(ip, (flags & SYNC_WAIT) ?
+        error = xfs_iflush(ip, flags);
-                           XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI);
 out_unlock:
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -281,14 +291,11 @@ xfs_sync_data(
        ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
        error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
-                                      XFS_ICI_NO_TAG);
+                                      XFS_ICI_NO_TAG, 0);
        if (error)
                return XFS_ERROR(error);
-        xfs_log_force(mp, 0,
+        xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
-                      (flags & SYNC_WAIT) ?
-                       XFS_LOG_FORCE | XFS_LOG_SYNC :
-                       XFS_LOG_FORCE);
        return 0;
 }
@@ -303,7 +310,7 @@ xfs_sync_attr(
        ASSERT((flags & ~SYNC_WAIT) == 0);
        return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
-                                     XFS_ICI_NO_TAG);
+                                     XFS_ICI_NO_TAG, 0);
 }
 STATIC int
@@ -314,10 +321,6 @@ xfs_commit_dummy_trans(
        struct xfs_inode        *ip = mp->m_rootip;
        struct xfs_trans        *tp;
        int                     error;
-        int                     log_flags = XFS_LOG_FORCE;
-        if (flags & SYNC_WAIT)
-                log_flags |= XFS_LOG_SYNC;
        /*
         * Put a dummy transaction in the log to tell recovery
@@ -339,11 +342,11 @@ xfs_commit_dummy_trans(
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        /* the log force ensures this transaction is pushed to disk */
-        xfs_log_force(mp, 0, log_flags);
+        xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
        return error;
 }
-int
+STATIC int
 xfs_sync_fsdata(
        struct xfs_mount        *mp,
        int                     flags)
@@ -359,7 +362,7 @@ xfs_sync_fsdata(
        if (flags & SYNC_TRYLOCK) {
                ASSERT(!(flags & SYNC_WAIT));
-                bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
+                bp = xfs_getsb(mp, XBF_TRYLOCK);
                if (!bp)
                        goto out;
@@ -379,7 +382,7 @@ xfs_sync_fsdata(
                 * become pinned in between there and here.
                 */
                if (XFS_BUF_ISPINNED(bp))
-                        xfs_log_force(mp, 0, XFS_LOG_FORCE);
+                        xfs_log_force(mp, 0);
        }
@@ -440,9 +443,6 @@ xfs_quiesce_data(
        xfs_sync_data(mp, SYNC_WAIT);
        xfs_qm_sync(mp, SYNC_WAIT);
-        /* drop inode references pinned by filestreams */
-        xfs_filestream_flush(mp);
        /* write superblock and hoover up shutdown errors */
        error = xfs_sync_fsdata(mp, SYNC_WAIT);
@@ -459,16 +459,18 @@ xfs_quiesce_fs(
 {
        int     count = 0, pincount;
+        xfs_reclaim_inodes(mp, 0);
        xfs_flush_buftarg(mp->m_ddev_targp, 0);
-        xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
        /*
         * This loop must run at least twice.  The first instance of the loop
         * will flush most meta data but that will generate more meta data
         * (typically directory updates).  Which then must be flushed and
-         * logged before we can write the unmount record.
+         * logged before we can write the unmount record. We also so sync
+         * reclaim of inodes to catch any that the above delwri flush skipped.
         */
        do {
+                xfs_reclaim_inodes(mp, SYNC_WAIT);
                xfs_sync_attr(mp, SYNC_WAIT);
                pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
                if (!pincount) {
@@ -567,7 +569,7 @@ xfs_flush_inodes(
        igrab(inode);
        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
        wait_for_completion(&completion);
-        xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
+        xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
 }
 /*
@@ -583,8 +585,8 @@ xfs_sync_worker(
        int             error;
        if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+                xfs_log_force(mp, 0);
-                xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+                xfs_reclaim_inodes(mp, 0);
                /* dgc: errors ignored here */
                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
                error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
@@ -605,7 +607,8 @@ xfssyncd(
        set_freezable();
        timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
        for (;;) {
-                timeleft = schedule_timeout_interruptible(timeleft);
+                if (list_empty(&mp->m_sync_list))
+                        timeleft = schedule_timeout_interruptible(timeleft);
                /* swsusp */
                try_to_freeze();
                if (kthread_should_stop() && list_empty(&mp->m_sync_list))
@@ -625,8 +628,7 @@ xfssyncd(
                        list_add_tail(&mp->m_sync_work.w_list,
                                        &mp->m_sync_list);
                }
-                list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
+                list_splice_init(&mp->m_sync_list, &tmp);
-                        list_move(&work->w_list, &tmp);
                spin_unlock(&mp->m_sync_lock);
                list_for_each_entry_safe(work, n, &tmp, w_list) {
@@ -663,67 +665,6 @@ xfs_syncd_stop(
        kthread_stop(mp->m_sync_task);
 }
-int
-xfs_reclaim_inode(
-        xfs_inode_t     *ip,
-        int             locked,
-        int             sync_mode)
-{
-        xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
-        /* The hash lock here protects a thread in xfs_iget_core from
-         * racing with us on linking the inode back with a vnode.
-         * Once we have the XFS_IRECLAIM flag set it will not touch
-         * us.
-         */
-        write_lock(&pag->pag_ici_lock);
-        spin_lock(&ip->i_flags_lock);
-        if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
-            !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
-                spin_unlock(&ip->i_flags_lock);
-                write_unlock(&pag->pag_ici_lock);
-                if (locked) {
-                        xfs_ifunlock(ip);
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                }
-                return -EAGAIN;
-        }
-        __xfs_iflags_set(ip, XFS_IRECLAIM);
-        spin_unlock(&ip->i_flags_lock);
-        write_unlock(&pag->pag_ici_lock);
-        xfs_put_perag(ip->i_mount, pag);
-        /*
-         * If the inode is still dirty, then flush it out.  If the inode
-         * is not in the AIL, then it will be OK to flush it delwri as
-         * long as xfs_iflush() does not keep any references to the inode.
-         * We leave that decision up to xfs_iflush() since it has the
-         * knowledge of whether it's OK to simply do a delwri flush of
-         * the inode or whether we need to wait until the inode is
-         * pulled from the AIL.
-         * We get the flush lock regardless, though, just to make sure
-         * we don't free it while it is being flushed.
-         */
-        if (!locked) {
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                xfs_iflock(ip);
-        }
-        /*
-         * In the case of a forced shutdown we rely on xfs_iflush() to
-         * wait for the inode to be unpinned before returning an error.
-         */
-        if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
-                /* synchronize with xfs_iflush_done */
-                xfs_iflock(ip);
-                xfs_ifunlock(ip);
-        }
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        xfs_ireclaim(ip);
-        return 0;
-}
 void
 __xfs_inode_set_reclaim_tag(
        struct xfs_perag        *pag,
@@ -743,16 +684,17 @@ void
 xfs_inode_set_reclaim_tag(
        xfs_inode_t     *ip)
 {
-        xfs_mount_t     *mp = ip->i_mount;
+        struct xfs_mount *mp = ip->i_mount;
-        xfs_perag_t     *pag = xfs_get_perag(mp, ip->i_ino);
+        struct xfs_perag *pag;
-        read_lock(&pag->pag_ici_lock);
+        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+        write_lock(&pag->pag_ici_lock);
        spin_lock(&ip->i_flags_lock);
        __xfs_inode_set_reclaim_tag(pag, ip);
        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
        spin_unlock(&ip->i_flags_lock);
-        read_unlock(&pag->pag_ici_lock);
+        write_unlock(&pag->pag_ici_lock);
-        xfs_put_perag(mp, pag);
+        xfs_perag_put(pag);
 }
 void
@@ -765,20 +707,145 @@ __xfs_inode_clear_reclaim_tag(
                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
 }
+/*
+ * Inodes in different states need to be treated differently, and the return
+ * value of xfs_iflush is not sufficient to get this right. The following table
+ * lists the inode states and the reclaim actions necessary for non-blocking
+ * reclaim:
+ *
+ *
+ *      inode state          iflush ret         required action
+ *      ---------------      ----------         ---------------
+ *      bad                     -               reclaim
+ *      shutdown                EIO             unpin and reclaim
+ *      clean, unpinned         0               reclaim
+ *      stale, unpinned         0               reclaim
+ *      clean, pinned(*)        0               requeue
+ *      stale, pinned           EAGAIN          requeue
+ *      dirty, delwri ok        0               requeue
+ *      dirty, delwri blocked   EAGAIN          requeue
+ *      dirty, sync flush       0               reclaim
+ *
+ * (*) dgc: I don't think the clean, pinned state is possible but it gets
+ * handled anyway given the order of checks implemented.
+ *
+ * As can be seen from the table, the return value of xfs_iflush() is not
+ * sufficient to correctly decide the reclaim action here. The checks in
+ * xfs_iflush() might look like duplicates, but they are not.
+ *
+ * Also, because we get the flush lock first, we know that any inode that has
+ * been flushed delwri has had the flush completed by the time we check that
+ * the inode is clean. The clean inode check needs to be done before flushing
+ * the inode delwri otherwise we would loop forever requeuing clean inodes as
+ * we cannot tell apart a successful delwri flush and a clean inode from the
+ * return value of xfs_iflush().
+ *
+ * Note that because the inode is flushed delayed write by background
+ * writeback, the flush lock may already be held here and waiting on it can
+ * result in very long latencies. Hence for sync reclaims, where we wait on the
+ * flush lock, the caller should push out delayed write inodes first before
+ * trying to reclaim them to minimise the amount of time spent waiting. For
+ * background relaim, we just requeue the inode for the next pass.
+ *
+ * Hence the order of actions after gaining the locks should be:
+ *      bad             => reclaim
+ *      shutdown        => unpin and reclaim
+ *      pinned, delwri  => requeue
+ *      pinned, sync    => unpin
+ *      stale           => reclaim
+ *      clean           => reclaim
+ *      dirty, delwri   => flush and requeue
+ *      dirty, sync     => flush, wait and reclaim
+ */
 STATIC int
-xfs_reclaim_inode_now(
+xfs_reclaim_inode(
        struct xfs_inode        *ip,
        struct xfs_perag        *pag,
-        int                     flags)
+        int                     sync_mode)
 {
-        /* ignore if already under reclaim */
+        int     error = 0;
-        if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
-                read_unlock(&pag->pag_ici_lock);
+        /*
+         * The radix tree lock here protects a thread in xfs_iget from racing
+         * with us starting reclaim on the inode.  Once we have the
+         * XFS_IRECLAIM flag set it will not touch us.
+         */
+        spin_lock(&ip->i_flags_lock);
+        ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+        if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
+                /* ignore as it is already under reclaim */
+                spin_unlock(&ip->i_flags_lock);
+                write_unlock(&pag->pag_ici_lock);
                return 0;
        }
-        read_unlock(&pag->pag_ici_lock);
+        __xfs_iflags_set(ip, XFS_IRECLAIM);
+        spin_unlock(&ip->i_flags_lock);
+        write_unlock(&pag->pag_ici_lock);
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        if (!xfs_iflock_nowait(ip)) {
+                if (!(sync_mode & SYNC_WAIT))
+                        goto out;
+                xfs_iflock(ip);
+        }
+        if (is_bad_inode(VFS_I(ip)))
+                goto reclaim;
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                xfs_iunpin_wait(ip);
+                goto reclaim;
+        }
+        if (xfs_ipincount(ip)) {
+                if (!(sync_mode & SYNC_WAIT)) {
+                        xfs_ifunlock(ip);
+                        goto out;
+                }
+                xfs_iunpin_wait(ip);
+        }
+        if (xfs_iflags_test(ip, XFS_ISTALE))
+                goto reclaim;
+        if (xfs_inode_clean(ip))
+                goto reclaim;
+        /* Now we have an inode that needs flushing */
+        error = xfs_iflush(ip, sync_mode);
+        if (sync_mode & SYNC_WAIT) {
+                xfs_iflock(ip);
+                goto reclaim;
+        }
+        /*
+         * When we have to flush an inode but don't have SYNC_WAIT set, we
+         * flush the inode out using a delwri buffer and wait for the next
+         * call into reclaim to find it in a clean state instead of waiting for
+         * it now. We also don't return errors here - if the error is transient
+         * then the next reclaim pass will flush the inode, and if the error
+         * is permanent then the next sync reclaim will relcaim the inode and
+         * pass on the error.
+         */
+        if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+                        "inode 0x%llx background reclaim flush failed with %d",
+                        (long long)ip->i_ino, error);
+        }
+out:
+        xfs_iflags_clear(ip, XFS_IRECLAIM);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        /*
+         * We could return EAGAIN here to make reclaim rescan the inode tree in
+         * a short while. However, this just burns CPU time scanning the tree
+         * waiting for IO to complete and xfssyncd never goes back to the idle
+         * state. Instead, return 0 to let the next scheduled background reclaim
+         * attempt to reclaim the inode again.
+         */
+        return 0;
+reclaim:
+        xfs_ifunlock(ip);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        xfs_ireclaim(ip);
+        return error;
-        return xfs_reclaim_inode(ip, 0, flags);
 }
 int
@@ -786,6 +853,6 @@ xfs_reclaim_inodes(
        xfs_mount_t     *mp,
        int             mode)
 {
-        return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode,
+        return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
-                                        XFS_ICI_RECLAIM_TAG);
+                                        XFS_ICI_RECLAIM_TAG, 1);
 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 27920eb7a820..d480c346cabb 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -37,14 +37,12 @@ void xfs_syncd_stop(struct xfs_mount *mp);
 int xfs_sync_attr(struct xfs_mount *mp, int flags);
 int xfs_sync_data(struct xfs_mount *mp, int flags);
-int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
 int xfs_quiesce_data(struct xfs_mount *mp);
 void xfs_quiesce_attr(struct xfs_mount *mp);
 void xfs_flush_inodes(struct xfs_inode *ip);
-int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
@@ -55,6 +53,6 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
 int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
        int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
-        int flags, int tag);
+        int flags, int tag, int write_lock);
 #endif
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index c5bc67c4e3bb..7bb5092d6ae4 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -55,170 +55,140 @@ xfs_stats_clear_proc_handler(
 static ctl_table xfs_table[] = {
        {
-                .ctl_name       = XFS_SGID_INHERIT,
                .procname       = "irix_sgid_inherit",
                .data           = &xfs_params.sgid_inherit.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.sgid_inherit.min,
                .extra2         = &xfs_params.sgid_inherit.max
        },
        {
-                .ctl_name       = XFS_SYMLINK_MODE,
                .procname       = "irix_symlink_mode",
                .data           = &xfs_params.symlink_mode.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.symlink_mode.min,
                .extra2         = &xfs_params.symlink_mode.max
        },
        {
-                .ctl_name       = XFS_PANIC_MASK,
                .procname       = "panic_mask",
                .data           = &xfs_params.panic_mask.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.panic_mask.min,
                .extra2         = &xfs_params.panic_mask.max
        },
        {
-                .ctl_name       = XFS_ERRLEVEL,
                .procname       = "error_level",
                .data           = &xfs_params.error_level.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.error_level.min,
                .extra2         = &xfs_params.error_level.max
        },
        {
-                .ctl_name       = XFS_SYNCD_TIMER,
                .procname       = "xfssyncd_centisecs",
                .data           = &xfs_params.syncd_timer.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.syncd_timer.min,
                .extra2         = &xfs_params.syncd_timer.max
        },
        {
-                .ctl_name       = XFS_INHERIT_SYNC,
                .procname       = "inherit_sync",
                .data           = &xfs_params.inherit_sync.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.inherit_sync.min,
                .extra2         = &xfs_params.inherit_sync.max
        },
        {
-                .ctl_name       = XFS_INHERIT_NODUMP,
                .procname       = "inherit_nodump",
                .data           = &xfs_params.inherit_nodump.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.inherit_nodump.min,
                .extra2         = &xfs_params.inherit_nodump.max
        },
        {
-                .ctl_name       = XFS_INHERIT_NOATIME,
                .procname       = "inherit_noatime",
                .data           = &xfs_params.inherit_noatim.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.inherit_noatim.min,
                .extra2         = &xfs_params.inherit_noatim.max
        },
        {
-                .ctl_name       = XFS_BUF_TIMER,
                .procname       = "xfsbufd_centisecs",
                .data           = &xfs_params.xfs_buf_timer.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.xfs_buf_timer.min,
                .extra2         = &xfs_params.xfs_buf_timer.max
        },
        {
-                .ctl_name       = XFS_BUF_AGE,
                .procname       = "age_buffer_centisecs",
                .data           = &xfs_params.xfs_buf_age.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.xfs_buf_age.min,
                .extra2         = &xfs_params.xfs_buf_age.max
        },
        {
-                .ctl_name       = XFS_INHERIT_NOSYM,
                .procname       = "inherit_nosymlinks",
                .data           = &xfs_params.inherit_nosym.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.inherit_nosym.min,
                .extra2         = &xfs_params.inherit_nosym.max
        },
        {
-                .ctl_name       = XFS_ROTORSTEP,
                .procname       = "rotorstep",
                .data           = &xfs_params.rotorstep.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.rotorstep.min,
                .extra2         = &xfs_params.rotorstep.max
        },
        {
-                .ctl_name       = XFS_INHERIT_NODFRG,
                .procname       = "inherit_nodefrag",
                .data           = &xfs_params.inherit_nodfrg.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.inherit_nodfrg.min,
                .extra2         = &xfs_params.inherit_nodfrg.max
        },
        {
-                .ctl_name       = XFS_FILESTREAM_TIMER,
                .procname       = "filestream_centisecs",
                .data           = &xfs_params.fstrm_timer.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.fstrm_timer.min,
                .extra2         = &xfs_params.fstrm_timer.max,
        },
        /* please keep this the last entry */
 #ifdef CONFIG_PROC_FS
        {
-                .ctl_name       = XFS_STATS_CLEAR,
                .procname       = "stats_clear",
                .data           = &xfs_params.stats_clear.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &xfs_stats_clear_proc_handler,
+                .proc_handler   = xfs_stats_clear_proc_handler,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.stats_clear.min,
                .extra2         = &xfs_params.stats_clear.max
        },
@@ -229,7 +199,6 @@ static ctl_table xfs_table[] = {
 static ctl_table xfs_dir_table[] = {
        {
-                .ctl_name       = FS_XFS,
                .procname       = "xfs",
                .mode           = 0555,
                .child          = xfs_table
@@ -239,7 +208,6 @@ static ctl_table xfs_dir_table[] = {
 static ctl_table xfs_root_table[] = {
        {
-                .ctl_name       = CTL_FS,
                .procname       = "fs",
                .mode           = 0555,
                .child          = xfs_dir_table
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
new file mode 100644
index 000000000000..5a107601e969
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2009, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_ialloc.h"
+#include "xfs_itable.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_log_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_quota.h"
+#include "xfs_iomap.h"
+#include "xfs_aops.h"
+#include "quota/xfs_dquot_item.h"
+#include "quota/xfs_dquot.h"
+/*
+ * We include this last to have the helpers above available for the trace
+ * event implementations.
+ */
+#define CREATE_TRACE_POINTS
+#include "xfs_trace.h"
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
new file mode 100644
index 000000000000..fcaa62f0799e
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -0,0 +1,1503 @@
+/*
+ * Copyright (c) 2009, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM xfs
+#if !defined(_TRACE_XFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_XFS_H
+#include <linux/tracepoint.h>
+struct xfs_agf;
+struct xfs_alloc_arg;
+struct xfs_attr_list_context;
+struct xfs_buf_log_item;
+struct xfs_da_args;
+struct xfs_da_node_entry;
+struct xfs_dquot;
+struct xlog_ticket;
+struct log;
+DECLARE_EVENT_CLASS(xfs_attr_list_class,
+        TP_PROTO(struct xfs_attr_list_context *ctx),
+        TP_ARGS(ctx),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, ino)
+                __field(u32, hashval)
+                __field(u32, blkno)
+                __field(u32, offset)
+                __field(void *, alist)
+                __field(int, bufsize)
+                __field(int, count)
+                __field(int, firstu)
+                __field(int, dupcnt)
+                __field(int, flags)
+        ),
+        TP_fast_assign(
+                __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
+                __entry->ino = ctx->dp->i_ino;
+                __entry->hashval = ctx->cursor->hashval;
+                __entry->blkno = ctx->cursor->blkno;
+                __entry->offset = ctx->cursor->offset;
+                __entry->alist = ctx->alist;
+                __entry->bufsize = ctx->bufsize;
+                __entry->count = ctx->count;
+                __entry->firstu = ctx->firstu;
+                __entry->flags = ctx->flags;
+        ),
+        TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
+                  "alist 0x%p size %u count %u firstu %u flags %d %s",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                   __entry->ino,
+                   __entry->hashval,
+                   __entry->blkno,
+                   __entry->offset,
+                   __entry->dupcnt,
+                   __entry->alist,
+                   __entry->bufsize,
+                   __entry->count,
+                   __entry->firstu,
+                   __entry->flags,
+                   __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS)
+        )
+)
+#define DEFINE_PERAG_REF_EVENT(name) \
+TRACE_EVENT(name, \
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
+                 unsigned long caller_ip), \
+        TP_ARGS(mp, agno, refcount, caller_ip), \
+        TP_STRUCT__entry( \
+                __field(dev_t, dev) \
+                __field(xfs_agnumber_t, agno) \
+                __field(int, refcount) \
+                __field(unsigned long, caller_ip) \
+        ), \
+        TP_fast_assign( \
+                __entry->dev = mp->m_super->s_dev; \
+                __entry->agno = agno; \
+                __entry->refcount = refcount; \
+                __entry->caller_ip = caller_ip; \
+        ), \
+        TP_printk("dev %d:%d agno %u refcount %d caller %pf", \
+                  MAJOR(__entry->dev), MINOR(__entry->dev), \
+                  __entry->agno, \
+                  __entry->refcount, \
+                  (char *)__entry->caller_ip) \
+);
+DEFINE_PERAG_REF_EVENT(xfs_perag_get)
+DEFINE_PERAG_REF_EVENT(xfs_perag_put)
+#define DEFINE_ATTR_LIST_EVENT(name) \
+DEFINE_EVENT(xfs_attr_list_class, name, \
+        TP_PROTO(struct xfs_attr_list_context *ctx), \
+        TP_ARGS(ctx))
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf_all);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf_end);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
+TRACE_EVENT(xfs_attr_list_node_descend,
+        TP_PROTO(struct xfs_attr_list_context *ctx,
+                 struct xfs_da_node_entry *btree),
+        TP_ARGS(ctx, btree),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, ino)
+                __field(u32, hashval)
+                __field(u32, blkno)
+                __field(u32, offset)
+                __field(void *, alist)
+                __field(int, bufsize)
+                __field(int, count)
+                __field(int, firstu)
+                __field(int, dupcnt)
+                __field(int, flags)
+                __field(u32, bt_hashval)
+                __field(u32, bt_before)
+        ),
+        TP_fast_assign(
+                __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
+                __entry->ino = ctx->dp->i_ino;
+                __entry->hashval = ctx->cursor->hashval;
+                __entry->blkno = ctx->cursor->blkno;
+                __entry->offset = ctx->cursor->offset;
+                __entry->alist = ctx->alist;
+                __entry->bufsize = ctx->bufsize;
+                __entry->count = ctx->count;
+                __entry->firstu = ctx->firstu;
+                __entry->flags = ctx->flags;
+                __entry->bt_hashval = be32_to_cpu(btree->hashval);
+                __entry->bt_before = be32_to_cpu(btree->before);
+        ),
+        TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
+                  "alist 0x%p size %u count %u firstu %u flags %d %s "
+                  "node hashval %u, node before %u",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                   __entry->ino,
+                   __entry->hashval,
+                   __entry->blkno,
+                   __entry->offset,
+                   __entry->dupcnt,
+                   __entry->alist,
+                   __entry->bufsize,
+                   __entry->count,
+                   __entry->firstu,
+                   __entry->flags,
+                   __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS),
+                   __entry->bt_hashval,
+                   __entry->bt_before)
+);
+TRACE_EVENT(xfs_iext_insert,
+        TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx,
+                 struct xfs_bmbt_irec *r, int state, unsigned long caller_ip),
+        TP_ARGS(ip, idx, r, state, caller_ip),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, ino)
+                __field(xfs_extnum_t, idx)
+                __field(xfs_fileoff_t, startoff)
+                __field(xfs_fsblock_t, startblock)
+                __field(xfs_filblks_t, blockcount)
+                __field(xfs_exntst_t, state)
+                __field(int, bmap_state)
+                __field(unsigned long, caller_ip)
+        ),
+        TP_fast_assign(
+                __entry->dev = VFS_I(ip)->i_sb->s_dev;
+                __entry->ino = ip->i_ino;
+                __entry->idx = idx;
+                __entry->startoff = r->br_startoff;
+                __entry->startblock = r->br_startblock;
+                __entry->blockcount = r->br_blockcount;
+                __entry->state = r->br_state;
+                __entry->bmap_state = state;
+                __entry->caller_ip = caller_ip;
+        ),
+        TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
+                  "offset %lld block %lld count %lld flag %d caller %pf",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->ino,
+                  __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
+                  (long)__entry->idx,
+                  __entry->startoff,
+                  (__int64_t)__entry->startblock,
+                  __entry->blockcount,
+                  __entry->state,
+                  (char *)__entry->caller_ip)
+);
+DECLARE_EVENT_CLASS(xfs_bmap_class,
+        TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state,
+                 unsigned long caller_ip),
+        TP_ARGS(ip, idx, state, caller_ip),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, ino)
+                __field(xfs_extnum_t, idx)
+                __field(xfs_fileoff_t, startoff)
+                __field(xfs_fsblock_t, startblock)
+                __field(xfs_filblks_t, blockcount)
+                __field(xfs_exntst_t, state)
+                __field(int, bmap_state)
+                __field(unsigned long, caller_ip)
+        ),
+        TP_fast_assign(
+                struct xfs_ifork        *ifp = (state & BMAP_ATTRFORK) ?
+                                                ip->i_afp : &ip->i_df;
+                struct xfs_bmbt_irec    r;
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r);
+                __entry->dev = VFS_I(ip)->i_sb->s_dev;
+                __entry->ino = ip->i_ino;
+                __entry->idx = idx;
+                __entry->startoff = r.br_startoff;
+                __entry->startblock = r.br_startblock;
+                __entry->blockcount = r.br_blockcount;
+                __entry->state = r.br_state;
+                __entry->bmap_state = state;
+                __entry->caller_ip = caller_ip;
+        ),
+        TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
+                  "offset %lld block %lld count %lld flag %d caller %pf",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->ino,
+                  __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
+                  (long)__entry->idx,
+                  __entry->startoff,
+                  (__int64_t)__entry->startblock,
+                  __entry->blockcount,
+                  __entry->state,
+                  (char *)__entry->caller_ip)
+)
+#define DEFINE_BMAP_EVENT(name) \
+DEFINE_EVENT(xfs_bmap_class, name, \
+        TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, \
+                 unsigned long caller_ip), \
+        TP_ARGS(ip, idx, state, caller_ip))
+DEFINE_BMAP_EVENT(xfs_iext_remove);
+DEFINE_BMAP_EVENT(xfs_bmap_pre_update);
+DEFINE_BMAP_EVENT(xfs_bmap_post_update);
+DEFINE_BMAP_EVENT(xfs_extlist);
+DECLARE_EVENT_CLASS(xfs_buf_class,
+        TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip),
+        TP_ARGS(bp, caller_ip),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_daddr_t, bno)
+                __field(size_t, buffer_length)
+                __field(int, hold)
+                __field(int, pincount)
+                __field(unsigned, lockval)
+                __field(unsigned, flags)
+                __field(unsigned long, caller_ip)
+        ),
+        TP_fast_assign(
+                __entry->dev = bp->b_target->bt_dev;
+                __entry->bno = bp->b_bn;
+                __entry->buffer_length = bp->b_buffer_length;
+                __entry->hold = atomic_read(&bp->b_hold);
+                __entry->pincount = atomic_read(&bp->b_pin_count);
+                __entry->lockval = xfs_buf_lock_value(bp);
+                __entry->flags = bp->b_flags;
+                __entry->caller_ip = caller_ip;
+        ),
+        TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+                  "lock %d flags %s caller %pf",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  (unsigned long long)__entry->bno,
+                  __entry->buffer_length,
+                  __entry->hold,
+                  __entry->pincount,
+                  __entry->lockval,
+                  __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+                  (void *)__entry->caller_ip)
+)
+#define DEFINE_BUF_EVENT(name) \
+DEFINE_EVENT(xfs_buf_class, name, \
+        TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), \
+        TP_ARGS(bp, caller_ip))
+DEFINE_BUF_EVENT(xfs_buf_init);
+DEFINE_BUF_EVENT(xfs_buf_free);
+DEFINE_BUF_EVENT(xfs_buf_hold);
+DEFINE_BUF_EVENT(xfs_buf_rele);
+DEFINE_BUF_EVENT(xfs_buf_pin);
+DEFINE_BUF_EVENT(xfs_buf_unpin);
+DEFINE_BUF_EVENT(xfs_buf_iodone);
+DEFINE_BUF_EVENT(xfs_buf_iorequest);
+DEFINE_BUF_EVENT(xfs_buf_bawrite);
+DEFINE_BUF_EVENT(xfs_buf_bdwrite);
+DEFINE_BUF_EVENT(xfs_buf_lock);
+DEFINE_BUF_EVENT(xfs_buf_lock_done);
+DEFINE_BUF_EVENT(xfs_buf_cond_lock);
+DEFINE_BUF_EVENT(xfs_buf_unlock);
+DEFINE_BUF_EVENT(xfs_buf_ordered_retry);
+DEFINE_BUF_EVENT(xfs_buf_iowait);
+DEFINE_BUF_EVENT(xfs_buf_iowait_done);
+DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
+DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue);
+DEFINE_BUF_EVENT(xfs_buf_delwri_split);
+DEFINE_BUF_EVENT(xfs_buf_get_noaddr);
+DEFINE_BUF_EVENT(xfs_bdstrat_shut);
+DEFINE_BUF_EVENT(xfs_buf_item_relse);
+DEFINE_BUF_EVENT(xfs_buf_item_iodone);
+DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
+DEFINE_BUF_EVENT(xfs_buf_error_relse);
+DEFINE_BUF_EVENT(xfs_trans_read_buf_io);
+DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
+/* not really buffer traces, but the buf provides useful information */
+DEFINE_BUF_EVENT(xfs_btree_corrupt);
+DEFINE_BUF_EVENT(xfs_da_btree_corrupt);
+DEFINE_BUF_EVENT(xfs_reset_dqcounts);
+DEFINE_BUF_EVENT(xfs_inode_item_push);
+/* pass flags explicitly */
+DECLARE_EVENT_CLASS(xfs_buf_flags_class,
+        TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip),
+        TP_ARGS(bp, flags, caller_ip),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_daddr_t, bno)
+                __field(size_t, buffer_length)
+                __field(int, hold)
+                __field(int, pincount)
+                __field(unsigned, lockval)
+                __field(unsigned, flags)
+                __field(unsigned long, caller_ip)
+        ),
+        TP_fast_assign(
+                __entry->dev = bp->b_target->bt_dev;
+                __entry->bno = bp->b_bn;
+                __entry->buffer_length = bp->b_buffer_length;
+                __entry->flags = flags;
+                __entry->hold = atomic_read(&bp->b_hold);
+                __entry->pincount = atomic_read(&bp->b_pin_count);
+                __entry->lockval = xfs_buf_lock_value(bp);
+                __entry->caller_ip = caller_ip;
+        ),
+        TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+                  "lock %d flags %s caller %pf",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  (unsigned long long)__entry->bno,
+                  __entry->buffer_length,
+                  __entry->hold,
+                  __entry->pincount,
+                  __entry->lockval,
+                  __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+                  (void *)__entry->caller_ip)
+)
+#define DEFINE_BUF_FLAGS_EVENT(name) \
+DEFINE_EVENT(xfs_buf_flags_class, name, \
+        TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), \
+        TP_ARGS(bp, flags, caller_ip))
+DEFINE_BUF_FLAGS_EVENT(xfs_buf_find);
+DEFINE_BUF_FLAGS_EVENT(xfs_buf_get);
+DEFINE_BUF_FLAGS_EVENT(xfs_buf_read);
+TRACE_EVENT(xfs_buf_ioerror,
+        TP_PROTO(struct xfs_buf *bp, int error, unsigned long caller_ip),
+        TP_ARGS(bp, error, caller_ip),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_daddr_t, bno)
+                __field(size_t, buffer_length)
+                __field(unsigned, flags)
+                __field(int, hold)
+                __field(int, pincount)
+                __field(unsigned, lockval)
+                __field(int, error)
+                __field(unsigned long, caller_ip)
+        ),
+        TP_fast_assign(
+                __entry->dev = bp->b_target->bt_dev;
+                __entry->bno = bp->b_bn;
+                __entry->buffer_length = bp->b_buffer_length;
+                __entry->hold = atomic_read(&bp->b_hold);
+                __entry->pincount = atomic_read(&bp->b_pin_count);
+                __entry->lockval = xfs_buf_lock_value(bp);
+                __entry->error = error;
+                __entry->flags = bp->b_flags;
+                __entry->caller_ip = caller_ip;
+        ),
+        TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+                  "lock %d error %d flags %s caller %pf",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  (unsigned long long)__entry->bno,
+                  __entry->buffer_length,
+                  __entry->hold,
+                  __entry->pincount,
+                  __entry->lockval,
+                  __entry->error,
+                  __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+                  (void *)__entry->caller_ip)
+);
+DECLARE_EVENT_CLASS(xfs_buf_item_class,
+        TP_PROTO(struct xfs_buf_log_item *bip),
+        TP_ARGS(bip),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_daddr_t, buf_bno)
+                __field(size_t, buf_len)
+                __field(int, buf_hold)
+                __field(int, buf_pincount)
+                __field(int, buf_lockval)
+                __field(unsigned, buf_flags)
+                __field(unsigned, bli_recur)
+                __field(int, bli_refcount)
+                __field(unsigned, bli_flags)
+                __field(void *, li_desc)
+                __field(unsigned, li_flags)
+        ),
+        TP_fast_assign(
+                __entry->dev = bip->bli_buf->b_target->bt_dev;
+                __entry->bli_flags = bip->bli_flags;
+                __entry->bli_recur = bip->bli_recur;
+                __entry->bli_refcount = atomic_read(&bip->bli_refcount);
+                __entry->buf_bno = bip->bli_buf->b_bn;
+                __entry->buf_len = bip->bli_buf->b_buffer_length;
+                __entry->buf_flags = bip->bli_buf->b_flags;
+                __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold);
+                __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
+                __entry->buf_lockval = xfs_buf_lock_value(bip->bli_buf);
+                __entry->li_desc = bip->bli_item.li_desc;
+                __entry->li_flags = bip->bli_item.li_flags;
+        ),
+        TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+                  "lock %d flags %s recur %d refcount %d bliflags %s "
+                  "lidesc 0x%p liflags %s",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  (unsigned long long)__entry->buf_bno,
+                  __entry->buf_len,
+                  __entry->buf_hold,
+                  __entry->buf_pincount,
+                  __entry->buf_lockval,
+                  __print_flags(__entry->buf_flags, "|", XFS_BUF_FLAGS),
+                  __entry->bli_recur,
+                  __entry->bli_refcount,
+                  __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS),
+                  __entry->li_desc,
+                  __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS))
+)
+#define DEFINE_BUF_ITEM_EVENT(name) \
+DEFINE_EVENT(xfs_buf_item_class, name, \
+        TP_PROTO(struct xfs_buf_log_item *bip), \
+        TP_ARGS(bip))
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_trylock);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb_recur);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf_recur);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_log_buf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_brelse);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
+DECLARE_EVENT_CLASS(xfs_lock_class,
+        TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
+                 unsigned long caller_ip),
+        TP_ARGS(ip,  lock_flags, caller_ip),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, ino)
+                __field(int, lock_flags)
+                __field(unsigned long, caller_ip)
+        ),
+        TP_fast_assign(
+                __entry->dev = VFS_I(ip)->i_sb->s_dev;
+                __entry->ino = ip->i_ino;
+                __entry->lock_flags = lock_flags;
+                __entry->caller_ip = caller_ip;
+        ),
+        TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->ino,
+                  __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS),
+                  (void *)__entry->caller_ip)
+)
+#define DEFINE_LOCK_EVENT(name) \
+DEFINE_EVENT(xfs_lock_class, name, \
+        TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, \
+                 unsigned long caller_ip), \
+        TP_ARGS(ip,  lock_flags, caller_ip))
+DEFINE_LOCK_EVENT(xfs_ilock);
+DEFINE_LOCK_EVENT(xfs_ilock_nowait);
+DEFINE_LOCK_EVENT(xfs_ilock_demote);
+DEFINE_LOCK_EVENT(xfs_iunlock);
+DECLARE_EVENT_CLASS(xfs_iget_class,
+        TP_PROTO(struct xfs_inode *ip),
+        TP_ARGS(ip),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, ino)
+        ),
+        TP_fast_assign(
+                __entry->dev = VFS_I(ip)->i_sb->s_dev;
+                __entry->ino = ip->i_ino;
+        ),
+        TP_printk("dev %d:%d ino 0x%llx",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->ino)
+)
+#define DEFINE_IGET_EVENT(name) \
+DEFINE_EVENT(xfs_iget_class, name, \
+        TP_PROTO(struct xfs_inode *ip), \
+        TP_ARGS(ip))
+DEFINE_IGET_EVENT(xfs_iget_skip);
+DEFINE_IGET_EVENT(xfs_iget_reclaim);
+DEFINE_IGET_EVENT(xfs_iget_found);
+DEFINE_IGET_EVENT(xfs_iget_alloc);
+DECLARE_EVENT_CLASS(xfs_inode_class,
+        TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
+        TP_ARGS(ip, caller_ip),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, ino)
+                __field(int, count)
+                __field(unsigned long, caller_ip)
+        ),
+        TP_fast_assign(
+                __entry->dev = VFS_I(ip)->i_sb->s_dev;
+                __entry->ino = ip->i_ino;
+                __entry->count = atomic_read(&VFS_I(ip)->i_count);
+                __entry->caller_ip = caller_ip;
+        ),
+        TP_printk("dev %d:%d ino 0x%llx count %d caller %pf",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->ino,
+                  __entry->count,
+                  (char *)__entry->caller_ip)
+)
+#define DEFINE_INODE_EVENT(name) \
+DEFINE_EVENT(xfs_inode_class, name, \
+        TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
+        TP_ARGS(ip, caller_ip))
+DEFINE_INODE_EVENT(xfs_ihold);
+DEFINE_INODE_EVENT(xfs_irele);
+/* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */
+DEFINE_INODE_EVENT(xfs_inode);
+#define xfs_itrace_entry(ip)    \
+        trace_xfs_inode(ip, _THIS_IP_)
+DECLARE_EVENT_CLASS(xfs_dquot_class,
+        TP_PROTO(struct xfs_dquot *dqp),
+        TP_ARGS(dqp),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(u32, id)
+                __field(unsigned, flags)
+                __field(unsigned, nrefs)
+                __field(unsigned long long, res_bcount)
+                __field(unsigned long long, bcount)
+                __field(unsigned long long, icount)
+                __field(unsigned long long, blk_hardlimit)
+                __field(unsigned long long, blk_softlimit)
+                __field(unsigned long long, ino_hardlimit)
+                __field(unsigned long long, ino_softlimit)
+        ), \
+        TP_fast_assign(
+                __entry->dev = dqp->q_mount->m_super->s_dev;
+                __entry->id = be32_to_cpu(dqp->q_core.d_id);
+                __entry->flags = dqp->dq_flags;
+                __entry->nrefs = dqp->q_nrefs;
+                __entry->res_bcount = dqp->q_res_bcount;
+                __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount);
+                __entry->icount = be64_to_cpu(dqp->q_core.d_icount);
+                __entry->blk_hardlimit =
+                        be64_to_cpu(dqp->q_core.d_blk_hardlimit);
+                __entry->blk_softlimit =
+                        be64_to_cpu(dqp->q_core.d_blk_softlimit);
+                __entry->ino_hardlimit =
+                        be64_to_cpu(dqp->q_core.d_ino_hardlimit);
+                __entry->ino_softlimit =
+                        be64_to_cpu(dqp->q_core.d_ino_softlimit);
+        ),
+        TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx "
+                  "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx "
+                  "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->id,
+                  __print_flags(__entry->flags, "|", XFS_DQ_FLAGS),
+                  __entry->nrefs,
+                  __entry->res_bcount,
+                  __entry->bcount,
+                  __entry->blk_hardlimit,
+                  __entry->blk_softlimit,
+                  __entry->icount,
+                  __entry->ino_hardlimit,
+                  __entry->ino_softlimit)
+)
+#define DEFINE_DQUOT_EVENT(name) \
+DEFINE_EVENT(xfs_dquot_class, name, \
+        TP_PROTO(struct xfs_dquot *dqp), \
+        TP_ARGS(dqp))
+DEFINE_DQUOT_EVENT(xfs_dqadjust);
+DEFINE_DQUOT_EVENT(xfs_dqshake_dirty);
+DEFINE_DQUOT_EVENT(xfs_dqshake_unlink);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink);
+DEFINE_DQUOT_EVENT(xfs_dqattach_found);
+DEFINE_DQUOT_EVENT(xfs_dqattach_get);
+DEFINE_DQUOT_EVENT(xfs_dqinit);
+DEFINE_DQUOT_EVENT(xfs_dqreuse);
+DEFINE_DQUOT_EVENT(xfs_dqalloc);
+DEFINE_DQUOT_EVENT(xfs_dqtobp_read);
+DEFINE_DQUOT_EVENT(xfs_dqread);
+DEFINE_DQUOT_EVENT(xfs_dqread_fail);
+DEFINE_DQUOT_EVENT(xfs_dqlookup_found);
+DEFINE_DQUOT_EVENT(xfs_dqlookup_want);
+DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist);
+DEFINE_DQUOT_EVENT(xfs_dqlookup_move);
+DEFINE_DQUOT_EVENT(xfs_dqlookup_done);
+DEFINE_DQUOT_EVENT(xfs_dqget_hit);
+DEFINE_DQUOT_EVENT(xfs_dqget_miss);
+DEFINE_DQUOT_EVENT(xfs_dqput);
+DEFINE_DQUOT_EVENT(xfs_dqput_wait);
+DEFINE_DQUOT_EVENT(xfs_dqput_free);
+DEFINE_DQUOT_EVENT(xfs_dqrele);
+DEFINE_DQUOT_EVENT(xfs_dqflush);
+DEFINE_DQUOT_EVENT(xfs_dqflush_force);
+DEFINE_DQUOT_EVENT(xfs_dqflush_done);
+/* not really iget events, but we re-use the format */
+DEFINE_IGET_EVENT(xfs_dquot_dqalloc);
+DEFINE_IGET_EVENT(xfs_dquot_dqdetach);
+DECLARE_EVENT_CLASS(xfs_loggrant_class,
+        TP_PROTO(struct log *log, struct xlog_ticket *tic),
+        TP_ARGS(log, tic),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(unsigned, trans_type)
+                __field(char, ocnt)
+                __field(char, cnt)
+                __field(int, curr_res)
+                __field(int, unit_res)
+                __field(unsigned int, flags)
+                __field(void *, reserve_headq)
+                __field(void *, write_headq)
+                __field(int, grant_reserve_cycle)
+                __field(int, grant_reserve_bytes)
+                __field(int, grant_write_cycle)
+                __field(int, grant_write_bytes)
+                __field(int, curr_cycle)
+                __field(int, curr_block)
+                __field(xfs_lsn_t, tail_lsn)
+        ),
+        TP_fast_assign(
+                __entry->dev = log->l_mp->m_super->s_dev;
+                __entry->trans_type = tic->t_trans_type;
+                __entry->ocnt = tic->t_ocnt;
+                __entry->cnt = tic->t_cnt;
+                __entry->curr_res = tic->t_curr_res;
+                __entry->unit_res = tic->t_unit_res;
+                __entry->flags = tic->t_flags;
+                __entry->reserve_headq = log->l_reserve_headq;
+                __entry->write_headq = log->l_write_headq;
+                __entry->grant_reserve_cycle = log->l_grant_reserve_cycle;
+                __entry->grant_reserve_bytes = log->l_grant_reserve_bytes;
+                __entry->grant_write_cycle = log->l_grant_write_cycle;
+                __entry->grant_write_bytes = log->l_grant_write_bytes;
+                __entry->curr_cycle = log->l_curr_cycle;
+                __entry->curr_block = log->l_curr_block;
+                __entry->tail_lsn = log->l_tail_lsn;
+        ),
+        TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
+                  "t_unit_res %u t_flags %s reserve_headq 0x%p "
+                  "write_headq 0x%p grant_reserve_cycle %d "
+                  "grant_reserve_bytes %d grant_write_cycle %d "
+                  "grant_write_bytes %d curr_cycle %d curr_block %d "
+                  "tail_cycle %d tail_block %d",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES),
+                  __entry->ocnt,
+                  __entry->cnt,
+                  __entry->curr_res,
+                  __entry->unit_res,
+                  __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
+                  __entry->reserve_headq,
+                  __entry->write_headq,
+                  __entry->grant_reserve_cycle,
+                  __entry->grant_reserve_bytes,
+                  __entry->grant_write_cycle,
+                  __entry->grant_write_bytes,
+                  __entry->curr_cycle,
+                  __entry->curr_block,
+                  CYCLE_LSN(__entry->tail_lsn),
+                  BLOCK_LSN(__entry->tail_lsn)
+        )
+)
+#define DEFINE_LOGGRANT_EVENT(name) \
+DEFINE_EVENT(xfs_loggrant_class, name, \
+        TP_PROTO(struct log *log, struct xlog_ticket *tic), \
+        TP_ARGS(log, tic))
+DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm);
+DEFINE_LOGGRANT_EVENT(xfs_log_done_perm);
+DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
+DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_error);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
+DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
+#define DEFINE_RW_EVENT(name) \
+TRACE_EVENT(name, \
+        TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
+        TP_ARGS(ip, count, offset, flags), \
+        TP_STRUCT__entry( \
+                __field(dev_t, dev) \
+                __field(xfs_ino_t, ino) \
+                __field(xfs_fsize_t, size) \
+                __field(xfs_fsize_t, new_size) \
+                __field(loff_t, offset) \
+                __field(size_t, count) \
+                __field(int, flags) \
+        ), \
+        TP_fast_assign( \
+                __entry->dev = VFS_I(ip)->i_sb->s_dev; \
+                __entry->ino = ip->i_ino; \
+                __entry->size = ip->i_d.di_size; \
+                __entry->new_size = ip->i_new_size; \
+                __entry->offset = offset; \
+                __entry->count = count; \
+                __entry->flags = flags; \
+        ), \
+        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
+                  "offset 0x%llx count 0x%zx ioflags %s", \
+                  MAJOR(__entry->dev), MINOR(__entry->dev), \
+                  __entry->ino, \
+                  __entry->size, \
+                  __entry->new_size, \
+                  __entry->offset, \
+                  __entry->count, \
+                  __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) \
+)
+DEFINE_RW_EVENT(xfs_file_read);
+DEFINE_RW_EVENT(xfs_file_buffered_write);
+DEFINE_RW_EVENT(xfs_file_direct_write);
+DEFINE_RW_EVENT(xfs_file_splice_read);
+DEFINE_RW_EVENT(xfs_file_splice_write);
+#define DEFINE_PAGE_EVENT(name) \
+TRACE_EVENT(name, \
+        TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \
+        TP_ARGS(inode, page, off), \
+        TP_STRUCT__entry( \
+                __field(dev_t, dev) \
+                __field(xfs_ino_t, ino) \
+                __field(pgoff_t, pgoff) \
+                __field(loff_t, size) \
+                __field(unsigned long, offset) \
+                __field(int, delalloc) \
+                __field(int, unmapped) \
+                __field(int, unwritten) \
+        ), \
+        TP_fast_assign( \
+                int delalloc = -1, unmapped = -1, unwritten = -1; \
+        \
+                if (page_has_buffers(page)) \
+                        xfs_count_page_state(page, &delalloc, \
+                                             &unmapped, &unwritten); \
+                __entry->dev = inode->i_sb->s_dev; \
+                __entry->ino = XFS_I(inode)->i_ino; \
+                __entry->pgoff = page_offset(page); \
+                __entry->size = i_size_read(inode); \
+                __entry->offset = off; \
+                __entry->delalloc = delalloc; \
+                __entry->unmapped = unmapped; \
+                __entry->unwritten = unwritten; \
+        ), \
+        TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " \
+                  "delalloc %d unmapped %d unwritten %d", \
+                  MAJOR(__entry->dev), MINOR(__entry->dev), \
+                  __entry->ino, \
+                  __entry->pgoff, \
+                  __entry->size, \
+                  __entry->offset, \
+                  __entry->delalloc, \
+                  __entry->unmapped, \
+                  __entry->unwritten) \
+)
+DEFINE_PAGE_EVENT(xfs_writepage);
+DEFINE_PAGE_EVENT(xfs_releasepage);
+DEFINE_PAGE_EVENT(xfs_invalidatepage);
+#define DEFINE_IOMAP_EVENT(name) \
+TRACE_EVENT(name, \
+        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
+                 int flags, struct xfs_bmbt_irec *irec), \
+        TP_ARGS(ip, offset, count, flags, irec), \
+        TP_STRUCT__entry( \
+                __field(dev_t, dev) \
+                __field(xfs_ino_t, ino) \
+                __field(loff_t, size) \
+                __field(loff_t, new_size) \
+                __field(loff_t, offset) \
+                __field(size_t, count) \
+                __field(int, flags) \
+                __field(xfs_fileoff_t, startoff) \
+                __field(xfs_fsblock_t, startblock) \
+                __field(xfs_filblks_t, blockcount) \
+        ), \
+        TP_fast_assign( \
+                __entry->dev = VFS_I(ip)->i_sb->s_dev; \
+                __entry->ino = ip->i_ino; \
+                __entry->size = ip->i_d.di_size; \
+                __entry->new_size = ip->i_new_size; \
+                __entry->offset = offset; \
+                __entry->count = count; \
+                __entry->flags = flags; \
+                __entry->startoff = irec ? irec->br_startoff : 0; \
+                __entry->startblock = irec ? irec->br_startblock : 0; \
+                __entry->blockcount = irec ? irec->br_blockcount : 0; \
+        ), \
+        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
+                  "offset 0x%llx count %zd flags %s " \
+                  "startoff 0x%llx startblock %lld blockcount 0x%llx", \
+                  MAJOR(__entry->dev), MINOR(__entry->dev), \
+                  __entry->ino, \
+                  __entry->size, \
+                  __entry->new_size, \
+                  __entry->offset, \
+                  __entry->count, \
+                  __print_flags(__entry->flags, "|", BMAPI_FLAGS), \
+                  __entry->startoff, \
+                  (__int64_t)__entry->startblock, \
+                  __entry->blockcount) \
+)
+DEFINE_IOMAP_EVENT(xfs_iomap_enter);
+DEFINE_IOMAP_EVENT(xfs_iomap_found);
+DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
+#define DEFINE_SIMPLE_IO_EVENT(name) \
+TRACE_EVENT(name, \
+        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \
+        TP_ARGS(ip, offset, count), \
+        TP_STRUCT__entry( \
+                __field(dev_t, dev) \
+                __field(xfs_ino_t, ino) \
+                __field(loff_t, size) \
+                __field(loff_t, new_size) \
+                __field(loff_t, offset) \
+                __field(size_t, count) \
+        ), \
+        TP_fast_assign( \
+                __entry->dev = VFS_I(ip)->i_sb->s_dev; \
+                __entry->ino = ip->i_ino; \
+                __entry->size = ip->i_d.di_size; \
+                __entry->new_size = ip->i_new_size; \
+                __entry->offset = offset; \
+                __entry->count = count; \
+        ), \
+        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
+                  "offset 0x%llx count %zd", \
+                  MAJOR(__entry->dev), MINOR(__entry->dev), \
+                  __entry->ino, \
+                  __entry->size, \
+                  __entry->new_size, \
+                  __entry->offset, \
+                  __entry->count) \
+);
+DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
+DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
+TRACE_EVENT(xfs_itruncate_start,
+        TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size, int flag,
+                 xfs_off_t toss_start, xfs_off_t toss_finish),
+        TP_ARGS(ip, new_size, flag, toss_start, toss_finish),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, ino)
+                __field(xfs_fsize_t, size)
+                __field(xfs_fsize_t, new_size)
+                __field(xfs_off_t, toss_start)
+                __field(xfs_off_t, toss_finish)
+                __field(int, flag)
+        ),
+        TP_fast_assign(
+                __entry->dev = VFS_I(ip)->i_sb->s_dev;
+                __entry->ino = ip->i_ino;
+                __entry->size = ip->i_d.di_size;
+                __entry->new_size = new_size;
+                __entry->toss_start = toss_start;
+                __entry->toss_finish = toss_finish;
+                __entry->flag = flag;
+        ),
+        TP_printk("dev %d:%d ino 0x%llx %s size 0x%llx new_size 0x%llx "
+                  "toss start 0x%llx toss finish 0x%llx",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->ino,
+                  __print_flags(__entry->flag, "|", XFS_ITRUNC_FLAGS),
+                  __entry->size,
+                  __entry->new_size,
+                  __entry->toss_start,
+                  __entry->toss_finish)
+);
+DECLARE_EVENT_CLASS(xfs_itrunc_class,
+        TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
+        TP_ARGS(ip, new_size),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, ino)
+                __field(xfs_fsize_t, size)
+                __field(xfs_fsize_t, new_size)
+        ),
+        TP_fast_assign(
+                __entry->dev = VFS_I(ip)->i_sb->s_dev;
+                __entry->ino = ip->i_ino;
+                __entry->size = ip->i_d.di_size;
+                __entry->new_size = new_size;
+        ),
+        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->ino,
+                  __entry->size,
+                  __entry->new_size)
+)
+#define DEFINE_ITRUNC_EVENT(name) \
+DEFINE_EVENT(xfs_itrunc_class, name, \
+        TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
+        TP_ARGS(ip, new_size))
+DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_start);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_end);
+TRACE_EVENT(xfs_pagecache_inval,
+        TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
+        TP_ARGS(ip, start, finish),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, ino)
+                __field(xfs_fsize_t, size)
+                __field(xfs_off_t, start)
+                __field(xfs_off_t, finish)
+        ),
+        TP_fast_assign(
+                __entry->dev = VFS_I(ip)->i_sb->s_dev;
+                __entry->ino = ip->i_ino;
+                __entry->size = ip->i_d.di_size;
+                __entry->start = start;
+                __entry->finish = finish;
+        ),
+        TP_printk("dev %d:%d ino 0x%llx size 0x%llx start 0x%llx finish 0x%llx",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->ino,
+                  __entry->size,
+                  __entry->start,
+                  __entry->finish)
+);
+TRACE_EVENT(xfs_bunmap,
+        TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len,
+                 int flags, unsigned long caller_ip),
+        TP_ARGS(ip, bno, len, flags, caller_ip),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, ino)
+                __field(xfs_fsize_t, size)
+                __field(xfs_fileoff_t, bno)
+                __field(xfs_filblks_t, len)
+                __field(unsigned long, caller_ip)
+                __field(int, flags)
+        ),
+        TP_fast_assign(
+                __entry->dev = VFS_I(ip)->i_sb->s_dev;
+                __entry->ino = ip->i_ino;
+                __entry->size = ip->i_d.di_size;
+                __entry->bno = bno;
+                __entry->len = len;
+                __entry->caller_ip = caller_ip;
+                __entry->flags = flags;
+        ),
+        TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx"
+                  "flags %s caller %pf",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->ino,
+                  __entry->size,
+                  __entry->bno,
+                  __entry->len,
+                  __print_flags(__entry->flags, "|", XFS_BMAPI_FLAGS),
+                  (void *)__entry->caller_ip)
+);
+TRACE_EVENT(xfs_alloc_busy,
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+                 xfs_extlen_t len, int slot),
+        TP_ARGS(mp, agno, agbno, len, slot),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_agnumber_t, agno)
+                __field(xfs_agblock_t, agbno)
+                __field(xfs_extlen_t, len)
+                __field(int, slot)
+        ),
+        TP_fast_assign(
+                __entry->dev = mp->m_super->s_dev;
+                __entry->agno = agno;
+                __entry->agbno = agbno;
+                __entry->len = len;
+                __entry->slot = slot;
+        ),
+        TP_printk("dev %d:%d agno %u agbno %u len %u slot %d",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->agno,
+                  __entry->agbno,
+                  __entry->len,
+                  __entry->slot)
+);
+#define XFS_BUSY_STATES \
+        { 0,    "found" }, \
+        { 1,    "missing" }
+TRACE_EVENT(xfs_alloc_unbusy,
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+                 int slot, int found),
+        TP_ARGS(mp, agno, slot, found),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_agnumber_t, agno)
+                __field(int, slot)
+                __field(int, found)
+        ),
+        TP_fast_assign(
+                __entry->dev = mp->m_super->s_dev;
+                __entry->agno = agno;
+                __entry->slot = slot;
+                __entry->found = found;
+        ),
+        TP_printk("dev %d:%d agno %u slot %d %s",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->agno,
+                  __entry->slot,
+                  __print_symbolic(__entry->found, XFS_BUSY_STATES))
+);
+TRACE_EVENT(xfs_alloc_busysearch,
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+                 xfs_extlen_t len, xfs_lsn_t lsn),
+        TP_ARGS(mp, agno, agbno, len, lsn),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_agnumber_t, agno)
+                __field(xfs_agblock_t, agbno)
+                __field(xfs_extlen_t, len)
+                __field(xfs_lsn_t, lsn)
+        ),
+        TP_fast_assign(
+                __entry->dev = mp->m_super->s_dev;
+                __entry->agno = agno;
+                __entry->agbno = agbno;
+                __entry->len = len;
+                __entry->lsn = lsn;
+        ),
+        TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->agno,
+                  __entry->agbno,
+                  __entry->len,
+                  __entry->lsn)
+);
+TRACE_EVENT(xfs_agf,
+        TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags,
+                 unsigned long caller_ip),
+        TP_ARGS(mp, agf, flags, caller_ip),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_agnumber_t, agno)
+                __field(int, flags)
+                __field(__u32, length)
+                __field(__u32, bno_root)
+                __field(__u32, cnt_root)
+                __field(__u32, bno_level)
+                __field(__u32, cnt_level)
+                __field(__u32, flfirst)
+                __field(__u32, fllast)
+                __field(__u32, flcount)
+                __field(__u32, freeblks)
+                __field(__u32, longest)
+                __field(unsigned long, caller_ip)
+        ),
+        TP_fast_assign(
+                __entry->dev = mp->m_super->s_dev;
+                __entry->agno = be32_to_cpu(agf->agf_seqno),
+                __entry->flags = flags;
+                __entry->length = be32_to_cpu(agf->agf_length),
+                __entry->bno_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]),
+                __entry->cnt_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]),
+                __entry->bno_level =
+                                be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]),
+                __entry->cnt_level =
+                                be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]),
+                __entry->flfirst = be32_to_cpu(agf->agf_flfirst),
+                __entry->fllast = be32_to_cpu(agf->agf_fllast),
+                __entry->flcount = be32_to_cpu(agf->agf_flcount),
+                __entry->freeblks = be32_to_cpu(agf->agf_freeblks),
+                __entry->longest = be32_to_cpu(agf->agf_longest);
+                __entry->caller_ip = caller_ip;
+        ),
+        TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u "
+                  "levels b %u c %u flfirst %u fllast %u flcount %u "
+                  "freeblks %u longest %u caller %pf",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->agno,
+                  __print_flags(__entry->flags, "|", XFS_AGF_FLAGS),
+                  __entry->length,
+                  __entry->bno_root,
+                  __entry->cnt_root,
+                  __entry->bno_level,
+                  __entry->cnt_level,
+                  __entry->flfirst,
+                  __entry->fllast,
+                  __entry->flcount,
+                  __entry->freeblks,
+                  __entry->longest,
+                  (void *)__entry->caller_ip)
+);
+TRACE_EVENT(xfs_free_extent,
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+                 xfs_extlen_t len, bool isfl, int haveleft, int haveright),
+        TP_ARGS(mp, agno, agbno, len, isfl, haveleft, haveright),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_agnumber_t, agno)
+                __field(xfs_agblock_t, agbno)
+                __field(xfs_extlen_t, len)
+                __field(int, isfl)
+                __field(int, haveleft)
+                __field(int, haveright)
+        ),
+        TP_fast_assign(
+                __entry->dev = mp->m_super->s_dev;
+                __entry->agno = agno;
+                __entry->agbno = agbno;
+                __entry->len = len;
+                __entry->isfl = isfl;
+                __entry->haveleft = haveleft;
+                __entry->haveright = haveright;
+        ),
+        TP_printk("dev %d:%d agno %u agbno %u len %u isfl %d %s",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->agno,
+                  __entry->agbno,
+                  __entry->len,
+                  __entry->isfl,
+                  __entry->haveleft ?
+                        (__entry->haveright ? "both" : "left") :
+                        (__entry->haveright ? "right" : "none"))
+);
+DECLARE_EVENT_CLASS(xfs_alloc_class,
+        TP_PROTO(struct xfs_alloc_arg *args),
+        TP_ARGS(args),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_agnumber_t, agno)
+                __field(xfs_agblock_t, agbno)
+                __field(xfs_extlen_t, minlen)
+                __field(xfs_extlen_t, maxlen)
+                __field(xfs_extlen_t, mod)
+                __field(xfs_extlen_t, prod)
+                __field(xfs_extlen_t, minleft)
+                __field(xfs_extlen_t, total)
+                __field(xfs_extlen_t, alignment)
+                __field(xfs_extlen_t, minalignslop)
+                __field(xfs_extlen_t, len)
+                __field(short, type)
+                __field(short, otype)
+                __field(char, wasdel)
+                __field(char, wasfromfl)
+                __field(char, isfl)
+                __field(char, userdata)
+                __field(xfs_fsblock_t, firstblock)
+        ),
+        TP_fast_assign(
+                __entry->dev = args->mp->m_super->s_dev;
+                __entry->agno = args->agno;
+                __entry->agbno = args->agbno;
+                __entry->minlen = args->minlen;
+                __entry->maxlen = args->maxlen;
+                __entry->mod = args->mod;
+                __entry->prod = args->prod;
+                __entry->minleft = args->minleft;
+                __entry->total = args->total;
+                __entry->alignment = args->alignment;
+                __entry->minalignslop = args->minalignslop;
+                __entry->len = args->len;
+                __entry->type = args->type;
+                __entry->otype = args->otype;
+                __entry->wasdel = args->wasdel;
+                __entry->wasfromfl = args->wasfromfl;
+                __entry->isfl = args->isfl;
+                __entry->userdata = args->userdata;
+                __entry->firstblock = args->firstblock;
+        ),
+        TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u "
+                  "prod %u minleft %u total %u alignment %u minalignslop %u "
+                  "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d "
+                  "userdata %d firstblock 0x%llx",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->agno,
+                  __entry->agbno,
+                  __entry->minlen,
+                  __entry->maxlen,
+                  __entry->mod,
+                  __entry->prod,
+                  __entry->minleft,
+                  __entry->total,
+                  __entry->alignment,
+                  __entry->minalignslop,
+                  __entry->len,
+                  __print_symbolic(__entry->type, XFS_ALLOC_TYPES),
+                  __print_symbolic(__entry->otype, XFS_ALLOC_TYPES),
+                  __entry->wasdel,
+                  __entry->wasfromfl,
+                  __entry->isfl,
+                  __entry->userdata,
+                  __entry->firstblock)
+)
+#define DEFINE_ALLOC_EVENT(name) \
+DEFINE_EVENT(xfs_alloc_class, name, \
+        TP_PROTO(struct xfs_alloc_arg *args), \
+        TP_ARGS(args))
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_greater);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_neither);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_badargs);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_nofix);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed);
+DECLARE_EVENT_CLASS(xfs_dir2_class,
+        TP_PROTO(struct xfs_da_args *args),
+        TP_ARGS(args),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, ino)
+                __dynamic_array(char, name, args->namelen)
+                __field(int, namelen)
+                __field(xfs_dahash_t, hashval)
+                __field(xfs_ino_t, inumber)
+                __field(int, op_flags)
+        ),
+        TP_fast_assign(
+                __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+                __entry->ino = args->dp->i_ino;
+                if (args->namelen)
+                        memcpy(__get_str(name), args->name, args->namelen);
+                __entry->namelen = args->namelen;
+                __entry->hashval = args->hashval;
+                __entry->inumber = args->inumber;
+                __entry->op_flags = args->op_flags;
+        ),
+        TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x "
+                  "inumber 0x%llx op_flags %s",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->ino,
+                  __entry->namelen,
+                  __entry->namelen ? __get_str(name) : NULL,
+                  __entry->namelen,
+                  __entry->hashval,
+                  __entry->inumber,
+                  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
+)
+#define DEFINE_DIR2_EVENT(name) \
+DEFINE_EVENT(xfs_dir2_class, name, \
+        TP_PROTO(struct xfs_da_args *args), \
+        TP_ARGS(args))
+DEFINE_DIR2_EVENT(xfs_dir2_sf_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_create);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_toino4);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_toino8);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_to_block);
+DEFINE_DIR2_EVENT(xfs_dir2_block_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_block_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_block_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_block_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_block_to_sf);
+DEFINE_DIR2_EVENT(xfs_dir2_block_to_leaf);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_block);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_node);
+DEFINE_DIR2_EVENT(xfs_dir2_node_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_node_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_node_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_node_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf);
+DECLARE_EVENT_CLASS(xfs_dir2_space_class,
+        TP_PROTO(struct xfs_da_args *args, int idx),
+        TP_ARGS(args, idx),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, ino)
+                __field(int, op_flags)
+                __field(int, idx)
+        ),
+        TP_fast_assign(
+                __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+                __entry->ino = args->dp->i_ino;
+                __entry->op_flags = args->op_flags;
+                __entry->idx = idx;
+        ),
+        TP_printk("dev %d:%d ino 0x%llx op_flags %s index %d",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->ino,
+                  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS),
+                  __entry->idx)
+)
+#define DEFINE_DIR2_SPACE_EVENT(name) \
+DEFINE_EVENT(xfs_dir2_space_class, name, \
+        TP_PROTO(struct xfs_da_args *args, int idx), \
+        TP_ARGS(args, idx))
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_add);
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_remove);
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_grow_inode);
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_shrink_inode);
+TRACE_EVENT(xfs_dir2_leafn_moveents,
+        TP_PROTO(struct xfs_da_args *args, int src_idx, int dst_idx, int count),
+        TP_ARGS(args, src_idx, dst_idx, count),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, ino)
+                __field(int, op_flags)
+                __field(int, src_idx)
+                __field(int, dst_idx)
+                __field(int, count)
+        ),
+        TP_fast_assign(
+                __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+                __entry->ino = args->dp->i_ino;
+                __entry->op_flags = args->op_flags;
+                __entry->src_idx = src_idx;
+                __entry->dst_idx = dst_idx;
+                __entry->count = count;
+        ),
+        TP_printk("dev %d:%d ino 0x%llx op_flags %s "
+                  "src_idx %d dst_idx %d count %d",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->ino,
+                  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS),
+                  __entry->src_idx,
+                  __entry->dst_idx,
+                  __entry->count)
+);
+#define XFS_SWAPEXT_INODES \
+        { 0,    "target" }, \
+        { 1,    "temp" }
+#define XFS_INODE_FORMAT_STR \
+        { 0,    "invalid" }, \
+        { 1,    "local" }, \
+        { 2,    "extent" }, \
+        { 3,    "btree" }
+DECLARE_EVENT_CLASS(xfs_swap_extent_class,
+        TP_PROTO(struct xfs_inode *ip, int which),
+        TP_ARGS(ip, which),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(int, which)
+                __field(xfs_ino_t, ino)
+                __field(int, format)
+                __field(int, nex)
+                __field(int, max_nex)
+                __field(int, broot_size)
+                __field(int, fork_off)
+        ),
+        TP_fast_assign(
+                __entry->dev = VFS_I(ip)->i_sb->s_dev;
+                __entry->which = which;
+                __entry->ino = ip->i_ino;
+                __entry->format = ip->i_d.di_format;
+                __entry->nex = ip->i_d.di_nextents;
+                __entry->max_nex = ip->i_df.if_ext_max;
+                __entry->broot_size = ip->i_df.if_broot_bytes;
+                __entry->fork_off = XFS_IFORK_BOFF(ip);
+        ),
+        TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
+                  "Max in-fork extents %d, broot size %d, fork offset %d",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->ino,
+                  __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
+                  __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
+                  __entry->nex,
+                  __entry->max_nex,
+                  __entry->broot_size,
+                  __entry->fork_off)
+)
+#define DEFINE_SWAPEXT_EVENT(name) \
+DEFINE_EVENT(xfs_swap_extent_class, name, \
+        TP_PROTO(struct xfs_inode *ip, int which), \
+        TP_ARGS(ip, which))
+DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
+DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
+#endif /* _TRACE_XFS_H */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE xfs_trace
+#include <trace/define_trace.h>
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index ad7fbead4c97..7c220b4227bc 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -36,10 +36,13 @@ struct attrlist_cursor_kern;
 /*
 * Flags for read/write calls - same values as IRIX
 */
-#define IO_ISAIO        0x00001         /* don't wait for completion */
 #define IO_ISDIRECT     0x00004         /* bypass page cache */
 #define IO_INVIS        0x00020         /* don't update inode timestamps */
+#define XFS_IO_FLAGS \
+        { IO_ISDIRECT,  "DIRECT" }, \
+        { IO_INVIS,     "INVIS"}
 /*
 * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
 */
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
index 497c7fb75cc1..fa01b9daba6b 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -30,10 +30,10 @@
 static int
-__xfs_xattr_get(struct inode *inode, const char *name,
+xfs_xattr_get(struct dentry *dentry, const char *name,
                void *value, size_t size, int xflags)
 {
-        struct xfs_inode *ip = XFS_I(inode);
+        struct xfs_inode *ip = XFS_I(dentry->d_inode);
        int error, asize = size;
        if (strcmp(name, "") == 0)
@@ -45,17 +45,17 @@ __xfs_xattr_get(struct inode *inode, const char *name,
                value = NULL;
        }
-        error = -xfs_attr_get(ip, name, value, &asize, xflags);
+        error = -xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
        if (error)
                return error;
        return asize;
 }
 static int
-__xfs_xattr_set(struct inode *inode, const char *name, const void *value,
+xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
                size_t size, int flags, int xflags)
 {
-        struct xfs_inode *ip = XFS_I(inode);
+        struct xfs_inode *ip = XFS_I(dentry->d_inode);
        if (strcmp(name, "") == 0)
                return -EINVAL;
@@ -67,79 +67,39 @@ __xfs_xattr_set(struct inode *inode, const char *name, const void *value,
                xflags |= ATTR_REPLACE;
        if (!value)
-                return -xfs_attr_remove(ip, name, xflags);
+                return -xfs_attr_remove(ip, (unsigned char *)name, xflags);
-        return -xfs_attr_set(ip, name, (void *)value, size, xflags);
+        return -xfs_attr_set(ip, (unsigned char *)name,
-}
+                                (void *)value, size, xflags);
-static int
-xfs_xattr_user_get(struct inode *inode, const char *name,
-                void *value, size_t size)
-{
-        return __xfs_xattr_get(inode, name, value, size, 0);
-}
-static int
-xfs_xattr_user_set(struct inode *inode, const char *name,
-                const void *value, size_t size, int flags)
-{
-        return __xfs_xattr_set(inode, name, value, size, flags, 0);
 }
 static struct xattr_handler xfs_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
-        .get    = xfs_xattr_user_get,
+        .flags  = 0, /* no flags implies user namespace */
-        .set    = xfs_xattr_user_set,
+        .get    = xfs_xattr_get,
+        .set    = xfs_xattr_set,
 };
-static int
-xfs_xattr_trusted_get(struct inode *inode, const char *name,
-                void *value, size_t size)
-{
-        return __xfs_xattr_get(inode, name, value, size, ATTR_ROOT);
-}
-static int
-xfs_xattr_trusted_set(struct inode *inode, const char *name,
-                const void *value, size_t size, int flags)
-{
-        return __xfs_xattr_set(inode, name, value, size, flags, ATTR_ROOT);
-}
 static struct xattr_handler xfs_xattr_trusted_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
-        .get    = xfs_xattr_trusted_get,
+        .flags  = ATTR_ROOT,
-        .set    = xfs_xattr_trusted_set,
+        .get    = xfs_xattr_get,
+        .set    = xfs_xattr_set,
 };
-static int
-xfs_xattr_secure_get(struct inode *inode, const char *name,
-                void *value, size_t size)
-{
-        return __xfs_xattr_get(inode, name, value, size, ATTR_SECURE);
-}
-static int
-xfs_xattr_secure_set(struct inode *inode, const char *name,
-                const void *value, size_t size, int flags)
-{
-        return __xfs_xattr_set(inode, name, value, size, flags, ATTR_SECURE);
-}
 static struct xattr_handler xfs_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
-        .get    = xfs_xattr_secure_get,
+        .flags  = ATTR_SECURE,
-        .set    = xfs_xattr_secure_set,
+        .get    = xfs_xattr_get,
+        .set    = xfs_xattr_set,
 };
 struct xattr_handler *xfs_xattr_handlers[] = {
        &xfs_xattr_user_handler,
        &xfs_xattr_trusted_handler,
        &xfs_xattr_security_handler,
 #ifdef CONFIG_XFS_POSIX_ACL
-        &xfs_xattr_system_handler,
+        &xfs_xattr_acl_access_handler,
+        &xfs_xattr_acl_default_handler,
 #endif
        NULL
 };
@@ -165,8 +125,13 @@ static const char *xfs_xattr_prefix(int flags)
 }
 static int
-xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags,
+xfs_xattr_put_listent(
-                char *name, int namelen, int valuelen, char *value)
+        struct xfs_attr_list_context *context,
+        int             flags,
+        unsigned char   *name,
+        int             namelen,
+        int             valuelen,
+        unsigned char   *value)
 {
        unsigned int prefix_len = xfs_xattr_prefix_len(flags);
        char *offset;
@@ -189,7 +154,7 @@ xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags,
        offset = (char *)context->alist + context->count;
        strncpy(offset, xfs_xattr_prefix(flags), prefix_len);
        offset += prefix_len;
-        strncpy(offset, name, namelen);                 /* real name */
+        strncpy(offset, (char *)name, namelen);                 /* real name */
        offset += namelen;
        *offset = '\0';
        context->count += prefix_len + namelen + 1;
@@ -197,8 +162,13 @@ xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags,
 }
 static int
-xfs_xattr_put_listent_sizes(struct xfs_attr_list_context *context, int flags,
+xfs_xattr_put_listent_sizes(
-                char *name, int namelen, int valuelen, char *value)
+        struct xfs_attr_list_context *context,
+        int             flags,
+        unsigned char   *name,
+        int             namelen,
+        int             valuelen,
+        unsigned char   *value)
 {
        context->count += xfs_xattr_prefix_len(flags) + namelen + 1;
        return 0;
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 2f3f2229eaaf..5f79dd78626b 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -47,6 +47,7 @@
 #include "xfs_trans_space.h"
 #include "xfs_trans_priv.h"
 #include "xfs_qm.h"
+#include "xfs_trace.h"
 /*
@@ -112,10 +113,7 @@ xfs_qm_dqinit(
                init_completion(&dqp->q_flush);
                complete(&dqp->q_flush);
-#ifdef XFS_DQUOT_TRACE
+                trace_xfs_dqinit(dqp);
-                dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_NOFS);
-                xfs_dqtrace_entry(dqp, "DQINIT");
-#endif
        } else {
                /*
                 * Only the q_core portion was zeroed in dqreclaim_one().
@@ -136,10 +134,7 @@ xfs_qm_dqinit(
                 dqp->q_hash = NULL;
                 ASSERT(dqp->dq_flnext == dqp->dq_flprev);
-#ifdef XFS_DQUOT_TRACE
+                trace_xfs_dqreuse(dqp);
-                 ASSERT(dqp->q_trace);
-                 xfs_dqtrace_entry(dqp, "DQRECLAIMED_INIT");
-#endif
        }
        /*
@@ -167,13 +162,8 @@ xfs_qm_dqdestroy(
        mutex_destroy(&dqp->q_qlock);
        sv_destroy(&dqp->q_pinwait);
-#ifdef XFS_DQUOT_TRACE
-        if (dqp->q_trace)
-             ktrace_free(dqp->q_trace);
-        dqp->q_trace = NULL;
-#endif
        kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
        atomic_dec(&xfs_Gqm->qm_totaldquots);
 }
@@ -195,49 +185,6 @@ xfs_qm_dqinit_core(
        d->dd_diskdq.d_flags = type;
 }
-#ifdef XFS_DQUOT_TRACE
-/*
- * Dquot tracing for debugging.
- */
-/* ARGSUSED */
-void
-__xfs_dqtrace_entry(
-        xfs_dquot_t     *dqp,
-        char            *func,
-        void            *retaddr,
-        xfs_inode_t     *ip)
-{
-        xfs_dquot_t     *udqp = NULL;
-        xfs_ino_t       ino = 0;
-        ASSERT(dqp->q_trace);
-        if (ip) {
-                ino = ip->i_ino;
-                udqp = ip->i_udquot;
-        }
-        ktrace_enter(dqp->q_trace,
-                     (void *)(__psint_t)DQUOT_KTRACE_ENTRY,
-                     (void *)func,
-                     (void *)(__psint_t)dqp->q_nrefs,
-                     (void *)(__psint_t)dqp->dq_flags,
-                     (void *)(__psint_t)dqp->q_res_bcount,
-                     (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_bcount),
-                     (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_icount),
-                     (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_blk_hardlimit),
-                     (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_blk_softlimit),
-                     (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_ino_hardlimit),
-                     (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_ino_softlimit),
-                     (void *)(__psint_t)be32_to_cpu(dqp->q_core.d_id),
-                     (void *)(__psint_t)current_pid(),
-                     (void *)(__psint_t)ino,
-                     (void *)(__psint_t)retaddr,
-                     (void *)(__psint_t)udqp);
-        return;
-}
-#endif
 /*
 * If default limits are in force, push them into the dquot now.
 * We overwrite the dquot limits only if they are zero and this
@@ -425,7 +372,8 @@ xfs_qm_dqalloc(
        xfs_trans_t     *tp = *tpp;
        ASSERT(tp != NULL);
-        xfs_dqtrace_entry(dqp, "DQALLOC");
+        trace_xfs_dqalloc(dqp);
        /*
         * Initialize the bmap freelist prior to calling bmapi code.
@@ -612,7 +560,8 @@ xfs_qm_dqtobp(
         * (in which case we already have the buf).
         */
        if (! newdquot) {
-                xfs_dqtrace_entry(dqp, "DQTOBP READBUF");
+                trace_xfs_dqtobp_read(dqp);
                if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
                                               dqp->q_blkno,
                                               XFS_QI_DQCHUNKLEN(mp),
@@ -670,11 +619,12 @@ xfs_qm_dqread(
        ASSERT(tpp);
+        trace_xfs_dqread(dqp);
        /*
         * get a pointer to the on-disk dquot and the buffer containing it
         * dqp already knows its own type (GROUP/USER).
         */
-        xfs_dqtrace_entry(dqp, "DQREAD");
        if ((error = xfs_qm_dqtobp(tpp, dqp, &ddqp, &bp, flags))) {
                return (error);
        }
@@ -763,7 +713,7 @@ xfs_qm_idtodq(
                 * or if the dquot didn't exist on disk and we ask to
                 * allocate (ENOENT).
                 */
-                xfs_dqtrace_entry(dqp, "DQREAD FAIL");
+                trace_xfs_dqread_fail(dqp);
                cancelflags |= XFS_TRANS_ABORT;
                goto error0;
        }
@@ -817,7 +767,8 @@ xfs_qm_dqlookup(
                 * id can't be modified without the hashlock anyway.
                 */
                if (be32_to_cpu(dqp->q_core.d_id) == id && dqp->q_mount == mp) {
-                        xfs_dqtrace_entry(dqp, "DQFOUND BY LOOKUP");
+                        trace_xfs_dqlookup_found(dqp);
                        /*
                         * All in core dquots must be on the dqlist of mp
                         */
@@ -827,7 +778,7 @@ xfs_qm_dqlookup(
                        if (dqp->q_nrefs == 0) {
                                ASSERT (XFS_DQ_IS_ON_FREELIST(dqp));
                                if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
-                                        xfs_dqtrace_entry(dqp, "DQLOOKUP: WANT");
+                                        trace_xfs_dqlookup_want(dqp);
                                        /*
                                         * We may have raced with dqreclaim_one()
@@ -857,8 +808,7 @@ xfs_qm_dqlookup(
                                        /*
                                         * take it off the freelist
                                         */
-                                        xfs_dqtrace_entry(dqp,
+                                        trace_xfs_dqlookup_freelist(dqp);
-                                                        "DQLOOKUP: TAKEOFF FL");
                                        XQM_FREELIST_REMOVE(dqp);
                                        /* xfs_qm_freelist_print(&(xfs_Gqm->
                                                        qm_dqfreelist),
@@ -878,8 +828,7 @@ xfs_qm_dqlookup(
                         */
                        ASSERT(mutex_is_locked(&qh->qh_lock));
                        if (dqp->HL_PREVP != &qh->qh_next) {
-                                xfs_dqtrace_entry(dqp,
+                                trace_xfs_dqlookup_move(dqp);
-                                                  "DQLOOKUP: HASH MOVETOFRONT");
                                if ((d = dqp->HL_NEXT))
                                        d->HL_PREVP = dqp->HL_PREVP;
                                *(dqp->HL_PREVP) = d;
@@ -889,7 +838,7 @@ xfs_qm_dqlookup(
                                dqp->HL_PREVP = &qh->qh_next;
                                qh->qh_next = dqp;
                        }
-                        xfs_dqtrace_entry(dqp, "LOOKUP END");
+                        trace_xfs_dqlookup_done(dqp);
                        *O_dqpp = dqp;
                        ASSERT(mutex_is_locked(&qh->qh_lock));
                        return (0);
@@ -971,7 +920,7 @@ xfs_qm_dqget(
                ASSERT(*O_dqpp);
                ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
                mutex_unlock(&h->qh_lock);
-                xfs_dqtrace_entry(*O_dqpp, "DQGET DONE (FROM CACHE)");
+                trace_xfs_dqget_hit(*O_dqpp);
                return (0);     /* success */
        }
        XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses);
@@ -1104,7 +1053,7 @@ xfs_qm_dqget(
        mutex_unlock(&h->qh_lock);
 dqret:
        ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        xfs_dqtrace_entry(dqp, "DQGET DONE");
+        trace_xfs_dqget_miss(dqp);
        *O_dqpp = dqp;
        return (0);
 }
@@ -1124,7 +1073,8 @@ xfs_qm_dqput(
        ASSERT(dqp->q_nrefs > 0);
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-        xfs_dqtrace_entry(dqp, "DQPUT");
+        trace_xfs_dqput(dqp);
        if (dqp->q_nrefs != 1) {
                dqp->q_nrefs--;
@@ -1137,7 +1087,7 @@ xfs_qm_dqput(
         * in the right order; but try to get it out-of-order first
         */
        if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
-                xfs_dqtrace_entry(dqp, "DQPUT: FLLOCK-WAIT");
+                trace_xfs_dqput_wait(dqp);
                xfs_dqunlock(dqp);
                xfs_qm_freelist_lock(xfs_Gqm);
                xfs_dqlock(dqp);
@@ -1148,7 +1098,8 @@ xfs_qm_dqput(
                /* We can't depend on nrefs being == 1 here */
                if (--dqp->q_nrefs == 0) {
-                        xfs_dqtrace_entry(dqp, "DQPUT: ON FREELIST");
+                        trace_xfs_dqput_free(dqp);
                        /*
                         * insert at end of the freelist.
                         */
@@ -1196,7 +1147,7 @@ xfs_qm_dqrele(
        if (!dqp)
                return;
-        xfs_dqtrace_entry(dqp, "DQRELE");
+        trace_xfs_dqrele(dqp);
        xfs_dqlock(dqp);
        /*
@@ -1229,14 +1180,14 @@ xfs_qm_dqflush(
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
        ASSERT(!completion_done(&dqp->q_flush));
-        xfs_dqtrace_entry(dqp, "DQFLUSH");
+        trace_xfs_dqflush(dqp);
        /*
         * If not dirty, or it's pinned and we are not supposed to
         * block, nada.
         */
        if (!XFS_DQ_IS_DIRTY(dqp) ||
-            (!(flags & XFS_QMOPT_SYNC) && atomic_read(&dqp->q_pincount) > 0)) {
+            (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) {
                xfs_dqfunlock(dqp);
                return 0;
        }
@@ -1259,7 +1210,6 @@ xfs_qm_dqflush(
         * the ondisk-dquot has already been allocated for.
         */
        if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) {
-                xfs_dqtrace_entry(dqp, "DQTOBP FAIL");
                ASSERT(error != ENOENT);
                /*
                 * Quotas could have gotten turned off (ESRCH)
@@ -1297,22 +1247,21 @@ xfs_qm_dqflush(
         * get stuck waiting in the write for too long.
         */
        if (XFS_BUF_ISPINNED(bp)) {
-                xfs_dqtrace_entry(dqp, "DQFLUSH LOG FORCE");
+                trace_xfs_dqflush_force(dqp);
-                xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+                xfs_log_force(mp, 0);
        }
-        if (flags & XFS_QMOPT_DELWRI) {
+        if (flags & SYNC_WAIT)
-                xfs_bdwrite(mp, bp);
-        } else if (flags & XFS_QMOPT_ASYNC) {
-                error = xfs_bawrite(mp, bp);
-        } else {
                error = xfs_bwrite(mp, bp);
-        }
+        else
-        xfs_dqtrace_entry(dqp, "DQFLUSH END");
+                xfs_bdwrite(mp, bp);
+        trace_xfs_dqflush_done(dqp);
        /*
         * dqp is still locked, but caller is free to unlock it now.
         */
-        return (error);
+        return error;
 }
@@ -1483,7 +1432,7 @@ xfs_qm_dqpurge(
         */
        if (XFS_DQ_IS_DIRTY(dqp)) {
                int     error;
-                xfs_dqtrace_entry(dqp, "DQPURGE ->DQFLUSH: DQDIRTY");
                /* dqflush unlocks dqflock */
                /*
                 * Given that dqpurge is a very rare occurrence, it is OK
@@ -1493,7 +1442,7 @@ xfs_qm_dqpurge(
                 * We don't care about getting disk errors here. We need
                 * to purge this dquot anyway, so we go ahead regardless.
                 */
-                error = xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC);
+                error = xfs_qm_dqflush(dqp, SYNC_WAIT);
                if (error)
                        xfs_fs_cmn_err(CE_WARN, mp,
                                "xfs_qm_dqpurge: dquot %p flush failed", dqp);
@@ -1577,25 +1526,17 @@ xfs_qm_dqflock_pushbuf_wait(
         * the flush lock when the I/O completes.
         */
        bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno,
-                    XFS_QI_DQCHUNKLEN(dqp->q_mount),
+                    XFS_QI_DQCHUNKLEN(dqp->q_mount), XBF_TRYLOCK);
-                    XFS_INCORE_TRYLOCK);
+        if (!bp)
-        if (bp != NULL) {
+                goto out_lock;
-                if (XFS_BUF_ISDELAYWRITE(bp)) {
-                        int     error;
+        if (XFS_BUF_ISDELAYWRITE(bp)) {
-                        if (XFS_BUF_ISPINNED(bp)) {
+                if (XFS_BUF_ISPINNED(bp))
-                                xfs_log_force(dqp->q_mount,
+                        xfs_log_force(dqp->q_mount, 0);
-                                              (xfs_lsn_t)0,
+                xfs_buf_delwri_promote(bp);
-                                              XFS_LOG_FORCE);
+                wake_up_process(bp->b_target->bt_task);
-                        }
-                        error = xfs_bawrite(dqp->q_mount, bp);
-                        if (error)
-                                xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
-                                        "xfs_qm_dqflock_pushbuf_wait: "
-                                        "pushbuf error %d on dqp %p, bp %p",
-                                        error, dqp, bp);
-                } else {
-                        xfs_buf_relse(bp);
-                }
        }
+        xfs_buf_relse(bp);
+out_lock:
        xfs_dqflock(dqp);
 }
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index 6533ead9b889..a0f7da586d1b 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -85,9 +85,6 @@ typedef struct xfs_dquot {
        struct completion q_flush;      /* flush completion queue */
        atomic_t          q_pincount;   /* dquot pin count */
        wait_queue_head_t q_pinwait;    /* dquot pinning wait queue */
-#ifdef XFS_DQUOT_TRACE
-        struct ktrace   *q_trace;       /* trace header structure */
-#endif
 } xfs_dquot_t;
@@ -98,7 +95,7 @@ typedef struct xfs_dquot {
 #define dq_flags        q_lists.dqm_flags
 /*
- * Lock hierachy for q_qlock:
+ * Lock hierarchy for q_qlock:
 *      XFS_QLOCK_NORMAL is the implicit default,
 *      XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
 */
@@ -144,24 +141,6 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
                                     (XFS_IS_UQUOTA_ON((d)->q_mount)) : \
                                     (XFS_IS_OQUOTA_ON((d)->q_mount))))
-#ifdef XFS_DQUOT_TRACE
-/*
- * Dquot Tracing stuff.
- */
-#define DQUOT_TRACE_SIZE        64
-#define DQUOT_KTRACE_ENTRY      1
-extern void             __xfs_dqtrace_entry(xfs_dquot_t *dqp, char *func,
-                                            void *, xfs_inode_t *);
-#define xfs_dqtrace_entry_ino(a,b,ip) \
-                __xfs_dqtrace_entry((a), (b), (void*)__return_address, (ip))
-#define xfs_dqtrace_entry(a,b) \
-                __xfs_dqtrace_entry((a), (b), (void*)__return_address, NULL)
-#else
-#define xfs_dqtrace_entry(a,b)
-#define xfs_dqtrace_entry_ino(a,b,ip)
-#endif
 #ifdef QUOTADEBUG
 extern void             xfs_qm_dqprint(xfs_dquot_t *);
 #else
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index d0d4a9a0bbd7..4e4ee9a57194 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -74,11 +74,11 @@ xfs_qm_dquot_logitem_format(
        logvec->i_addr = (xfs_caddr_t)&logitem->qli_format;
        logvec->i_len  = sizeof(xfs_dq_logformat_t);
-        XLOG_VEC_SET_TYPE(logvec, XLOG_REG_TYPE_QFORMAT);
+        logvec->i_type = XLOG_REG_TYPE_QFORMAT;
        logvec++;
        logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core;
        logvec->i_len  = sizeof(xfs_disk_dquot_t);
-        XLOG_VEC_SET_TYPE(logvec, XLOG_REG_TYPE_DQUOT);
+        logvec->i_type = XLOG_REG_TYPE_DQUOT;
        ASSERT(2 == logitem->qli_item.li_desc->lid_size);
        logitem->qli_format.qlf_size = 2;
@@ -153,7 +153,7 @@ xfs_qm_dquot_logitem_push(
         * lock without sleeping, then there must not have been
         * anyone in the process of flushing the dquot.
         */
-        error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+        error = xfs_qm_dqflush(dqp, 0);
        if (error)
                xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
                        "xfs_qm_dquot_logitem_push: push error %d on dqp %p",
@@ -190,7 +190,7 @@ xfs_qm_dqunpin_wait(
        /*
         * Give the log a push so we don't wait here too long.
         */
-        xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE);
+        xfs_log_force(dqp->q_mount, 0);
        wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
 }
@@ -212,68 +212,31 @@ xfs_qm_dquot_logitem_pushbuf(
        xfs_dquot_t     *dqp;
        xfs_mount_t     *mp;
        xfs_buf_t       *bp;
-        uint            dopush;
        dqp = qip->qli_dquot;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
        /*
-         * The qli_pushbuf_flag keeps others from
-         * trying to duplicate our effort.
-         */
-        ASSERT(qip->qli_pushbuf_flag != 0);
-        ASSERT(qip->qli_push_owner == current_pid());
-        /*
         * If flushlock isn't locked anymore, chances are that the
         * inode flush completed and the inode was taken off the AIL.
         * So, just get out.
         */
        if (completion_done(&dqp->q_flush)  ||
            ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
-                qip->qli_pushbuf_flag = 0;
                xfs_dqunlock(dqp);
                return;
        }
        mp = dqp->q_mount;
        bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
-                    XFS_QI_DQCHUNKLEN(mp),
+                    XFS_QI_DQCHUNKLEN(mp), XBF_TRYLOCK);
-                    XFS_INCORE_TRYLOCK);
+        xfs_dqunlock(dqp);
-        if (bp != NULL) {
+        if (!bp)
-                if (XFS_BUF_ISDELAYWRITE(bp)) {
-                        dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
-                                  !completion_done(&dqp->q_flush));
-                        qip->qli_pushbuf_flag = 0;
-                        xfs_dqunlock(dqp);
-                        if (XFS_BUF_ISPINNED(bp)) {
-                                xfs_log_force(mp, (xfs_lsn_t)0,
-                                              XFS_LOG_FORCE);
-                        }
-                        if (dopush) {
-                                int     error;
-#ifdef XFSRACEDEBUG
-                                delay_for_intr();
-                                delay(300);
-#endif
-                                error = xfs_bawrite(mp, bp);
-                                if (error)
-                                        xfs_fs_cmn_err(CE_WARN, mp,
-        "xfs_qm_dquot_logitem_pushbuf: pushbuf error %d on qip %p, bp %p",
-                                                        error, qip, bp);
-                        } else {
-                                xfs_buf_relse(bp);
-                        }
-                } else {
-                        qip->qli_pushbuf_flag = 0;
-                        xfs_dqunlock(dqp);
-                        xfs_buf_relse(bp);
-                }
                return;
-        }
+        if (XFS_BUF_ISDELAYWRITE(bp))
+                xfs_buf_delwri_promote(bp);
+        xfs_buf_relse(bp);
+        return;
-        qip->qli_pushbuf_flag = 0;
-        xfs_dqunlock(dqp);
 }
 /*
@@ -291,50 +254,24 @@ xfs_qm_dquot_logitem_trylock(
        xfs_dq_logitem_t        *qip)
 {
        xfs_dquot_t             *dqp;
-        uint                    retval;
        dqp = qip->qli_dquot;
        if (atomic_read(&dqp->q_pincount) > 0)
-                return (XFS_ITEM_PINNED);
+                return XFS_ITEM_PINNED;
        if (! xfs_qm_dqlock_nowait(dqp))
-                return (XFS_ITEM_LOCKED);
+                return XFS_ITEM_LOCKED;
-        retval = XFS_ITEM_SUCCESS;
        if (!xfs_dqflock_nowait(dqp)) {
                /*
-                 * The dquot is already being flushed.  It may have been
+                 * dquot has already been flushed to the backing buffer,
-                 * flushed delayed write, however, and we don't want to
+                 * leave it locked, pushbuf routine will unlock it.
-                 * get stuck waiting for that to complete.  So, we want to check
-                 * to see if we can lock the dquot's buffer without sleeping.
-                 * If we can and it is marked for delayed write, then we
-                 * hold it and send it out from the push routine.  We don't
-                 * want to do that now since we might sleep in the device
-                 * strategy routine.  We also don't want to grab the buffer lock
-                 * here because we'd like not to call into the buffer cache
-                 * while holding the AIL lock.
-                 * Make sure to only return PUSHBUF if we set pushbuf_flag
-                 * ourselves.  If someone else is doing it then we don't
-                 * want to go to the push routine and duplicate their efforts.
                 */
-                if (qip->qli_pushbuf_flag == 0) {
+                return XFS_ITEM_PUSHBUF;
-                        qip->qli_pushbuf_flag = 1;
-                        ASSERT(qip->qli_format.qlf_blkno == dqp->q_blkno);
-#ifdef DEBUG
-                        qip->qli_push_owner = current_pid();
-#endif
-                        /*
-                         * The dquot is left locked.
-                         */
-                        retval = XFS_ITEM_PUSHBUF;
-                } else {
-                        retval = XFS_ITEM_FLUSHING;
-                        xfs_dqunlock_nonotify(dqp);
-                }
        }
        ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL);
-        return (retval);
+        return XFS_ITEM_SUCCESS;
 }
@@ -467,7 +404,7 @@ xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t	*qf,
        log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format);
        log_vector->i_len = sizeof(xfs_qoff_logitem_t);
-        XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_QUOTAOFF);
+        log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF;
        qf->qql_format.qf_size = 1;
 }
diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/quota/xfs_dquot_item.h
index 5a632531f843..5acae2ada70b 100644
--- a/fs/xfs/quota/xfs_dquot_item.h
+++ b/fs/xfs/quota/xfs_dquot_item.h
@@ -27,10 +27,6 @@ typedef struct xfs_dq_logitem {
        xfs_log_item_t           qli_item;         /* common portion */
        struct xfs_dquot        *qli_dquot;        /* dquot ptr */
        xfs_lsn_t                qli_flush_lsn;    /* lsn at last flush */
-        unsigned short           qli_pushbuf_flag; /* 1 bit used in push_ail */
-#ifdef DEBUG
-        uint64_t                 qli_push_owner;
-#endif
        xfs_dq_logformat_t       qli_format;       /* logged structure */
 } xfs_dq_logitem_t;
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 45b1bfef7388..417e61e3d9dd 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -47,6 +47,7 @@
 #include "xfs_trans_space.h"
 #include "xfs_utils.h"
 #include "xfs_qm.h"
+#include "xfs_trace.h"
 /*
 * The global quota manager. There is only one of these for the entire
@@ -117,9 +118,14 @@ xfs_Gqm_init(void)
         */
        udqhash = kmem_zalloc_greedy(&hsize,
                                     XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t),
-                                     XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t),
+                                     XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t));
-                                     KM_SLEEP | KM_MAYFAIL | KM_LARGE);
+        if (!udqhash)
-        gdqhash = kmem_zalloc(hsize, KM_SLEEP | KM_LARGE);
+                goto out;
+        gdqhash = kmem_zalloc_large(hsize);
+        if (!gdqhash)
+                goto out_free_udqhash;
        hsize /= sizeof(xfs_dqhash_t);
        ndquot = hsize << 8;
@@ -169,6 +175,11 @@ xfs_Gqm_init(void)
        mutex_init(&qcheck_lock);
 #endif
        return xqm;
+ out_free_udqhash:
+        kmem_free_large(udqhash);
+ out:
+        return NULL;
 }
 /*
@@ -188,8 +199,8 @@ xfs_qm_destroy(
                xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
                xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
        }
-        kmem_free(xqm->qm_usr_dqhtable);
+        kmem_free_large(xqm->qm_usr_dqhtable);
-        kmem_free(xqm->qm_grp_dqhtable);
+        kmem_free_large(xqm->qm_grp_dqhtable);
        xqm->qm_usr_dqhtable = NULL;
        xqm->qm_grp_dqhtable = NULL;
        xqm->qm_dqhashmask = 0;
@@ -218,8 +229,12 @@ xfs_qm_hold_quotafs_ref(
         */
        mutex_lock(&xfs_Gqm_lock);
-        if (xfs_Gqm == NULL)
+        if (!xfs_Gqm) {
                xfs_Gqm = xfs_Gqm_init();
+                if (!xfs_Gqm)
+                        return ENOMEM;
+        }
        /*
         * We can keep a list of all filesystems with quotas mounted for
         * debugging and statistical purposes, but ...
@@ -435,7 +450,7 @@ xfs_qm_unmount_quotas(
 STATIC int
 xfs_qm_dqflush_all(
        xfs_mount_t     *mp,
-        int             flags)
+        int             sync_mode)
 {
        int             recl;
        xfs_dquot_t     *dqp;
@@ -453,7 +468,7 @@ again:
                        xfs_dqunlock(dqp);
                        continue;
                }
-                xfs_dqtrace_entry(dqp, "FLUSHALL: DQDIRTY");
                /* XXX a sentinel would be better */
                recl = XFS_QI_MPLRECLAIMS(mp);
                if (!xfs_dqflock_nowait(dqp)) {
@@ -471,7 +486,7 @@ again:
                 * across a disk write.
                 */
                xfs_qm_mplist_unlock(mp);
-                error = xfs_qm_dqflush(dqp, flags);
+                error = xfs_qm_dqflush(dqp, sync_mode);
                xfs_dqunlock(dqp);
                if (error)
                        return error;
@@ -651,7 +666,7 @@ xfs_qm_dqattach_one(
         */
        dqp = *IO_idqpp;
        if (dqp) {
-                xfs_dqtrace_entry(dqp, "DQATTACH: found in ip");
+                trace_xfs_dqattach_found(dqp);
                return 0;
        }
@@ -704,7 +719,7 @@ xfs_qm_dqattach_one(
        if (error)
                return error;
-        xfs_dqtrace_entry(dqp, "DQATTACH: found by dqget");
+        trace_xfs_dqattach_get(dqp);
        /*
         * dqget may have dropped and re-acquired the ilock, but it guarantees
@@ -890,15 +905,15 @@ xfs_qm_dqdetach(
        if (!(ip->i_udquot || ip->i_gdquot))
                return;
+        trace_xfs_dquot_dqdetach(ip);
        ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino);
        ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino);
        if (ip->i_udquot) {
-                xfs_dqtrace_entry_ino(ip->i_udquot, "DQDETTACH", ip);
                xfs_qm_dqrele(ip->i_udquot);
                ip->i_udquot = NULL;
        }
        if (ip->i_gdquot) {
-                xfs_dqtrace_entry_ino(ip->i_gdquot, "DQDETTACH", ip);
                xfs_qm_dqrele(ip->i_gdquot);
                ip->i_gdquot = NULL;
        }
@@ -911,13 +926,11 @@ xfs_qm_sync(
 {
        int             recl, restarts;
        xfs_dquot_t     *dqp;
-        uint            flush_flags;
        int             error;
        if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
                return 0;
-        flush_flags = (flags & SYNC_WAIT) ? XFS_QMOPT_SYNC : XFS_QMOPT_DELWRI;
        restarts = 0;
  again:
@@ -977,8 +990,7 @@ xfs_qm_sync(
                 * across a disk write
                 */
                xfs_qm_mplist_unlock(mp);
-                xfs_dqtrace_entry(dqp, "XQM_SYNC: DQFLUSH");
+                error = xfs_qm_dqflush(dqp, flags);
-                error = xfs_qm_dqflush(dqp, flush_flags);
                xfs_dqunlock(dqp);
                if (error && XFS_FORCED_SHUTDOWN(mp))
                        return 0;       /* Need to prevent umount failure */
@@ -1350,7 +1362,8 @@ xfs_qm_reset_dqcounts(
        xfs_disk_dquot_t        *ddq;
        int                     j;
-        xfs_buftrace("RESET DQUOTS", bp);
+        trace_xfs_reset_dqcounts(bp, _RET_IP_);
        /*
         * Reset all counters and timers. They'll be
         * started afresh by xfs_qm_quotacheck.
@@ -1543,7 +1556,9 @@ xfs_qm_quotacheck_dqadjust(
        xfs_qcnt_t              rtblks)
 {
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-        xfs_dqtrace_entry(dqp, "QCHECK DQADJUST");
+        trace_xfs_dqadjust(dqp);
        /*
         * Adjust the inode count and the block count to reflect this inode's
         * resource usage.
@@ -1779,7 +1794,7 @@ xfs_qm_quotacheck(
         * successfully.
         */
        if (!error)
-                error = xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI);
+                error = xfs_qm_dqflush_all(mp, 0);
        /*
         * We can get this error if we couldn't do a dquot allocation inside
@@ -1994,12 +2009,14 @@ xfs_qm_shake_freelist(
                 */
                if (XFS_DQ_IS_DIRTY(dqp)) {
                        int     error;
-                        xfs_dqtrace_entry(dqp, "DQSHAKE: DQDIRTY");
+                        trace_xfs_dqshake_dirty(dqp);
                        /*
                         * We flush it delayed write, so don't bother
                         * releasing the mplock.
                         */
-                        error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+                        error = xfs_qm_dqflush(dqp, 0);
                        if (error) {
                                xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
                        "xfs_qm_dqflush_all: dquot %p flush failed", dqp);
@@ -2038,7 +2055,9 @@ xfs_qm_shake_freelist(
                                return nreclaimed;
                        goto tryagain;
                }
-                xfs_dqtrace_entry(dqp, "DQSHAKE: UNLINKING");
+                trace_xfs_dqshake_unlink(dqp);
 #ifdef QUOTADEBUG
                cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n",
                        dqp, be32_to_cpu(dqp->q_core.d_id));
@@ -2125,7 +2144,9 @@ xfs_qm_dqreclaim_one(void)
                 */
                if (dqp->dq_flags & XFS_DQ_WANT) {
                        ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
-                        xfs_dqtrace_entry(dqp, "DQRECLAIM: DQWANT");
+                        trace_xfs_dqreclaim_want(dqp);
                        xfs_dqunlock(dqp);
                        xfs_qm_freelist_unlock(xfs_Gqm);
                        if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
@@ -2171,12 +2192,14 @@ xfs_qm_dqreclaim_one(void)
                 */
                if (XFS_DQ_IS_DIRTY(dqp)) {
                        int     error;
-                        xfs_dqtrace_entry(dqp, "DQRECLAIM: DQDIRTY");
+                        trace_xfs_dqreclaim_dirty(dqp);
                        /*
                         * We flush it delayed write, so don't bother
                         * releasing the freelist lock.
                         */
-                        error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+                        error = xfs_qm_dqflush(dqp, 0);
                        if (error) {
                                xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
                        "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
@@ -2194,8 +2217,9 @@ xfs_qm_dqreclaim_one(void)
                if (!mutex_trylock(&dqp->q_hash->qh_lock))
                        goto mplistunlock;
+                trace_xfs_dqreclaim_unlink(dqp);
                ASSERT(dqp->q_nrefs == 0);
-                xfs_dqtrace_entry(dqp, "DQRECLAIM: UNLINKING");
                XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
                XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
                XQM_FREELIST_REMOVE(dqp);
@@ -2430,7 +2454,7 @@ xfs_qm_vop_dqalloc(
                }
        }
        if (uq)
-                xfs_dqtrace_entry_ino(uq, "DQALLOC", ip);
+                trace_xfs_dquot_dqalloc(ip);
        xfs_iunlock(ip, lockflags);
        if (O_udqpp)
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index a5346630dfae..97b410c12794 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -59,7 +59,7 @@ xfs_fill_statvfs_from_dquot(
                be64_to_cpu(dp->d_blk_hardlimit);
        if (limit && statp->f_blocks > limit) {
                statp->f_blocks = limit;
-                statp->f_bfree =
+                statp->f_bfree = statp->f_bavail =
                        (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ?
                         (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0;
        }
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 5d1a3b98a6e6..5d0ee8d492db 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -49,6 +49,7 @@
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
 #include "xfs_qm.h"
+#include "xfs_trace.h"
 #ifdef DEBUG
 # define qdprintk(s, args...)   cmn_err(CE_DEBUG, s, ## args)
@@ -496,7 +497,6 @@ xfs_qm_scall_setqlim(
                ASSERT(error != ENOENT);
                return (error);
        }
-        xfs_dqtrace_entry(dqp, "Q_SETQLIM: AFT DQGET");
        xfs_trans_dqjoin(tp, dqp);
        ddq = &dqp->q_core;
@@ -602,7 +602,6 @@ xfs_qm_scall_setqlim(
        dqp->dq_flags |= XFS_DQ_DIRTY;
        xfs_trans_log_dquot(tp, dqp);
-        xfs_dqtrace_entry(dqp, "Q_SETQLIM: COMMIT");
        error = xfs_trans_commit(tp, 0);
        xfs_qm_dqprint(dqp);
        xfs_qm_dqrele(dqp);
@@ -630,7 +629,6 @@ xfs_qm_scall_getquota(
                return (error);
        }
-        xfs_dqtrace_entry(dqp, "Q_GETQUOTA SUCCESS");
        /*
         * If everything's NULL, this dquot doesn't quite exist as far as
         * our utility programs are concerned.
@@ -893,7 +891,7 @@ xfs_qm_dqrele_all_inodes(
        uint             flags)
 {
        ASSERT(mp->m_quotainfo);
-        xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG);
+        xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
 }
 /*------------------------------------------------------------------------*/
@@ -1194,9 +1192,9 @@ xfs_qm_internalqcheck(
        if (! XFS_IS_QUOTA_ON(mp))
                return XFS_ERROR(ESRCH);
-        xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+        xfs_log_force(mp, XFS_LOG_SYNC);
        XFS_bflush(mp->m_ddev_targp);
-        xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+        xfs_log_force(mp, XFS_LOG_SYNC);
        XFS_bflush(mp->m_ddev_targp);
        mutex_lock(&qcheck_lock);
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 97ac9640be98..c3ab75cb1d9a 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -589,12 +589,18 @@ xfs_trans_unreserve_and_mod_dquots(
        }
 }
-STATIC int
+STATIC void
-xfs_quota_error(uint flags)
+xfs_quota_warn(
+        struct xfs_mount        *mp,
+        struct xfs_dquot        *dqp,
+        int                     type)
 {
-        if (flags & XFS_QMOPT_ENOSPC)
+        /* no warnings for project quotas - we just return ENOSPC later */
-                return ENOSPC;
+        if (dqp->dq_flags & XFS_DQ_PROJ)
-        return EDQUOT;
+                return;
+        quota_send_warning((dqp->dq_flags & XFS_DQ_USER) ? USRQUOTA : GRPQUOTA,
+                           be32_to_cpu(dqp->q_core.d_id), mp->m_super->s_dev,
+                           type);
 }
 /*
@@ -612,7 +618,6 @@ xfs_trans_dqresv(
        long            ninos,
        uint            flags)
 {
-        int             error;
        xfs_qcnt_t      hardlimit;
        xfs_qcnt_t      softlimit;
        time_t          timer;
@@ -649,7 +654,6 @@ xfs_trans_dqresv(
                warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount);
                resbcountp = &dqp->q_res_rtbcount;
        }
-        error = 0;
        if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
            dqp->q_core.d_id &&
@@ -667,18 +671,20 @@ xfs_trans_dqresv(
                         * nblks.
                         */
                        if (hardlimit > 0ULL &&
-                             (hardlimit <= nblks + *resbcountp)) {
+                            hardlimit <= nblks + *resbcountp) {
-                                error = xfs_quota_error(flags);
+                                xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN);
                                goto error_return;
                        }
                        if (softlimit > 0ULL &&
-                             (softlimit <= nblks + *resbcountp)) {
+                            softlimit <= nblks + *resbcountp) {
                                if ((timer != 0 && get_seconds() > timer) ||
                                    (warns != 0 && warns >= warnlimit)) {
-                                        error = xfs_quota_error(flags);
+                                        xfs_quota_warn(mp, dqp,
+                                                       QUOTA_NL_BSOFTLONGWARN);
                                        goto error_return;
                                }
+                                xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN);
                        }
                }
                if (ninos > 0) {
@@ -692,15 +698,19 @@ xfs_trans_dqresv(
                        softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
                        if (!softlimit)
                                softlimit = q->qi_isoftlimit;
                        if (hardlimit > 0ULL && count >= hardlimit) {
-                                error = xfs_quota_error(flags);
+                                xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
                                goto error_return;
-                        } else if (softlimit > 0ULL && count >= softlimit) {
+                        }
-                                if ((timer != 0 && get_seconds() > timer) ||
+                        if (softlimit > 0ULL && count >= softlimit) {
+                                if  ((timer != 0 && get_seconds() > timer) ||
                                     (warns != 0 && warns >= warnlimit)) {
-                                        error = xfs_quota_error(flags);
+                                        xfs_quota_warn(mp, dqp,
+                                                       QUOTA_NL_ISOFTLONGWARN);
                                        goto error_return;
                                }
+                                xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN);
                        }
                }
        }
@@ -736,9 +746,14 @@ xfs_trans_dqresv(
        ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount));
        ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount));
+        xfs_dqunlock(dqp);
+        return 0;
 error_return:
        xfs_dqunlock(dqp);
-        return error;
+        if (flags & XFS_QMOPT_ENOSPC)
+                return ENOSPC;
+        return EDQUOT;
 }
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index 6f4fd37c67af..d2d20462fd4f 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -41,10 +41,6 @@ extern void assfail(char *expr, char *f, int l);
 # define STATIC static noinline
 #endif
-#ifndef STATIC_INLINE
-# define STATIC_INLINE static inline
-#endif
 #else /* DEBUG */
 #define ASSERT(expr)    \
@@ -54,19 +50,5 @@ extern void assfail(char *expr, char *f, int l);
 # define STATIC noinline
 #endif
-/*
- * We stop inlining of inline functions in debug mode.
- * Unfortunately, this means static inline in header files
- * get multiple definitions, so they need to remain static.
- * This then gives tonnes of warnings about unused but defined
- * functions, so we need to add the unused attribute to prevent
- * these spurious warnings.
- */
-#ifndef STATIC_INLINE
-# define STATIC_INLINE static __attribute__ ((unused)) noinline
-#endif
 #endif /* DEBUG */
 #endif  /* __XFS_SUPPORT_DEBUG_H__ */
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
deleted file mode 100644
index 2d494c26717f..000000000000
--- a/fs/xfs/support/ktrace.c
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include <xfs.h>
-static kmem_zone_t *ktrace_hdr_zone;
-static kmem_zone_t *ktrace_ent_zone;
-static int          ktrace_zentries;
-void __init
-ktrace_init(int zentries)
-{
-        ktrace_zentries = roundup_pow_of_two(zentries);
-        ktrace_hdr_zone = kmem_zone_init(sizeof(ktrace_t),
-                                        "ktrace_hdr");
-        ASSERT(ktrace_hdr_zone);
-        ktrace_ent_zone = kmem_zone_init(ktrace_zentries
-                                        * sizeof(ktrace_entry_t),
-                                        "ktrace_ent");
-        ASSERT(ktrace_ent_zone);
-}
-void __exit
-ktrace_uninit(void)
-{
-        kmem_zone_destroy(ktrace_hdr_zone);
-        kmem_zone_destroy(ktrace_ent_zone);
-}
-/*
- * ktrace_alloc()
- *
- * Allocate a ktrace header and enough buffering for the given
- * number of entries. Round the number of entries up to a
- * power of 2 so we can do fast masking to get the index from
- * the atomic index counter.
- */
-ktrace_t *
-ktrace_alloc(int nentries, unsigned int __nocast sleep)
-{
-        ktrace_t        *ktp;
-        ktrace_entry_t  *ktep;
-        int             entries;
-        ktp = (ktrace_t*)kmem_zone_alloc(ktrace_hdr_zone, sleep);
-        if (ktp == (ktrace_t*)NULL) {
-                /*
-                 * KM_SLEEP callers don't expect failure.
-                 */
-                if (sleep & KM_SLEEP)
-                        panic("ktrace_alloc: NULL memory on KM_SLEEP request!");
-                return NULL;
-        }
-        /*
-         * Special treatment for buffers with the ktrace_zentries entries
-         */
-        entries = roundup_pow_of_two(nentries);
-        if (entries == ktrace_zentries) {
-                ktep = (ktrace_entry_t*)kmem_zone_zalloc(ktrace_ent_zone,
-                                                            sleep);
-        } else {
-                ktep = (ktrace_entry_t*)kmem_zalloc((entries * sizeof(*ktep)),
-                                                            sleep | KM_LARGE);
-        }
-        if (ktep == NULL) {
-                /*
-                 * KM_SLEEP callers don't expect failure.
-                 */
-                if (sleep & KM_SLEEP)
-                        panic("ktrace_alloc: NULL memory on KM_SLEEP request!");
-                kmem_free(ktp);
-                return NULL;
-        }
-        ktp->kt_entries  = ktep;
-        ktp->kt_nentries = entries;
-        ASSERT(is_power_of_2(entries));
-        ktp->kt_index_mask = entries - 1;
-        atomic_set(&ktp->kt_index, 0);
-        ktp->kt_rollover = 0;
-        return ktp;
-}
-/*
- * ktrace_free()
- *
- * Free up the ktrace header and buffer.  It is up to the caller
- * to ensure that no-one is referencing it.
- */
-void
-ktrace_free(ktrace_t *ktp)
-{
-        if (ktp == (ktrace_t *)NULL)
-                return;
-        /*
-         * Special treatment for the Vnode trace buffer.
-         */
-        if (ktp->kt_nentries == ktrace_zentries)
-                kmem_zone_free(ktrace_ent_zone, ktp->kt_entries);
-        else
-                kmem_free(ktp->kt_entries);
-        kmem_zone_free(ktrace_hdr_zone, ktp);
-}
-/*
- * Enter the given values into the "next" entry in the trace buffer.
- * kt_index is always the index of the next entry to be filled.
- */
-void
-ktrace_enter(
-        ktrace_t        *ktp,
-        void            *val0,
-        void            *val1,
-        void            *val2,
-        void            *val3,
-        void            *val4,
-        void            *val5,
-        void            *val6,
-        void            *val7,
-        void            *val8,
-        void            *val9,
-        void            *val10,
-        void            *val11,
-        void            *val12,
-        void            *val13,
-        void            *val14,
-        void            *val15)
-{
-        int             index;
-        ktrace_entry_t  *ktep;
-        ASSERT(ktp != NULL);
-        /*
-         * Grab an entry by pushing the index up to the next one.
-         */
-        index = atomic_add_return(1, &ktp->kt_index);
-        index = (index - 1) & ktp->kt_index_mask;
-        if (!ktp->kt_rollover && index == ktp->kt_nentries - 1)
-                ktp->kt_rollover = 1;
-        ASSERT((index >= 0) && (index < ktp->kt_nentries));
-        ktep = &(ktp->kt_entries[index]);
-        ktep->val[0]  = val0;
-        ktep->val[1]  = val1;
-        ktep->val[2]  = val2;
-        ktep->val[3]  = val3;
-        ktep->val[4]  = val4;
-        ktep->val[5]  = val5;
-        ktep->val[6]  = val6;
-        ktep->val[7]  = val7;
-        ktep->val[8]  = val8;
-        ktep->val[9]  = val9;
-        ktep->val[10] = val10;
-        ktep->val[11] = val11;
-        ktep->val[12] = val12;
-        ktep->val[13] = val13;
-        ktep->val[14] = val14;
-        ktep->val[15] = val15;
-}
-/*
- * Return the number of entries in the trace buffer.
- */
-int
-ktrace_nentries(
-        ktrace_t        *ktp)
-{
-        int     index;
-        if (ktp == NULL)
-                return 0;
-        index = atomic_read(&ktp->kt_index) & ktp->kt_index_mask;
-        return (ktp->kt_rollover ? ktp->kt_nentries : index);
-}
-/*
- * ktrace_first()
- *
- * This is used to find the start of the trace buffer.
- * In conjunction with ktrace_next() it can be used to
- * iterate through the entire trace buffer.  This code does
- * not do any locking because it is assumed that it is called
- * from the debugger.
- *
- * The caller must pass in a pointer to a ktrace_snap
- * structure in which we will keep some state used to
- * iterate through the buffer.  This state must not touched
- * by any code outside of this module.
- */
-ktrace_entry_t *
-ktrace_first(ktrace_t   *ktp, ktrace_snap_t     *ktsp)
-{
-        ktrace_entry_t  *ktep;
-        int             index;
-        int             nentries;
-        if (ktp->kt_rollover)
-                index = atomic_read(&ktp->kt_index) & ktp->kt_index_mask;
-        else
-                index = 0;
-        ktsp->ks_start = index;
-        ktep = &(ktp->kt_entries[index]);
-        nentries = ktrace_nentries(ktp);
-        index++;
-        if (index < nentries) {
-                ktsp->ks_index = index;
-        } else {
-                ktsp->ks_index = 0;
-                if (index > nentries)
-                        ktep = NULL;
-        }
-        return ktep;
-}
-/*
- * ktrace_next()
- *
- * This is used to iterate through the entries of the given
- * trace buffer.  The caller must pass in the ktrace_snap_t
- * structure initialized by ktrace_first().  The return value
- * will be either a pointer to the next ktrace_entry or NULL
- * if all of the entries have been traversed.
- */
-ktrace_entry_t *
-ktrace_next(
-        ktrace_t        *ktp,
-        ktrace_snap_t   *ktsp)
-{
-        int             index;
-        ktrace_entry_t  *ktep;
-        index = ktsp->ks_index;
-        if (index == ktsp->ks_start) {
-                ktep = NULL;
-        } else {
-                ktep = &ktp->kt_entries[index];
-        }
-        index++;
-        if (index == ktrace_nentries(ktp)) {
-                ktsp->ks_index = 0;
-        } else {
-                ktsp->ks_index = index;
-        }
-        return ktep;
-}
-/*
- * ktrace_skip()
- *
- * Skip the next "count" entries and return the entry after that.
- * Return NULL if this causes us to iterate past the beginning again.
- */
-ktrace_entry_t *
-ktrace_skip(
-        ktrace_t        *ktp,
-        int             count,
-        ktrace_snap_t   *ktsp)
-{
-        int             index;
-        int             new_index;
-        ktrace_entry_t  *ktep;
-        int             nentries = ktrace_nentries(ktp);
-        index = ktsp->ks_index;
-        new_index = index + count;
-        while (new_index >= nentries) {
-                new_index -= nentries;
-        }
-        if (index == ktsp->ks_start) {
-                /*
-                 * We've iterated around to the start, so we're done.
-                 */
-                ktep = NULL;
-        } else if ((new_index < index) && (index < ktsp->ks_index)) {
-                /*
-                 * We've skipped past the start again, so we're done.
-                 */
-                ktep = NULL;
-                ktsp->ks_index = ktsp->ks_start;
-        } else {
-                ktep = &(ktp->kt_entries[new_index]);
-                new_index++;
-                if (new_index == nentries) {
-                        ktsp->ks_index = 0;
-                } else {
-                        ktsp->ks_index = new_index;
-                }
-        }
-        return ktep;
-}
diff --git a/fs/xfs/support/ktrace.h b/fs/xfs/support/ktrace.h
deleted file mode 100644
index 741d6947ca60..000000000000
--- a/fs/xfs/support/ktrace.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_KTRACE_H__
-#define __XFS_SUPPORT_KTRACE_H__
-/*
- * Trace buffer entry structure.
- */
-typedef struct ktrace_entry {
-        void    *val[16];
-} ktrace_entry_t;
-/*
- * Trace buffer header structure.
- */
-typedef struct ktrace {
-        int             kt_nentries;    /* number of entries in trace buf */
-        atomic_t        kt_index;       /* current index in entries */
-        unsigned int    kt_index_mask;
-        int             kt_rollover;
-        ktrace_entry_t  *kt_entries;    /* buffer of entries */
-} ktrace_t;
-/*
- * Trace buffer snapshot structure.
- */
-typedef struct ktrace_snap {
-        int             ks_start;       /* kt_index at time of snap */
-        int             ks_index;       /* current index */
-} ktrace_snap_t;
-#ifdef CONFIG_XFS_TRACE
-extern void ktrace_init(int zentries);
-extern void ktrace_uninit(void);
-extern ktrace_t *ktrace_alloc(int, unsigned int __nocast);
-extern void ktrace_free(ktrace_t *);
-extern void ktrace_enter(
-        ktrace_t        *,
-        void            *,
-        void            *,
-        void            *,
-        void            *,
-        void            *,
-        void            *,
-        void            *,
-        void            *,
-        void            *,
-        void            *,
-        void            *,
-        void            *,
-        void            *,
-        void            *,
-        void            *,
-        void            *);
-extern ktrace_entry_t   *ktrace_first(ktrace_t *, ktrace_snap_t *);
-extern int              ktrace_nentries(ktrace_t *);
-extern ktrace_entry_t   *ktrace_next(ktrace_t *, ktrace_snap_t *);
-extern ktrace_entry_t   *ktrace_skip(ktrace_t *, int, ktrace_snap_t *);
-#else
-#define ktrace_init(x)  do { } while (0)
-#define ktrace_uninit() do { } while (0)
-#endif  /* CONFIG_XFS_TRACE */
-#endif  /* __XFS_SUPPORT_KTRACE_H__ */
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index 17254b529c54..5ad8ad3a1dcd 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -25,21 +25,5 @@
 /* #define QUOTADEBUG 1 */
 #endif
-#ifdef CONFIG_XFS_TRACE
-#define XFS_ALLOC_TRACE 1
-#define XFS_ATTR_TRACE 1
-#define XFS_BLI_TRACE 1
-#define XFS_BMAP_TRACE 1
-#define XFS_BTREE_TRACE 1
-#define XFS_DIR2_TRACE 1
-#define XFS_DQUOT_TRACE 1
-#define XFS_ILOCK_TRACE 1
-#define XFS_LOG_TRACE 1
-#define XFS_RW_TRACE 1
-#define XFS_BUF_TRACE 1
-#define XFS_INODE_TRACE 1
-#define XFS_FILESTREAMS_TRACE 1
-#endif
 #include <linux-2.6/xfs_linux.h>
 #endif  /* __XFS_H__ */
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 947b150df8ed..d13eeba2c8f8 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -36,8 +36,8 @@ struct xfs_acl {
 };
 /* On-disk XFS extended attribute names */
-#define SGI_ACL_FILE            "SGI_ACL_FILE"
+#define SGI_ACL_FILE            (unsigned char *)"SGI_ACL_FILE"
-#define SGI_ACL_DEFAULT         "SGI_ACL_DEFAULT"
+#define SGI_ACL_DEFAULT         (unsigned char *)"SGI_ACL_DEFAULT"
 #define SGI_ACL_FILE_SIZE       (sizeof(SGI_ACL_FILE)-1)
 #define SGI_ACL_DEFAULT_SIZE    (sizeof(SGI_ACL_DEFAULT)-1)
@@ -49,7 +49,8 @@ extern int xfs_acl_chmod(struct inode *inode);
 extern int posix_acl_access_exists(struct inode *inode);
 extern int posix_acl_default_exists(struct inode *inode);
-extern struct xattr_handler xfs_xattr_system_handler;
+extern struct xattr_handler xfs_xattr_acl_access_handler;
+extern struct xattr_handler xfs_xattr_acl_default_handler;
 #else
 # define xfs_check_acl                                  NULL
 # define xfs_get_acl(inode, type)                       NULL
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index a5d54bf4931b..b1a5a1ff88ea 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -86,6 +86,20 @@ typedef struct xfs_agf {
 #define XFS_AGF_NUM_BITS        12
 #define XFS_AGF_ALL_BITS        ((1 << XFS_AGF_NUM_BITS) - 1)
+#define XFS_AGF_FLAGS \
+        { XFS_AGF_MAGICNUM,     "MAGICNUM" }, \
+        { XFS_AGF_VERSIONNUM,   "VERSIONNUM" }, \
+        { XFS_AGF_SEQNO,        "SEQNO" }, \
+        { XFS_AGF_LENGTH,       "LENGTH" }, \
+        { XFS_AGF_ROOTS,        "ROOTS" }, \
+        { XFS_AGF_LEVELS,       "LEVELS" }, \
+        { XFS_AGF_FLFIRST,      "FLFIRST" }, \
+        { XFS_AGF_FLLAST,       "FLLAST" }, \
+        { XFS_AGF_FLCOUNT,      "FLCOUNT" }, \
+        { XFS_AGF_FREEBLKS,     "FREEBLKS" }, \
+        { XFS_AGF_LONGEST,      "LONGEST" }, \
+        { XFS_AGF_BTREEBLKS,    "BTREEBLKS" }
 /* disk block (xfs_daddr_t) in the AG */
 #define XFS_AGF_DADDR(mp)       ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
 #define XFS_AGF_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
@@ -173,17 +187,13 @@ typedef struct xfs_perag_busy {
 /*
 * Per-ag incore structure, copies of information in agf and agi,
 * to improve the performance of allocation group selection.
- *
- * pick sizes which fit in allocation buckets well
 */
-#if (BITS_PER_LONG == 32)
-#define XFS_PAGB_NUM_SLOTS      84
-#elif (BITS_PER_LONG == 64)
 #define XFS_PAGB_NUM_SLOTS      128
-#endif
-typedef struct xfs_perag
+typedef struct xfs_perag {
-{
+        struct xfs_mount *pag_mount;    /* owner filesystem */
+        xfs_agnumber_t  pag_agno;       /* AG this structure belongs to */
+        atomic_t        pag_ref;        /* perag reference count */
        char            pagf_init;      /* this agf's entry is initialized */
        char            pagi_init;      /* this agi's entry is initialized */
        char            pagf_metadata;  /* the agf is preferred to be metadata */
@@ -196,8 +206,6 @@ typedef struct xfs_perag
        __uint32_t      pagf_btreeblks; /* # of blocks held in AGF btrees */
        xfs_agino_t     pagi_freecount; /* number of free inodes */
        xfs_agino_t     pagi_count;     /* number of allocated inodes */
-        int             pagb_count;     /* pagb slots in use */
-        xfs_perag_busy_t *pagb_list;    /* unstable blocks */
        /*
         * Inode allocation search lookup optimisation.
@@ -216,6 +224,8 @@ typedef struct xfs_perag
        rwlock_t        pag_ici_lock;   /* incore inode lock */
        struct radix_tree_root pag_ici_root;    /* incore inode cache root */
 #endif
+        int             pagb_count;     /* pagb slots in use */
+        xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */
 } xfs_perag_t;
 /*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 2cf944eb796d..94cddbfb2560 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -38,6 +38,7 @@
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
+#include "xfs_trace.h"
 #define XFS_ABSDIFF(a,b)        (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
@@ -51,30 +52,6 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
                    xfs_agblock_t bno,
                    xfs_extlen_t len);
-#if defined(XFS_ALLOC_TRACE)
-ktrace_t *xfs_alloc_trace_buf;
-#define TRACE_ALLOC(s,a)        \
-        xfs_alloc_trace_alloc(__func__, s, a, __LINE__)
-#define TRACE_FREE(s,a,b,x,f)   \
-        xfs_alloc_trace_free(__func__, s, mp, a, b, x, f, __LINE__)
-#define TRACE_MODAGF(s,a,f)     \
-        xfs_alloc_trace_modagf(__func__, s, mp, a, f, __LINE__)
-#define TRACE_BUSY(__func__,s,ag,agb,l,sl,tp)   \
-        xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__)
-#define TRACE_UNBUSY(__func__,s,ag,sl,tp)       \
-        xfs_alloc_trace_busy(__func__, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__)
-#define TRACE_BUSYSEARCH(__func__,s,ag,agb,l,tp)        \
-        xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, 0, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
-#else
-#define TRACE_ALLOC(s,a)
-#define TRACE_FREE(s,a,b,x,f)
-#define TRACE_MODAGF(s,a,f)
-#define TRACE_BUSY(s,a,ag,agb,l,sl,tp)
-#define TRACE_UNBUSY(fname,s,ag,sl,tp)
-#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,tp)
-#endif  /* XFS_ALLOC_TRACE */
 /*
 * Prototypes for per-ag allocation routines
 */
@@ -498,124 +475,6 @@ xfs_alloc_read_agfl(
        return 0;
 }
-#if defined(XFS_ALLOC_TRACE)
-/*
- * Add an allocation trace entry for an alloc call.
- */
-STATIC void
-xfs_alloc_trace_alloc(
-        const char      *name,          /* function tag string */
-        char            *str,           /* additional string */
-        xfs_alloc_arg_t *args,          /* allocation argument structure */
-        int             line)           /* source line number */
-{
-        ktrace_enter(xfs_alloc_trace_buf,
-                (void *)(__psint_t)(XFS_ALLOC_KTRACE_ALLOC | (line << 16)),
-                (void *)name,
-                (void *)str,
-                (void *)args->mp,
-                (void *)(__psunsigned_t)args->agno,
-                (void *)(__psunsigned_t)args->agbno,
-                (void *)(__psunsigned_t)args->minlen,
-                (void *)(__psunsigned_t)args->maxlen,
-                (void *)(__psunsigned_t)args->mod,
-                (void *)(__psunsigned_t)args->prod,
-                (void *)(__psunsigned_t)args->minleft,
-                (void *)(__psunsigned_t)args->total,
-                (void *)(__psunsigned_t)args->alignment,
-                (void *)(__psunsigned_t)args->len,
-                (void *)((((__psint_t)args->type) << 16) |
-                         (__psint_t)args->otype),
-                (void *)(__psint_t)((args->wasdel << 3) |
-                                    (args->wasfromfl << 2) |
-                                    (args->isfl << 1) |
-                                    (args->userdata << 0)));
-}
-/*
- * Add an allocation trace entry for a free call.
- */
-STATIC void
-xfs_alloc_trace_free(
-        const char      *name,          /* function tag string */
-        char            *str,           /* additional string */
-        xfs_mount_t     *mp,            /* file system mount point */
-        xfs_agnumber_t  agno,           /* allocation group number */
-        xfs_agblock_t   agbno,          /* a.g. relative block number */
-        xfs_extlen_t    len,            /* length of extent */
-        int             isfl,           /* set if is freelist allocation/free */
-        int             line)           /* source line number */
-{
-        ktrace_enter(xfs_alloc_trace_buf,
-                (void *)(__psint_t)(XFS_ALLOC_KTRACE_FREE | (line << 16)),
-                (void *)name,
-                (void *)str,
-                (void *)mp,
-                (void *)(__psunsigned_t)agno,
-                (void *)(__psunsigned_t)agbno,
-                (void *)(__psunsigned_t)len,
-                (void *)(__psint_t)isfl,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
-}
-/*
- * Add an allocation trace entry for modifying an agf.
- */
-STATIC void
-xfs_alloc_trace_modagf(
-        const char      *name,          /* function tag string */
-        char            *str,           /* additional string */
-        xfs_mount_t     *mp,            /* file system mount point */
-        xfs_agf_t       *agf,           /* new agf value */
-        int             flags,          /* logging flags for agf */
-        int             line)           /* source line number */
-{
-        ktrace_enter(xfs_alloc_trace_buf,
-                (void *)(__psint_t)(XFS_ALLOC_KTRACE_MODAGF | (line << 16)),
-                (void *)name,
-                (void *)str,
-                (void *)mp,
-                (void *)(__psint_t)flags,
-                (void *)(__psunsigned_t)be32_to_cpu(agf->agf_seqno),
-                (void *)(__psunsigned_t)be32_to_cpu(agf->agf_length),
-                (void *)(__psunsigned_t)be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]),
-                (void *)(__psunsigned_t)be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]),
-                (void *)(__psunsigned_t)be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]),
-                (void *)(__psunsigned_t)be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]),
-                (void *)(__psunsigned_t)be32_to_cpu(agf->agf_flfirst),
-                (void *)(__psunsigned_t)be32_to_cpu(agf->agf_fllast),
-                (void *)(__psunsigned_t)be32_to_cpu(agf->agf_flcount),
-                (void *)(__psunsigned_t)be32_to_cpu(agf->agf_freeblks),
-                (void *)(__psunsigned_t)be32_to_cpu(agf->agf_longest));
-}
-STATIC void
-xfs_alloc_trace_busy(
-        const char      *name,          /* function tag string */
-        char            *str,           /* additional string */
-        xfs_mount_t     *mp,            /* file system mount point */
-        xfs_agnumber_t  agno,           /* allocation group number */
-        xfs_agblock_t   agbno,          /* a.g. relative block number */
-        xfs_extlen_t    len,            /* length of extent */
-        int             slot,           /* perag Busy slot */
-        xfs_trans_t     *tp,
-        int             trtype,         /* type: add, delete, search */
-        int             line)           /* source line number */
-{
-        ktrace_enter(xfs_alloc_trace_buf,
-                (void *)(__psint_t)(trtype | (line << 16)),
-                (void *)name,
-                (void *)str,
-                (void *)mp,
-                (void *)(__psunsigned_t)agno,
-                (void *)(__psunsigned_t)agbno,
-                (void *)(__psunsigned_t)len,
-                (void *)(__psint_t)slot,
-                (void *)tp,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL);
-}
-#endif  /* XFS_ALLOC_TRACE */
 /*
 * Allocation group level functions.
 */
@@ -665,9 +524,6 @@ xfs_alloc_ag_vextent(
         */
        if (args->agbno != NULLAGBLOCK) {
                xfs_agf_t       *agf;   /* allocation group freelist header */
-#ifdef XFS_ALLOC_TRACE
-                xfs_mount_t     *mp = args->mp;
-#endif
                long            slen = (long)args->len;
                ASSERT(args->len >= args->minlen && args->len <= args->maxlen);
@@ -682,7 +538,6 @@ xfs_alloc_ag_vextent(
                        args->pag->pagf_freeblks -= args->len;
                        ASSERT(be32_to_cpu(agf->agf_freeblks) <=
                                be32_to_cpu(agf->agf_length));
-                        TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS);
                        xfs_alloc_log_agf(args->tp, args->agbp,
                                                XFS_AGF_FREEBLKS);
                        /* search the busylist for these blocks */
@@ -792,13 +647,14 @@ xfs_alloc_ag_vextent_exact(
        }
        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-        TRACE_ALLOC("normal", args);
+        trace_xfs_alloc_exact_done(args);
        args->wasfromfl = 0;
        return 0;
 error0:
        xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
-        TRACE_ALLOC("error", args);
+        trace_xfs_alloc_exact_error(args);
        return error;
 }
@@ -958,7 +814,7 @@ xfs_alloc_ag_vextent_near(
                args->len = blen;
                if (!xfs_alloc_fix_minleft(args)) {
                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-                        TRACE_ALLOC("nominleft", args);
+                        trace_xfs_alloc_near_nominleft(args);
                        return 0;
                }
                blen = args->len;
@@ -981,7 +837,8 @@ xfs_alloc_ag_vextent_near(
                        goto error0;
                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
                xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
-                TRACE_ALLOC("first", args);
+                trace_xfs_alloc_near_first(args);
                return 0;
        }
        /*
@@ -1272,7 +1129,7 @@ xfs_alloc_ag_vextent_near(
         * If we couldn't get anything, give up.
         */
        if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
-                TRACE_ALLOC("neither", args);
+                trace_xfs_alloc_size_neither(args);
                args->agbno = NULLAGBLOCK;
                return 0;
        }
@@ -1299,7 +1156,7 @@ xfs_alloc_ag_vextent_near(
        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
        xfs_alloc_fix_len(args);
        if (!xfs_alloc_fix_minleft(args)) {
-                TRACE_ALLOC("nominleft", args);
+                trace_xfs_alloc_near_nominleft(args);
                xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
                return 0;
@@ -1314,13 +1171,18 @@ xfs_alloc_ag_vextent_near(
        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
                        ltnew, rlen, XFSA_FIXUP_BNO_OK)))
                goto error0;
-        TRACE_ALLOC(j ? "gt" : "lt", args);
+        if (j)
+                trace_xfs_alloc_near_greater(args);
+        else
+                trace_xfs_alloc_near_lesser(args);
        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
        xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
        return 0;
 error0:
-        TRACE_ALLOC("error", args);
+        trace_xfs_alloc_near_error(args);
        if (cnt_cur != NULL)
                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
        if (bno_cur_lt != NULL)
@@ -1371,7 +1233,7 @@ xfs_alloc_ag_vextent_size(
                        goto error0;
                if (i == 0 || flen == 0) {
                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-                        TRACE_ALLOC("noentry", args);
+                        trace_xfs_alloc_size_noentry(args);
                        return 0;
                }
                ASSERT(i == 1);
@@ -1448,7 +1310,7 @@ xfs_alloc_ag_vextent_size(
        xfs_alloc_fix_len(args);
        if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) {
                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-                TRACE_ALLOC("nominleft", args);
+                trace_xfs_alloc_size_nominleft(args);
                args->agbno = NULLAGBLOCK;
                return 0;
        }
@@ -1471,11 +1333,11 @@ xfs_alloc_ag_vextent_size(
                args->agbno + args->len <=
                        be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
                error0);
-        TRACE_ALLOC("normal", args);
+        trace_xfs_alloc_size_done(args);
        return 0;
 error0:
-        TRACE_ALLOC("error", args);
+        trace_xfs_alloc_size_error(args);
        if (cnt_cur)
                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
        if (bno_cur)
@@ -1534,7 +1396,7 @@ xfs_alloc_ag_vextent_small(
                                be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
                                error0);
                        args->wasfromfl = 1;
-                        TRACE_ALLOC("freelist", args);
+                        trace_xfs_alloc_small_freelist(args);
                        *stat = 0;
                        return 0;
                }
@@ -1556,17 +1418,17 @@ xfs_alloc_ag_vextent_small(
         */
        if (flen < args->minlen) {
                args->agbno = NULLAGBLOCK;
-                TRACE_ALLOC("notenough", args);
+                trace_xfs_alloc_small_notenough(args);
                flen = 0;
        }
        *fbnop = fbno;
        *flenp = flen;
        *stat = 1;
-        TRACE_ALLOC("normal", args);
+        trace_xfs_alloc_small_done(args);
        return 0;
 error0:
-        TRACE_ALLOC("error", args);
+        trace_xfs_alloc_small_error(args);
        return error;
 }
@@ -1800,26 +1662,25 @@ xfs_free_ag_extent(
                xfs_agf_t       *agf;
                xfs_perag_t     *pag;           /* per allocation group data */
+                pag = xfs_perag_get(mp, agno);
+                pag->pagf_freeblks += len;
+                xfs_perag_put(pag);
                agf = XFS_BUF_TO_AGF(agbp);
-                pag = &mp->m_perag[agno];
                be32_add_cpu(&agf->agf_freeblks, len);
                xfs_trans_agblocks_delta(tp, len);
-                pag->pagf_freeblks += len;
                XFS_WANT_CORRUPTED_GOTO(
                        be32_to_cpu(agf->agf_freeblks) <=
                        be32_to_cpu(agf->agf_length),
                        error0);
-                TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS);
                xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
                if (!isfl)
                        xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
                XFS_STATS_INC(xs_freex);
                XFS_STATS_ADD(xs_freeb, len);
        }
-        TRACE_FREE(haveleft ?
-                        (haveright ? "both" : "left") :
+        trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
-                        (haveright ? "right" : "none"),
-                agno, bno, len, isfl);
        /*
         * Since blocks move to the free list without the coordination
@@ -1836,7 +1697,7 @@ xfs_free_ag_extent(
        return 0;
 error0:
-        TRACE_FREE("error", agno, bno, len, isfl);
+        trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1);
        if (bno_cur)
                xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
        if (cnt_cur)
@@ -2110,10 +1971,12 @@ xfs_alloc_get_freelist(
        xfs_trans_brelse(tp, agflbp);
        if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
                agf->agf_flfirst = 0;
-        pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)];
+        pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
        be32_add_cpu(&agf->agf_flcount, -1);
        xfs_trans_agflist_delta(tp, -1);
        pag->pagf_flcount--;
+        xfs_perag_put(pag);
        logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
        if (btreeblk) {
@@ -2122,7 +1985,6 @@ xfs_alloc_get_freelist(
                logflags |= XFS_AGF_BTREEBLKS;
        }
-        TRACE_MODAGF(NULL, agf, logflags);
        xfs_alloc_log_agf(tp, agbp, logflags);
        *bnop = bno;
@@ -2165,6 +2027,8 @@ xfs_alloc_log_agf(
                sizeof(xfs_agf_t)
        };
+        trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_);
        xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
        xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
 }
@@ -2218,7 +2082,8 @@ xfs_alloc_put_freelist(
        be32_add_cpu(&agf->agf_fllast, 1);
        if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
                agf->agf_fllast = 0;
-        pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)];
+        pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
        be32_add_cpu(&agf->agf_flcount, 1);
        xfs_trans_agflist_delta(tp, 1);
        pag->pagf_flcount++;
@@ -2229,14 +2094,13 @@ xfs_alloc_put_freelist(
                pag->pagf_btreeblks--;
                logflags |= XFS_AGF_BTREEBLKS;
        }
+        xfs_perag_put(pag);
-        TRACE_MODAGF(NULL, agf, logflags);
        xfs_alloc_log_agf(tp, agbp, logflags);
        ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp));
        blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)];
        *blockp = cpu_to_be32(bno);
-        TRACE_MODAGF(NULL, agf, logflags);
        xfs_alloc_log_agf(tp, agbp, logflags);
        xfs_trans_log_buf(tp, agflbp,
                (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl),
@@ -2294,7 +2158,6 @@ xfs_read_agf(
                xfs_trans_brelse(tp, *bpp);
                return XFS_ERROR(EFSCORRUPTED);
        }
        XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF);
        return 0;
 }
@@ -2317,7 +2180,7 @@ xfs_alloc_read_agf(
        ASSERT(agno != NULLAGNUMBER);
        error = xfs_read_agf(mp, tp, agno,
-                        (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0,
+                        (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
                        bpp);
        if (error)
                return error;
@@ -2326,7 +2189,7 @@ xfs_alloc_read_agf(
        ASSERT(!XFS_BUF_GETERROR(*bpp));
        agf = XFS_BUF_TO_AGF(*bpp);
-        pag = &mp->m_perag[agno];
+        pag = xfs_perag_get(mp, agno);
        if (!pag->pagf_init) {
                pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
                pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
@@ -2337,8 +2200,8 @@ xfs_alloc_read_agf(
                pag->pagf_levels[XFS_BTNUM_CNTi] =
                        be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
                spin_lock_init(&pag->pagb_lock);
-                pag->pagb_list = kmem_zalloc(XFS_PAGB_NUM_SLOTS *
+                pag->pagb_count = 0;
-                                        sizeof(xfs_perag_busy_t), KM_SLEEP);
+                memset(pag->pagb_list, 0, sizeof(pag->pagb_list));
                pag->pagf_init = 1;
        }
 #ifdef DEBUG
@@ -2353,6 +2216,7 @@ xfs_alloc_read_agf(
                       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
        }
 #endif
+        xfs_perag_put(pag);
        return 0;
 }
@@ -2399,7 +2263,7 @@ xfs_alloc_vextent(
            args->minlen > args->maxlen || args->minlen > agsize ||
            args->mod >= args->prod) {
                args->fsbno = NULLFSBLOCK;
-                TRACE_ALLOC("badargs", args);
+                trace_xfs_alloc_vextent_badargs(args);
                return 0;
        }
        minleft = args->minleft;
@@ -2412,24 +2276,21 @@ xfs_alloc_vextent(
                 * These three force us into a single a.g.
                 */
                args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
-                down_read(&mp->m_peraglock);
+                args->pag = xfs_perag_get(mp, args->agno);
-                args->pag = &mp->m_perag[args->agno];
                args->minleft = 0;
                error = xfs_alloc_fix_freelist(args, 0);
                args->minleft = minleft;
                if (error) {
-                        TRACE_ALLOC("nofix", args);
+                        trace_xfs_alloc_vextent_nofix(args);
                        goto error0;
                }
                if (!args->agbp) {
-                        up_read(&mp->m_peraglock);
+                        trace_xfs_alloc_vextent_noagbp(args);
-                        TRACE_ALLOC("noagbp", args);
                        break;
                }
                args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
                if ((error = xfs_alloc_ag_vextent(args)))
                        goto error0;
-                up_read(&mp->m_peraglock);
                break;
        case XFS_ALLOCTYPE_START_BNO:
                /*
@@ -2481,14 +2342,13 @@ xfs_alloc_vextent(
                 * Loop over allocation groups twice; first time with
                 * trylock set, second time without.
                 */
-                down_read(&mp->m_peraglock);
                for (;;) {
-                        args->pag = &mp->m_perag[args->agno];
+                        args->pag = xfs_perag_get(mp, args->agno);
                        if (no_min) args->minleft = 0;
                        error = xfs_alloc_fix_freelist(args, flags);
                        args->minleft = minleft;
                        if (error) {
-                                TRACE_ALLOC("nofix", args);
+                                trace_xfs_alloc_vextent_nofix(args);
                                goto error0;
                        }
                        /*
@@ -2499,7 +2359,9 @@ xfs_alloc_vextent(
                                        goto error0;
                                break;
                        }
-                        TRACE_ALLOC("loopfailed", args);
+                        trace_xfs_alloc_vextent_loopfailed(args);
                        /*
                         * Didn't work, figure out the next iteration.
                         */
@@ -2526,7 +2388,7 @@ xfs_alloc_vextent(
                        if (args->agno == sagno) {
                                if (no_min == 1) {
                                        args->agbno = NULLAGBLOCK;
-                                        TRACE_ALLOC("allfailed", args);
+                                        trace_xfs_alloc_vextent_allfailed(args);
                                        break;
                                }
                                if (flags == 0) {
@@ -2540,8 +2402,8 @@ xfs_alloc_vextent(
                                        }
                                }
                        }
+                        xfs_perag_put(args->pag);
                }
-                up_read(&mp->m_peraglock);
                if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
                        if (args->agno == sagno)
                                mp->m_agfrotor = (mp->m_agfrotor + 1) %
@@ -2567,9 +2429,10 @@ xfs_alloc_vextent(
                        args->len);
 #endif
        }
+        xfs_perag_put(args->pag);
        return 0;
 error0:
-        up_read(&mp->m_peraglock);
+        xfs_perag_put(args->pag);
        return error;
 }
@@ -2594,8 +2457,7 @@ xfs_free_extent(
        args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
        ASSERT(args.agno < args.mp->m_sb.sb_agcount);
        args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
-        down_read(&args.mp->m_peraglock);
+        args.pag = xfs_perag_get(args.mp, args.agno);
-        args.pag = &args.mp->m_perag[args.agno];
        if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING)))
                goto error0;
 #ifdef DEBUG
@@ -2605,7 +2467,7 @@ xfs_free_extent(
 #endif
        error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
 error0:
-        up_read(&args.mp->m_peraglock);
+        xfs_perag_put(args.pag);
        return error;
 }
@@ -2626,15 +2488,15 @@ xfs_alloc_mark_busy(xfs_trans_t *tp,
                    xfs_agblock_t bno,
                    xfs_extlen_t len)
 {
-        xfs_mount_t             *mp;
        xfs_perag_busy_t        *bsy;
+        struct xfs_perag        *pag;
        int                     n;
-        mp = tp->t_mountp;
+        pag = xfs_perag_get(tp->t_mountp, agno);
-        spin_lock(&mp->m_perag[agno].pagb_lock);
+        spin_lock(&pag->pagb_lock);
        /* search pagb_list for an open slot */
-        for (bsy = mp->m_perag[agno].pagb_list, n = 0;
+        for (bsy = pag->pagb_list, n = 0;
             n < XFS_PAGB_NUM_SLOTS;
             bsy++, n++) {
                if (bsy->busy_tp == NULL) {
@@ -2642,16 +2504,16 @@ xfs_alloc_mark_busy(xfs_trans_t *tp,
                }
        }
+        trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n);
        if (n < XFS_PAGB_NUM_SLOTS) {
-                bsy = &mp->m_perag[agno].pagb_list[n];
+                bsy = &pag->pagb_list[n];
-                mp->m_perag[agno].pagb_count++;
+                pag->pagb_count++;
-                TRACE_BUSY("xfs_alloc_mark_busy", "got", agno, bno, len, n, tp);
                bsy->busy_start = bno;
                bsy->busy_length = len;
                bsy->busy_tp = tp;
                xfs_trans_add_busy(tp, agno, n);
        } else {
-                TRACE_BUSY("xfs_alloc_mark_busy", "FULL", agno, bno, len, -1, tp);
                /*
                 * The busy list is full!  Since it is now not possible to
                 * track the free block, make this a synchronous transaction
@@ -2661,7 +2523,8 @@ xfs_alloc_mark_busy(xfs_trans_t *tp,
                xfs_trans_set_sync(tp);
        }
-        spin_unlock(&mp->m_perag[agno].pagb_lock);
+        spin_unlock(&pag->pagb_lock);
+        xfs_perag_put(pag);
 }
 void
@@ -2669,24 +2532,23 @@ xfs_alloc_clear_busy(xfs_trans_t *tp,
                     xfs_agnumber_t agno,
                     int idx)
 {
-        xfs_mount_t             *mp;
+        struct xfs_perag        *pag;
        xfs_perag_busy_t        *list;
-        mp = tp->t_mountp;
+        ASSERT(idx < XFS_PAGB_NUM_SLOTS);
+        pag = xfs_perag_get(tp->t_mountp, agno);
+        spin_lock(&pag->pagb_lock);
+        list = pag->pagb_list;
-        spin_lock(&mp->m_perag[agno].pagb_lock);
+        trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp);
-        list = mp->m_perag[agno].pagb_list;
-        ASSERT(idx < XFS_PAGB_NUM_SLOTS);
        if (list[idx].busy_tp == tp) {
-                TRACE_UNBUSY("xfs_alloc_clear_busy", "found", agno, idx, tp);
                list[idx].busy_tp = NULL;
-                mp->m_perag[agno].pagb_count--;
+                pag->pagb_count--;
-        } else {
-                TRACE_UNBUSY("xfs_alloc_clear_busy", "missing", agno, idx, tp);
        }
-        spin_unlock(&mp->m_perag[agno].pagb_lock);
+        spin_unlock(&pag->pagb_lock);
+        xfs_perag_put(pag);
 }
@@ -2700,48 +2562,44 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
                    xfs_agblock_t bno,
                    xfs_extlen_t len)
 {
-        xfs_mount_t             *mp;
+        struct xfs_perag        *pag;
        xfs_perag_busy_t        *bsy;
        xfs_agblock_t           uend, bend;
-        xfs_lsn_t               lsn;
+        xfs_lsn_t               lsn = 0;
        int                     cnt;
-        mp = tp->t_mountp;
+        pag = xfs_perag_get(tp->t_mountp, agno);
+        spin_lock(&pag->pagb_lock);
-        spin_lock(&mp->m_perag[agno].pagb_lock);
+        cnt = pag->pagb_count;
-        cnt = mp->m_perag[agno].pagb_count;
+        /*
+         * search pagb_list for this slot, skipping open slots. We have to
+         * search the entire array as there may be multiple overlaps and
+         * we have to get the most recent LSN for the log force to push out
+         * all the transactions that span the range.
+         */
        uend = bno + len - 1;
+        for (cnt = 0; cnt < pag->pagb_count; cnt++) {
-        /* search pagb_list for this slot, skipping open slots */
+                bsy = &pag->pagb_list[cnt];
-        for (bsy = mp->m_perag[agno].pagb_list; cnt; bsy++) {
+                if (!bsy->busy_tp)
+                        continue;
-                /*
-                 * (start1,length1) within (start2, length2)
+                bend = bsy->busy_start + bsy->busy_length - 1;
-                 */
+                if (bno > bend || uend < bsy->busy_start)
-                if (bsy->busy_tp != NULL) {
+                        continue;
-                        bend = bsy->busy_start + bsy->busy_length - 1;
-                        if ((bno > bend) || (uend < bsy->busy_start)) {
+                /* (start1,length1) within (start2, length2) */
-                                cnt--;
+                if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0)
-                        } else {
+                        lsn = bsy->busy_tp->t_commit_lsn;
-                                TRACE_BUSYSEARCH("xfs_alloc_search_busy",
-                                         "found1", agno, bno, len, tp);
-                                break;
-                        }
-                }
        }
+        spin_unlock(&pag->pagb_lock);
+        xfs_perag_put(pag);
+        trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn);
        /*
         * If a block was found, force the log through the LSN of the
         * transaction that freed the block
         */
-        if (cnt) {
+        if (lsn)
-                TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, tp);
+                xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC);
-                lsn = bsy->busy_tp->t_commit_lsn;
-                spin_unlock(&mp->m_perag[agno].pagb_lock);
-                xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
-        } else {
-                TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, tp);
-                spin_unlock(&mp->m_perag[agno].pagb_lock);
-        }
 }
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index e704caee10df..599bffa39784 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -37,6 +37,15 @@ typedef enum xfs_alloctype
        XFS_ALLOCTYPE_THIS_BNO          /* at exactly this block */
 } xfs_alloctype_t;
+#define XFS_ALLOC_TYPES \
+        { XFS_ALLOCTYPE_ANY_AG,         "ANY_AG" }, \
+        { XFS_ALLOCTYPE_FIRST_AG,       "FIRST_AG" }, \
+        { XFS_ALLOCTYPE_START_AG,       "START_AG" }, \
+        { XFS_ALLOCTYPE_THIS_AG,        "THIS_AG" }, \
+        { XFS_ALLOCTYPE_START_BNO,      "START_BNO" }, \
+        { XFS_ALLOCTYPE_NEAR_BNO,       "NEAR_BNO" }, \
+        { XFS_ALLOCTYPE_THIS_BNO,       "THIS_BNO" }
 /*
 * Flags for xfs_alloc_fix_freelist.
 */
@@ -109,24 +118,6 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
 #ifdef __KERNEL__
-#if defined(XFS_ALLOC_TRACE)
-/*
- * Allocation tracing buffer size.
- */
-#define XFS_ALLOC_TRACE_SIZE    4096
-extern ktrace_t *xfs_alloc_trace_buf;
-/*
- * Types for alloc tracing.
- */
-#define XFS_ALLOC_KTRACE_ALLOC  1
-#define XFS_ALLOC_KTRACE_FREE   2
-#define XFS_ALLOC_KTRACE_MODAGF 3
-#define XFS_ALLOC_KTRACE_BUSY   4
-#define XFS_ALLOC_KTRACE_UNBUSY 5
-#define XFS_ALLOC_KTRACE_BUSYSEARCH     6
-#endif
 void
 xfs_alloc_mark_busy(xfs_trans_t *tp,
                xfs_agnumber_t agno,
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index c10c3a292d30..b726e10d2c1c 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -39,6 +39,7 @@
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
+#include "xfs_trace.h"
 STATIC struct xfs_btree_cur *
@@ -60,12 +61,14 @@ xfs_allocbt_set_root(
        struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
        xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
        int                     btnum = cur->bc_btnum;
+        struct xfs_perag        *pag = xfs_perag_get(cur->bc_mp, seqno);
        ASSERT(ptr->s != 0);
        agf->agf_roots[btnum] = ptr->s;
        be32_add_cpu(&agf->agf_levels[btnum], inc);
-        cur->bc_mp->m_perag[seqno].pagf_levels[btnum] += inc;
+        pag->pagf_levels[btnum] += inc;
+        xfs_perag_put(pag);
        xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
 }
@@ -149,6 +152,7 @@ xfs_allocbt_update_lastrec(
 {
        struct xfs_agf          *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
        xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
+        struct xfs_perag        *pag;
        __be32                  len;
        int                     numrecs;
@@ -192,7 +196,9 @@ xfs_allocbt_update_lastrec(
        }
        agf->agf_longest = len;
-        cur->bc_mp->m_perag[seqno].pagf_longest = be32_to_cpu(len);
+        pag = xfs_perag_get(cur->bc_mp, seqno);
+        pag->pagf_longest = be32_to_cpu(len);
+        xfs_perag_put(pag);
        xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
 }
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 4ece1906bd41..b9c196a53c42 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -47,6 +47,7 @@
 #include "xfs_trans_space.h"
 #include "xfs_rw.h"
 #include "xfs_vnodeops.h"
+#include "xfs_trace.h"
 /*
 * xfs_attr.c
@@ -89,19 +90,15 @@ STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args);
 #define ATTR_RMTVALUE_MAPSIZE   1       /* # of map entries at once */
-#if defined(XFS_ATTR_TRACE)
-ktrace_t *xfs_attr_trace_buf;
-#endif
 STATIC int
 xfs_attr_name_to_xname(
        struct xfs_name *xname,
-        const char      *aname)
+        const unsigned char *aname)
 {
        if (!aname)
                return EINVAL;
        xname->name = aname;
-        xname->len = strlen(aname);
+        xname->len = strlen((char *)aname);
        if (xname->len >= MAXNAMELEN)
                return EFAULT;          /* match IRIX behaviour */
@@ -123,9 +120,13 @@ xfs_inode_hasattr(
 * Overall external interface routines.
 *========================================================================*/
-int
+STATIC int
-xfs_attr_fetch(xfs_inode_t *ip, struct xfs_name *name,
+xfs_attr_get_int(
-                char *value, int *valuelenp, int flags)
+        struct xfs_inode        *ip,
+        struct xfs_name         *name,
+        unsigned char           *value,
+        int                     *valuelenp,
+        int                     flags)
 {
        xfs_da_args_t   args;
        int             error;
@@ -170,8 +171,8 @@ xfs_attr_fetch(xfs_inode_t *ip, struct xfs_name *name,
 int
 xfs_attr_get(
        xfs_inode_t     *ip,
-        const char      *name,
+        const unsigned char *name,
-        char            *value,
+        unsigned char   *value,
        int             *valuelenp,
        int             flags)
 {
@@ -188,7 +189,7 @@ xfs_attr_get(
                return error;
        xfs_ilock(ip, XFS_ILOCK_SHARED);
-        error = xfs_attr_fetch(ip, &xname, value, valuelenp, flags);
+        error = xfs_attr_get_int(ip, &xname, value, valuelenp, flags);
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
        return(error);
 }
@@ -196,7 +197,7 @@ xfs_attr_get(
 /*
 * Calculate how many blocks we need for the new attribute,
 */
-int
+STATIC int
 xfs_attr_calc_size(
        struct xfs_inode        *ip,
        int                     namelen,
@@ -234,8 +235,12 @@ xfs_attr_calc_size(
 }
 STATIC int
-xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
+xfs_attr_set_int(
-                char *value, int valuelen, int flags)
+        struct xfs_inode *dp,
+        struct xfs_name *name,
+        unsigned char   *value,
+        int             valuelen,
+        int             flags)
 {
        xfs_da_args_t   args;
        xfs_fsblock_t   firstblock;
@@ -451,8 +456,8 @@ out:
 int
 xfs_attr_set(
        xfs_inode_t     *dp,
-        const char      *name,
+        const unsigned char *name,
-        char            *value,
+        unsigned char   *value,
        int             valuelen,
        int             flags)
 {
@@ -599,7 +604,7 @@ out:
 int
 xfs_attr_remove(
        xfs_inode_t     *dp,
-        const char      *name,
+        const unsigned char *name,
        int             flags)
 {
        int             error;
@@ -636,7 +641,6 @@ xfs_attr_list_int(xfs_attr_list_context_t *context)
                return EIO;
        xfs_ilock(dp, XFS_ILOCK_SHARED);
-        xfs_attr_trace_l_c("syscall start", context);
        /*
         * Decide on what work routines to call based on the inode size.
@@ -652,7 +656,6 @@ xfs_attr_list_int(xfs_attr_list_context_t *context)
        }
        xfs_iunlock(dp, XFS_ILOCK_SHARED);
-        xfs_attr_trace_l_c("syscall end", context);
        return error;
 }
@@ -670,9 +673,13 @@ xfs_attr_list_int(xfs_attr_list_context_t *context)
 */
 /*ARGSUSED*/
 STATIC int
-xfs_attr_put_listent(xfs_attr_list_context_t *context, int flags,
+xfs_attr_put_listent(
-                     char *name, int namelen,
+        xfs_attr_list_context_t *context,
-                     int valuelen, char *value)
+        int             flags,
+        unsigned char   *name,
+        int             namelen,
+        int             valuelen,
+        unsigned char   *value)
 {
        struct attrlist *alist = (struct attrlist *)context->alist;
        attrlist_ent_t *aep;
@@ -698,7 +705,7 @@ xfs_attr_put_listent(xfs_attr_list_context_t *context, int flags,
                        context->count * sizeof(alist->al_offset[0]);
        context->firstu -= ATTR_ENTSIZE(namelen);
        if (context->firstu < arraytop) {
-                xfs_attr_trace_l_c("buffer full", context);
+                trace_xfs_attr_list_full(context);
                alist->al_more = 1;
                context->seen_enough = 1;
                return 1;
@@ -710,7 +717,7 @@ xfs_attr_put_listent(xfs_attr_list_context_t *context, int flags,
        aep->a_name[namelen] = 0;
        alist->al_offset[context->count++] = context->firstu;
        alist->al_count = context->count;
-        xfs_attr_trace_l_c("add", context);
+        trace_xfs_attr_list_add(context);
        return 0;
 }
@@ -1849,7 +1856,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
                        node = bp->data;
                        switch (be16_to_cpu(node->hdr.info.magic)) {
                        case XFS_DA_NODE_MAGIC:
-                                xfs_attr_trace_l_cn("wrong blk", context, node);
+                                trace_xfs_attr_list_wrong_blk(context);
                                xfs_da_brelse(NULL, bp);
                                bp = NULL;
                                break;
@@ -1857,20 +1864,18 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
                                leaf = bp->data;
                                if (cursor->hashval > be32_to_cpu(leaf->entries[
                                    be16_to_cpu(leaf->hdr.count)-1].hashval)) {
-                                        xfs_attr_trace_l_cl("wrong blk",
+                                        trace_xfs_attr_list_wrong_blk(context);
-                                                           context, leaf);
                                        xfs_da_brelse(NULL, bp);
                                        bp = NULL;
                                } else if (cursor->hashval <=
                                             be32_to_cpu(leaf->entries[0].hashval)) {
-                                        xfs_attr_trace_l_cl("maybe wrong blk",
+                                        trace_xfs_attr_list_wrong_blk(context);
-                                                           context, leaf);
                                        xfs_da_brelse(NULL, bp);
                                        bp = NULL;
                                }
                                break;
                        default:
-                                xfs_attr_trace_l_c("wrong blk - ??", context);
+                                trace_xfs_attr_list_wrong_blk(context);
                                xfs_da_brelse(NULL, bp);
                                bp = NULL;
                        }
@@ -1915,8 +1920,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
                                if (cursor->hashval
                                                <= be32_to_cpu(btree->hashval)) {
                                        cursor->blkno = be32_to_cpu(btree->before);
-                                        xfs_attr_trace_l_cb("descending",
+                                        trace_xfs_attr_list_node_descend(context,
-                                                            context, btree);
+                                                                         btree);
                                        break;
                                }
                        }
@@ -1983,7 +1988,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
        xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE];
        xfs_mount_t *mp;
        xfs_daddr_t dblkno;
-        xfs_caddr_t dst;
+        void *dst;
        xfs_buf_t *bp;
        int nmap, error, tmp, valuelen, blkcnt, i;
        xfs_dablk_t lblkno;
@@ -2010,15 +2015,14 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
                        dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
                        blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
                        error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno,
-                                             blkcnt,
+                                             blkcnt, XBF_LOCK | XBF_DONT_BLOCK,
-                                             XFS_BUF_LOCK | XBF_DONT_BLOCK,
                                             &bp);
                        if (error)
                                return(error);
                        tmp = (valuelen < XFS_BUF_SIZE(bp))
                                ? valuelen : XFS_BUF_SIZE(bp);
-                        xfs_biomove(bp, 0, tmp, dst, XFS_B_READ);
+                        xfs_biomove(bp, 0, tmp, dst, XBF_READ);
                        xfs_buf_relse(bp);
                        dst += tmp;
                        valuelen -= tmp;
@@ -2042,7 +2046,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
        xfs_inode_t *dp;
        xfs_bmbt_irec_t map;
        xfs_daddr_t dblkno;
-        xfs_caddr_t src;
+        void *src;
        xfs_buf_t *bp;
        xfs_dablk_t lblkno;
        int blkcnt, valuelen, nmap, error, tmp, committed;
@@ -2143,14 +2147,14 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
                blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
-                bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, blkcnt,
+                bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt,
-                                       XFS_BUF_LOCK | XBF_DONT_BLOCK);
+                                 XBF_LOCK | XBF_DONT_BLOCK);
                ASSERT(bp);
                ASSERT(!XFS_BUF_GETERROR(bp));
                tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
                                                        XFS_BUF_SIZE(bp);
-                xfs_biomove(bp, 0, tmp, src, XFS_B_WRITE);
+                xfs_biomove(bp, 0, tmp, src, XBF_WRITE);
                if (tmp < XFS_BUF_SIZE(bp))
                        xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
                if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
@@ -2211,8 +2215,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
                /*
                 * If the "remote" value is in the cache, remove it.
                 */
-                bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt,
+                bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK);
-                                XFS_INCORE_TRYLOCK);
                if (bp) {
                        XFS_BUF_STALE(bp);
                        XFS_BUF_UNDELAYWRITE(bp);
@@ -2266,85 +2269,3 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
        }
        return(0);
 }
-#if defined(XFS_ATTR_TRACE)
-/*
- * Add a trace buffer entry for an attr_list context structure.
- */
-void
-xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context)
-{
-        xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_C, where, context,
-                (__psunsigned_t)NULL,
-                (__psunsigned_t)NULL,
-                (__psunsigned_t)NULL);
-}
-/*
- * Add a trace buffer entry for a context structure and a Btree node.
- */
-void
-xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context,
-                         struct xfs_da_intnode *node)
-{
-        xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CN, where, context,
-                (__psunsigned_t)be16_to_cpu(node->hdr.count),
-                (__psunsigned_t)be32_to_cpu(node->btree[0].hashval),
-                (__psunsigned_t)be32_to_cpu(node->btree[
-                                    be16_to_cpu(node->hdr.count)-1].hashval));
-}
-/*
- * Add a trace buffer entry for a context structure and a Btree element.
- */
-void
-xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context,
-                          struct xfs_da_node_entry *btree)
-{
-        xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CB, where, context,
-                (__psunsigned_t)be32_to_cpu(btree->hashval),
-                (__psunsigned_t)be32_to_cpu(btree->before),
-                (__psunsigned_t)NULL);
-}
-/*
- * Add a trace buffer entry for a context structure and a leaf block.
- */
-void
-xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
-                              struct xfs_attr_leafblock *leaf)
-{
-        xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CL, where, context,
-                (__psunsigned_t)be16_to_cpu(leaf->hdr.count),
-                (__psunsigned_t)be32_to_cpu(leaf->entries[0].hashval),
-                (__psunsigned_t)be32_to_cpu(leaf->entries[
-                                be16_to_cpu(leaf->hdr.count)-1].hashval));
-}
-/*
- * Add a trace buffer entry for the arguments given to the routine,
- * generic form.
- */
-void
-xfs_attr_trace_enter(int type, char *where,
-                         struct xfs_attr_list_context *context,
-                         __psunsigned_t a13, __psunsigned_t a14,
-                         __psunsigned_t a15)
-{
-        ASSERT(xfs_attr_trace_buf);
-        ktrace_enter(xfs_attr_trace_buf, (void *)((__psunsigned_t)type),
-                (void *)((__psunsigned_t)where),
-                (void *)((__psunsigned_t)context->dp),
-                (void *)((__psunsigned_t)context->cursor->hashval),
-                (void *)((__psunsigned_t)context->cursor->blkno),
-                (void *)((__psunsigned_t)context->cursor->offset),
-                (void *)((__psunsigned_t)context->alist),
-                (void *)((__psunsigned_t)context->bufsize),
-                (void *)((__psunsigned_t)context->count),
-                (void *)((__psunsigned_t)context->firstu),
-                NULL,
-                (void *)((__psunsigned_t)context->dupcnt),
-                (void *)((__psunsigned_t)context->flags),
-                (void *)a13, (void *)a14, (void *)a15);
-}
-#endif  /* XFS_ATTR_TRACE */
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index fb3b2a68b9b9..e920d68ef509 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -48,6 +48,16 @@ struct xfs_attr_list_context;
 #define ATTR_KERNOTIME  0x1000  /* [kernel] don't update inode timestamps */
 #define ATTR_KERNOVAL   0x2000  /* [kernel] get attr size only, not value */
+#define XFS_ATTR_FLAGS \
+        { ATTR_DONTFOLLOW,      "DONTFOLLOW" }, \
+        { ATTR_ROOT,            "ROOT" }, \
+        { ATTR_TRUST,           "TRUST" }, \
+        { ATTR_SECURE,          "SECURE" }, \
+        { ATTR_CREATE,          "CREATE" }, \
+        { ATTR_REPLACE,         "REPLACE" }, \
+        { ATTR_KERNOTIME,       "KERNOTIME" }, \
+        { ATTR_KERNOVAL,        "KERNOVAL" }
 /*
 * The maximum size (into the kernel or returned from the kernel) of an
 * attribute value or the buffer used for an attr_list() call.  Larger
@@ -103,7 +113,7 @@ typedef struct attrlist_cursor_kern {
 typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
-                                      char *, int, int, char *);
+                              unsigned char *, int, int, unsigned char *);
 typedef struct xfs_attr_list_context {
        struct xfs_inode                *dp;            /* inode */
@@ -129,9 +139,7 @@ typedef struct xfs_attr_list_context {
 /*
 * Overall external interface routines.
 */
-int xfs_attr_calc_size(struct xfs_inode *, int, int, int *);
 int xfs_attr_inactive(struct xfs_inode *dp);
-int xfs_attr_fetch(struct xfs_inode *, struct xfs_name *, char *, int *, int);
 int xfs_attr_rmtval_get(struct xfs_da_args *args);
 int xfs_attr_list_int(struct xfs_attr_list_context *);
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index afdc8911637d..a90ce74fc256 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -42,6 +42,7 @@
 #include "xfs_attr.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_error.h"
+#include "xfs_trace.h"
 /*
 * xfs_attr_leaf.c
@@ -98,7 +99,7 @@ STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
 * If namespace bits don't match return 0.
 * If all match then return 1.
 */
-STATIC_INLINE int
+STATIC int
 xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
 {
        return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
@@ -520,11 +521,11 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
        sfe = &sf->list[0];
        for (i = 0; i < sf->hdr.count; i++) {
-                nargs.name = (char *)sfe->nameval;
+                nargs.name = sfe->nameval;
                nargs.namelen = sfe->namelen;
-                nargs.value = (char *)&sfe->nameval[nargs.namelen];
+                nargs.value = &sfe->nameval[nargs.namelen];
                nargs.valuelen = sfe->valuelen;
-                nargs.hashval = xfs_da_hashname((char *)sfe->nameval,
+                nargs.hashval = xfs_da_hashname(sfe->nameval,
                                                sfe->namelen);
                nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
                error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */
@@ -594,7 +595,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
        cursor = context->cursor;
        ASSERT(cursor != NULL);
-        xfs_attr_trace_l_c("sf start", context);
+        trace_xfs_attr_list_sf(context);
        /*
         * If the buffer is large enough and the cursor is at the start,
@@ -611,10 +612,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
                for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
                        error = context->put_listent(context,
                                           sfe->flags,
-                                           (char *)sfe->nameval,
+                                           sfe->nameval,
                                           (int)sfe->namelen,
                                           (int)sfe->valuelen,
-                                           (char*)&sfe->nameval[sfe->namelen]);
+                                           &sfe->nameval[sfe->namelen]);
                        /*
                         * Either search callback finished early or
@@ -627,7 +628,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
                                return error;
                        sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
                }
-                xfs_attr_trace_l_c("sf big-gulp", context);
+                trace_xfs_attr_list_sf_all(context);
                return(0);
        }
@@ -653,14 +654,13 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
                        XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
                                             XFS_ERRLEVEL_LOW,
                                             context->dp->i_mount, sfe);
-                        xfs_attr_trace_l_c("sf corrupted", context);
                        kmem_free(sbuf);
                        return XFS_ERROR(EFSCORRUPTED);
                }
                sbp->entno = i;
-                sbp->hash = xfs_da_hashname((char *)sfe->nameval, sfe->namelen);
+                sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen);
-                sbp->name = (char *)sfe->nameval;
+                sbp->name = sfe->nameval;
                sbp->namelen = sfe->namelen;
                /* These are bytes, and both on-disk, don't endian-flip */
                sbp->valuelen = sfe->valuelen;
@@ -693,7 +693,6 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
        }
        if (i == nsbuf) {
                kmem_free(sbuf);
-                xfs_attr_trace_l_c("blk end", context);
                return(0);
        }
@@ -719,7 +718,6 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
        }
        kmem_free(sbuf);
-        xfs_attr_trace_l_c("sf E-O-F", context);
        return(0);
 }
@@ -820,9 +818,9 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
                        continue;
                ASSERT(entry->flags & XFS_ATTR_LOCAL);
                name_loc = xfs_attr_leaf_name_local(leaf, i);
-                nargs.name = (char *)name_loc->nameval;
+                nargs.name = name_loc->nameval;
                nargs.namelen = name_loc->namelen;
-                nargs.value = (char *)&name_loc->nameval[nargs.namelen];
+                nargs.value = &name_loc->nameval[nargs.namelen];
                nargs.valuelen = be16_to_cpu(name_loc->valuelen);
                nargs.hashval = be32_to_cpu(entry->hashval);
                nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags);
@@ -2323,7 +2321,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
        cursor = context->cursor;
        cursor->initted = 1;
-        xfs_attr_trace_l_cl("blk start", context, leaf);
+        trace_xfs_attr_list_leaf(context);
        /*
         * Re-find our place in the leaf block if this is a new syscall.
@@ -2344,7 +2342,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
                        }
                }
                if (i == be16_to_cpu(leaf->hdr.count)) {
-                        xfs_attr_trace_l_c("not found", context);
+                        trace_xfs_attr_list_notfound(context);
                        return(0);
                }
        } else {
@@ -2372,10 +2370,10 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
                        retval = context->put_listent(context,
                                                entry->flags,
-                                                (char *)name_loc->nameval,
+                                                name_loc->nameval,
                                                (int)name_loc->namelen,
                                                be16_to_cpu(name_loc->valuelen),
-                                                (char *)&name_loc->nameval[name_loc->namelen]);
+                                                &name_loc->nameval[name_loc->namelen]);
                        if (retval)
                                return retval;
                } else {
@@ -2399,15 +2397,15 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
                                        return retval;
                                retval = context->put_listent(context,
                                                entry->flags,
-                                                (char *)name_rmt->name,
+                                                name_rmt->name,
                                                (int)name_rmt->namelen,
                                                valuelen,
-                                                (char*)args.value);
+                                                args.value);
                                kmem_free(args.value);
                        } else {
                                retval = context->put_listent(context,
                                                entry->flags,
-                                                (char *)name_rmt->name,
+                                                name_rmt->name,
                                                (int)name_rmt->namelen,
                                                valuelen,
                                                NULL);
@@ -2419,7 +2417,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
                        break;
                cursor->offset++;
        }
-        xfs_attr_trace_l_cl("blk end", context, leaf);
+        trace_xfs_attr_list_leaf_end(context);
        return(retval);
 }
@@ -2952,7 +2950,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
                                                map.br_blockcount);
                        bp = xfs_trans_get_buf(*trans,
                                        dp->i_mount->m_ddev_targp,
-                                        dblkno, dblkcnt, XFS_BUF_LOCK);
+                                        dblkno, dblkcnt, XBF_LOCK);
                        xfs_trans_binval(*trans, bp);
                        /*
                         * Roll to next transaction.
diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h
index ea22839caed2..919756e3ba53 100644
--- a/fs/xfs/xfs_attr_sf.h
+++ b/fs/xfs/xfs_attr_sf.h
@@ -25,8 +25,6 @@
 * to fit into the literal area of the inode.
 */
-struct xfs_inode;
 /*
 * Entries are packed toward the top as tight as possible.
 */
@@ -54,7 +52,7 @@ typedef struct xfs_attr_sf_sort {
        __uint8_t       valuelen;       /* length of value */
        __uint8_t       flags;          /* flags bits (see xfs_attr_leaf.h) */
        xfs_dahash_t    hash;           /* this entry's hash value */
-        char            *name;          /* name value, pointer into buffer */
+        unsigned char   *name;          /* name value, pointer into buffer */
 } xfs_attr_sf_sort_t;
 #define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen)   /* space name/value uses */ \
@@ -69,42 +67,4 @@ typedef struct xfs_attr_sf_sort {
        (be16_to_cpu(((xfs_attr_shortform_t *)  \
                ((dp)->i_afp->if_u1.if_data))->hdr.totsize))
-#if defined(XFS_ATTR_TRACE)
-/*
- * Kernel tracing support for attribute lists
- */
-struct xfs_attr_list_context;
-struct xfs_da_intnode;
-struct xfs_da_node_entry;
-struct xfs_attr_leafblock;
-#define XFS_ATTR_TRACE_SIZE     4096    /* size of global trace buffer */
-extern ktrace_t *xfs_attr_trace_buf;
-/*
- * Trace record types.
- */
-#define XFS_ATTR_KTRACE_L_C     1       /* context */
-#define XFS_ATTR_KTRACE_L_CN    2       /* context, node */
-#define XFS_ATTR_KTRACE_L_CB    3       /* context, btree */
-#define XFS_ATTR_KTRACE_L_CL    4       /* context, leaf */
-void xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context);
-void xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context,
-                              struct xfs_da_intnode *node);
-void xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context,
-                              struct xfs_da_node_entry *btree);
-void xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
-                              struct xfs_attr_leafblock *leaf);
-void xfs_attr_trace_enter(int type, char *where,
-                             struct xfs_attr_list_context *context,
-                             __psunsigned_t a13, __psunsigned_t a14,
-                             __psunsigned_t a15);
-#else
-#define xfs_attr_trace_l_c(w,c)
-#define xfs_attr_trace_l_cn(w,c,n)
-#define xfs_attr_trace_l_cb(w,c,b)
-#define xfs_attr_trace_l_cl(w,c,l)
-#endif /* XFS_ATTR_TRACE */
 #endif  /* __XFS_ATTR_SF_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 8971fb09d387..5c11e4d17010 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -54,6 +54,7 @@
 #include "xfs_buf_item.h"
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
+#include "xfs_trace.h"
 #ifdef DEBUG
@@ -272,71 +273,6 @@ xfs_bmap_isaeof(
        int             whichfork,      /* data or attribute fork */
        char            *aeof);         /* return value */
-#ifdef XFS_BMAP_TRACE
-/*
- * Add bmap trace entry prior to a call to xfs_iext_remove.
- */
-STATIC void
-xfs_bmap_trace_delete(
-        const char      *fname,         /* function name */
-        char            *desc,          /* operation description */
-        xfs_inode_t     *ip,            /* incore inode pointer */
-        xfs_extnum_t    idx,            /* index of entry(entries) deleted */
-        xfs_extnum_t    cnt,            /* count of entries deleted, 1 or 2 */
-        int             whichfork);     /* data or attr fork */
-/*
- * Add bmap trace entry prior to a call to xfs_iext_insert, or
- * reading in the extents list from the disk (in the btree).
- */
-STATIC void
-xfs_bmap_trace_insert(
-        const char      *fname,         /* function name */
-        char            *desc,          /* operation description */
-        xfs_inode_t     *ip,            /* incore inode pointer */
-        xfs_extnum_t    idx,            /* index of entry(entries) inserted */
-        xfs_extnum_t    cnt,            /* count of entries inserted, 1 or 2 */
-        xfs_bmbt_irec_t *r1,            /* inserted record 1 */
-        xfs_bmbt_irec_t *r2,            /* inserted record 2 or null */
-        int             whichfork);     /* data or attr fork */
-/*
- * Add bmap trace entry after updating an extent record in place.
- */
-STATIC void
-xfs_bmap_trace_post_update(
-        const char      *fname,         /* function name */
-        char            *desc,          /* operation description */
-        xfs_inode_t     *ip,            /* incore inode pointer */
-        xfs_extnum_t    idx,            /* index of entry updated */
-        int             whichfork);     /* data or attr fork */
-/*
- * Add bmap trace entry prior to updating an extent record in place.
- */
-STATIC void
-xfs_bmap_trace_pre_update(
-        const char      *fname,         /* function name */
-        char            *desc,          /* operation description */
-        xfs_inode_t     *ip,            /* incore inode pointer */
-        xfs_extnum_t    idx,            /* index of entry to be updated */
-        int             whichfork);     /* data or attr fork */
-#define XFS_BMAP_TRACE_DELETE(d,ip,i,c,w)       \
-        xfs_bmap_trace_delete(__func__,d,ip,i,c,w)
-#define XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w) \
-        xfs_bmap_trace_insert(__func__,d,ip,i,c,r1,r2,w)
-#define XFS_BMAP_TRACE_POST_UPDATE(d,ip,i,w)    \
-        xfs_bmap_trace_post_update(__func__,d,ip,i,w)
-#define XFS_BMAP_TRACE_PRE_UPDATE(d,ip,i,w)     \
-        xfs_bmap_trace_pre_update(__func__,d,ip,i,w)
-#else
-#define XFS_BMAP_TRACE_DELETE(d,ip,i,c,w)
-#define XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w)
-#define XFS_BMAP_TRACE_POST_UPDATE(d,ip,i,w)
-#define XFS_BMAP_TRACE_PRE_UPDATE(d,ip,i,w)
-#endif  /* XFS_BMAP_TRACE */
 /*
 * Compute the worst-case number of indirect blocks that will be used
 * for ip's delayed extent of length "len".
@@ -363,18 +299,6 @@ xfs_bmap_validate_ret(
 #define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
 #endif /* DEBUG */
-#if defined(XFS_RW_TRACE)
-STATIC void
-xfs_bunmap_trace(
-        xfs_inode_t             *ip,
-        xfs_fileoff_t           bno,
-        xfs_filblks_t           len,
-        int                     flags,
-        inst_t                  *ra);
-#else
-#define xfs_bunmap_trace(ip, bno, len, flags, ra)
-#endif  /* XFS_RW_TRACE */
 STATIC int
 xfs_bmap_count_tree(
        xfs_mount_t     *mp,
@@ -590,9 +514,9 @@ xfs_bmap_add_extent(
         * already extents in the list.
         */
        if (nextents == 0) {
-                XFS_BMAP_TRACE_INSERT("insert empty", ip, 0, 1, new, NULL,
+                xfs_iext_insert(ip, 0, 1, new,
-                        whichfork);
+                                whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
-                xfs_iext_insert(ifp, 0, 1, new);
                ASSERT(cur == NULL);
                ifp->if_lastex = 0;
                if (!isnullstartblock(new->br_startblock)) {
@@ -759,26 +683,10 @@ xfs_bmap_add_extent_delay_real(
        xfs_filblks_t           temp=0; /* value for dnew calculations */
        xfs_filblks_t           temp2=0;/* value for dnew calculations */
        int                     tmp_rval;       /* partial logging flags */
-        enum {                          /* bit number definitions for state */
-                LEFT_CONTIG,    RIGHT_CONTIG,
-                LEFT_FILLING,   RIGHT_FILLING,
-                LEFT_DELAY,     RIGHT_DELAY,
-                LEFT_VALID,     RIGHT_VALID
-        };
 #define LEFT            r[0]
 #define RIGHT           r[1]
 #define PREV            r[2]
-#define MASK(b)         (1 << (b))
-#define MASK2(a,b)      (MASK(a) | MASK(b))
-#define MASK3(a,b,c)    (MASK2(a,b) | MASK(c))
-#define MASK4(a,b,c,d)  (MASK3(a,b,c) | MASK(d))
-#define STATE_SET(b,v)  ((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
-#define STATE_TEST(b)   (state & MASK(b))
-#define STATE_SET_TEST(b,v)     ((v) ? ((state |= MASK(b)), 1) : \
-                                       ((state &= ~MASK(b)), 0))
-#define SWITCH_STATE            \
-        (state & MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG))
        /*
         * Set up a bunch of variables to make the tests simpler.
@@ -790,69 +698,80 @@ xfs_bmap_add_extent_delay_real(
        new_endoff = new->br_startoff + new->br_blockcount;
        ASSERT(PREV.br_startoff <= new->br_startoff);
        ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
        /*
         * Set flags determining what part of the previous delayed allocation
         * extent is being replaced by a real allocation.
         */
-        STATE_SET(LEFT_FILLING, PREV.br_startoff == new->br_startoff);
+        if (PREV.br_startoff == new->br_startoff)
-        STATE_SET(RIGHT_FILLING,
+                state |= BMAP_LEFT_FILLING;
-                PREV.br_startoff + PREV.br_blockcount == new_endoff);
+        if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
+                state |= BMAP_RIGHT_FILLING;
        /*
         * Check and set flags if this segment has a left neighbor.
         * Don't set contiguous if the combined extent would be too large.
         */
-        if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
+        if (idx > 0) {
+                state |= BMAP_LEFT_VALID;
                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
-                STATE_SET(LEFT_DELAY, isnullstartblock(LEFT.br_startblock));
+                if (isnullstartblock(LEFT.br_startblock))
+                        state |= BMAP_LEFT_DELAY;
        }
-        STATE_SET(LEFT_CONTIG,
-                STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
+        if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
-                LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+            LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
-                LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+            LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
-                LEFT.br_state == new->br_state &&
+            LEFT.br_state == new->br_state &&
-                LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN);
+            LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+                state |= BMAP_LEFT_CONTIG;
        /*
         * Check and set flags if this segment has a right neighbor.
         * Don't set contiguous if the combined extent would be too large.
         * Also check for all-three-contiguous being too large.
         */
-        if (STATE_SET_TEST(RIGHT_VALID,
+        if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
-                        idx <
+                state |= BMAP_RIGHT_VALID;
-                        ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
-                STATE_SET(RIGHT_DELAY, isnullstartblock(RIGHT.br_startblock));
+                if (isnullstartblock(RIGHT.br_startblock))
+                        state |= BMAP_RIGHT_DELAY;
        }
-        STATE_SET(RIGHT_CONTIG,
-                STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
+        if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
-                new_endoff == RIGHT.br_startoff &&
+            new_endoff == RIGHT.br_startoff &&
-                new->br_startblock + new->br_blockcount ==
+            new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
-                    RIGHT.br_startblock &&
+            new->br_state == RIGHT.br_state &&
-                new->br_state == RIGHT.br_state &&
+            new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
-                new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+            ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
-                ((state & MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING)) !=
+                       BMAP_RIGHT_FILLING)) !=
-                  MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING) ||
+                      (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
-                 LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+                       BMAP_RIGHT_FILLING) ||
-                     <= MAXEXTLEN));
+             LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+                        <= MAXEXTLEN))
+                state |= BMAP_RIGHT_CONTIG;
        error = 0;
        /*
         * Switch out based on the FILLING and CONTIG state bits.
         */
-        switch (SWITCH_STATE) {
+        switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+                         BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
-        case MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+        case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+             BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
                /*
                 * Filling in all of a previously delayed allocation extent.
                 * The left and right neighbors are both contiguous with new.
                 */
-                XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC|RC", ip, idx - 1,
+                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
-                        XFS_DATA_FORK);
                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
                        LEFT.br_blockcount + PREV.br_blockcount +
                        RIGHT.br_blockcount);
-                XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC|RC", ip, idx - 1,
+                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
-                        XFS_DATA_FORK);
-                XFS_BMAP_TRACE_DELETE("LF|RF|LC|RC", ip, idx, 2, XFS_DATA_FORK);
+                xfs_iext_remove(ip, idx, 2, state);
-                xfs_iext_remove(ifp, idx, 2);
                ip->i_df.if_lastex = idx - 1;
                ip->i_d.di_nextents--;
                if (cur == NULL)
@@ -885,20 +804,18 @@ xfs_bmap_add_extent_delay_real(
                        RIGHT.br_blockcount;
                break;
-        case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG):
+        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
                /*
                 * Filling in all of a previously delayed allocation extent.
                 * The left neighbor is contiguous, the right is not.
                 */
-                XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC", ip, idx - 1,
+                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
-                        XFS_DATA_FORK);
                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
                        LEFT.br_blockcount + PREV.br_blockcount);
-                XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC", ip, idx - 1,
+                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
-                        XFS_DATA_FORK);
                ip->i_df.if_lastex = idx - 1;
-                XFS_BMAP_TRACE_DELETE("LF|RF|LC", ip, idx, 1, XFS_DATA_FORK);
+                xfs_iext_remove(ip, idx, 1, state);
-                xfs_iext_remove(ifp, idx, 1);
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
@@ -921,19 +838,19 @@ xfs_bmap_add_extent_delay_real(
                        PREV.br_blockcount;
                break;
-        case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG):
+        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
                /*
                 * Filling in all of a previously delayed allocation extent.
                 * The right neighbor is contiguous, the left is not.
                 */
-                XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|RC", ip, idx, XFS_DATA_FORK);
+                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
                xfs_bmbt_set_startblock(ep, new->br_startblock);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount + RIGHT.br_blockcount);
-                XFS_BMAP_TRACE_POST_UPDATE("LF|RF|RC", ip, idx, XFS_DATA_FORK);
+                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
                ip->i_df.if_lastex = idx;
-                XFS_BMAP_TRACE_DELETE("LF|RF|RC", ip, idx + 1, 1, XFS_DATA_FORK);
+                xfs_iext_remove(ip, idx + 1, 1, state);
-                xfs_iext_remove(ifp, idx + 1, 1);
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
@@ -956,15 +873,16 @@ xfs_bmap_add_extent_delay_real(
                        RIGHT.br_blockcount;
                break;
-        case MASK2(LEFT_FILLING, RIGHT_FILLING):
+        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
                /*
                 * Filling in all of a previously delayed allocation extent.
                 * Neither the left nor right neighbors are contiguous with
                 * the new one.
                 */
-                XFS_BMAP_TRACE_PRE_UPDATE("LF|RF", ip, idx, XFS_DATA_FORK);
+                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
                xfs_bmbt_set_startblock(ep, new->br_startblock);
-                XFS_BMAP_TRACE_POST_UPDATE("LF|RF", ip, idx, XFS_DATA_FORK);
+                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
                ip->i_df.if_lastex = idx;
                ip->i_d.di_nextents++;
                if (cur == NULL)
@@ -987,19 +905,20 @@ xfs_bmap_add_extent_delay_real(
                temp2 = new->br_blockcount;
                break;
-        case MASK2(LEFT_FILLING, LEFT_CONTIG):
+        case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
                /*
                 * Filling in the first part of a previous delayed allocation.
                 * The left neighbor is contiguous.
                 */
-                XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx - 1, XFS_DATA_FORK);
+                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
                        LEFT.br_blockcount + new->br_blockcount);
                xfs_bmbt_set_startoff(ep,
                        PREV.br_startoff + new->br_blockcount);
-                XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx - 1, XFS_DATA_FORK);
+                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
                temp = PREV.br_blockcount - new->br_blockcount;
-                XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK);
+                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);
                ip->i_df.if_lastex = idx - 1;
                if (cur == NULL)
@@ -1021,7 +940,7 @@ xfs_bmap_add_extent_delay_real(
                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
                        startblockval(PREV.br_startblock));
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK);
+                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
                *dnew = temp;
                /* DELTA: The boundary between two in-core extents moved. */
                temp = LEFT.br_startoff;
@@ -1029,18 +948,16 @@ xfs_bmap_add_extent_delay_real(
                        PREV.br_blockcount;
                break;
-        case MASK(LEFT_FILLING):
+        case BMAP_LEFT_FILLING:
                /*
                 * Filling in the first part of a previous delayed allocation.
                 * The left neighbor is not contiguous.
                 */
-                XFS_BMAP_TRACE_PRE_UPDATE("LF", ip, idx, XFS_DATA_FORK);
+                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
                xfs_bmbt_set_startoff(ep, new_endoff);
                temp = PREV.br_blockcount - new->br_blockcount;
                xfs_bmbt_set_blockcount(ep, temp);
-                XFS_BMAP_TRACE_INSERT("LF", ip, idx, 1, new, NULL,
+                xfs_iext_insert(ip, idx, 1, new, state);
-                        XFS_DATA_FORK);
-                xfs_iext_insert(ifp, idx, 1, new);
                ip->i_df.if_lastex = idx;
                ip->i_d.di_nextents++;
                if (cur == NULL)
@@ -1071,27 +988,27 @@ xfs_bmap_add_extent_delay_real(
                        (cur ? cur->bc_private.b.allocated : 0));
                ep = xfs_iext_get_ext(ifp, idx + 1);
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                XFS_BMAP_TRACE_POST_UPDATE("LF", ip, idx + 1, XFS_DATA_FORK);
+                trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
                *dnew = temp;
                /* DELTA: One in-core extent is split in two. */
                temp = PREV.br_startoff;
                temp2 = PREV.br_blockcount;
                break;
-        case MASK2(RIGHT_FILLING, RIGHT_CONTIG):
+        case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
                /*
                 * Filling in the last part of a previous delayed allocation.
                 * The right neighbor is contiguous with the new allocation.
                 */
                temp = PREV.br_blockcount - new->br_blockcount;
-                XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK);
+                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
-                XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx + 1, XFS_DATA_FORK);
+                trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);
                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1),
                        new->br_startoff, new->br_startblock,
                        new->br_blockcount + RIGHT.br_blockcount,
                        RIGHT.br_state);
-                XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx + 1, XFS_DATA_FORK);
+                trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
                ip->i_df.if_lastex = idx + 1;
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
@@ -1112,7 +1029,7 @@ xfs_bmap_add_extent_delay_real(
                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
                        startblockval(PREV.br_startblock));
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK);
+                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
                *dnew = temp;
                /* DELTA: The boundary between two in-core extents moved. */
                temp = PREV.br_startoff;
@@ -1120,17 +1037,15 @@ xfs_bmap_add_extent_delay_real(
                        RIGHT.br_blockcount;
                break;
-        case MASK(RIGHT_FILLING):
+        case BMAP_RIGHT_FILLING:
                /*
                 * Filling in the last part of a previous delayed allocation.
                 * The right neighbor is not contiguous.
                 */
                temp = PREV.br_blockcount - new->br_blockcount;
-                XFS_BMAP_TRACE_PRE_UPDATE("RF", ip, idx, XFS_DATA_FORK);
+                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);
-                XFS_BMAP_TRACE_INSERT("RF", ip, idx + 1, 1, new, NULL,
+                xfs_iext_insert(ip, idx + 1, 1, new, state);
-                        XFS_DATA_FORK);
-                xfs_iext_insert(ifp, idx + 1, 1, new);
                ip->i_df.if_lastex = idx + 1;
                ip->i_d.di_nextents++;
                if (cur == NULL)
@@ -1161,7 +1076,7 @@ xfs_bmap_add_extent_delay_real(
                        (cur ? cur->bc_private.b.allocated : 0));
                ep = xfs_iext_get_ext(ifp, idx);
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                XFS_BMAP_TRACE_POST_UPDATE("RF", ip, idx, XFS_DATA_FORK);
+                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
                *dnew = temp;
                /* DELTA: One in-core extent is split in two. */
                temp = PREV.br_startoff;
@@ -1175,7 +1090,7 @@ xfs_bmap_add_extent_delay_real(
                 * This case is avoided almost all the time.
                 */
                temp = new->br_startoff - PREV.br_startoff;
-                XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx, XFS_DATA_FORK);
+                trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);
                r[0] = *new;
                r[1].br_state = PREV.br_state;
@@ -1183,9 +1098,7 @@ xfs_bmap_add_extent_delay_real(
                r[1].br_startoff = new_endoff;
                temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
                r[1].br_blockcount = temp2;
-                XFS_BMAP_TRACE_INSERT("0", ip, idx + 1, 2, &r[0], &r[1],
+                xfs_iext_insert(ip, idx + 1, 2, &r[0], state);
-                        XFS_DATA_FORK);
-                xfs_iext_insert(ifp, idx + 1, 2, &r[0]);
                ip->i_df.if_lastex = idx + 1;
                ip->i_d.di_nextents++;
                if (cur == NULL)
@@ -1242,24 +1155,24 @@ xfs_bmap_add_extent_delay_real(
                }
                ep = xfs_iext_get_ext(ifp, idx);
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, XFS_DATA_FORK);
+                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
-                XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx + 2, XFS_DATA_FORK);
+                trace_xfs_bmap_pre_update(ip, idx + 2, state, _THIS_IP_);
                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2),
                        nullstartblock((int)temp2));
-                XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx + 2, XFS_DATA_FORK);
+                trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_);
                *dnew = temp + temp2;
                /* DELTA: One in-core extent is split in three. */
                temp = PREV.br_startoff;
                temp2 = PREV.br_blockcount;
                break;
-        case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+        case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
-        case MASK3(RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+        case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
-        case MASK2(LEFT_FILLING, RIGHT_CONTIG):
+        case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
-        case MASK2(RIGHT_FILLING, LEFT_CONTIG):
+        case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
-        case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
+        case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
-        case MASK(LEFT_CONTIG):
+        case BMAP_LEFT_CONTIG:
-        case MASK(RIGHT_CONTIG):
+        case BMAP_RIGHT_CONTIG:
                /*
                 * These cases are all impossible.
                 */
@@ -1279,14 +1192,6 @@ done:
 #undef  LEFT
 #undef  RIGHT
 #undef  PREV
-#undef  MASK
-#undef  MASK2
-#undef  MASK3
-#undef  MASK4
-#undef  STATE_SET
-#undef  STATE_TEST
-#undef  STATE_SET_TEST
-#undef  SWITCH_STATE
 }
 /*
@@ -1316,27 +1221,10 @@ xfs_bmap_add_extent_unwritten_real(
        int                     state = 0;/* state bits, accessed thru macros */
        xfs_filblks_t           temp=0;
        xfs_filblks_t           temp2=0;
-        enum {                          /* bit number definitions for state */
-                LEFT_CONTIG,    RIGHT_CONTIG,
-                LEFT_FILLING,   RIGHT_FILLING,
-                LEFT_DELAY,     RIGHT_DELAY,
-                LEFT_VALID,     RIGHT_VALID
-        };
 #define LEFT            r[0]
 #define RIGHT           r[1]
 #define PREV            r[2]
-#define MASK(b)         (1 << (b))
-#define MASK2(a,b)      (MASK(a) | MASK(b))
-#define MASK3(a,b,c)    (MASK2(a,b) | MASK(c))
-#define MASK4(a,b,c,d)  (MASK3(a,b,c) | MASK(d))
-#define STATE_SET(b,v)  ((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
-#define STATE_TEST(b)   (state & MASK(b))
-#define STATE_SET_TEST(b,v)     ((v) ? ((state |= MASK(b)), 1) : \
-                                       ((state &= ~MASK(b)), 0))
-#define SWITCH_STATE            \
-        (state & MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG))
        /*
         * Set up a bunch of variables to make the tests simpler.
         */
@@ -1352,68 +1240,78 @@ xfs_bmap_add_extent_unwritten_real(
        new_endoff = new->br_startoff + new->br_blockcount;
        ASSERT(PREV.br_startoff <= new->br_startoff);
        ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
        /*
         * Set flags determining what part of the previous oldext allocation
         * extent is being replaced by a newext allocation.
         */
-        STATE_SET(LEFT_FILLING, PREV.br_startoff == new->br_startoff);
+        if (PREV.br_startoff == new->br_startoff)
-        STATE_SET(RIGHT_FILLING,
+                state |= BMAP_LEFT_FILLING;
-                PREV.br_startoff + PREV.br_blockcount == new_endoff);
+        if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
+                state |= BMAP_RIGHT_FILLING;
        /*
         * Check and set flags if this segment has a left neighbor.
         * Don't set contiguous if the combined extent would be too large.
         */
-        if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
+        if (idx > 0) {
+                state |= BMAP_LEFT_VALID;
                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
-                STATE_SET(LEFT_DELAY, isnullstartblock(LEFT.br_startblock));
+                if (isnullstartblock(LEFT.br_startblock))
+                        state |= BMAP_LEFT_DELAY;
        }
-        STATE_SET(LEFT_CONTIG,
-                STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
+        if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
-                LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+            LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
-                LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+            LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
-                LEFT.br_state == newext &&
+            LEFT.br_state == newext &&
-                LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN);
+            LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+                state |= BMAP_LEFT_CONTIG;
        /*
         * Check and set flags if this segment has a right neighbor.
         * Don't set contiguous if the combined extent would be too large.
         * Also check for all-three-contiguous being too large.
         */
-        if (STATE_SET_TEST(RIGHT_VALID,
+        if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
-                        idx <
+                state |= BMAP_RIGHT_VALID;
-                        ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
-                STATE_SET(RIGHT_DELAY, isnullstartblock(RIGHT.br_startblock));
+                if (isnullstartblock(RIGHT.br_startblock))
+                        state |= BMAP_RIGHT_DELAY;
        }
-        STATE_SET(RIGHT_CONTIG,
-                STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
+        if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
-                new_endoff == RIGHT.br_startoff &&
+            new_endoff == RIGHT.br_startoff &&
-                new->br_startblock + new->br_blockcount ==
+            new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
-                    RIGHT.br_startblock &&
+            newext == RIGHT.br_state &&
-                newext == RIGHT.br_state &&
+            new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
-                new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+            ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
-                ((state & MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING)) !=
+                       BMAP_RIGHT_FILLING)) !=
-                  MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING) ||
+                      (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
-                 LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+                       BMAP_RIGHT_FILLING) ||
-                     <= MAXEXTLEN));
+             LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+                        <= MAXEXTLEN))
+                state |= BMAP_RIGHT_CONTIG;
        /*
         * Switch out based on the FILLING and CONTIG state bits.
         */
-        switch (SWITCH_STATE) {
+        switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+                         BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
-        case MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+        case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+             BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
                /*
                 * Setting all of a previous oldext extent to newext.
                 * The left and right neighbors are both contiguous with new.
                 */
-                XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC|RC", ip, idx - 1,
+                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
-                        XFS_DATA_FORK);
                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
                        LEFT.br_blockcount + PREV.br_blockcount +
                        RIGHT.br_blockcount);
-                XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC|RC", ip, idx - 1,
+                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
-                        XFS_DATA_FORK);
-                XFS_BMAP_TRACE_DELETE("LF|RF|LC|RC", ip, idx, 2, XFS_DATA_FORK);
+                xfs_iext_remove(ip, idx, 2, state);
-                xfs_iext_remove(ifp, idx, 2);
                ip->i_df.if_lastex = idx - 1;
                ip->i_d.di_nextents -= 2;
                if (cur == NULL)
@@ -1450,20 +1348,18 @@ xfs_bmap_add_extent_unwritten_real(
                        RIGHT.br_blockcount;
                break;
-        case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG):
+        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
                /*
                 * Setting all of a previous oldext extent to newext.
                 * The left neighbor is contiguous, the right is not.
                 */
-                XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC", ip, idx - 1,
+                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
-                        XFS_DATA_FORK);
                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
                        LEFT.br_blockcount + PREV.br_blockcount);
-                XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC", ip, idx - 1,
+                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
-                        XFS_DATA_FORK);
                ip->i_df.if_lastex = idx - 1;
-                XFS_BMAP_TRACE_DELETE("LF|RF|LC", ip, idx, 1, XFS_DATA_FORK);
+                xfs_iext_remove(ip, idx, 1, state);
-                xfs_iext_remove(ifp, idx, 1);
                ip->i_d.di_nextents--;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1492,21 +1388,18 @@ xfs_bmap_add_extent_unwritten_real(
                        PREV.br_blockcount;
                break;
-        case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG):
+        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
                /*
                 * Setting all of a previous oldext extent to newext.
                 * The right neighbor is contiguous, the left is not.
                 */
-                XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|RC", ip, idx,
+                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
-                        XFS_DATA_FORK);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount + RIGHT.br_blockcount);
                xfs_bmbt_set_state(ep, newext);
-                XFS_BMAP_TRACE_POST_UPDATE("LF|RF|RC", ip, idx,
+                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
-                        XFS_DATA_FORK);
                ip->i_df.if_lastex = idx;
-                XFS_BMAP_TRACE_DELETE("LF|RF|RC", ip, idx + 1, 1, XFS_DATA_FORK);
+                xfs_iext_remove(ip, idx + 1, 1, state);
-                xfs_iext_remove(ifp, idx + 1, 1);
                ip->i_d.di_nextents--;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1535,17 +1428,16 @@ xfs_bmap_add_extent_unwritten_real(
                        RIGHT.br_blockcount;
                break;
-        case MASK2(LEFT_FILLING, RIGHT_FILLING):
+        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
                /*
                 * Setting all of a previous oldext extent to newext.
                 * Neither the left nor right neighbors are contiguous with
                 * the new one.
                 */
-                XFS_BMAP_TRACE_PRE_UPDATE("LF|RF", ip, idx,
+                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
-                        XFS_DATA_FORK);
                xfs_bmbt_set_state(ep, newext);
-                XFS_BMAP_TRACE_POST_UPDATE("LF|RF", ip, idx,
+                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
-                        XFS_DATA_FORK);
                ip->i_df.if_lastex = idx;
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
@@ -1566,27 +1458,25 @@ xfs_bmap_add_extent_unwritten_real(
                temp2 = new->br_blockcount;
                break;
-        case MASK2(LEFT_FILLING, LEFT_CONTIG):
+        case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
                /*
                 * Setting the first part of a previous oldext extent to newext.
                 * The left neighbor is contiguous.
                 */
-                XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx - 1,
+                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
-                        XFS_DATA_FORK);
                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
                        LEFT.br_blockcount + new->br_blockcount);
                xfs_bmbt_set_startoff(ep,
                        PREV.br_startoff + new->br_blockcount);
-                XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx - 1,
+                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
-                        XFS_DATA_FORK);
-                XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx,
+                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
-                        XFS_DATA_FORK);
                xfs_bmbt_set_startblock(ep,
                        new->br_startblock + new->br_blockcount);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount - new->br_blockcount);
-                XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx,
+                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
-                        XFS_DATA_FORK);
                ip->i_df.if_lastex = idx - 1;
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
@@ -1617,22 +1507,21 @@ xfs_bmap_add_extent_unwritten_real(
                        PREV.br_blockcount;
                break;
-        case MASK(LEFT_FILLING):
+        case BMAP_LEFT_FILLING:
                /*
                 * Setting the first part of a previous oldext extent to newext.
                 * The left neighbor is not contiguous.
                 */
-                XFS_BMAP_TRACE_PRE_UPDATE("LF", ip, idx, XFS_DATA_FORK);
+                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
                ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
                xfs_bmbt_set_startoff(ep, new_endoff);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount - new->br_blockcount);
                xfs_bmbt_set_startblock(ep,
                        new->br_startblock + new->br_blockcount);
-                XFS_BMAP_TRACE_POST_UPDATE("LF", ip, idx, XFS_DATA_FORK);
+                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
-                XFS_BMAP_TRACE_INSERT("LF", ip, idx, 1, new, NULL,
-                        XFS_DATA_FORK);
+                xfs_iext_insert(ip, idx, 1, new, state);
-                xfs_iext_insert(ifp, idx, 1, new);
                ip->i_df.if_lastex = idx;
                ip->i_d.di_nextents++;
                if (cur == NULL)
@@ -1660,24 +1549,21 @@ xfs_bmap_add_extent_unwritten_real(
                temp2 = PREV.br_blockcount;
                break;
-        case MASK2(RIGHT_FILLING, RIGHT_CONTIG):
+        case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
                /*
                 * Setting the last part of a previous oldext extent to newext.
                 * The right neighbor is contiguous with the new allocation.
                 */
-                XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx,
+                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
-                        XFS_DATA_FORK);
+                trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
-                XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx + 1,
-                        XFS_DATA_FORK);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount - new->br_blockcount);
-                XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx,
+                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
-                        XFS_DATA_FORK);
                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1),
                        new->br_startoff, new->br_startblock,
                        new->br_blockcount + RIGHT.br_blockcount, newext);
-                XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx + 1,
+                trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
-                        XFS_DATA_FORK);
                ip->i_df.if_lastex = idx + 1;
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
@@ -1707,18 +1593,17 @@ xfs_bmap_add_extent_unwritten_real(
                        RIGHT.br_blockcount;
                break;
-        case MASK(RIGHT_FILLING):
+        case BMAP_RIGHT_FILLING:
                /*
                 * Setting the last part of a previous oldext extent to newext.
                 * The right neighbor is not contiguous.
                 */
-                XFS_BMAP_TRACE_PRE_UPDATE("RF", ip, idx, XFS_DATA_FORK);
+                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount - new->br_blockcount);
-                XFS_BMAP_TRACE_POST_UPDATE("RF", ip, idx, XFS_DATA_FORK);
+                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
-                XFS_BMAP_TRACE_INSERT("RF", ip, idx + 1, 1, new, NULL,
-                        XFS_DATA_FORK);
+                xfs_iext_insert(ip, idx + 1, 1, new, state);
-                xfs_iext_insert(ifp, idx + 1, 1, new);
                ip->i_df.if_lastex = idx + 1;
                ip->i_d.di_nextents++;
                if (cur == NULL)
@@ -1756,19 +1641,18 @@ xfs_bmap_add_extent_unwritten_real(
                 * newext.  Contiguity is impossible here.
                 * One extent becomes three extents.
                 */
-                XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx, XFS_DATA_FORK);
+                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep,
                        new->br_startoff - PREV.br_startoff);
-                XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, XFS_DATA_FORK);
+                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
                r[0] = *new;
                r[1].br_startoff = new_endoff;
                r[1].br_blockcount =
                        PREV.br_startoff + PREV.br_blockcount - new_endoff;
                r[1].br_startblock = new->br_startblock + new->br_blockcount;
                r[1].br_state = oldext;
-                XFS_BMAP_TRACE_INSERT("0", ip, idx + 1, 2, &r[0], &r[1],
+                xfs_iext_insert(ip, idx + 1, 2, &r[0], state);
-                        XFS_DATA_FORK);
-                xfs_iext_insert(ifp, idx + 1, 2, &r[0]);
                ip->i_df.if_lastex = idx + 1;
                ip->i_d.di_nextents += 2;
                if (cur == NULL)
@@ -1813,13 +1697,13 @@ xfs_bmap_add_extent_unwritten_real(
                temp2 = PREV.br_blockcount;
                break;
-        case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+        case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
-        case MASK3(RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+        case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
-        case MASK2(LEFT_FILLING, RIGHT_CONTIG):
+        case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
-        case MASK2(RIGHT_FILLING, LEFT_CONTIG):
+        case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
-        case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
+        case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
-        case MASK(LEFT_CONTIG):
+        case BMAP_LEFT_CONTIG:
-        case MASK(RIGHT_CONTIG):
+        case BMAP_RIGHT_CONTIG:
                /*
                 * These cases are all impossible.
                 */
@@ -1839,14 +1723,6 @@ done:
 #undef  LEFT
 #undef  RIGHT
 #undef  PREV
-#undef  MASK
-#undef  MASK2
-#undef  MASK3
-#undef  MASK4
-#undef  STATE_SET
-#undef  STATE_TEST
-#undef  STATE_SET_TEST
-#undef  SWITCH_STATE
 }
 /*
@@ -1872,62 +1748,57 @@ xfs_bmap_add_extent_hole_delay(
        int                     state;  /* state bits, accessed thru macros */
        xfs_filblks_t           temp=0; /* temp for indirect calculations */
        xfs_filblks_t           temp2=0;
-        enum {                          /* bit number definitions for state */
-                LEFT_CONTIG,    RIGHT_CONTIG,
-                LEFT_DELAY,     RIGHT_DELAY,
-                LEFT_VALID,     RIGHT_VALID
-        };
-#define MASK(b)                 (1 << (b))
-#define MASK2(a,b)              (MASK(a) | MASK(b))
-#define STATE_SET(b,v)          ((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
-#define STATE_TEST(b)           (state & MASK(b))
-#define STATE_SET_TEST(b,v)     ((v) ? ((state |= MASK(b)), 1) : \
-                                       ((state &= ~MASK(b)), 0))
-#define SWITCH_STATE            (state & MASK2(LEFT_CONTIG, RIGHT_CONTIG))
        ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
        ep = xfs_iext_get_ext(ifp, idx);
        state = 0;
        ASSERT(isnullstartblock(new->br_startblock));
        /*
         * Check and set flags if this segment has a left neighbor
         */
-        if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
+        if (idx > 0) {
+                state |= BMAP_LEFT_VALID;
                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
-                STATE_SET(LEFT_DELAY, isnullstartblock(left.br_startblock));
+                if (isnullstartblock(left.br_startblock))
+                        state |= BMAP_LEFT_DELAY;
        }
        /*
         * Check and set flags if the current (right) segment exists.
         * If it doesn't exist, we're converting the hole at end-of-file.
         */
-        if (STATE_SET_TEST(RIGHT_VALID,
+        if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
-                           idx <
+                state |= BMAP_RIGHT_VALID;
-                           ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
                xfs_bmbt_get_all(ep, &right);
-                STATE_SET(RIGHT_DELAY, isnullstartblock(right.br_startblock));
+                if (isnullstartblock(right.br_startblock))
+                        state |= BMAP_RIGHT_DELAY;
        }
        /*
         * Set contiguity flags on the left and right neighbors.
         * Don't let extents get too large, even if the pieces are contiguous.
         */
-        STATE_SET(LEFT_CONTIG,
+        if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
-                STATE_TEST(LEFT_VALID) && STATE_TEST(LEFT_DELAY) &&
+            left.br_startoff + left.br_blockcount == new->br_startoff &&
-                left.br_startoff + left.br_blockcount == new->br_startoff &&
+            left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
-                left.br_blockcount + new->br_blockcount <= MAXEXTLEN);
+                state |= BMAP_LEFT_CONTIG;
-        STATE_SET(RIGHT_CONTIG,
-                STATE_TEST(RIGHT_VALID) && STATE_TEST(RIGHT_DELAY) &&
+        if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
-                new->br_startoff + new->br_blockcount == right.br_startoff &&
+            new->br_startoff + new->br_blockcount == right.br_startoff &&
-                new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+            new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
-                (!STATE_TEST(LEFT_CONTIG) ||
+            (!(state & BMAP_LEFT_CONTIG) ||
-                 (left.br_blockcount + new->br_blockcount +
+             (left.br_blockcount + new->br_blockcount +
-                     right.br_blockcount <= MAXEXTLEN)));
+              right.br_blockcount <= MAXEXTLEN)))
+                state |= BMAP_RIGHT_CONTIG;
        /*
         * Switch out based on the contiguity flags.
         */
-        switch (SWITCH_STATE) {
+        switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+        case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
-        case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
                /*
                 * New allocation is contiguous with delayed allocations
                 * on the left and on the right.
@@ -1935,8 +1806,8 @@ xfs_bmap_add_extent_hole_delay(
                 */
                temp = left.br_blockcount + new->br_blockcount +
                        right.br_blockcount;
-                XFS_BMAP_TRACE_PRE_UPDATE("LC|RC", ip, idx - 1,
-                        XFS_DATA_FORK);
+                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
                oldlen = startblockval(left.br_startblock) +
                        startblockval(new->br_startblock) +
@@ -1944,53 +1815,52 @@ xfs_bmap_add_extent_hole_delay(
                newlen = xfs_bmap_worst_indlen(ip, temp);
                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
                        nullstartblock((int)newlen));
-                XFS_BMAP_TRACE_POST_UPDATE("LC|RC", ip, idx - 1,
+                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
-                        XFS_DATA_FORK);
-                XFS_BMAP_TRACE_DELETE("LC|RC", ip, idx, 1, XFS_DATA_FORK);
+                xfs_iext_remove(ip, idx, 1, state);
-                xfs_iext_remove(ifp, idx, 1);
                ip->i_df.if_lastex = idx - 1;
                /* DELTA: Two in-core extents were replaced by one. */
                temp2 = temp;
                temp = left.br_startoff;
                break;
-        case MASK(LEFT_CONTIG):
+        case BMAP_LEFT_CONTIG:
                /*
                 * New allocation is contiguous with a delayed allocation
                 * on the left.
                 * Merge the new allocation with the left neighbor.
                 */
                temp = left.br_blockcount + new->br_blockcount;
-                XFS_BMAP_TRACE_PRE_UPDATE("LC", ip, idx - 1,
+                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
-                        XFS_DATA_FORK);
                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
                oldlen = startblockval(left.br_startblock) +
                        startblockval(new->br_startblock);
                newlen = xfs_bmap_worst_indlen(ip, temp);
                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
                        nullstartblock((int)newlen));
-                XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1,
+                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
-                        XFS_DATA_FORK);
                ip->i_df.if_lastex = idx - 1;
                /* DELTA: One in-core extent grew into a hole. */
                temp2 = temp;
                temp = left.br_startoff;
                break;
-        case MASK(RIGHT_CONTIG):
+        case BMAP_RIGHT_CONTIG:
                /*
                 * New allocation is contiguous with a delayed allocation
                 * on the right.
                 * Merge the new allocation with the right neighbor.
                 */
-                XFS_BMAP_TRACE_PRE_UPDATE("RC", ip, idx, XFS_DATA_FORK);
+                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
                temp = new->br_blockcount + right.br_blockcount;
                oldlen = startblockval(new->br_startblock) +
                        startblockval(right.br_startblock);
                newlen = xfs_bmap_worst_indlen(ip, temp);
                xfs_bmbt_set_allf(ep, new->br_startoff,
                        nullstartblock((int)newlen), temp, right.br_state);
-                XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, XFS_DATA_FORK);
+                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
                ip->i_df.if_lastex = idx;
                /* DELTA: One in-core extent grew into a hole. */
                temp2 = temp;
@@ -2004,9 +1874,7 @@ xfs_bmap_add_extent_hole_delay(
                 * Insert a new entry.
                 */
                oldlen = newlen = 0;
-                XFS_BMAP_TRACE_INSERT("0", ip, idx, 1, new, NULL,
+                xfs_iext_insert(ip, idx, 1, new, state);
-                        XFS_DATA_FORK);
-                xfs_iext_insert(ifp, idx, 1, new);
                ip->i_df.if_lastex = idx;
                /* DELTA: A new in-core extent was added in a hole. */
                temp2 = new->br_blockcount;
@@ -2030,12 +1898,6 @@ xfs_bmap_add_extent_hole_delay(
        }
        *logflagsp = 0;
        return 0;
-#undef  MASK
-#undef  MASK2
-#undef  STATE_SET
-#undef  STATE_TEST
-#undef  STATE_SET_TEST
-#undef  SWITCH_STATE
 }
 /*
@@ -2062,83 +1924,75 @@ xfs_bmap_add_extent_hole_real(
        int                     state;  /* state bits, accessed thru macros */
        xfs_filblks_t           temp=0;
        xfs_filblks_t           temp2=0;
-        enum {                          /* bit number definitions for state */
-                LEFT_CONTIG,    RIGHT_CONTIG,
-                LEFT_DELAY,     RIGHT_DELAY,
-                LEFT_VALID,     RIGHT_VALID
-        };
-#define MASK(b)                 (1 << (b))
-#define MASK2(a,b)              (MASK(a) | MASK(b))
-#define STATE_SET(b,v)          ((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
-#define STATE_TEST(b)           (state & MASK(b))
-#define STATE_SET_TEST(b,v)     ((v) ? ((state |= MASK(b)), 1) : \
-                                       ((state &= ~MASK(b)), 0))
-#define SWITCH_STATE            (state & MASK2(LEFT_CONTIG, RIGHT_CONTIG))
        ifp = XFS_IFORK_PTR(ip, whichfork);
        ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
        ep = xfs_iext_get_ext(ifp, idx);
        state = 0;
+        if (whichfork == XFS_ATTR_FORK)
+                state |= BMAP_ATTRFORK;
        /*
         * Check and set flags if this segment has a left neighbor.
         */
-        if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
+        if (idx > 0) {
+                state |= BMAP_LEFT_VALID;
                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
-                STATE_SET(LEFT_DELAY, isnullstartblock(left.br_startblock));
+                if (isnullstartblock(left.br_startblock))
+                        state |= BMAP_LEFT_DELAY;
        }
        /*
         * Check and set flags if this segment has a current value.
         * Not true if we're inserting into the "hole" at eof.
         */
-        if (STATE_SET_TEST(RIGHT_VALID,
+        if (idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
-                           idx <
+                state |= BMAP_RIGHT_VALID;
-                           ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
                xfs_bmbt_get_all(ep, &right);
-                STATE_SET(RIGHT_DELAY, isnullstartblock(right.br_startblock));
+                if (isnullstartblock(right.br_startblock))
+                        state |= BMAP_RIGHT_DELAY;
        }
        /*
         * We're inserting a real allocation between "left" and "right".
         * Set the contiguity flags.  Don't let extents get too large.
         */
-        STATE_SET(LEFT_CONTIG,
+        if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
-                STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
+            left.br_startoff + left.br_blockcount == new->br_startoff &&
-                left.br_startoff + left.br_blockcount == new->br_startoff &&
+            left.br_startblock + left.br_blockcount == new->br_startblock &&
-                left.br_startblock + left.br_blockcount == new->br_startblock &&
+            left.br_state == new->br_state &&
-                left.br_state == new->br_state &&
+            left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
-                left.br_blockcount + new->br_blockcount <= MAXEXTLEN);
+                state |= BMAP_LEFT_CONTIG;
-        STATE_SET(RIGHT_CONTIG,
-                STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
+        if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
-                new->br_startoff + new->br_blockcount == right.br_startoff &&
+            new->br_startoff + new->br_blockcount == right.br_startoff &&
-                new->br_startblock + new->br_blockcount ==
+            new->br_startblock + new->br_blockcount == right.br_startblock &&
-                    right.br_startblock &&
+            new->br_state == right.br_state &&
-                new->br_state == right.br_state &&
+            new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
-                new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+            (!(state & BMAP_LEFT_CONTIG) ||
-                (!STATE_TEST(LEFT_CONTIG) ||
+             left.br_blockcount + new->br_blockcount +
-                 left.br_blockcount + new->br_blockcount +
+             right.br_blockcount <= MAXEXTLEN))
-                     right.br_blockcount <= MAXEXTLEN));
+                state |= BMAP_RIGHT_CONTIG;
        error = 0;
        /*
         * Select which case we're in here, and implement it.
         */
-        switch (SWITCH_STATE) {
+        switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+        case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
-        case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
                /*
                 * New allocation is contiguous with real allocations on the
                 * left and on the right.
                 * Merge all three into a single extent record.
                 */
-                XFS_BMAP_TRACE_PRE_UPDATE("LC|RC", ip, idx - 1,
+                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
-                        whichfork);
                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
                        left.br_blockcount + new->br_blockcount +
                        right.br_blockcount);
-                XFS_BMAP_TRACE_POST_UPDATE("LC|RC", ip, idx - 1,
+                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
-                        whichfork);
-                XFS_BMAP_TRACE_DELETE("LC|RC", ip, idx, 1, whichfork);
+                xfs_iext_remove(ip, idx, 1, state);
-                xfs_iext_remove(ifp, idx, 1);
                ifp->if_lastex = idx - 1;
                XFS_IFORK_NEXT_SET(ip, whichfork,
                        XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
@@ -2173,16 +2027,17 @@ xfs_bmap_add_extent_hole_real(
                        right.br_blockcount;
                break;
-        case MASK(LEFT_CONTIG):
+        case BMAP_LEFT_CONTIG:
                /*
                 * New allocation is contiguous with a real allocation
                 * on the left.
                 * Merge the new allocation with the left neighbor.
                 */
-                XFS_BMAP_TRACE_PRE_UPDATE("LC", ip, idx - 1, whichfork);
+                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
                        left.br_blockcount + new->br_blockcount);
-                XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1, whichfork);
+                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
                ifp->if_lastex = idx - 1;
                if (cur == NULL) {
                        rval = xfs_ilog_fext(whichfork);
@@ -2207,17 +2062,18 @@ xfs_bmap_add_extent_hole_real(
                        new->br_blockcount;
                break;
-        case MASK(RIGHT_CONTIG):
+        case BMAP_RIGHT_CONTIG:
                /*
                 * New allocation is contiguous with a real allocation
                 * on the right.
                 * Merge the new allocation with the right neighbor.
                 */
-                XFS_BMAP_TRACE_PRE_UPDATE("RC", ip, idx, whichfork);
+                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
                xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock,
                        new->br_blockcount + right.br_blockcount,
                        right.br_state);
-                XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, whichfork);
+                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
                ifp->if_lastex = idx;
                if (cur == NULL) {
                        rval = xfs_ilog_fext(whichfork);
@@ -2248,8 +2104,7 @@ xfs_bmap_add_extent_hole_real(
                 * real allocation.
                 * Insert a new entry.
                 */
-                XFS_BMAP_TRACE_INSERT("0", ip, idx, 1, new, NULL, whichfork);
+                xfs_iext_insert(ip, idx, 1, new, state);
-                xfs_iext_insert(ifp, idx, 1, new);
                ifp->if_lastex = idx;
                XFS_IFORK_NEXT_SET(ip, whichfork,
                        XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
@@ -2283,12 +2138,6 @@ xfs_bmap_add_extent_hole_real(
 done:
        *logflagsp = rval;
        return error;
-#undef  MASK
-#undef  MASK2
-#undef  STATE_SET
-#undef  STATE_TEST
-#undef  STATE_SET_TEST
-#undef  SWITCH_STATE
 }
 /*
@@ -2701,22 +2550,134 @@ xfs_bmap_rtalloc(
 }
 STATIC int
+xfs_bmap_btalloc_nullfb(
+        struct xfs_bmalloca     *ap,
+        struct xfs_alloc_arg    *args,
+        xfs_extlen_t            *blen)
+{
+        struct xfs_mount        *mp = ap->ip->i_mount;
+        struct xfs_perag        *pag;
+        xfs_agnumber_t          ag, startag;
+        int                     notinit = 0;
+        int                     error;
+        if (ap->userdata && xfs_inode_is_filestream(ap->ip))
+                args->type = XFS_ALLOCTYPE_NEAR_BNO;
+        else
+                args->type = XFS_ALLOCTYPE_START_BNO;
+        args->total = ap->total;
+        /*
+         * Search for an allocation group with a single extent large enough
+         * for the request.  If one isn't found, then adjust the minimum
+         * allocation size to the largest space found.
+         */
+        startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
+        if (startag == NULLAGNUMBER)
+                startag = ag = 0;
+        pag = xfs_perag_get(mp, ag);
+        while (*blen < ap->alen) {
+                if (!pag->pagf_init) {
+                        error = xfs_alloc_pagf_init(mp, args->tp, ag,
+                                                    XFS_ALLOC_FLAG_TRYLOCK);
+                        if (error) {
+                                xfs_perag_put(pag);
+                                return error;
+                        }
+                }
+                /*
+                 * See xfs_alloc_fix_freelist...
+                 */
+                if (pag->pagf_init) {
+                        xfs_extlen_t    longest;
+                        longest = xfs_alloc_longest_free_extent(mp, pag);
+                        if (*blen < longest)
+                                *blen = longest;
+                } else
+                        notinit = 1;
+                if (xfs_inode_is_filestream(ap->ip)) {
+                        if (*blen >= ap->alen)
+                                break;
+                        if (ap->userdata) {
+                                /*
+                                 * If startag is an invalid AG, we've
+                                 * come here once before and
+                                 * xfs_filestream_new_ag picked the
+                                 * best currently available.
+                                 *
+                                 * Don't continue looping, since we
+                                 * could loop forever.
+                                 */
+                                if (startag == NULLAGNUMBER)
+                                        break;
+                                error = xfs_filestream_new_ag(ap, &ag);
+                                xfs_perag_put(pag);
+                                if (error)
+                                        return error;
+                                /* loop again to set 'blen'*/
+                                startag = NULLAGNUMBER;
+                                pag = xfs_perag_get(mp, ag);
+                                continue;
+                        }
+                }
+                if (++ag == mp->m_sb.sb_agcount)
+                        ag = 0;
+                if (ag == startag)
+                        break;
+                xfs_perag_put(pag);
+                pag = xfs_perag_get(mp, ag);
+        }
+        xfs_perag_put(pag);
+        /*
+         * Since the above loop did a BUF_TRYLOCK, it is
+         * possible that there is space for this request.
+         */
+        if (notinit || *blen < ap->minlen)
+                args->minlen = ap->minlen;
+        /*
+         * If the best seen length is less than the request
+         * length, use the best as the minimum.
+         */
+        else if (*blen < ap->alen)
+                args->minlen = *blen;
+        /*
+         * Otherwise we've seen an extent as big as alen,
+         * use that as the minimum.
+         */
+        else
+                args->minlen = ap->alen;
+        /*
+         * set the failure fallback case to look in the selected
+         * AG as the stream may have moved.
+         */
+        if (xfs_inode_is_filestream(ap->ip))
+                ap->rval = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
+        return 0;
+}
+STATIC int
 xfs_bmap_btalloc(
        xfs_bmalloca_t  *ap)            /* bmap alloc argument struct */
 {
        xfs_mount_t     *mp;            /* mount point structure */
        xfs_alloctype_t atype = 0;      /* type for allocation routines */
        xfs_extlen_t    align;          /* minimum allocation alignment */
-        xfs_agnumber_t  ag;
        xfs_agnumber_t  fb_agno;        /* ag number of ap->firstblock */
-        xfs_agnumber_t  startag;
+        xfs_agnumber_t  ag;
        xfs_alloc_arg_t args;
        xfs_extlen_t    blen;
        xfs_extlen_t    nextminlen = 0;
-        xfs_perag_t     *pag;
        int             nullfb;         /* true if ap->firstblock isn't set */
        int             isaligned;
-        int             notinit;
        int             tryagain;
        int             error;
@@ -2763,102 +2724,9 @@ xfs_bmap_btalloc(
        args.firstblock = ap->firstblock;
        blen = 0;
        if (nullfb) {
-                if (ap->userdata && xfs_inode_is_filestream(ap->ip))
+                error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
-                        args.type = XFS_ALLOCTYPE_NEAR_BNO;
+                if (error)
-                else
+                        return error;
-                        args.type = XFS_ALLOCTYPE_START_BNO;
-                args.total = ap->total;
-                /*
-                 * Search for an allocation group with a single extent
-                 * large enough for the request.
-                 *
-                 * If one isn't found, then adjust the minimum allocation
-                 * size to the largest space found.
-                 */
-                startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno);
-                if (startag == NULLAGNUMBER)
-                        startag = ag = 0;
-                notinit = 0;
-                down_read(&mp->m_peraglock);
-                while (blen < ap->alen) {
-                        pag = &mp->m_perag[ag];
-                        if (!pag->pagf_init &&
-                            (error = xfs_alloc_pagf_init(mp, args.tp,
-                                    ag, XFS_ALLOC_FLAG_TRYLOCK))) {
-                                up_read(&mp->m_peraglock);
-                                return error;
-                        }
-                        /*
-                         * See xfs_alloc_fix_freelist...
-                         */
-                        if (pag->pagf_init) {
-                                xfs_extlen_t    longest;
-                                longest = xfs_alloc_longest_free_extent(mp, pag);
-                                if (blen < longest)
-                                        blen = longest;
-                        } else
-                                notinit = 1;
-                        if (xfs_inode_is_filestream(ap->ip)) {
-                                if (blen >= ap->alen)
-                                        break;
-                                if (ap->userdata) {
-                                        /*
-                                         * If startag is an invalid AG, we've
-                                         * come here once before and
-                                         * xfs_filestream_new_ag picked the
-                                         * best currently available.
-                                         *
-                                         * Don't continue looping, since we
-                                         * could loop forever.
-                                         */
-                                        if (startag == NULLAGNUMBER)
-                                                break;
-                                        error = xfs_filestream_new_ag(ap, &ag);
-                                        if (error) {
-                                                up_read(&mp->m_peraglock);
-                                                return error;
-                                        }
-                                        /* loop again to set 'blen'*/
-                                        startag = NULLAGNUMBER;
-                                        continue;
-                                }
-                        }
-                        if (++ag == mp->m_sb.sb_agcount)
-                                ag = 0;
-                        if (ag == startag)
-                                break;
-                }
-                up_read(&mp->m_peraglock);
-                /*
-                 * Since the above loop did a BUF_TRYLOCK, it is
-                 * possible that there is space for this request.
-                 */
-                if (notinit || blen < ap->minlen)
-                        args.minlen = ap->minlen;
-                /*
-                 * If the best seen length is less than the request
-                 * length, use the best as the minimum.
-                 */
-                else if (blen < ap->alen)
-                        args.minlen = blen;
-                /*
-                 * Otherwise we've seen an extent as big as alen,
-                 * use that as the minimum.
-                 */
-                else
-                        args.minlen = ap->alen;
-                /*
-                 * set the failure fallback case to look in the selected
-                 * AG as the stream may have moved.
-                 */
-                if (xfs_inode_is_filestream(ap->ip))
-                        ap->rval = args.fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
        } else if (ap->low) {
                if (xfs_inode_is_filestream(ap->ip))
                        args.type = XFS_ALLOCTYPE_FIRST_AG;
@@ -3115,8 +2983,13 @@ xfs_bmap_del_extent(
        uint                    qfield; /* quota field to update */
        xfs_filblks_t           temp;   /* for indirect length calculations */
        xfs_filblks_t           temp2;  /* for indirect length calculations */
+        int                     state = 0;
        XFS_STATS_INC(xs_del_exlist);
+        if (whichfork == XFS_ATTR_FORK)
+                state |= BMAP_ATTRFORK;
        mp = ip->i_mount;
        ifp = XFS_IFORK_PTR(ip, whichfork);
        ASSERT((idx >= 0) && (idx < ifp->if_bytes /
@@ -3196,8 +3069,8 @@ xfs_bmap_del_extent(
                /*
                 * Matches the whole extent.  Delete the entry.
                 */
-                XFS_BMAP_TRACE_DELETE("3", ip, idx, 1, whichfork);
+                xfs_iext_remove(ip, idx, 1,
-                xfs_iext_remove(ifp, idx, 1);
+                                whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
                ifp->if_lastex = idx;
                if (delay)
                        break;
@@ -3217,7 +3090,7 @@ xfs_bmap_del_extent(
                /*
                 * Deleting the first part of the extent.
                 */
-                XFS_BMAP_TRACE_PRE_UPDATE("2", ip, idx, whichfork);
+                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
                xfs_bmbt_set_startoff(ep, del_endoff);
                temp = got.br_blockcount - del->br_blockcount;
                xfs_bmbt_set_blockcount(ep, temp);
@@ -3226,13 +3099,12 @@ xfs_bmap_del_extent(
                        temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
                                da_old);
                        xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                        XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx,
+                        trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
-                                whichfork);
                        da_new = temp;
                        break;
                }
                xfs_bmbt_set_startblock(ep, del_endblock);
-                XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx, whichfork);
+                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
                if (!cur) {
                        flags |= xfs_ilog_fext(whichfork);
                        break;
@@ -3248,19 +3120,18 @@ xfs_bmap_del_extent(
                 * Deleting the last part of the extent.
                 */
                temp = got.br_blockcount - del->br_blockcount;
-                XFS_BMAP_TRACE_PRE_UPDATE("1", ip, idx, whichfork);
+                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);
                ifp->if_lastex = idx;
                if (delay) {
                        temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
                                da_old);
                        xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                        XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx,
+                        trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
-                                whichfork);
                        da_new = temp;
                        break;
                }
-                XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx, whichfork);
+                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
                if (!cur) {
                        flags |= xfs_ilog_fext(whichfork);
                        break;
@@ -3277,7 +3148,7 @@ xfs_bmap_del_extent(
                 * Deleting the middle of the extent.
                 */
                temp = del->br_startoff - got.br_startoff;
-                XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx, whichfork);
+                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);
                new.br_startoff = del_endoff;
                temp2 = got_endoff - del_endoff;
@@ -3364,10 +3235,8 @@ xfs_bmap_del_extent(
                                }
                        }
                }
-                XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, whichfork);
+                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
-                XFS_BMAP_TRACE_INSERT("0", ip, idx + 1, 1, &new, NULL,
+                xfs_iext_insert(ip, idx + 1, 1, &new, state);
-                        whichfork);
-                xfs_iext_insert(ifp, idx + 1, 1, &new);
                ifp->if_lastex = idx + 1;
                break;
        }
@@ -3687,7 +3556,9 @@ xfs_bmap_local_to_extents(
                xfs_iext_add(ifp, 0, 1);
                ep = xfs_iext_get_ext(ifp, 0);
                xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
-                XFS_BMAP_TRACE_POST_UPDATE("new", ip, 0, whichfork);
+                trace_xfs_bmap_post_update(ip, 0,
+                                whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
+                                _THIS_IP_);
                XFS_IFORK_NEXT_SET(ip, whichfork, 1);
                ip->i_d.di_nblocks = 1;
                xfs_trans_mod_dquot_byino(tp, ip,
@@ -3800,158 +3671,6 @@ xfs_bmap_search_extents(
        return ep;
 }
-#ifdef XFS_BMAP_TRACE
-ktrace_t        *xfs_bmap_trace_buf;
-/*
- * Add a bmap trace buffer entry.  Base routine for the others.
- */
-STATIC void
-xfs_bmap_trace_addentry(
-        int             opcode,         /* operation */
-        const char      *fname,         /* function name */
-        char            *desc,          /* operation description */
-        xfs_inode_t     *ip,            /* incore inode pointer */
-        xfs_extnum_t    idx,            /* index of entry(ies) */
-        xfs_extnum_t    cnt,            /* count of entries, 1 or 2 */
-        xfs_bmbt_rec_host_t *r1,        /* first record */
-        xfs_bmbt_rec_host_t *r2,        /* second record or null */
-        int             whichfork)      /* data or attr fork */
-{
-        xfs_bmbt_rec_host_t tr2;
-        ASSERT(cnt == 1 || cnt == 2);
-        ASSERT(r1 != NULL);
-        if (cnt == 1) {
-                ASSERT(r2 == NULL);
-                r2 = &tr2;
-                memset(&tr2, 0, sizeof(tr2));
-        } else
-                ASSERT(r2 != NULL);
-        ktrace_enter(xfs_bmap_trace_buf,
-                (void *)(__psint_t)(opcode | (whichfork << 16)),
-                (void *)fname, (void *)desc, (void *)ip,
-                (void *)(__psint_t)idx,
-                (void *)(__psint_t)cnt,
-                (void *)(__psunsigned_t)(ip->i_ino >> 32),
-                (void *)(__psunsigned_t)(unsigned)ip->i_ino,
-                (void *)(__psunsigned_t)(r1->l0 >> 32),
-                (void *)(__psunsigned_t)(unsigned)(r1->l0),
-                (void *)(__psunsigned_t)(r1->l1 >> 32),
-                (void *)(__psunsigned_t)(unsigned)(r1->l1),
-                (void *)(__psunsigned_t)(r2->l0 >> 32),
-                (void *)(__psunsigned_t)(unsigned)(r2->l0),
-                (void *)(__psunsigned_t)(r2->l1 >> 32),
-                (void *)(__psunsigned_t)(unsigned)(r2->l1)
-                );
-        ASSERT(ip->i_xtrace);
-        ktrace_enter(ip->i_xtrace,
-                (void *)(__psint_t)(opcode | (whichfork << 16)),
-                (void *)fname, (void *)desc, (void *)ip,
-                (void *)(__psint_t)idx,
-                (void *)(__psint_t)cnt,
-                (void *)(__psunsigned_t)(ip->i_ino >> 32),
-                (void *)(__psunsigned_t)(unsigned)ip->i_ino,
-                (void *)(__psunsigned_t)(r1->l0 >> 32),
-                (void *)(__psunsigned_t)(unsigned)(r1->l0),
-                (void *)(__psunsigned_t)(r1->l1 >> 32),
-                (void *)(__psunsigned_t)(unsigned)(r1->l1),
-                (void *)(__psunsigned_t)(r2->l0 >> 32),
-                (void *)(__psunsigned_t)(unsigned)(r2->l0),
-                (void *)(__psunsigned_t)(r2->l1 >> 32),
-                (void *)(__psunsigned_t)(unsigned)(r2->l1)
-                );
-}
-/*
- * Add bmap trace entry prior to a call to xfs_iext_remove.
- */
-STATIC void
-xfs_bmap_trace_delete(
-        const char      *fname,         /* function name */
-        char            *desc,          /* operation description */
-        xfs_inode_t     *ip,            /* incore inode pointer */
-        xfs_extnum_t    idx,            /* index of entry(entries) deleted */
-        xfs_extnum_t    cnt,            /* count of entries deleted, 1 or 2 */
-        int             whichfork)      /* data or attr fork */
-{
-        xfs_ifork_t     *ifp;           /* inode fork pointer */
-        ifp = XFS_IFORK_PTR(ip, whichfork);
-        xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_DELETE, fname, desc, ip, idx,
-                cnt, xfs_iext_get_ext(ifp, idx),
-                cnt == 2 ? xfs_iext_get_ext(ifp, idx + 1) : NULL,
-                whichfork);
-}
-/*
- * Add bmap trace entry prior to a call to xfs_iext_insert, or
- * reading in the extents list from the disk (in the btree).
- */
-STATIC void
-xfs_bmap_trace_insert(
-        const char      *fname,         /* function name */
-        char            *desc,          /* operation description */
-        xfs_inode_t     *ip,            /* incore inode pointer */
-        xfs_extnum_t    idx,            /* index of entry(entries) inserted */
-        xfs_extnum_t    cnt,            /* count of entries inserted, 1 or 2 */
-        xfs_bmbt_irec_t *r1,            /* inserted record 1 */
-        xfs_bmbt_irec_t *r2,            /* inserted record 2 or null */
-        int             whichfork)      /* data or attr fork */
-{
-        xfs_bmbt_rec_host_t tr1;        /* compressed record 1 */
-        xfs_bmbt_rec_host_t tr2;        /* compressed record 2 if needed */
-        xfs_bmbt_set_all(&tr1, r1);
-        if (cnt == 2) {
-                ASSERT(r2 != NULL);
-                xfs_bmbt_set_all(&tr2, r2);
-        } else {
-                ASSERT(cnt == 1);
-                ASSERT(r2 == NULL);
-        }
-        xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_INSERT, fname, desc, ip, idx,
-                cnt, &tr1, cnt == 2 ? &tr2 : NULL, whichfork);
-}
-/*
- * Add bmap trace entry after updating an extent record in place.
- */
-STATIC void
-xfs_bmap_trace_post_update(
-        const char      *fname,         /* function name */
-        char            *desc,          /* operation description */
-        xfs_inode_t     *ip,            /* incore inode pointer */
-        xfs_extnum_t    idx,            /* index of entry updated */
-        int             whichfork)      /* data or attr fork */
-{
-        xfs_ifork_t     *ifp;           /* inode fork pointer */
-        ifp = XFS_IFORK_PTR(ip, whichfork);
-        xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_POST_UP, fname, desc, ip, idx,
-                1, xfs_iext_get_ext(ifp, idx), NULL, whichfork);
-}
-/*
- * Add bmap trace entry prior to updating an extent record in place.
- */
-STATIC void
-xfs_bmap_trace_pre_update(
-        const char      *fname,         /* function name */
-        char            *desc,          /* operation description */
-        xfs_inode_t     *ip,            /* incore inode pointer */
-        xfs_extnum_t    idx,            /* index of entry to be updated */
-        int             whichfork)      /* data or attr fork */
-{
-        xfs_ifork_t     *ifp;           /* inode fork pointer */
-        ifp = XFS_IFORK_PTR(ip, whichfork);
-        xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_PRE_UP, fname, desc, ip, idx, 1,
-                xfs_iext_get_ext(ifp, idx), NULL, whichfork);
-}
-#endif  /* XFS_BMAP_TRACE */
 /*
 * Compute the worst-case number of indirect blocks that will be used
 * for ip's delayed extent of length "len".
@@ -3983,37 +3702,6 @@ xfs_bmap_worst_indlen(
        return rval;
 }
-#if defined(XFS_RW_TRACE)
-STATIC void
-xfs_bunmap_trace(
-        xfs_inode_t             *ip,
-        xfs_fileoff_t           bno,
-        xfs_filblks_t           len,
-        int                     flags,
-        inst_t                  *ra)
-{
-        if (ip->i_rwtrace == NULL)
-                return;
-        ktrace_enter(ip->i_rwtrace,
-                (void *)(__psint_t)XFS_BUNMAP,
-                (void *)ip,
-                (void *)(__psint_t)((ip->i_d.di_size >> 32) & 0xffffffff),
-                (void *)(__psint_t)(ip->i_d.di_size & 0xffffffff),
-                (void *)(__psint_t)(((xfs_dfiloff_t)bno >> 32) & 0xffffffff),
-                (void *)(__psint_t)((xfs_dfiloff_t)bno & 0xffffffff),
-                (void *)(__psint_t)len,
-                (void *)(__psint_t)flags,
-                (void *)(unsigned long)current_cpu(),
-                (void *)ra,
-                (void *)0,
-                (void *)0,
-                (void *)0,
-                (void *)0,
-                (void *)0,
-                (void *)0);
-}
-#endif
 /*
 * Convert inode from non-attributed to attributed.
 * Must not be in a transaction, ip must not be locked.
@@ -4702,34 +4390,30 @@ error0:
        return XFS_ERROR(EFSCORRUPTED);
 }
-#ifdef XFS_BMAP_TRACE
+#ifdef DEBUG
 /*
 * Add bmap trace insert entries for all the contents of the extent records.
 */
 void
 xfs_bmap_trace_exlist(
-        const char      *fname,         /* function name */
        xfs_inode_t     *ip,            /* incore inode pointer */
        xfs_extnum_t    cnt,            /* count of entries in the list */
-        int             whichfork)      /* data or attr fork */
+        int             whichfork,      /* data or attr fork */
+        unsigned long   caller_ip)
 {
-        xfs_bmbt_rec_host_t *ep;        /* current extent record */
        xfs_extnum_t    idx;            /* extent record index */
        xfs_ifork_t     *ifp;           /* inode fork pointer */
-        xfs_bmbt_irec_t s;              /* file extent record */
+        int             state = 0;
+        if (whichfork == XFS_ATTR_FORK)
+                state |= BMAP_ATTRFORK;
        ifp = XFS_IFORK_PTR(ip, whichfork);
        ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
-        for (idx = 0; idx < cnt; idx++) {
+        for (idx = 0; idx < cnt; idx++)
-                ep = xfs_iext_get_ext(ifp, idx);
+                trace_xfs_extlist(ip, idx, whichfork, caller_ip);
-                xfs_bmbt_get_all(ep, &s);
-                XFS_BMAP_TRACE_INSERT("exlist", ip, idx, 1, &s, NULL,
-                        whichfork);
-        }
 }
-#endif
-#ifdef DEBUG
 /*
 * Validate that the bmbt_irecs being returned from bmapi are valid
 * given the callers original parameters.  Specifically check the
@@ -4805,7 +4489,7 @@ xfs_bmapi(
        xfs_fsblock_t   abno;           /* allocated block number */
        xfs_extlen_t    alen;           /* allocated extent length */
        xfs_fileoff_t   aoff;           /* allocated file offset */
-        xfs_bmalloca_t  bma;            /* args for xfs_bmap_alloc */
+        xfs_bmalloca_t  bma = { 0 };    /* args for xfs_bmap_alloc */
        xfs_btree_cur_t *cur;           /* bmap btree cursor */
        xfs_fileoff_t   end;            /* end of mapped file region */
        int             eof;            /* we've hit the end of extents */
@@ -5478,7 +5162,8 @@ xfs_bunmapi(
        int                     rsvd;           /* OK to allocate reserved blocks */
        xfs_fsblock_t           sum;
-        xfs_bunmap_trace(ip, bno, len, flags, (inst_t *)__return_address);
+        trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
        whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
                XFS_ATTR_FORK : XFS_DATA_FORK;
        ifp = XFS_IFORK_PTR(ip, whichfork);
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 56f62d2edc35..419dafb9d87d 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -95,6 +95,21 @@ typedef	struct xfs_bmap_free
                                        /* need write cache flushing and no */
                                        /* additional allocation alignments */
+#define XFS_BMAPI_FLAGS \
+        { XFS_BMAPI_WRITE,      "WRITE" }, \
+        { XFS_BMAPI_DELAY,      "DELAY" }, \
+        { XFS_BMAPI_ENTIRE,     "ENTIRE" }, \
+        { XFS_BMAPI_METADATA,   "METADATA" }, \
+        { XFS_BMAPI_EXACT,      "EXACT" }, \
+        { XFS_BMAPI_ATTRFORK,   "ATTRFORK" }, \
+        { XFS_BMAPI_ASYNC,      "ASYNC" }, \
+        { XFS_BMAPI_RSVBLOCKS,  "RSVBLOCKS" }, \
+        { XFS_BMAPI_PREALLOC,   "PREALLOC" }, \
+        { XFS_BMAPI_IGSTATE,    "IGSTATE" }, \
+        { XFS_BMAPI_CONTIG,     "CONTIG" }, \
+        { XFS_BMAPI_CONVERT,    "CONVERT" }
 static inline int xfs_bmapi_aflag(int w)
 {
        return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0);
@@ -135,36 +150,43 @@ typedef struct xfs_bmalloca {
        char                    conv;   /* overwriting unwritten extents */
 } xfs_bmalloca_t;
-#if defined(__KERNEL__) && defined(XFS_BMAP_TRACE)
 /*
- * Trace operations for bmap extent tracing
+ * Flags for xfs_bmap_add_extent*.
 */
-#define XFS_BMAP_KTRACE_DELETE  1
+#define BMAP_LEFT_CONTIG        (1 << 0)
-#define XFS_BMAP_KTRACE_INSERT  2
+#define BMAP_RIGHT_CONTIG       (1 << 1)
-#define XFS_BMAP_KTRACE_PRE_UP  3
+#define BMAP_LEFT_FILLING       (1 << 2)
-#define XFS_BMAP_KTRACE_POST_UP 4
+#define BMAP_RIGHT_FILLING      (1 << 3)
+#define BMAP_LEFT_DELAY         (1 << 4)
-#define XFS_BMAP_TRACE_SIZE     4096    /* size of global trace buffer */
+#define BMAP_RIGHT_DELAY        (1 << 5)
-#define XFS_BMAP_KTRACE_SIZE    32      /* size of per-inode trace buffer */
+#define BMAP_LEFT_VALID         (1 << 6)
-extern ktrace_t *xfs_bmap_trace_buf;
+#define BMAP_RIGHT_VALID        (1 << 7)
+#define BMAP_ATTRFORK           (1 << 8)
+#define XFS_BMAP_EXT_FLAGS \
+        { BMAP_LEFT_CONTIG,     "LC" }, \
+        { BMAP_RIGHT_CONTIG,    "RC" }, \
+        { BMAP_LEFT_FILLING,    "LF" }, \
+        { BMAP_RIGHT_FILLING,   "RF" }, \
+        { BMAP_ATTRFORK,        "ATTR" }
 /*
 * Add bmap trace insert entries for all the contents of the extent list.
+ *
+ * Quite excessive tracing.  Only do this for debug builds.
 */
+#if defined(__KERNEL) && defined(DEBUG)
 void
 xfs_bmap_trace_exlist(
-        const char              *fname,         /* function name */
        struct xfs_inode        *ip,            /* incore inode pointer */
        xfs_extnum_t            cnt,            /* count of entries in list */
-        int                     whichfork);     /* data or attr fork */
+        int                     whichfork,
+        unsigned long           caller_ip);     /* data or attr fork */
 #define XFS_BMAP_TRACE_EXLIST(ip,c,w)   \
-        xfs_bmap_trace_exlist(__func__,ip,c,w)
+        xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_)
+#else
-#else   /* __KERNEL__ && XFS_BMAP_TRACE */
 #define XFS_BMAP_TRACE_EXLIST(ip,c,w)
+#endif
-#endif  /* __KERNEL__ && XFS_BMAP_TRACE */
 /*
 * Convert inode from non-attributed to attributed.
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index eb7b702d0690..416e47e54b83 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -98,8 +98,7 @@ xfs_bmdr_to_bmbt(
 * This code must be in sync with the routines xfs_bmbt_get_startoff,
 * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state.
 */
+STATIC void
-STATIC_INLINE void
 __xfs_bmbt_get_all(
                __uint64_t l0,
                __uint64_t l1,
@@ -335,7 +334,7 @@ xfs_bmbt_disk_set_allf(
 /*
 * Set all the fields in a bmap extent record from the uncompressed form.
 */
-void
+STATIC void
 xfs_bmbt_disk_set_all(
        xfs_bmbt_rec_t  *r,
        xfs_bmbt_irec_t *s)
@@ -769,12 +768,6 @@ xfs_bmbt_trace_enter(
                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
                (void *)a8, (void *)a9, (void *)a10);
-        ktrace_enter(ip->i_btrace,
-                (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
-                (void *)func, (void *)s, (void *)ip, (void *)cur,
-                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
-                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
-                (void *)a8, (void *)a9, (void *)a10);
 }
 STATIC void
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 5549d495947f..0e66c4ea0f85 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -46,20 +46,12 @@ typedef struct xfs_bmdr_block {
 #define BMBT_STARTBLOCK_BITLEN  52
 #define BMBT_BLOCKCOUNT_BITLEN  21
+typedef struct xfs_bmbt_rec {
-#define BMBT_USE_64     1
-typedef struct xfs_bmbt_rec_32
-{
-        __uint32_t              l0, l1, l2, l3;
-} xfs_bmbt_rec_32_t;
-typedef struct xfs_bmbt_rec_64
-{
        __be64                  l0, l1;
-} xfs_bmbt_rec_64_t;
+} xfs_bmbt_rec_t;
 typedef __uint64_t      xfs_bmbt_rec_base_t;    /* use this for casts */
-typedef xfs_bmbt_rec_64_t xfs_bmbt_rec_t, xfs_bmdr_rec_t;
+typedef xfs_bmbt_rec_t xfs_bmdr_rec_t;
 typedef struct xfs_bmbt_rec_host {
        __uint64_t              l0, l1;
@@ -231,7 +223,6 @@ extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v);
 extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v);
 extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v);
-extern void xfs_bmbt_disk_set_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
 extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
                        xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 52b5f14d0c32..96be4b0f2496 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -39,6 +39,7 @@
 #include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_error.h"
+#include "xfs_trace.h"
 /*
 * Cursor allocation zone.
@@ -81,7 +82,7 @@ xfs_btree_check_lblock(
                        XFS_ERRTAG_BTREE_CHECK_LBLOCK,
                        XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
                if (bp)
-                        xfs_buftrace("LBTREE ERROR", bp);
+                        trace_xfs_btree_corrupt(bp, _RET_IP_);
                XFS_ERROR_REPORT("xfs_btree_check_lblock", XFS_ERRLEVEL_LOW,
                                 mp);
                return XFS_ERROR(EFSCORRUPTED);
@@ -119,7 +120,7 @@ xfs_btree_check_sblock(
                        XFS_ERRTAG_BTREE_CHECK_SBLOCK,
                        XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
                if (bp)
-                        xfs_buftrace("SBTREE ERROR", bp);
+                        trace_xfs_btree_corrupt(bp, _RET_IP_);
                XFS_CORRUPTION_ERROR("xfs_btree_check_sblock",
                        XFS_ERRLEVEL_LOW, cur->bc_mp, block);
                return XFS_ERROR(EFSCORRUPTED);
@@ -976,7 +977,7 @@ xfs_btree_get_buf_block(
        xfs_daddr_t             d;
        /* need to sort out how callers deal with failures first */
-        ASSERT(!(flags & XFS_BUF_TRYLOCK));
+        ASSERT(!(flags & XBF_TRYLOCK));
        d = xfs_btree_ptr_to_daddr(cur, ptr);
        *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
@@ -1007,7 +1008,7 @@ xfs_btree_read_buf_block(
        int                     error;
        /* need to sort out how callers deal with failures first */
-        ASSERT(!(flags & XFS_BUF_TRYLOCK));
+        ASSERT(!(flags & XBF_TRYLOCK));
        d = xfs_btree_ptr_to_daddr(cur, ptr);
        error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
diff --git a/fs/xfs/xfs_btree_trace.h b/fs/xfs/xfs_btree_trace.h
index b3f5eb3c3c6c..2d8a309873ea 100644
--- a/fs/xfs/xfs_btree_trace.h
+++ b/fs/xfs/xfs_btree_trace.h
@@ -58,8 +58,6 @@ void xfs_btree_trace_argbi(const char *, struct xfs_btree_cur *,
                struct xfs_buf *, int, int);
 void xfs_btree_trace_argbii(const char *, struct xfs_btree_cur *,
                struct xfs_buf *, int, int, int);
-void xfs_btree_trace_argfffi(const char *, struct xfs_btree_cur *,
-                xfs_dfiloff_t, xfs_dfsbno_t, xfs_dfilblks_t, int, int);
 void xfs_btree_trace_argi(const char *, struct xfs_btree_cur *, int, int);
 void xfs_btree_trace_argipk(const char *, struct xfs_btree_cur *, int,
                union xfs_btree_ptr, union xfs_btree_key *, int);
@@ -71,24 +69,10 @@ void xfs_btree_trace_argr(const char *, struct xfs_btree_cur *,
                union xfs_btree_rec *, int);
 void xfs_btree_trace_cursor(const char *, struct xfs_btree_cur *, int, int);
-#define XFS_ALLOCBT_TRACE_SIZE  4096    /* size of global trace buffer */
-extern ktrace_t *xfs_allocbt_trace_buf;
-#define XFS_INOBT_TRACE_SIZE    4096    /* size of global trace buffer */
-extern ktrace_t *xfs_inobt_trace_buf;
-#define XFS_BMBT_TRACE_SIZE     4096    /* size of global trace buffer */
-#define XFS_BMBT_KTRACE_SIZE    32      /* size of per-inode trace buffer */
-extern ktrace_t *xfs_bmbt_trace_buf;
 #define XFS_BTREE_TRACE_ARGBI(c, b, i)  \
        xfs_btree_trace_argbi(__func__, c, b, i, __LINE__)
 #define XFS_BTREE_TRACE_ARGBII(c, b, i, j)      \
        xfs_btree_trace_argbii(__func__, c, b, i, j, __LINE__)
-#define XFS_BTREE_TRACE_ARGFFFI(c, o, b, i, j)  \
-        xfs_btree_trace_argfffi(__func__, c, o, b, i, j, __LINE__)
 #define XFS_BTREE_TRACE_ARGI(c, i)      \
        xfs_btree_trace_argi(__func__, c, i, __LINE__)
 #define XFS_BTREE_TRACE_ARGIPK(c, i, p, k)      \
@@ -104,7 +88,6 @@ extern ktrace_t	*xfs_bmbt_trace_buf;
 #else
 #define XFS_BTREE_TRACE_ARGBI(c, b, i)
 #define XFS_BTREE_TRACE_ARGBII(c, b, i, j)
-#define XFS_BTREE_TRACE_ARGFFFI(c, o, b, i, j)
 #define XFS_BTREE_TRACE_ARGI(c, i)
 #define XFS_BTREE_TRACE_ARGIPK(c, i, p, s)
 #define XFS_BTREE_TRACE_ARGIPR(c, i, p, r)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 92af4098c7e8..f3c49e69eab9 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -29,6 +29,7 @@
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
+#include "xfs_trace.h"
 kmem_zone_t     *xfs_buf_item_zone;
@@ -164,7 +165,7 @@ xfs_buf_item_size(
                 * is the buf log format structure with the
                 * cancel flag in it.
                 */
-                xfs_buf_item_trace("SIZE STALE", bip);
+                trace_xfs_buf_item_size_stale(bip);
                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
                return 1;
        }
@@ -206,7 +207,7 @@ xfs_buf_item_size(
                }
        }
-        xfs_buf_item_trace("SIZE NORM", bip);
+        trace_xfs_buf_item_size(bip);
        return nvecs;
 }
@@ -249,7 +250,7 @@ xfs_buf_item_format(
                       ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
        vecp->i_addr = (xfs_caddr_t)&bip->bli_format;
        vecp->i_len = base_size;
-        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BFORMAT);
+        vecp->i_type = XLOG_REG_TYPE_BFORMAT;
        vecp++;
        nvecs = 1;
@@ -259,7 +260,7 @@ xfs_buf_item_format(
                 * is the buf log format structure with the
                 * cancel flag in it.
                 */
-                xfs_buf_item_trace("FORMAT STALE", bip);
+                trace_xfs_buf_item_format_stale(bip);
                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
                bip->bli_format.blf_size = nvecs;
                return;
@@ -296,14 +297,14 @@ xfs_buf_item_format(
                        buffer_offset = first_bit * XFS_BLI_CHUNK;
                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
                        vecp->i_len = nbits * XFS_BLI_CHUNK;
-                        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
+                        vecp->i_type = XLOG_REG_TYPE_BCHUNK;
                        nvecs++;
                        break;
                } else if (next_bit != last_bit + 1) {
                        buffer_offset = first_bit * XFS_BLI_CHUNK;
                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
                        vecp->i_len = nbits * XFS_BLI_CHUNK;
-                        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
+                        vecp->i_type = XLOG_REG_TYPE_BCHUNK;
                        nvecs++;
                        vecp++;
                        first_bit = next_bit;
@@ -315,7 +316,7 @@ xfs_buf_item_format(
                        buffer_offset = first_bit * XFS_BLI_CHUNK;
                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
                        vecp->i_len = nbits * XFS_BLI_CHUNK;
-                        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
+                        vecp->i_type = XLOG_REG_TYPE_BCHUNK;
 /* You would think we need to bump the nvecs here too, but we do not
 * this number is used by recovery, and it gets confused by the boundary
 * split here
@@ -335,7 +336,7 @@ xfs_buf_item_format(
        /*
         * Check to make sure everything is consistent.
         */
-        xfs_buf_item_trace("FORMAT NORM", bip);
+        trace_xfs_buf_item_format(bip);
        xfs_buf_item_log_check(bip);
 }
@@ -355,8 +356,7 @@ xfs_buf_item_pin(
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
               (bip->bli_flags & XFS_BLI_STALE));
-        xfs_buf_item_trace("PIN", bip);
+        trace_xfs_buf_item_pin(bip);
-        xfs_buftrace("XFS_PIN", bp);
        xfs_bpin(bp);
 }
@@ -383,8 +383,7 @@ xfs_buf_item_unpin(
        ASSERT(bp != NULL);
        ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
-        xfs_buf_item_trace("UNPIN", bip);
+        trace_xfs_buf_item_unpin(bip);
-        xfs_buftrace("XFS_UNPIN", bp);
        freed = atomic_dec_and_test(&bip->bli_refcount);
        ailp = bip->bli_item.li_ailp;
@@ -395,8 +394,8 @@ xfs_buf_item_unpin(
                ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
                ASSERT(XFS_BUF_ISSTALE(bp));
                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
-                xfs_buf_item_trace("UNPIN STALE", bip);
+                trace_xfs_buf_item_unpin_stale(bip);
-                xfs_buftrace("XFS_UNPIN STALE", bp);
                /*
                 * If we get called here because of an IO error, we may
                 * or may not have the item on the AIL. xfs_trans_ail_delete()
@@ -440,8 +439,8 @@ xfs_buf_item_unpin_remove(
        if ((atomic_read(&bip->bli_refcount) == 1) &&
            (bip->bli_flags & XFS_BLI_STALE)) {
                ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
-                xfs_buf_item_trace("UNPIN REMOVE", bip);
+                trace_xfs_buf_item_unpin_stale(bip);
-                xfs_buftrace("XFS_UNPIN_REMOVE", bp);
                /*
                 * yes -- clear the xaction descriptor in-use flag
                 * and free the chunk if required.  We can safely
@@ -468,8 +467,10 @@ xfs_buf_item_unpin_remove(
 /*
 * This is called to attempt to lock the buffer associated with this
 * buf log item.  Don't sleep on the buffer lock.  If we can't get
- * the lock right away, return 0.  If we can get the lock, pull the
+ * the lock right away, return 0.  If we can get the lock, take a
- * buffer from the free list, mark it busy, and return 1.
+ * reference to the buffer. If this is a delayed write buffer that
+ * needs AIL help to be written back, invoke the pushbuf routine
+ * rather than the normal success path.
 */
 STATIC uint
 xfs_buf_item_trylock(
@@ -478,24 +479,18 @@ xfs_buf_item_trylock(
        xfs_buf_t       *bp;
        bp = bip->bli_buf;
+        if (XFS_BUF_ISPINNED(bp))
-        if (XFS_BUF_ISPINNED(bp)) {
                return XFS_ITEM_PINNED;
-        }
+        if (!XFS_BUF_CPSEMA(bp))
-        if (!XFS_BUF_CPSEMA(bp)) {
                return XFS_ITEM_LOCKED;
-        }
-        /*
+        /* take a reference to the buffer.  */
-         * Remove the buffer from the free list.  Only do this
-         * if it's on the free list.  Private buffers like the
-         * superblock buffer are not.
-         */
        XFS_BUF_HOLD(bp);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        xfs_buf_item_trace("TRYLOCK SUCCESS", bip);
+        trace_xfs_buf_item_trylock(bip);
+        if (XFS_BUF_ISDELAYWRITE(bp))
+                return XFS_ITEM_PUSHBUF;
        return XFS_ITEM_SUCCESS;
 }
@@ -524,7 +519,6 @@ xfs_buf_item_unlock(
        uint            hold;
        bp = bip->bli_buf;
-        xfs_buftrace("XFS_UNLOCK", bp);
        /*
         * Clear the buffer's association with this transaction.
@@ -547,7 +541,7 @@ xfs_buf_item_unlock(
         */
        if (bip->bli_flags & XFS_BLI_STALE) {
                bip->bli_flags &= ~XFS_BLI_LOGGED;
-                xfs_buf_item_trace("UNLOCK STALE", bip);
+                trace_xfs_buf_item_unlock_stale(bip);
                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
                if (!aborted)
                        return;
@@ -574,7 +568,7 @@ xfs_buf_item_unlock(
         * release the buffer at the end of this routine.
         */
        hold = bip->bli_flags & XFS_BLI_HOLD;
-        xfs_buf_item_trace("UNLOCK", bip);
+        trace_xfs_buf_item_unlock(bip);
        /*
         * If the buf item isn't tracking any data, free it.
@@ -618,7 +612,8 @@ xfs_buf_item_committed(
        xfs_buf_log_item_t      *bip,
        xfs_lsn_t               lsn)
 {
-        xfs_buf_item_trace("COMMITTED", bip);
+        trace_xfs_buf_item_committed(bip);
        if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
            (bip->bli_item.li_lsn != 0)) {
                return bip->bli_item.li_lsn;
@@ -627,11 +622,9 @@ xfs_buf_item_committed(
 }
 /*
- * This is called to asynchronously write the buffer associated with this
+ * The buffer is locked, but is not a delayed write buffer. This happens
- * buf log item out to disk. The buffer will already have been locked by
+ * if we race with IO completion and hence we don't want to try to write it
- * a successful call to xfs_buf_item_trylock().  If the buffer still has
+ * again. Just release the buffer.
- * B_DELWRI set, then get it going out to disk with a call to bawrite().
- * If not, then just release the buffer.
 */
 STATIC void
 xfs_buf_item_push(
@@ -640,20 +633,32 @@ xfs_buf_item_push(
        xfs_buf_t       *bp;
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        xfs_buf_item_trace("PUSH", bip);
+        trace_xfs_buf_item_push(bip);
        bp = bip->bli_buf;
+        ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
+        xfs_buf_relse(bp);
+}
-        if (XFS_BUF_ISDELAYWRITE(bp)) {
+/*
-                int     error;
+ * The buffer is locked and is a delayed write buffer. Promote the buffer
-                error = xfs_bawrite(bip->bli_item.li_mountp, bp);
+ * in the delayed write queue as the caller knows that they must invoke
-                if (error)
+ * the xfsbufd to get this buffer written. We have to unlock the buffer
-                        xfs_fs_cmn_err(CE_WARN, bip->bli_item.li_mountp,
+ * to allow the xfsbufd to write it, too.
-                        "xfs_buf_item_push: pushbuf error %d on bip %p, bp %p",
+ */
-                                        error, bip, bp);
+STATIC void
-        } else {
+xfs_buf_item_pushbuf(
-                xfs_buf_relse(bp);
+        xfs_buf_log_item_t      *bip)
-        }
+{
+        xfs_buf_t       *bp;
+        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+        trace_xfs_buf_item_pushbuf(bip);
+        bp = bip->bli_buf;
+        ASSERT(XFS_BUF_ISDELAYWRITE(bp));
+        xfs_buf_delwri_promote(bp);
+        xfs_buf_relse(bp);
 }
 /* ARGSUSED */
@@ -678,7 +683,7 @@ static struct xfs_item_ops xfs_buf_item_ops = {
        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
                                        xfs_buf_item_committed,
        .iop_push       = (void(*)(xfs_log_item_t*))xfs_buf_item_push,
-        .iop_pushbuf    = NULL,
+        .iop_pushbuf    = (void(*)(xfs_log_item_t*))xfs_buf_item_pushbuf,
        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
                                        xfs_buf_item_committing
 };
@@ -738,9 +743,6 @@ xfs_buf_item_init(
        bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
        bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
        bip->bli_format.blf_map_size = map_size;
-#ifdef XFS_BLI_TRACE
-        bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_NOFS);
-#endif
 #ifdef XFS_TRANS_DEBUG
        /*
@@ -878,9 +880,6 @@ xfs_buf_item_free(
        kmem_free(bip->bli_logged);
 #endif /* XFS_TRANS_DEBUG */
-#ifdef XFS_BLI_TRACE
-        ktrace_free(bip->bli_trace);
-#endif
        kmem_zone_free(xfs_buf_item_zone, bip);
 }
@@ -897,7 +896,8 @@ xfs_buf_item_relse(
 {
        xfs_buf_log_item_t      *bip;
-        xfs_buftrace("XFS_RELSE", bp);
+        trace_xfs_buf_item_relse(bp, _RET_IP_);
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
        XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list);
        if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) &&
@@ -994,7 +994,7 @@ xfs_buf_iodone_callbacks(
                if (XFS_FORCED_SHUTDOWN(mp)) {
                        ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
                        XFS_BUF_SUPER_STALE(bp);
-                        xfs_buftrace("BUF_IODONE_CB", bp);
+                        trace_xfs_buf_item_iodone(bp, _RET_IP_);
                        xfs_buf_do_callbacks(bp, lip);
                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
                        XFS_BUF_CLR_IODONE_FUNC(bp);
@@ -1030,7 +1030,7 @@ xfs_buf_iodone_callbacks(
                                XFS_BUF_SET_START(bp);
                        }
                        ASSERT(XFS_BUF_IODONE_FUNC(bp));
-                        xfs_buftrace("BUF_IODONE ASYNC", bp);
+                        trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
                        xfs_buf_relse(bp);
                } else {
                        /*
@@ -1053,9 +1053,7 @@ xfs_buf_iodone_callbacks(
                }
                return;
        }
-#ifdef XFSERRORDEBUG
-        xfs_buftrace("XFS BUFCB NOERR", bp);
-#endif
        xfs_buf_do_callbacks(bp, lip);
        XFS_BUF_SET_FSPRIVATE(bp, NULL);
        XFS_BUF_CLR_IODONE_FUNC(bp);
@@ -1081,7 +1079,9 @@ xfs_buf_error_relse(
        XFS_BUF_DONE(bp);
        XFS_BUF_UNDELAYWRITE(bp);
        XFS_BUF_ERROR(bp,0);
-        xfs_buftrace("BUF_ERROR_RELSE", bp);
+        trace_xfs_buf_error_relse(bp, _RET_IP_);
        if (! XFS_FORCED_SHUTDOWN(mp))
                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
        /*
@@ -1128,34 +1128,3 @@ xfs_buf_iodone(
        xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
        xfs_buf_item_free(bip);
 }
-#if defined(XFS_BLI_TRACE)
-void
-xfs_buf_item_trace(
-        char                    *id,
-        xfs_buf_log_item_t      *bip)
-{
-        xfs_buf_t               *bp;
-        ASSERT(bip->bli_trace != NULL);
-        bp = bip->bli_buf;
-        ktrace_enter(bip->bli_trace,
-                     (void *)id,
-                     (void *)bip->bli_buf,
-                     (void *)((unsigned long)bip->bli_flags),
-                     (void *)((unsigned long)bip->bli_recur),
-                     (void *)((unsigned long)atomic_read(&bip->bli_refcount)),
-                     (void *)((unsigned long)
-                                (0xFFFFFFFF & XFS_BUF_ADDR(bp) >> 32)),
-                     (void *)((unsigned long)(0xFFFFFFFF & XFS_BUF_ADDR(bp))),
-                     (void *)((unsigned long)XFS_BUF_COUNT(bp)),
-                     (void *)((unsigned long)XFS_BUF_BFLAGS(bp)),
-                     XFS_BUF_FSPRIVATE(bp, void *),
-                     XFS_BUF_FSPRIVATE2(bp, void *),
-                     (void *)(unsigned long)XFS_BUF_ISPINNED(bp),
-                     (void *)XFS_BUF_IODONE_FUNC(bp),
-                     (void *)((unsigned long)(XFS_BUF_VALUSEMA(bp))),
-                     (void *)bip->bli_item.li_desc,
-                     (void *)((unsigned long)bip->bli_item.li_flags));
-}
-#endif /* XFS_BLI_TRACE */
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 5a41c348bb1c..217f34af00cb 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -70,22 +70,21 @@ typedef struct xfs_buf_log_format_t {
 #define XFS_BLI_INODE_ALLOC_BUF 0x10
 #define XFS_BLI_STALE_INODE     0x20
+#define XFS_BLI_FLAGS \
+        { XFS_BLI_HOLD,         "HOLD" }, \
+        { XFS_BLI_DIRTY,        "DIRTY" }, \
+        { XFS_BLI_STALE,        "STALE" }, \
+        { XFS_BLI_LOGGED,       "LOGGED" }, \
+        { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
+        { XFS_BLI_STALE_INODE,  "STALE_INODE" }
 #ifdef __KERNEL__
 struct xfs_buf;
-struct ktrace;
 struct xfs_mount;
 struct xfs_buf_log_item;
-#if defined(XFS_BLI_TRACE)
-#define XFS_BLI_TRACE_SIZE      32
-void    xfs_buf_item_trace(char *, struct xfs_buf_log_item *);
-#else
-#define xfs_buf_item_trace(id, bip)
-#endif
 /*
 * This is the in core log item structure used to track information
 * needed to log buffers.  It tracks how many times the lock has been
@@ -97,9 +96,6 @@ typedef struct xfs_buf_log_item {
        unsigned int            bli_flags;      /* misc flags */
        unsigned int            bli_recur;      /* lock recursion count */
        atomic_t                bli_refcount;   /* cnt of tp refs */
-#ifdef XFS_BLI_TRACE
-        struct ktrace           *bli_trace;     /* event trace buf */
-#endif
 #ifdef XFS_TRANS_DEBUG
        char                    *bli_orig;      /* original buffer copy */
        char                    *bli_logged;    /* bytes logged (bitmap) */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 2847bbc1c534..0ca556b4bf31 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -46,6 +46,7 @@
 #include "xfs_dir2_block.h"
 #include "xfs_dir2_node.h"
 #include "xfs_error.h"
+#include "xfs_trace.h"
 /*
 * xfs_da_btree.c
@@ -1533,8 +1534,8 @@ xfs_da_hashname(const __uint8_t *name, int namelen)
 enum xfs_dacmp
 xfs_da_compname(
        struct xfs_da_args *args,
-        const char      *name,
+        const unsigned char *name,
-        int             len)
+        int             len)
 {
        return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
                                        XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
@@ -2107,7 +2108,7 @@ xfs_da_do_buf(
                                   (be32_to_cpu(free->hdr.magic) != XFS_DIR2_FREE_MAGIC),
                                mp, XFS_ERRTAG_DA_READ_BUF,
                                XFS_RANDOM_DA_READ_BUF))) {
-                        xfs_buftrace("DA READ ERROR", rbp->bps[0]);
+                        trace_xfs_da_btree_corrupt(rbp->bps[0], _RET_IP_);
                        XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)",
                                             XFS_ERRLEVEL_LOW, mp, info);
                        error = XFS_ERROR(EFSCORRUPTED);
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 8c536167bf75..fe9f5a8c1d2a 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -125,6 +125,13 @@ typedef struct xfs_da_args {
 #define XFS_DA_OP_OKNOENT       0x0008  /* lookup/add op, ENOENT ok, else die */
 #define XFS_DA_OP_CILOOKUP      0x0010  /* lookup to return CI name if found */
+#define XFS_DA_OP_FLAGS \
+        { XFS_DA_OP_JUSTCHECK,  "JUSTCHECK" }, \
+        { XFS_DA_OP_RENAME,     "RENAME" }, \
+        { XFS_DA_OP_ADDNAME,    "ADDNAME" }, \
+        { XFS_DA_OP_OKNOENT,    "OKNOENT" }, \
+        { XFS_DA_OP_CILOOKUP,   "CILOOKUP" }
 /*
 * Structure to describe buffer(s) for a block.
 * This is needed in the directory version 2 format case, when
@@ -202,7 +209,8 @@ typedef struct xfs_da_state {
 */
 struct xfs_nameops {
        xfs_dahash_t    (*hashname)(struct xfs_name *);
-        enum xfs_dacmp  (*compname)(struct xfs_da_args *, const char *, int);
+        enum xfs_dacmp  (*compname)(struct xfs_da_args *,
+                                        const unsigned char *, int);
 };
@@ -253,7 +261,7 @@ int	xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
 uint xfs_da_hashname(const __uint8_t *name_string, int name_length);
 enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
-                                const char *name, int len);
+                                const unsigned char *name, int len);
 xfs_da_state_t *xfs_da_state_alloc(void);
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index ab89a7e94a0f..cd27c9d6c71f 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -43,16 +43,23 @@
 #include "xfs_error.h"
 #include "xfs_rw.h"
 #include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+static int xfs_swap_extents(
+        xfs_inode_t     *ip,    /* target inode */
+        xfs_inode_t     *tip,   /* tmp inode */
+        xfs_swapext_t   *sxp);
 /*
- * Syssgi interface for swapext
+ * ioctl interface for swapext
 */
 int
 xfs_swapext(
        xfs_swapext_t   *sxp)
 {
        xfs_inode_t     *ip, *tip;
-        struct file     *file, *target_file;
+        struct file     *file, *tmp_file;
        int             error = 0;
        /* Pull information for the target fd */
@@ -67,56 +74,128 @@ xfs_swapext(
                goto out_put_file;
        }
-        target_file = fget((int)sxp->sx_fdtmp);
+        tmp_file = fget((int)sxp->sx_fdtmp);
-        if (!target_file) {
+        if (!tmp_file) {
                error = XFS_ERROR(EINVAL);
                goto out_put_file;
        }
-        if (!(target_file->f_mode & FMODE_WRITE) ||
+        if (!(tmp_file->f_mode & FMODE_WRITE) ||
-            (target_file->f_flags & O_APPEND)) {
+            (tmp_file->f_flags & O_APPEND)) {
                error = XFS_ERROR(EBADF);
-                goto out_put_target_file;
+                goto out_put_tmp_file;
        }
        if (IS_SWAPFILE(file->f_path.dentry->d_inode) ||
-            IS_SWAPFILE(target_file->f_path.dentry->d_inode)) {
+            IS_SWAPFILE(tmp_file->f_path.dentry->d_inode)) {
                error = XFS_ERROR(EINVAL);
-                goto out_put_target_file;
+                goto out_put_tmp_file;
        }
        ip = XFS_I(file->f_path.dentry->d_inode);
-        tip = XFS_I(target_file->f_path.dentry->d_inode);
+        tip = XFS_I(tmp_file->f_path.dentry->d_inode);
        if (ip->i_mount != tip->i_mount) {
                error = XFS_ERROR(EINVAL);
-                goto out_put_target_file;
+                goto out_put_tmp_file;
        }
        if (ip->i_ino == tip->i_ino) {
                error = XFS_ERROR(EINVAL);
-                goto out_put_target_file;
+                goto out_put_tmp_file;
        }
        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
                error = XFS_ERROR(EIO);
-                goto out_put_target_file;
+                goto out_put_tmp_file;
        }
        error = xfs_swap_extents(ip, tip, sxp);
- out_put_target_file:
+ out_put_tmp_file:
-        fput(target_file);
+        fput(tmp_file);
 out_put_file:
        fput(file);
 out:
        return error;
 }
-int
+/*
+ * We need to check that the format of the data fork in the temporary inode is
+ * valid for the target inode before doing the swap. This is not a problem with
+ * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
+ * data fork depending on the space the attribute fork is taking so we can get
+ * invalid formats on the target inode.
+ *
+ * E.g. target has space for 7 extents in extent format, temp inode only has
+ * space for 6.  If we defragment down to 7 extents, then the tmp format is a
+ * btree, but when swapped it needs to be in extent format. Hence we can't just
+ * blindly swap data forks on attr2 filesystems.
+ *
+ * Note that we check the swap in both directions so that we don't end up with
+ * a corrupt temporary inode, either.
+ *
+ * Note that fixing the way xfs_fsr sets up the attribute fork in the source
+ * inode will prevent this situation from occurring, so all we do here is
+ * reject and log the attempt. basically we are putting the responsibility on
+ * userspace to get this right.
+ */
+static int
+xfs_swap_extents_check_format(
+        xfs_inode_t     *ip,    /* target inode */
+        xfs_inode_t     *tip)   /* tmp inode */
+{
+        /* Should never get a local format */
+        if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
+            tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+                return EINVAL;
+        /*
+         * if the target inode has less extents that then temporary inode then
+         * why did userspace call us?
+         */
+        if (ip->i_d.di_nextents < tip->i_d.di_nextents)
+                return EINVAL;
+        /*
+         * if the target inode is in extent form and the temp inode is in btree
+         * form then we will end up with the target inode in the wrong format
+         * as we already know there are less extents in the temp inode.
+         */
+        if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+            tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+                return EINVAL;
+        /* Check temp in extent form to max in target */
+        if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+            XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
+                return EINVAL;
+        /* Check target in extent form to max in temp */
+        if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+            XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
+                return EINVAL;
+        /* Check root block of temp in btree form to max in target */
+        if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+            XFS_IFORK_BOFF(ip) &&
+            tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
+                return EINVAL;
+        /* Check root block of target in btree form to max in temp */
+        if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+            XFS_IFORK_BOFF(tip) &&
+            ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
+                return EINVAL;
+        return 0;
+}
+static int
 xfs_swap_extents(
-        xfs_inode_t     *ip,
+        xfs_inode_t     *ip,    /* target inode */
-        xfs_inode_t     *tip,
+        xfs_inode_t     *tip,   /* tmp inode */
        xfs_swapext_t   *sxp)
 {
        xfs_mount_t     *mp;
@@ -160,15 +239,7 @@ xfs_swap_extents(
                goto out_unlock;
        }
-        /* Should never get a local format */
-        if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
-            tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-                error = XFS_ERROR(EINVAL);
-                goto out_unlock;
-        }
        if (VN_CACHED(VFS_I(tip)) != 0) {
-                xfs_inval_cached_trace(tip, 0, -1, 0, -1);
                error = xfs_flushinval_pages(tip, 0, -1,
                                FI_REMAPF_LOCKED);
                if (error)
@@ -189,13 +260,15 @@ xfs_swap_extents(
                goto out_unlock;
        }
-        /*
+        trace_xfs_swap_extent_before(ip, 0);
-         * If the target has extended attributes, the tmp file
+        trace_xfs_swap_extent_before(tip, 1);
-         * must also in order to ensure the correct data fork
-         * format.
+        /* check inode formats now that data is flushed */
-         */
+        error = xfs_swap_extents_check_format(ip, tip);
-        if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) {
+        if (error) {
-                error = XFS_ERROR(EINVAL);
+                xfs_fs_cmn_err(CE_NOTE, mp,
+                    "%s: inode 0x%llx format is incompatible for exchanging.",
+                                __FILE__, ip->i_ino);
                goto out_unlock;
        }
@@ -276,6 +349,16 @@ xfs_swap_extents(
        *tifp = *tempifp;       /* struct copy */
        /*
+         * Fix the in-memory data fork values that are dependent on the fork
+         * offset in the inode. We can't assume they remain the same as attr2
+         * has dynamic fork offsets.
+         */
+        ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
+                                        (uint)sizeof(xfs_bmbt_rec_t);
+        tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
+                                        (uint)sizeof(xfs_bmbt_rec_t);
+        /*
         * Fix the on-disk inode values
         */
        tmp = (__uint64_t)ip->i_d.di_nblocks;
@@ -347,6 +430,8 @@ xfs_swap_extents(
        error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT);
+        trace_xfs_swap_extent_after(ip, 0);
+        trace_xfs_swap_extent_after(tip, 1);
 out:
        kmem_free(tempifp);
        return error;
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
index 4f55a6306558..20bdd935c121 100644
--- a/fs/xfs/xfs_dfrag.h
+++ b/fs/xfs/xfs_dfrag.h
@@ -48,9 +48,6 @@ typedef struct xfs_swapext
 */
 int     xfs_swapext(struct xfs_swapext *sx);
-int     xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
-                struct xfs_swapext *sxp);
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_DFRAG_H__ */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index bb1d58eb3982..42520f041265 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -40,11 +40,11 @@
 #include "xfs_dir2_leaf.h"
 #include "xfs_dir2_block.h"
 #include "xfs_dir2_node.h"
-#include "xfs_dir2_trace.h"
 #include "xfs_error.h"
 #include "xfs_vnodeops.h"
+#include "xfs_trace.h"
-struct xfs_name xfs_name_dotdot = {"..", 2};
+struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2};
 /*
 * ASCII case-insensitive (ie. A-Z) support for directories that was
@@ -66,8 +66,8 @@ xfs_ascii_ci_hashname(
 STATIC enum xfs_dacmp
 xfs_ascii_ci_compname(
        struct xfs_da_args *args,
-        const char      *name,
+        const unsigned char *name,
-        int             len)
+        int             len)
 {
        enum xfs_dacmp  result;
        int             i;
@@ -247,7 +247,7 @@ xfs_dir_createname(
 int
 xfs_dir_cilookup_result(
        struct xfs_da_args *args,
-        const char      *name,
+        const unsigned char *name,
        int             len)
 {
        if (args->cmpresult == XFS_CMP_DIFFERENT)
@@ -525,7 +525,8 @@ xfs_dir2_grow_inode(
        xfs_trans_t     *tp;
        xfs_drfsbno_t   nblks;
-        xfs_dir2_trace_args_s("grow_inode", args, space);
+        trace_xfs_dir2_grow_inode(args, space);
        dp = args->dp;
        tp = args->trans;
        mp = dp->i_mount;
@@ -703,7 +704,8 @@ xfs_dir2_shrink_inode(
        xfs_mount_t     *mp;
        xfs_trans_t     *tp;
-        xfs_dir2_trace_args_db("shrink_inode", args, db, bp);
+        trace_xfs_dir2_shrink_inode(args, db);
        dp = args->dp;
        mp = dp->i_mount;
        tp = args->trans;
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index 1d9ef96f33aa..74a3b1057685 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -100,7 +100,7 @@ extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp,
 extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
                                struct xfs_dabuf *bp);
-extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const char *name,
+extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
-                                int len);
+                                const unsigned char *name, int len);
 #endif  /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index ab52e9e1c1ee..779a267b0a84 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -36,8 +36,8 @@
 #include "xfs_dir2_data.h"
 #include "xfs_dir2_leaf.h"
 #include "xfs_dir2_block.h"
-#include "xfs_dir2_trace.h"
 #include "xfs_error.h"
+#include "xfs_trace.h"
 /*
 * Local function prototypes.
@@ -57,8 +57,8 @@ static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
 void
 xfs_dir_startup(void)
 {
-        xfs_dir_hash_dot = xfs_da_hashname(".", 1);
+        xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1);
-        xfs_dir_hash_dotdot = xfs_da_hashname("..", 2);
+        xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
 }
 /*
@@ -94,7 +94,8 @@ xfs_dir2_block_addname(
        __be16                  *tagp;          /* pointer to tag value */
        xfs_trans_t             *tp;            /* transaction structure */
-        xfs_dir2_trace_args("block_addname", args);
+        trace_xfs_dir2_block_addname(args);
        dp = args->dp;
        tp = args->trans;
        mp = dp->i_mount;
@@ -512,8 +513,9 @@ xfs_dir2_block_getdents(
                /*
                 * If it didn't fit, set the final offset to here & return.
                 */
-                if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff,
+                if (filldir(dirent, (char *)dep->name, dep->namelen,
-                            be64_to_cpu(dep->inumber), DT_UNKNOWN)) {
+                            cook & 0x7fffffff, be64_to_cpu(dep->inumber),
+                            DT_UNKNOWN)) {
                        *offset = cook & 0x7fffffff;
                        xfs_da_brelse(NULL, bp);
                        return 0;
@@ -590,7 +592,8 @@ xfs_dir2_block_lookup(
        int                     error;          /* error return value */
        xfs_mount_t             *mp;            /* filesystem mount point */
-        xfs_dir2_trace_args("block_lookup", args);
+        trace_xfs_dir2_block_lookup(args);
        /*
         * Get the buffer, look up the entry.
         * If not found (ENOENT) then return, have no buffer.
@@ -747,7 +750,8 @@ xfs_dir2_block_removename(
        int                     size;           /* shortform size */
        xfs_trans_t             *tp;            /* transaction pointer */
-        xfs_dir2_trace_args("block_removename", args);
+        trace_xfs_dir2_block_removename(args);
        /*
         * Look up the entry in the block.  Gets the buffer and entry index.
         * It will always be there, the vnodeops level does a lookup first.
@@ -823,7 +827,8 @@ xfs_dir2_block_replace(
        int                     error;          /* error return value */
        xfs_mount_t             *mp;            /* filesystem mount point */
-        xfs_dir2_trace_args("block_replace", args);
+        trace_xfs_dir2_block_replace(args);
        /*
         * Lookup the entry in the directory.  Get buffer and entry index.
         * This will always succeed since the caller has already done a lookup.
@@ -897,7 +902,8 @@ xfs_dir2_leaf_to_block(
        int                     to;             /* block/leaf to index */
        xfs_trans_t             *tp;            /* transaction pointer */
-        xfs_dir2_trace_args_bb("leaf_to_block", args, lbp, dbp);
+        trace_xfs_dir2_leaf_to_block(args);
        dp = args->dp;
        tp = args->trans;
        mp = dp->i_mount;
@@ -1044,7 +1050,8 @@ xfs_dir2_sf_to_block(
        xfs_trans_t             *tp;            /* transaction pointer */
        struct xfs_name         name;
-        xfs_dir2_trace_args("sf_to_block", args);
+        trace_xfs_dir2_sf_to_block(args);
        dp = args->dp;
        tp = args->trans;
        mp = dp->i_mount;
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 41ad537c49e9..e2d89854ec9e 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -38,8 +38,8 @@
 #include "xfs_dir2_leaf.h"
 #include "xfs_dir2_block.h"
 #include "xfs_dir2_node.h"
-#include "xfs_dir2_trace.h"
 #include "xfs_error.h"
+#include "xfs_trace.h"
 /*
 * Local function declarations.
@@ -80,7 +80,8 @@ xfs_dir2_block_to_leaf(
        int                     needscan;       /* need to rescan bestfree */
        xfs_trans_t             *tp;            /* transaction pointer */
-        xfs_dir2_trace_args_b("block_to_leaf", args, dbp);
+        trace_xfs_dir2_block_to_leaf(args);
        dp = args->dp;
        mp = dp->i_mount;
        tp = args->trans;
@@ -188,7 +189,8 @@ xfs_dir2_leaf_addname(
        xfs_trans_t             *tp;            /* transaction pointer */
        xfs_dir2_db_t           use_block;      /* data block number */
-        xfs_dir2_trace_args("leaf_addname", args);
+        trace_xfs_dir2_leaf_addname(args);
        dp = args->dp;
        tp = args->trans;
        mp = dp->i_mount;
@@ -1079,7 +1081,7 @@ xfs_dir2_leaf_getdents(
                dep = (xfs_dir2_data_entry_t *)ptr;
                length = xfs_dir2_data_entsize(dep->namelen);
-                if (filldir(dirent, dep->name, dep->namelen,
+                if (filldir(dirent, (char *)dep->name, dep->namelen,
                            xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
                            be64_to_cpu(dep->inumber), DT_UNKNOWN))
                        break;
@@ -1266,7 +1268,8 @@ xfs_dir2_leaf_lookup(
        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
        xfs_trans_t             *tp;            /* transaction pointer */
-        xfs_dir2_trace_args("leaf_lookup", args);
+        trace_xfs_dir2_leaf_lookup(args);
        /*
         * Look up name in the leaf block, returning both buffers and index.
         */
@@ -1454,7 +1457,8 @@ xfs_dir2_leaf_removename(
        xfs_dir2_data_off_t     oldbest;        /* old value of best free */
        xfs_trans_t             *tp;            /* transaction pointer */
-        xfs_dir2_trace_args("leaf_removename", args);
+        trace_xfs_dir2_leaf_removename(args);
        /*
         * Lookup the leaf entry, get the leaf and data blocks read in.
         */
@@ -1586,7 +1590,8 @@ xfs_dir2_leaf_replace(
        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
        xfs_trans_t             *tp;            /* transaction pointer */
-        xfs_dir2_trace_args("leaf_replace", args);
+        trace_xfs_dir2_leaf_replace(args);
        /*
         * Look up the entry.
         */
@@ -1766,7 +1771,9 @@ xfs_dir2_node_to_leaf(
        if (state->path.active > 1)
                return 0;
        args = state->args;
-        xfs_dir2_trace_args("node_to_leaf", args);
+        trace_xfs_dir2_node_to_leaf(args);
        mp = state->mp;
        dp = args->dp;
        tp = args->trans;
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 5a81ccd1045b..78fc4d9ae756 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -37,8 +37,8 @@
 #include "xfs_dir2_leaf.h"
 #include "xfs_dir2_block.h"
 #include "xfs_dir2_node.h"
-#include "xfs_dir2_trace.h"
 #include "xfs_error.h"
+#include "xfs_trace.h"
 /*
 * Function declarations.
@@ -65,7 +65,7 @@ static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
 /*
 * Log entries from a freespace block.
 */
-void
+STATIC void
 xfs_dir2_free_log_bests(
        xfs_trans_t             *tp,            /* transaction pointer */
        xfs_dabuf_t             *bp,            /* freespace buffer */
@@ -123,7 +123,8 @@ xfs_dir2_leaf_to_node(
        __be16                  *to;            /* pointer to freespace entry */
        xfs_trans_t             *tp;            /* transaction pointer */
-        xfs_dir2_trace_args_b("leaf_to_node", args, lbp);
+        trace_xfs_dir2_leaf_to_node(args);
        dp = args->dp;
        mp = dp->i_mount;
        tp = args->trans;
@@ -196,7 +197,8 @@ xfs_dir2_leafn_add(
        xfs_mount_t             *mp;            /* filesystem mount point */
        xfs_trans_t             *tp;            /* transaction pointer */
-        xfs_dir2_trace_args_sb("leafn_add", args, index, bp);
+        trace_xfs_dir2_leafn_add(args, index);
        dp = args->dp;
        mp = dp->i_mount;
        tp = args->trans;
@@ -711,8 +713,8 @@ xfs_dir2_leafn_moveents(
        int             stale;                  /* count stale leaves copied */
        xfs_trans_t     *tp;                    /* transaction pointer */
-        xfs_dir2_trace_args_bibii("leafn_moveents", args, bp_s, start_s, bp_d,
+        trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count);
-                start_d, count);
        /*
         * Silently return if nothing to do.
         */
@@ -933,7 +935,8 @@ xfs_dir2_leafn_remove(
        int                     needscan;       /* need to rescan data frees */
        xfs_trans_t             *tp;            /* transaction pointer */
-        xfs_dir2_trace_args_sb("leafn_remove", args, index, bp);
+        trace_xfs_dir2_leafn_remove(args, index);
        dp = args->dp;
        tp = args->trans;
        mp = dp->i_mount;
@@ -1363,7 +1366,8 @@ xfs_dir2_node_addname(
        int                     rval;           /* sub-return value */
        xfs_da_state_t          *state;         /* btree cursor */
-        xfs_dir2_trace_args("node_addname", args);
+        trace_xfs_dir2_node_addname(args);
        /*
         * Allocate and initialize the state (btree cursor).
         */
@@ -1822,7 +1826,8 @@ xfs_dir2_node_lookup(
        int             rval;                   /* operation return value */
        xfs_da_state_t  *state;                 /* btree cursor */
-        xfs_dir2_trace_args("node_lookup", args);
+        trace_xfs_dir2_node_lookup(args);
        /*
         * Allocate and initialize the btree cursor.
         */
@@ -1875,7 +1880,8 @@ xfs_dir2_node_removename(
        int                     rval;           /* operation return value */
        xfs_da_state_t          *state;         /* btree cursor */
-        xfs_dir2_trace_args("node_removename", args);
+        trace_xfs_dir2_node_removename(args);
        /*
         * Allocate and initialize the btree cursor.
         */
@@ -1944,7 +1950,8 @@ xfs_dir2_node_replace(
        int                     rval;           /* internal return value */
        xfs_da_state_t          *state;         /* btree cursor */
-        xfs_dir2_trace_args("node_replace", args);
+        trace_xfs_dir2_node_replace(args);
        /*
         * Allocate and initialize the btree cursor.
         */
diff --git a/fs/xfs/xfs_dir2_node.h b/fs/xfs/xfs_dir2_node.h
index dde72db3d695..82dfe7147195 100644
--- a/fs/xfs/xfs_dir2_node.h
+++ b/fs/xfs/xfs_dir2_node.h
@@ -75,8 +75,6 @@ xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db)
        return ((db) % XFS_DIR2_MAX_FREE_BESTS(mp));
 }
-extern void xfs_dir2_free_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp,
-                                    int first, int last);
 extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
                                 struct xfs_dabuf *lbp);
 extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count);
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index e89734e84646..c1a5945d463a 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -37,7 +37,7 @@
 #include "xfs_dir2_data.h"
 #include "xfs_dir2_leaf.h"
 #include "xfs_dir2_block.h"
-#include "xfs_dir2_trace.h"
+#include "xfs_trace.h"
 /*
 * Prototypes for internal functions.
@@ -169,7 +169,8 @@ xfs_dir2_block_to_sf(
        xfs_dir2_sf_t           *sfp;           /* shortform structure */
        xfs_ino_t               temp;
-        xfs_dir2_trace_args_sb("block_to_sf", args, size, bp);
+        trace_xfs_dir2_block_to_sf(args);
        dp = args->dp;
        mp = dp->i_mount;
@@ -281,7 +282,8 @@ xfs_dir2_sf_addname(
        xfs_dir2_sf_t           *sfp;           /* shortform structure */
        xfs_dir2_sf_entry_t     *sfep = NULL;   /* shortform entry */
-        xfs_dir2_trace_args("sf_addname", args);
+        trace_xfs_dir2_sf_addname(args);
        ASSERT(xfs_dir2_sf_lookup(args) == ENOENT);
        dp = args->dp;
        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
@@ -654,7 +656,8 @@ xfs_dir2_sf_create(
        xfs_dir2_sf_t   *sfp;           /* shortform structure */
        int             size;           /* directory size */
-        xfs_dir2_trace_args_i("sf_create", args, pino);
+        trace_xfs_dir2_sf_create(args);
        dp = args->dp;
        ASSERT(dp != NULL);
@@ -779,7 +782,7 @@ xfs_dir2_sf_getdents(
                }
                ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep));
-                if (filldir(dirent, sfep->name, sfep->namelen,
+                if (filldir(dirent, (char *)sfep->name, sfep->namelen,
                            off & 0x7fffffff, ino, DT_UNKNOWN)) {
                        *offset = off & 0x7fffffff;
                        return 0;
@@ -808,7 +811,8 @@ xfs_dir2_sf_lookup(
        enum xfs_dacmp          cmp;            /* comparison result */
        xfs_dir2_sf_entry_t     *ci_sfep;       /* case-insens. entry */
-        xfs_dir2_trace_args("sf_lookup", args);
+        trace_xfs_dir2_sf_lookup(args);
        xfs_dir2_sf_check(args);
        dp = args->dp;
@@ -891,7 +895,8 @@ xfs_dir2_sf_removename(
        xfs_dir2_sf_entry_t     *sfep;          /* shortform directory entry */
        xfs_dir2_sf_t           *sfp;           /* shortform structure */
-        xfs_dir2_trace_args("sf_removename", args);
+        trace_xfs_dir2_sf_removename(args);
        dp = args->dp;
        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
@@ -982,7 +987,8 @@ xfs_dir2_sf_replace(
        xfs_dir2_sf_entry_t     *sfep;          /* shortform directory entry */
        xfs_dir2_sf_t           *sfp;           /* shortform structure */
-        xfs_dir2_trace_args("sf_replace", args);
+        trace_xfs_dir2_sf_replace(args);
        dp = args->dp;
        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
@@ -1125,7 +1131,8 @@ xfs_dir2_sf_toino4(
        xfs_dir2_sf_entry_t     *sfep;          /* new sf entry */
        xfs_dir2_sf_t           *sfp;           /* new sf directory */
-        xfs_dir2_trace_args("sf_toino4", args);
+        trace_xfs_dir2_sf_toino4(args);
        dp = args->dp;
        /*
@@ -1202,7 +1209,8 @@ xfs_dir2_sf_toino8(
        xfs_dir2_sf_entry_t     *sfep;          /* new sf entry */
        xfs_dir2_sf_t           *sfp;           /* new sf directory */
-        xfs_dir2_trace_args("sf_toino8", args);
+        trace_xfs_dir2_sf_toino8(args);
        dp = args->dp;
        /*
diff --git a/fs/xfs/xfs_dir2_trace.c b/fs/xfs/xfs_dir2_trace.c
deleted file mode 100644
index 6cc7c0c681ac..000000000000
--- a/fs/xfs/xfs_dir2_trace.c
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_inum.h"
-#include "xfs_dir2.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_dir2_trace.h"
-#ifdef XFS_DIR2_TRACE
-ktrace_t        *xfs_dir2_trace_buf;
-/*
- * Enter something in the trace buffers.
- */
-static void
-xfs_dir2_trace_enter(
-        xfs_inode_t     *dp,
-        int             type,
-        char            *where,
-        char            *name,
-        int             namelen,
-        void            *a0,
-        void            *a1,
-        void            *a2,
-        void            *a3,
-        void            *a4,
-        void            *a5,
-        void            *a6,
-        void            *a7)
-{
-        void            *n[5];
-        ASSERT(xfs_dir2_trace_buf);
-        ASSERT(dp->i_dir_trace);
-        if (name)
-                memcpy(n, name, min((int)sizeof(n), namelen));
-        else
-                memset((char *)n, 0, sizeof(n));
-        ktrace_enter(xfs_dir2_trace_buf,
-                (void *)(long)type, (void *)where,
-                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
-                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
-                (void *)(long)namelen,
-                (void *)n[0], (void *)n[1], (void *)n[2],
-                (void *)n[3], (void *)n[4]);
-        ktrace_enter(dp->i_dir_trace,
-                (void *)(long)type, (void *)where,
-                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
-                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
-                (void *)(long)namelen,
-                (void *)n[0], (void *)n[1], (void *)n[2],
-                (void *)n[3], (void *)n[4]);
-}
-void
-xfs_dir2_trace_args(
-        char            *where,
-        xfs_da_args_t   *args)
-{
-        xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS, where,
-                (char *)args->name, (int)args->namelen,
-                (void *)(unsigned long)args->hashval,
-                (void *)((unsigned long)(args->inumber >> 32)),
-                (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
-                (void *)args->dp, (void *)args->trans,
-                (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
-                NULL, NULL);
-}
-void
-xfs_dir2_trace_args_b(
-        char            *where,
-        xfs_da_args_t   *args,
-        xfs_dabuf_t     *bp)
-{
-        xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_B, where,
-                (char *)args->name, (int)args->namelen,
-                (void *)(unsigned long)args->hashval,
-                (void *)((unsigned long)(args->inumber >> 32)),
-                (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
-                (void *)args->dp, (void *)args->trans,
-                (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
-                (void *)(bp ? bp->bps[0] : NULL), NULL);
-}
-void
-xfs_dir2_trace_args_bb(
-        char            *where,
-        xfs_da_args_t   *args,
-        xfs_dabuf_t     *lbp,
-        xfs_dabuf_t     *dbp)
-{
-        xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_BB, where,
-                (char *)args->name, (int)args->namelen,
-                (void *)(unsigned long)args->hashval,
-                (void *)((unsigned long)(args->inumber >> 32)),
-                (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
-                (void *)args->dp, (void *)args->trans,
-                (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
-                (void *)(lbp ? lbp->bps[0] : NULL),
-                (void *)(dbp ? dbp->bps[0] : NULL));
-}
-void
-xfs_dir2_trace_args_bibii(
-        char            *where,
-        xfs_da_args_t   *args,
-        xfs_dabuf_t     *bs,
-        int             ss,
-        xfs_dabuf_t     *bd,
-        int             sd,
-        int             c)
-{
-        xfs_buf_t       *bpbs = bs ? bs->bps[0] : NULL;
-        xfs_buf_t       *bpbd = bd ? bd->bps[0] : NULL;
-        xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_BIBII, where,
-                (char *)args->name, (int)args->namelen,
-                (void *)args->dp, (void *)args->trans,
-                (void *)bpbs, (void *)(long)ss, (void *)bpbd, (void *)(long)sd,
-                (void *)(long)c, NULL);
-}
-void
-xfs_dir2_trace_args_db(
-        char            *where,
-        xfs_da_args_t   *args,
-        xfs_dir2_db_t   db,
-        xfs_dabuf_t     *bp)
-{
-        xfs_buf_t       *dbp = bp ? bp->bps[0] : NULL;
-        xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_DB, where,
-                (char *)args->name, (int)args->namelen,
-                (void *)(unsigned long)args->hashval,
-                (void *)((unsigned long)(args->inumber >> 32)),
-                (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
-                (void *)args->dp, (void *)args->trans,
-                (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
-                (void *)(long)db, (void *)dbp);
-}
-void
-xfs_dir2_trace_args_i(
-        char            *where,
-        xfs_da_args_t   *args,
-        xfs_ino_t       i)
-{
-        xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_I, where,
-                (char *)args->name, (int)args->namelen,
-                (void *)(unsigned long)args->hashval,
-                (void *)((unsigned long)(args->inumber >> 32)),
-                (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
-                (void *)args->dp, (void *)args->trans,
-                (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
-                (void *)((unsigned long)(i >> 32)),
-                (void *)((unsigned long)(i & 0xFFFFFFFF)));
-}
-void
-xfs_dir2_trace_args_s(
-        char            *where,
-        xfs_da_args_t   *args,
-        int             s)
-{
-        xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_S, where,
-                (char *)args->name, (int)args->namelen,
-                (void *)(unsigned long)args->hashval,
-                (void *)((unsigned long)(args->inumber >> 32)),
-                (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
-                (void *)args->dp, (void *)args->trans,
-                (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
-                (void *)(long)s, NULL);
-}
-void
-xfs_dir2_trace_args_sb(
-        char            *where,
-        xfs_da_args_t   *args,
-        int             s,
-        xfs_dabuf_t     *bp)
-{
-        xfs_buf_t       *dbp = bp ? bp->bps[0] : NULL;
-        xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_SB, where,
-                (char *)args->name, (int)args->namelen,
-                (void *)(unsigned long)args->hashval,
-                (void *)((unsigned long)(args->inumber >> 32)),
-                (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
-                (void *)args->dp, (void *)args->trans,
-                (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
-                (void *)(long)s, (void *)dbp);
-}
-#endif  /* XFS_DIR2_TRACE */
diff --git a/fs/xfs/xfs_dir2_trace.h b/fs/xfs/xfs_dir2_trace.h
deleted file mode 100644
index ca3c754f4822..000000000000
--- a/fs/xfs/xfs_dir2_trace.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DIR2_TRACE_H__
-#define __XFS_DIR2_TRACE_H__
-/*
- * Tracing for xfs v2 directories.
- */
-#if defined(XFS_DIR2_TRACE)
-struct ktrace;
-struct xfs_dabuf;
-struct xfs_da_args;
-#define XFS_DIR2_GTRACE_SIZE            4096    /* global buffer */
-#define XFS_DIR2_KTRACE_SIZE            32      /* per-inode buffer */
-extern struct ktrace *xfs_dir2_trace_buf;
-#define XFS_DIR2_KTRACE_ARGS            1       /* args only */
-#define XFS_DIR2_KTRACE_ARGS_B          2       /* args + buffer */
-#define XFS_DIR2_KTRACE_ARGS_BB         3       /* args + 2 buffers */
-#define XFS_DIR2_KTRACE_ARGS_DB         4       /* args, db, buffer */
-#define XFS_DIR2_KTRACE_ARGS_I          5       /* args, inum */
-#define XFS_DIR2_KTRACE_ARGS_S          6       /* args, int */
-#define XFS_DIR2_KTRACE_ARGS_SB         7       /* args, int, buffer */
-#define XFS_DIR2_KTRACE_ARGS_BIBII      8       /* args, buf/int/buf/int/int */
-void xfs_dir2_trace_args(char *where, struct xfs_da_args *args);
-void xfs_dir2_trace_args_b(char *where, struct xfs_da_args *args,
-                           struct xfs_dabuf *bp);
-void xfs_dir2_trace_args_bb(char *where, struct xfs_da_args *args,
-                            struct xfs_dabuf *lbp, struct xfs_dabuf *dbp);
-void xfs_dir2_trace_args_bibii(char *where, struct xfs_da_args *args,
-                               struct xfs_dabuf *bs, int ss,
-                               struct xfs_dabuf *bd, int sd, int c);
-void xfs_dir2_trace_args_db(char *where, struct xfs_da_args *args,
-                            xfs_dir2_db_t db, struct xfs_dabuf *bp);
-void xfs_dir2_trace_args_i(char *where, struct xfs_da_args *args, xfs_ino_t i);
-void xfs_dir2_trace_args_s(char *where, struct xfs_da_args *args, int s);
-void xfs_dir2_trace_args_sb(char *where, struct xfs_da_args *args, int s,
-                            struct xfs_dabuf *bp);
-#else   /* XFS_DIR2_TRACE */
-#define xfs_dir2_trace_args(where, args)
-#define xfs_dir2_trace_args_b(where, args, bp)
-#define xfs_dir2_trace_args_bb(where, args, lbp, dbp)
-#define xfs_dir2_trace_args_bibii(where, args, bs, ss, bd, sd, c)
-#define xfs_dir2_trace_args_db(where, args, db, bp)
-#define xfs_dir2_trace_args_i(where, args, i)
-#define xfs_dir2_trace_args_s(where, args, s)
-#define xfs_dir2_trace_args_sb(where, args, s, bp)
-#endif  /* XFS_DIR2_TRACE */
-#endif  /* __XFS_DIR2_TRACE_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 05a4bdd4be39..6f35ed1b39b9 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -82,7 +82,7 @@ xfs_efi_item_format(xfs_efi_log_item_t	*efip,
        log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format);
        log_vector->i_len = size;
-        XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFI_FORMAT);
+        log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT;
        ASSERT(size >= sizeof(xfs_efi_log_format_t));
 }
@@ -406,7 +406,7 @@ xfs_efd_item_format(xfs_efd_log_item_t	*efdp,
        log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format);
        log_vector->i_len = size;
-        XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFD_FORMAT);
+        log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT;
        ASSERT(size >= sizeof(xfs_efd_log_format_t));
 }
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index edf8bdf4141f..390850ee6603 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -34,6 +34,7 @@
 #include "xfs_utils.h"
 #include "xfs_mru_cache.h"
 #include "xfs_filestream.h"
+#include "xfs_trace.h"
 #ifdef XFS_FILESTREAMS_TRACE
@@ -139,6 +140,7 @@ _xfs_filestream_pick_ag(
        int             flags,
        xfs_extlen_t    minlen)
 {
+        int             streams, max_streams;
        int             err, trylock, nscan;
        xfs_extlen_t    longest, free, minfree, maxfree = 0;
        xfs_agnumber_t  ag, max_ag = NULLAGNUMBER;
@@ -154,15 +156,15 @@ _xfs_filestream_pick_ag(
        trylock = XFS_ALLOC_FLAG_TRYLOCK;
        for (nscan = 0; 1; nscan++) {
+                pag = xfs_perag_get(mp, ag);
-                TRACE_AG_SCAN(mp, ag, xfs_filestream_peek_ag(mp, ag));
+                TRACE_AG_SCAN(mp, ag, atomic_read(&pag->pagf_fstrms));
-                pag = mp->m_perag + ag;
                if (!pag->pagf_init) {
                        err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
-                        if (err && !trylock)
+                        if (err && !trylock) {
+                                xfs_perag_put(pag);
                                return err;
+                        }
                }
                /* Might fail sometimes during the 1st pass with trylock set. */
@@ -172,6 +174,7 @@ _xfs_filestream_pick_ag(
                /* Keep track of the AG with the most free blocks. */
                if (pag->pagf_freeblks > maxfree) {
                        maxfree = pag->pagf_freeblks;
+                        max_streams = atomic_read(&pag->pagf_fstrms);
                        max_ag = ag;
                }
@@ -194,6 +197,8 @@ _xfs_filestream_pick_ag(
                        /* Break out, retaining the reference on the AG. */
                        free = pag->pagf_freeblks;
+                        streams = atomic_read(&pag->pagf_fstrms);
+                        xfs_perag_put(pag);
                        *agp = ag;
                        break;
                }
@@ -201,6 +206,7 @@ _xfs_filestream_pick_ag(
                /* Drop the reference on this AG, it's not usable. */
                xfs_filestream_put_ag(mp, ag);
 next_ag:
+                xfs_perag_put(pag);
                /* Move to the next AG, wrapping to AG 0 if necessary. */
                if (++ag >= mp->m_sb.sb_agcount)
                        ag = 0;
@@ -228,6 +234,7 @@ next_ag:
                if (max_ag != NULLAGNUMBER) {
                        xfs_filestream_get_ag(mp, max_ag);
                        TRACE_AG_PICK1(mp, max_ag, maxfree);
+                        streams = max_streams;
                        free = maxfree;
                        *agp = max_ag;
                        break;
@@ -239,16 +246,14 @@ next_ag:
                return 0;
        }
-        TRACE_AG_PICK2(mp, startag, *agp, xfs_filestream_peek_ag(mp, *agp),
+        TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags);
-                        free, nscan, flags);
        return 0;
 }
 /*
 * Set the allocation group number for a file or a directory, updating inode
- * references and per-AG references as appropriate.  Must be called with the
+ * references and per-AG references as appropriate.
- * m_peraglock held in read mode.
 */
 static int
 _xfs_filestream_update_ag(
@@ -394,9 +399,7 @@ xfs_filestream_init(void)
        item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item");
        if (!item_zone)
                return -ENOMEM;
-#ifdef XFS_FILESTREAMS_TRACE
-        xfs_filestreams_trace_buf = ktrace_alloc(XFS_FSTRM_KTRACE_SIZE, KM_NOFS);
-#endif
        return 0;
 }
@@ -407,9 +410,6 @@ xfs_filestream_init(void)
 void
 xfs_filestream_uninit(void)
 {
-#ifdef XFS_FILESTREAMS_TRACE
-        ktrace_free(xfs_filestreams_trace_buf);
-#endif
        kmem_zone_destroy(item_zone);
 }
@@ -455,20 +455,6 @@ xfs_filestream_unmount(
 }
 /*
- * If the mount point's m_perag array is going to be reallocated, all
- * outstanding cache entries must be flushed to avoid accessing reference count
- * addresses that have been freed.  The call to xfs_filestream_flush() must be
- * made inside the block that holds the m_peraglock in write mode to do the
- * reallocation.
- */
-void
-xfs_filestream_flush(
-        xfs_mount_t     *mp)
-{
-        xfs_mru_cache_flush(mp->m_filestream);
-}
-/*
 * Return the AG of the filestream the file or directory belongs to, or
 * NULLAGNUMBER otherwise.
 */
@@ -530,7 +516,6 @@ xfs_filestream_associate(
        mp = pip->i_mount;
        cache = mp->m_filestream;
-        down_read(&mp->m_peraglock);
        /*
         * We have a problem, Houston.
@@ -547,10 +532,8 @@ xfs_filestream_associate(
         *
         * So, if we can't get the iolock without sleeping then just give up
         */
-        if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) {
+        if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL))
-                up_read(&mp->m_peraglock);
                return 1;
-        }
        /* If the parent directory is already in the cache, use its AG. */
        item = xfs_mru_cache_lookup(cache, pip->i_ino);
@@ -605,7 +588,6 @@ exit_did_pick:
 exit:
        xfs_iunlock(pip, XFS_IOLOCK_EXCL);
-        up_read(&mp->m_peraglock);
        return -err;
 }
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index f655f7dc334c..260f757bbc5d 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -79,28 +79,49 @@ extern ktrace_t *xfs_filestreams_trace_buf;
 * the cache that reference per-ag array elements that have since been
 * reallocated.
 */
-STATIC_INLINE int
+/*
+ * xfs_filestream_peek_ag is only used in tracing code
+ */
+static inline int
 xfs_filestream_peek_ag(
        xfs_mount_t     *mp,
        xfs_agnumber_t  agno)
 {
-        return atomic_read(&mp->m_perag[agno].pagf_fstrms);
+        struct xfs_perag *pag;
+        int             ret;
+        pag = xfs_perag_get(mp, agno);
+        ret = atomic_read(&pag->pagf_fstrms);
+        xfs_perag_put(pag);
+        return ret;
 }
-STATIC_INLINE int
+static inline int
 xfs_filestream_get_ag(
        xfs_mount_t     *mp,
        xfs_agnumber_t  agno)
 {
-        return atomic_inc_return(&mp->m_perag[agno].pagf_fstrms);
+        struct xfs_perag *pag;
+        int             ret;
+        pag = xfs_perag_get(mp, agno);
+        ret = atomic_inc_return(&pag->pagf_fstrms);
+        xfs_perag_put(pag);
+        return ret;
 }
-STATIC_INLINE int
+static inline int
 xfs_filestream_put_ag(
        xfs_mount_t     *mp,
        xfs_agnumber_t  agno)
 {
-        return atomic_dec_return(&mp->m_perag[agno].pagf_fstrms);
+        struct xfs_perag *pag;
+        int             ret;
+        pag = xfs_perag_get(mp, agno);
+        ret = atomic_dec_return(&pag->pagf_fstrms);
+        xfs_perag_put(pag);
+        return ret;
 }
 /* allocation selection flags */
@@ -114,7 +135,6 @@ int xfs_filestream_init(void);
 void xfs_filestream_uninit(void);
 int xfs_filestream_mount(struct xfs_mount *mp);
 void xfs_filestream_unmount(struct xfs_mount *mp);
-void xfs_filestream_flush(struct xfs_mount *mp);
 xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
 int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip);
 void xfs_filestream_deassociate(struct xfs_inode *ip);
@@ -122,7 +142,7 @@ int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp);
 /* filestreams for the inode? */
-STATIC_INLINE int
+static inline int
 xfs_inode_is_filestream(
        struct xfs_inode        *ip)
 {
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index f52ac276277e..7cf7220e7d5f 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -292,7 +292,8 @@ typedef struct xfs_bstat {
        __s32           bs_extents;     /* number of extents            */
        __u32           bs_gen;         /* generation count             */
        __u16           bs_projid;      /* project id                   */
-        unsigned char   bs_pad[14];     /* pad space, unused            */
+        __u16           bs_forkoff;     /* inode fork offset in bytes   */
+        unsigned char   bs_pad[12];     /* pad space, unused            */
        __u32           bs_dmevmask;    /* DMIG event mask              */
        __u16           bs_dmstate;     /* DMIG state info              */
        __u16           bs_aextents;    /* attribute number of extents  */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 2d0b3e1da9e6..37a6f62c57b6 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -45,6 +45,7 @@
 #include "xfs_rtalloc.h"
 #include "xfs_rw.h"
 #include "xfs_filestream.h"
+#include "xfs_trace.h"
 /*
 * File system operations
@@ -166,27 +167,14 @@ xfs_growfs_data_private(
        }
        new = nb - mp->m_sb.sb_dblocks;
        oagcount = mp->m_sb.sb_agcount;
-        if (nagcount > oagcount) {
-                void *new_perag, *old_perag;
-                xfs_filestream_flush(mp);
-                new_perag = kmem_zalloc(sizeof(xfs_perag_t) * nagcount,
-                                        KM_MAYFAIL);
-                if (!new_perag)
-                        return XFS_ERROR(ENOMEM);
-                down_write(&mp->m_peraglock);
-                memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount);
-                old_perag = mp->m_perag;
-                mp->m_perag = new_perag;
-                mp->m_flags |= XFS_MOUNT_32BITINODES;
-                nagimax = xfs_initialize_perag(mp, nagcount);
-                up_write(&mp->m_peraglock);
-                kmem_free(old_perag);
+        /* allocate the new per-ag structures */
+        if (nagcount > oagcount) {
+                error = xfs_initialize_perag(mp, nagcount, &nagimax);
+                if (error)
+                        return error;
        }
        tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
        tp->t_flags |= XFS_TRANS_RESERVE;
        if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp),
@@ -195,14 +183,19 @@ xfs_growfs_data_private(
                return error;
        }
+        /*
+         * Write new AG headers to disk. Non-transactional, but written
+         * synchronously so they are completed prior to the growfs transaction
+         * being logged.
+         */
        nfree = 0;
        for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
                /*
                 * AG freelist header block
                 */
                bp = xfs_buf_get(mp->m_ddev_targp,
-                                  XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
+                                 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
-                                  XFS_FSS_TO_BB(mp, 1), 0);
+                                 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
                agf = XFS_BUF_TO_AGF(bp);
                memset(agf, 0, mp->m_sb.sb_sectsize);
                agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
@@ -233,8 +226,8 @@ xfs_growfs_data_private(
                 * AG inode header block
                 */
                bp = xfs_buf_get(mp->m_ddev_targp,
-                                  XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+                                 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-                                  XFS_FSS_TO_BB(mp, 1), 0);
+                                 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
                agi = XFS_BUF_TO_AGI(bp);
                memset(agi, 0, mp->m_sb.sb_sectsize);
                agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
@@ -257,8 +250,9 @@ xfs_growfs_data_private(
                 * BNO btree root block
                 */
                bp = xfs_buf_get(mp->m_ddev_targp,
-                        XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
+                                 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
-                        BTOBB(mp->m_sb.sb_blocksize), 0);
+                                 BTOBB(mp->m_sb.sb_blocksize),
+                                 XBF_LOCK | XBF_MAPPED);
                block = XFS_BUF_TO_BLOCK(bp);
                memset(block, 0, mp->m_sb.sb_blocksize);
                block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
@@ -278,8 +272,9 @@ xfs_growfs_data_private(
                 * CNT btree root block
                 */
                bp = xfs_buf_get(mp->m_ddev_targp,
-                        XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
+                                 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
-                        BTOBB(mp->m_sb.sb_blocksize), 0);
+                                 BTOBB(mp->m_sb.sb_blocksize),
+                                 XBF_LOCK | XBF_MAPPED);
                block = XFS_BUF_TO_BLOCK(bp);
                memset(block, 0, mp->m_sb.sb_blocksize);
                block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
@@ -300,8 +295,9 @@ xfs_growfs_data_private(
                 * INO btree root block
                 */
                bp = xfs_buf_get(mp->m_ddev_targp,
-                        XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
+                                 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
-                        BTOBB(mp->m_sb.sb_blocksize), 0);
+                                 BTOBB(mp->m_sb.sb_blocksize),
+                                 XBF_LOCK | XBF_MAPPED);
                block = XFS_BUF_TO_BLOCK(bp);
                memset(block, 0, mp->m_sb.sb_blocksize);
                block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
@@ -344,6 +340,7 @@ xfs_growfs_data_private(
                be32_add_cpu(&agf->agf_length, new);
                ASSERT(be32_to_cpu(agf->agf_length) ==
                       be32_to_cpu(agi->agi_length));
                xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
                /*
                 * Free the new space.
@@ -354,6 +351,12 @@ xfs_growfs_data_private(
                        goto error0;
                }
        }
+        /*
+         * Update changed superblock fields transactionally. These are not
+         * seen by the rest of the world until the transaction commit applies
+         * them atomically to the superblock.
+         */
        if (nagcount > oagcount)
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
        if (nb > mp->m_sb.sb_dblocks)
@@ -364,9 +367,9 @@ xfs_growfs_data_private(
        if (dpct)
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
        error = xfs_trans_commit(tp, 0);
-        if (error) {
+        if (error)
                return error;
-        }
        /* New allocation groups fully initialized, so update mount struct */
        if (nagimax)
                mp->m_maxagi = nagimax;
@@ -376,6 +379,8 @@ xfs_growfs_data_private(
                mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
        } else
                mp->m_maxicount = 0;
+        /* update secondary superblocks. */
        for (agno = 1; agno < nagcount; agno++) {
                error = xfs_read_buf(mp, mp->m_ddev_targp,
                                  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
@@ -611,7 +616,7 @@ xfs_fs_log_dummy(
        xfs_inode_t     *ip;
        int             error;
-        tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
+        tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
        error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
        if (error) {
                xfs_trans_cancel(tp, 0);
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 0785797db828..9d884c127bb9 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -205,7 +205,7 @@ xfs_ialloc_inode_init(
                d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
                fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
                                         mp->m_bsize * blks_per_cluster,
-                                         XFS_BUF_LOCK);
+                                         XBF_LOCK);
                ASSERT(fbuf);
                ASSERT(!XFS_BUF_GETERROR(fbuf));
@@ -253,6 +253,7 @@ xfs_ialloc_ag_alloc(
        xfs_agino_t     thisino;        /* current inode number, for loop */
        int             isaligned = 0;  /* inode allocation at stripe unit */
                                        /* boundary */
+        struct xfs_perag *pag;
        args.tp = tp;
        args.mp = tp->t_mountp;
@@ -382,9 +383,9 @@ xfs_ialloc_ag_alloc(
        newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
        be32_add_cpu(&agi->agi_count, newlen);
        be32_add_cpu(&agi->agi_freecount, newlen);
-        down_read(&args.mp->m_peraglock);
+        pag = xfs_perag_get(args.mp, agno);
-        args.mp->m_perag[agno].pagi_freecount += newlen;
+        pag->pagi_freecount += newlen;
-        up_read(&args.mp->m_peraglock);
+        xfs_perag_put(pag);
        agi->agi_newino = cpu_to_be32(newino);
        /*
@@ -425,7 +426,7 @@ xfs_ialloc_ag_alloc(
        return 0;
 }
-STATIC_INLINE xfs_agnumber_t
+STATIC xfs_agnumber_t
 xfs_ialloc_next_ag(
        xfs_mount_t     *mp)
 {
@@ -486,9 +487,8 @@ xfs_ialloc_ag_select(
         */
        agno = pagno;
        flags = XFS_ALLOC_FLAG_TRYLOCK;
-        down_read(&mp->m_peraglock);
        for (;;) {
-                pag = &mp->m_perag[agno];
+                pag = xfs_perag_get(mp, agno);
                if (!pag->pagi_init) {
                        if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
                                agbp = NULL;
@@ -527,7 +527,7 @@ xfs_ialloc_ag_select(
                                        agbp = NULL;
                                        goto nextag;
                                }
-                                up_read(&mp->m_peraglock);
+                                xfs_perag_put(pag);
                                return agbp;
                        }
                }
@@ -535,22 +535,19 @@ unlock_nextag:
                if (agbp)
                        xfs_trans_brelse(tp, agbp);
 nextag:
+                xfs_perag_put(pag);
                /*
                 * No point in iterating over the rest, if we're shutting
                 * down.
                 */
-                if (XFS_FORCED_SHUTDOWN(mp)) {
+                if (XFS_FORCED_SHUTDOWN(mp))
-                        up_read(&mp->m_peraglock);
                        return NULL;
-                }
                agno++;
                if (agno >= agcount)
                        agno = 0;
                if (agno == pagno) {
-                        if (flags == 0) {
+                        if (flags == 0)
-                                up_read(&mp->m_peraglock);
                                return NULL;
-                        }
                        flags = 0;
                }
        }
@@ -672,6 +669,7 @@ xfs_dialloc(
        xfs_agnumber_t  tagno;          /* testing allocation group number */
        xfs_btree_cur_t *tcur;          /* temp cursor */
        xfs_inobt_rec_incore_t trec;    /* temp inode allocation record */
+        struct xfs_perag *pag;
        if (*IO_agbp == NULL) {
@@ -771,13 +769,13 @@ nextag:
                        *inop = NULLFSINO;
                        return noroom ? ENOSPC : 0;
                }
-                down_read(&mp->m_peraglock);
+                pag = xfs_perag_get(mp, tagno);
-                if (mp->m_perag[tagno].pagi_inodeok == 0) {
+                if (pag->pagi_inodeok == 0) {
-                        up_read(&mp->m_peraglock);
+                        xfs_perag_put(pag);
                        goto nextag;
                }
                error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp);
-                up_read(&mp->m_peraglock);
+                xfs_perag_put(pag);
                if (error)
                        goto nextag;
                agi = XFS_BUF_TO_AGI(agbp);
@@ -790,6 +788,7 @@ nextag:
         */
        agno = tagno;
        *IO_agbp = NULL;
+        pag = xfs_perag_get(mp, agno);
 restart_pagno:
        cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
@@ -808,7 +807,6 @@ nextag:
         * If in the same AG as the parent, try to get near the parent.
         */
        if (pagno == agno) {
-                xfs_perag_t     *pag = &mp->m_perag[agno];
                int             doneleft;       /* done, to the left */
                int             doneright;      /* done, to the right */
                int             searchdistance = 10;
@@ -1006,9 +1004,7 @@ alloc_inode:
                goto error0;
        be32_add_cpu(&agi->agi_freecount, -1);
        xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
-        down_read(&mp->m_peraglock);
+        pag->pagi_freecount--;
-        mp->m_perag[tagno].pagi_freecount--;
-        up_read(&mp->m_peraglock);
        error = xfs_check_agi_freecount(cur, agi);
        if (error)
@@ -1016,12 +1012,14 @@ alloc_inode:
        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
        xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+        xfs_perag_put(pag);
        *inop = ino;
        return 0;
 error1:
        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
 error0:
        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+        xfs_perag_put(pag);
        return error;
 }
@@ -1052,6 +1050,7 @@ xfs_difree(
        xfs_mount_t     *mp;    /* mount structure for filesystem */
        int             off;    /* offset of inode in inode chunk */
        xfs_inobt_rec_incore_t rec;     /* btree record */
+        struct xfs_perag *pag;
        mp = tp->t_mountp;
@@ -1088,9 +1087,7 @@ xfs_difree(
        /*
         * Get the allocation group header.
         */
-        down_read(&mp->m_peraglock);
        error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
-        up_read(&mp->m_peraglock);
        if (error) {
                cmn_err(CE_WARN,
                        "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s.  Returning error.",
@@ -1157,9 +1154,9 @@ xfs_difree(
                be32_add_cpu(&agi->agi_count, -ilen);
                be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
                xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
-                down_read(&mp->m_peraglock);
+                pag = xfs_perag_get(mp, agno);
-                mp->m_perag[agno].pagi_freecount -= ilen - 1;
+                pag->pagi_freecount -= ilen - 1;
-                up_read(&mp->m_peraglock);
+                xfs_perag_put(pag);
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
@@ -1188,9 +1185,9 @@ xfs_difree(
                 */
                be32_add_cpu(&agi->agi_freecount, 1);
                xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
-                down_read(&mp->m_peraglock);
+                pag = xfs_perag_get(mp, agno);
-                mp->m_perag[agno].pagi_freecount++;
+                pag->pagi_freecount++;
-                up_read(&mp->m_peraglock);
+                xfs_perag_put(pag);
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
        }
@@ -1312,9 +1309,7 @@ xfs_imap(
                xfs_buf_t       *agbp;  /* agi buffer */
                int             i;      /* temp state */
-                down_read(&mp->m_peraglock);
                error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
-                up_read(&mp->m_peraglock);
                if (error) {
                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
                                        "xfs_ialloc_read_agi() returned "
@@ -1379,7 +1374,6 @@ xfs_imap(
                        XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
                return XFS_ERROR(EINVAL);
        }
        return 0;
 }
@@ -1523,8 +1517,7 @@ xfs_ialloc_read_agi(
                return error;
        agi = XFS_BUF_TO_AGI(*bpp);
-        pag = &mp->m_perag[agno];
+        pag = xfs_perag_get(mp, agno);
        if (!pag->pagi_init) {
                pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
                pag->pagi_count = be32_to_cpu(agi->agi_count);
@@ -1537,6 +1530,7 @@ xfs_ialloc_read_agi(
         */
        ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
                XFS_FORCED_SHUTDOWN(mp));
+        xfs_perag_put(pag);
        return 0;
 }
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 80e526489be5..6845db90818f 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -43,7 +43,7 @@
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_btree_trace.h"
-#include "xfs_dir2_trace.h"
+#include "xfs_trace.h"
 /*
@@ -74,6 +74,8 @@ xfs_inode_alloc(
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
+        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
        /* initialise the xfs inode */
        ip->i_ino = ino;
        ip->i_mount = mp;
@@ -87,30 +89,8 @@ xfs_inode_alloc(
        ip->i_size = 0;
        ip->i_new_size = 0;
-        /*
-         * Initialize inode's trace buffers.
-         */
-#ifdef  XFS_INODE_TRACE
-        ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_BMAP_TRACE
-        ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_BTREE_TRACE
-        ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_RW_TRACE
-        ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_ILOCK_TRACE
-        ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_DIR2_TRACE
-        ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
-#endif
        /* prevent anyone from using this yet */
-        VFS_I(ip)->i_state = I_NEW|I_LOCK;
+        VFS_I(ip)->i_state = I_NEW;
        return ip;
 }
@@ -130,25 +110,6 @@ xfs_inode_free(
        if (ip->i_afp)
                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-#ifdef XFS_INODE_TRACE
-        ktrace_free(ip->i_trace);
-#endif
-#ifdef XFS_BMAP_TRACE
-        ktrace_free(ip->i_xtrace);
-#endif
-#ifdef XFS_BTREE_TRACE
-        ktrace_free(ip->i_btrace);
-#endif
-#ifdef XFS_RW_TRACE
-        ktrace_free(ip->i_rwtrace);
-#endif
-#ifdef XFS_ILOCK_TRACE
-        ktrace_free(ip->i_lock_trace);
-#endif
-#ifdef XFS_DIR2_TRACE
-        ktrace_free(ip->i_dir_trace);
-#endif
        if (ip->i_itemp) {
                /*
                 * Only if we are shutting down the fs will we see an
@@ -207,6 +168,7 @@ xfs_iget_cache_hit(
         *           instead of polling for it.
         */
        if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
+                trace_xfs_iget_skip(ip);
                XFS_STATS_INC(xs_ig_frecycle);
                error = EAGAIN;
                goto out_error;
@@ -225,16 +187,15 @@ xfs_iget_cache_hit(
         * Need to carefully get it back into useable state.
         */
        if (ip->i_flags & XFS_IRECLAIMABLE) {
-                xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
+                trace_xfs_iget_reclaim(ip);
                /*
-                 * We need to set XFS_INEW atomically with clearing the
+                 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
-                 * reclaimable tag so that we do have an indicator of the
+                 * from stomping over us while we recycle the inode.  We can't
-                 * inode still being initialized.
+                 * clear the radix tree reclaimable tag yet as it requires
+                 * pag_ici_lock to be held exclusive.
                 */
-                ip->i_flags |= XFS_INEW;
+                ip->i_flags |= XFS_IRECLAIM;
-                ip->i_flags &= ~XFS_IRECLAIMABLE;
-                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
                spin_unlock(&ip->i_flags_lock);
                read_unlock(&pag->pag_ici_lock);
@@ -251,9 +212,18 @@ xfs_iget_cache_hit(
                        ip->i_flags &= ~XFS_INEW;
                        ip->i_flags |= XFS_IRECLAIMABLE;
                        __xfs_inode_set_reclaim_tag(pag, ip);
+                        trace_xfs_iget_reclaim(ip);
                        goto out_error;
                }
-                inode->i_state = I_LOCK|I_NEW;
+                write_lock(&pag->pag_ici_lock);
+                spin_lock(&ip->i_flags_lock);
+                ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
+                ip->i_flags |= XFS_INEW;
+                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+                inode->i_state = I_NEW;
+                spin_unlock(&ip->i_flags_lock);
+                write_unlock(&pag->pag_ici_lock);
        } else {
                /* If the VFS inode is being torn down, pause and try again. */
                if (!igrab(inode)) {
@@ -270,8 +240,9 @@ xfs_iget_cache_hit(
                xfs_ilock(ip, lock_flags);
        xfs_iflags_clear(ip, XFS_ISTALE);
-        xfs_itrace_exit_tag(ip, "xfs_iget.found");
        XFS_STATS_INC(xs_ig_found);
+        trace_xfs_iget_found(ip);
        return 0;
 out_error:
@@ -290,7 +261,7 @@ xfs_iget_cache_miss(
        struct xfs_inode        **ipp,
        xfs_daddr_t             bno,
        int                     flags,
-        int                     lock_flags) __releases(pag->pag_ici_lock)
+        int                     lock_flags)
 {
        struct xfs_inode        *ip;
        int                     error;
@@ -305,7 +276,7 @@ xfs_iget_cache_miss(
        if (error)
                goto out_destroy;
-        xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
+        xfs_itrace_entry(ip);
        if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
                error = ENOENT;
@@ -350,6 +321,8 @@ xfs_iget_cache_miss(
        write_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
+        trace_xfs_iget_alloc(ip);
        *ipp = ip;
        return 0;
@@ -408,7 +381,7 @@ xfs_iget(
                return EINVAL;
        /* get the perag structure and ensure that it's inode capable */
-        pag = xfs_get_perag(mp, ino);
+        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
        if (!pag->pagi_inodeok)
                return EINVAL;
        ASSERT(pag->pag_ici_init);
@@ -432,7 +405,7 @@ again:
                if (error)
                        goto out_error_or_again;
        }
-        xfs_put_perag(mp, pag);
+        xfs_perag_put(pag);
        *ipp = ip;
@@ -451,7 +424,7 @@ out_error_or_again:
                delay(1);
                goto again;
        }
-        xfs_put_perag(mp, pag);
+        xfs_perag_put(pag);
        return error;
 }
@@ -511,19 +484,23 @@ xfs_ireclaim(
 {
        struct xfs_mount        *mp = ip->i_mount;
        struct xfs_perag        *pag;
+        xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
        XFS_STATS_INC(xs_ig_reclaims);
        /*
-         * Remove the inode from the per-AG radix tree.  It doesn't matter
+         * Remove the inode from the per-AG radix tree.
-         * if it was never added to it because radix_tree_delete can deal
+         *
-         * with that case just fine.
+         * Because radix_tree_delete won't complain even if the item was never
+         * added to the tree assert that it's been there before to catch
+         * problems with the inode life time early on.
         */
-        pag = xfs_get_perag(mp, ip->i_ino);
+        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
        write_lock(&pag->pag_ici_lock);
-        radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino));
+        if (!radix_tree_delete(&pag->pag_ici_root, agino))
+                ASSERT(0);
        write_unlock(&pag->pag_ici_lock);
-        xfs_put_perag(mp, pag);
+        xfs_perag_put(pag);
        /*
         * Here we do an (almost) spurious inode lock in order to coordinate
@@ -636,7 +613,7 @@ xfs_ilock(
        else if (lock_flags & XFS_ILOCK_SHARED)
                mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
-        xfs_ilock_trace(ip, 1, lock_flags, (inst_t *)__return_address);
+        trace_xfs_ilock(ip, lock_flags, _RET_IP_);
 }
 /*
@@ -681,7 +658,7 @@ xfs_ilock_nowait(
                if (!mrtryaccess(&ip->i_lock))
                        goto out_undo_iolock;
        }
-        xfs_ilock_trace(ip, 2, lock_flags, (inst_t *)__return_address);
+        trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
        return 1;
 out_undo_iolock:
@@ -743,7 +720,7 @@ xfs_iunlock(
                xfs_trans_unlocked_item(ip->i_itemp->ili_item.li_ailp,
                                        (xfs_log_item_t*)(ip->i_itemp));
        }
-        xfs_ilock_trace(ip, 3, lock_flags, (inst_t *)__return_address);
+        trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
 }
 /*
@@ -762,6 +739,8 @@ xfs_ilock_demote(
                mrdemote(&ip->i_lock);
        if (lock_flags & XFS_IOLOCK_EXCL)
                mrdemote(&ip->i_iolock);
+        trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
 }
 #ifdef DEBUG
@@ -792,52 +771,3 @@ xfs_isilocked(
        return 1;
 }
 #endif
-#ifdef  XFS_INODE_TRACE
-#define KTRACE_ENTER(ip, vk, s, line, ra)                       \
-        ktrace_enter((ip)->i_trace,                             \
-/*  0 */                (void *)(__psint_t)(vk),                \
-/*  1 */                (void *)(s),                            \
-/*  2 */                (void *)(__psint_t) line,               \
-/*  3 */                (void *)(__psint_t)atomic_read(&VFS_I(ip)->i_count), \
-/*  4 */                (void *)(ra),                           \
-/*  5 */                NULL,                                   \
-/*  6 */                (void *)(__psint_t)current_cpu(),       \
-/*  7 */                (void *)(__psint_t)current_pid(),       \
-/*  8 */                (void *)__return_address,               \
-/*  9 */                NULL, NULL, NULL, NULL, NULL, NULL, NULL)
-/*
- * Vnode tracing code.
- */
-void
-_xfs_itrace_entry(xfs_inode_t *ip, const char *func, inst_t *ra)
-{
-        KTRACE_ENTER(ip, INODE_KTRACE_ENTRY, func, 0, ra);
-}
-void
-_xfs_itrace_exit(xfs_inode_t *ip, const char *func, inst_t *ra)
-{
-        KTRACE_ENTER(ip, INODE_KTRACE_EXIT, func, 0, ra);
-}
-void
-xfs_itrace_hold(xfs_inode_t *ip, char *file, int line, inst_t *ra)
-{
-        KTRACE_ENTER(ip, INODE_KTRACE_HOLD, file, line, ra);
-}
-void
-_xfs_itrace_ref(xfs_inode_t *ip, char *file, int line, inst_t *ra)
-{
-        KTRACE_ENTER(ip, INODE_KTRACE_REF, file, line, ra);
-}
-void
-xfs_itrace_rele(xfs_inode_t *ip, char *file, int line, inst_t *ra)
-{
-        KTRACE_ENTER(ip, INODE_KTRACE_RELE, file, line, ra);
-}
-#endif  /* XFS_INODE_TRACE */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b92a4fa2a0a1..0ffd56447045 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -47,10 +47,10 @@
 #include "xfs_rw.h"
 #include "xfs_error.h"
 #include "xfs_utils.h"
-#include "xfs_dir2_trace.h"
 #include "xfs_quota.h"
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
+#include "xfs_trace.h"
 kmem_zone_t *xfs_ifork_zone;
 kmem_zone_t *xfs_inode_zone;
@@ -151,7 +151,7 @@ xfs_imap_to_bp(
                                "an error %d on %s.  Returning error.",
                                error, mp->m_fsname);
                } else {
-                        ASSERT(buf_flags & XFS_BUF_TRYLOCK);
+                        ASSERT(buf_flags & XBF_TRYLOCK);
                }
                return error;
        }
@@ -239,7 +239,7 @@ xfs_inotobp(
        if (error)
                return error;
-        error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags);
+        error = xfs_imap_to_bp(mp, tp, &imap, &bp, XBF_LOCK, imap_flags);
        if (error)
                return error;
@@ -285,7 +285,7 @@ xfs_itobp(
                return error;
        if (!bp) {
-                ASSERT(buf_flags & XFS_BUF_TRYLOCK);
+                ASSERT(buf_flags & XBF_TRYLOCK);
                ASSERT(tp == NULL);
                *bpp = NULL;
                return EAGAIN;
@@ -807,7 +807,7 @@ xfs_iread(
         * Get pointers to the on-disk inode and the buffer containing it.
         */
        error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp,
-                               XFS_BUF_LOCK, iget_flags);
+                               XBF_LOCK, iget_flags);
        if (error)
                return error;
        dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
@@ -1291,42 +1291,6 @@ xfs_file_last_byte(
        return last_byte;
 }
-#if defined(XFS_RW_TRACE)
-STATIC void
-xfs_itrunc_trace(
-        int             tag,
-        xfs_inode_t     *ip,
-        int             flag,
-        xfs_fsize_t     new_size,
-        xfs_off_t       toss_start,
-        xfs_off_t       toss_finish)
-{
-        if (ip->i_rwtrace == NULL) {
-                return;
-        }
-        ktrace_enter(ip->i_rwtrace,
-                     (void*)((long)tag),
-                     (void*)ip,
-                     (void*)(unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff),
-                     (void*)(unsigned long)(ip->i_d.di_size & 0xffffffff),
-                     (void*)((long)flag),
-                     (void*)(unsigned long)((new_size >> 32) & 0xffffffff),
-                     (void*)(unsigned long)(new_size & 0xffffffff),
-                     (void*)(unsigned long)((toss_start >> 32) & 0xffffffff),
-                     (void*)(unsigned long)(toss_start & 0xffffffff),
-                     (void*)(unsigned long)((toss_finish >> 32) & 0xffffffff),
-                     (void*)(unsigned long)(toss_finish & 0xffffffff),
-                     (void*)(unsigned long)current_cpu(),
-                     (void*)(unsigned long)current_pid(),
-                     (void*)NULL,
-                     (void*)NULL,
-                     (void*)NULL);
-}
-#else
-#define xfs_itrunc_trace(tag, ip, flag, new_size, toss_start, toss_finish)
-#endif
 /*
 * Start the truncation of the file to new_size.  The new size
 * must be smaller than the current size.  This routine will
@@ -1409,8 +1373,7 @@ xfs_itruncate_start(
                return 0;
        }
        last_byte = xfs_file_last_byte(ip);
-        xfs_itrunc_trace(XFS_ITRUNC_START, ip, flags, new_size, toss_start,
+        trace_xfs_itruncate_start(ip, flags, new_size, toss_start, last_byte);
-                         last_byte);
        if (last_byte > toss_start) {
                if (flags & XFS_ITRUNC_DEFINITE) {
                        xfs_tosspages(ip, toss_start,
@@ -1514,7 +1477,8 @@ xfs_itruncate_finish(
                new_size = 0LL;
        }
        first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
-        xfs_itrunc_trace(XFS_ITRUNC_FINISH1, ip, 0, new_size, 0, 0);
+        trace_xfs_itruncate_finish_start(ip, new_size);
        /*
         * The first thing we do is set the size to new_size permanently
         * on disk.  This way we don't have to worry about anyone ever
@@ -1731,7 +1695,7 @@ xfs_itruncate_finish(
        ASSERT((new_size != 0) ||
               (fork == XFS_ATTR_FORK) ||
               (ip->i_d.di_nextents == 0));
-        xfs_itrunc_trace(XFS_ITRUNC_FINISH2, ip, 0, new_size, 0, 0);
+        trace_xfs_itruncate_finish_end(ip, new_size);
        return 0;
 }
@@ -1787,7 +1751,7 @@ xfs_iunlink(
                 * Here we put the head pointer into our next pointer,
                 * and then we fall through to point the head at us.
                 */
-                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
+                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
                if (error)
                        return error;
@@ -1869,7 +1833,7 @@ xfs_iunlink_remove(
                 * of dealing with the buffer when there is no need to
                 * change it.
                 */
-                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
+                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
                if (error) {
                        cmn_err(CE_WARN,
                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
@@ -1931,7 +1895,7 @@ xfs_iunlink_remove(
                 * Now last_ibp points to the buffer previous to us on
                 * the unlinked list.  Pull us from the list.
                 */
-                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
+                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
                if (error) {
                        cmn_err(CE_WARN,
                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
@@ -1982,8 +1946,9 @@ xfs_ifree_cluster(
        xfs_inode_t             *ip, **ip_found;
        xfs_inode_log_item_t    *iip;
        xfs_log_item_t          *lip;
-        xfs_perag_t             *pag = xfs_get_perag(mp, inum);
+        struct xfs_perag        *pag;
+        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
        if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
                blks_per_cluster = 1;
                ninodes = mp->m_sb.sb_inopblock;
@@ -2075,7 +2040,7 @@ xfs_ifree_cluster(
                bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 
                                        mp->m_bsize * blks_per_cluster,
-                                        XFS_BUF_LOCK);
+                                        XBF_LOCK);
                pre_flushed = 0;
                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
@@ -2124,7 +2089,7 @@ xfs_ifree_cluster(
        }
        kmem_free(ip_found);
-        xfs_put_perag(mp, pag);
+        xfs_perag_put(pag);
 }
 /*
@@ -2186,7 +2151,7 @@ xfs_ifree(
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
+        error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XBF_LOCK);
        if (error)
                return error;
@@ -2474,72 +2439,31 @@ xfs_idestroy_fork(
 }
 /*
- * Increment the pin count of the given buffer.
+ * This is called to unpin an inode.  The caller must have the inode locked
- * This value is protected by ipinlock spinlock in the mount structure.
+ * in at least shared mode so that the buffer cannot be subsequently pinned
- */
+ * once someone is waiting for it to be unpinned.
-void
-xfs_ipin(
-        xfs_inode_t     *ip)
-{
-        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        atomic_inc(&ip->i_pincount);
-}
-/*
- * Decrement the pin count of the given inode, and wake up
- * anyone in xfs_iwait_unpin() if the count goes to 0.  The
- * inode must have been previously pinned with a call to xfs_ipin().
 */
-void
+static void
-xfs_iunpin(
+xfs_iunpin_nowait(
-        xfs_inode_t     *ip)
+        struct xfs_inode        *ip)
-{
-        ASSERT(atomic_read(&ip->i_pincount) > 0);
-        if (atomic_dec_and_test(&ip->i_pincount))
-                wake_up(&ip->i_ipin_wait);
-}
-/*
- * This is called to unpin an inode. It can be directed to wait or to return
- * immediately without waiting for the inode to be unpinned.  The caller must
- * have the inode locked in at least shared mode so that the buffer cannot be
- * subsequently pinned once someone is waiting for it to be unpinned.
- */
-STATIC void
-__xfs_iunpin_wait(
-        xfs_inode_t     *ip,
-        int             wait)
 {
-        xfs_inode_log_item_t    *iip = ip->i_itemp;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-        if (atomic_read(&ip->i_pincount) == 0)
-                return;
        /* Give the log a push to start the unpinning I/O */
-        xfs_log_force(ip->i_mount, (iip && iip->ili_last_lsn) ?
+        xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
-                                iip->ili_last_lsn : 0, XFS_LOG_FORCE);
-        if (wait)
-                wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
-}
-static inline void
-xfs_iunpin_wait(
-        xfs_inode_t     *ip)
-{
-        __xfs_iunpin_wait(ip, 1);
 }
-static inline void
+void
-xfs_iunpin_nowait(
+xfs_iunpin_wait(
-        xfs_inode_t     *ip)
+        struct xfs_inode        *ip)
 {
-        __xfs_iunpin_wait(ip, 0);
+        if (xfs_ipincount(ip)) {
+                xfs_iunpin_nowait(ip);
+                wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0));
+        }
 }
 /*
 * xfs_iextents_copy()
 *
@@ -2711,7 +2635,7 @@ xfs_iflush_cluster(
        xfs_buf_t       *bp)
 {
        xfs_mount_t             *mp = ip->i_mount;
-        xfs_perag_t             *pag = xfs_get_perag(mp, ip->i_ino);
+        struct xfs_perag        *pag;
        unsigned long           first_index, mask;
        unsigned long           inodes_per_cluster;
        int                     ilist_size;
@@ -2722,6 +2646,7 @@ xfs_iflush_cluster(
        int                     bufwasdelwri;
        int                     i;
+        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
        ASSERT(pag->pagi_inodeok);
        ASSERT(pag->pag_ici_init);
@@ -2729,7 +2654,7 @@ xfs_iflush_cluster(
        ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
        ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
        if (!ilist)
-                return 0;
+                goto out_put;
        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
@@ -2798,6 +2723,8 @@ xfs_iflush_cluster(
 out_free:
        read_unlock(&pag->pag_ici_lock);
        kmem_free(ilist);
+out_put:
+        xfs_perag_put(pag);
        return 0;
@@ -2841,6 +2768,7 @@ cluster_corrupt_out:
         */
        xfs_iflush_abort(iq);
        kmem_free(ilist);
+        xfs_perag_put(pag);
        return XFS_ERROR(EFSCORRUPTED);
 }
@@ -2863,8 +2791,6 @@ xfs_iflush(
        xfs_dinode_t            *dip;
        xfs_mount_t             *mp;
        int                     error;
-        int                     noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
-        enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
        XFS_STATS_INC(xs_iflush_count);
@@ -2877,15 +2803,6 @@ xfs_iflush(
        mp = ip->i_mount;
        /*
-         * If the inode isn't dirty, then just release the inode
-         * flush lock and do nothing.
-         */
-        if (xfs_inode_clean(ip)) {
-                xfs_ifunlock(ip);
-                return 0;
-        }
-        /*
         * We can't flush the inode until it is unpinned, so wait for it if we
         * are allowed to block.  We know noone new can pin it, because we are
         * holding the inode lock shared and you need to hold it exclusively to
@@ -2896,7 +2813,7 @@ xfs_iflush(
         * in the same cluster are dirty, they will probably write the inode
         * out for us if they occur after the log force completes.
         */
-        if (noblock && xfs_ipincount(ip)) {
+        if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
                xfs_iunpin_nowait(ip);
                xfs_ifunlock(ip);
                return EAGAIN;
@@ -2904,6 +2821,19 @@ xfs_iflush(
        xfs_iunpin_wait(ip);
        /*
+         * For stale inodes we cannot rely on the backing buffer remaining
+         * stale in cache for the remaining life of the stale inode and so
+         * xfs_itobp() below may give us a buffer that no longer contains
+         * inodes below. We have to check this after ensuring the inode is
+         * unpinned so that it is safe to reclaim the stale inode after the
+         * flush call.
+         */
+        if (xfs_iflags_test(ip, XFS_ISTALE)) {
+                xfs_ifunlock(ip);
+                return 0;
+        }
+        /*
         * This may have been unpinned because the filesystem is shutting
         * down forcibly. If that's the case we must not write this inode
         * to disk, because the log record didn't make it to disk!
@@ -2917,60 +2847,10 @@ xfs_iflush(
        }
        /*
-         * Decide how buffer will be flushed out.  This is done before
-         * the call to xfs_iflush_int because this field is zeroed by it.
-         */
-        if (iip != NULL && iip->ili_format.ilf_fields != 0) {
-                /*
-                 * Flush out the inode buffer according to the directions
-                 * of the caller.  In the cases where the caller has given
-                 * us a choice choose the non-delwri case.  This is because
-                 * the inode is in the AIL and we need to get it out soon.
-                 */
-                switch (flags) {
-                case XFS_IFLUSH_SYNC:
-                case XFS_IFLUSH_DELWRI_ELSE_SYNC:
-                        flags = 0;
-                        break;
-                case XFS_IFLUSH_ASYNC_NOBLOCK:
-                case XFS_IFLUSH_ASYNC:
-                case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
-                        flags = INT_ASYNC;
-                        break;
-                case XFS_IFLUSH_DELWRI:
-                        flags = INT_DELWRI;
-                        break;
-                default:
-                        ASSERT(0);
-                        flags = 0;
-                        break;
-                }
-        } else {
-                switch (flags) {
-                case XFS_IFLUSH_DELWRI_ELSE_SYNC:
-                case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
-                case XFS_IFLUSH_DELWRI:
-                        flags = INT_DELWRI;
-                        break;
-                case XFS_IFLUSH_ASYNC_NOBLOCK:
-                case XFS_IFLUSH_ASYNC:
-                        flags = INT_ASYNC;
-                        break;
-                case XFS_IFLUSH_SYNC:
-                        flags = 0;
-                        break;
-                default:
-                        ASSERT(0);
-                        flags = 0;
-                        break;
-                }
-        }
-        /*
         * Get the buffer containing the on-disk inode.
         */
        error = xfs_itobp(mp, NULL, ip, &dip, &bp,
-                                noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK);
+                                (flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK);
        if (error || !bp) {
                xfs_ifunlock(ip);
                return error;
@@ -2988,7 +2868,7 @@ xfs_iflush(
         * get stuck waiting in the write for too long.
         */
        if (XFS_BUF_ISPINNED(bp))
-                xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+                xfs_log_force(mp, 0);
        /*
         * inode clustering:
@@ -2998,13 +2878,10 @@ xfs_iflush(
        if (error)
                goto cluster_corrupt_out;
-        if (flags & INT_DELWRI) {
+        if (flags & SYNC_WAIT)
-                xfs_bdwrite(mp, bp);
-        } else if (flags & INT_ASYNC) {
-                error = xfs_bawrite(mp, bp);
-        } else {
                error = xfs_bwrite(mp, bp);
-        }
+        else
+                xfs_bdwrite(mp, bp);
        return error;
 corrupt_out:
@@ -3039,16 +2916,6 @@ xfs_iflush_int(
        iip = ip->i_itemp;
        mp = ip->i_mount;
-        /*
-         * If the inode isn't dirty, then just release the inode
-         * flush lock and do nothing.
-         */
-        if (xfs_inode_clean(ip)) {
-                xfs_ifunlock(ip);
-                return 0;
-        }
        /* set *dip = inode's place in the buffer */
        dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
@@ -3252,23 +3119,6 @@ corrupt_out:
        return XFS_ERROR(EFSCORRUPTED);
 }
-#ifdef XFS_ILOCK_TRACE
-void
-xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra)
-{
-        ktrace_enter(ip->i_lock_trace,
-                     (void *)ip,
-                     (void *)(unsigned long)lock, /* 1 = LOCK, 3=UNLOCK, etc */
-                     (void *)(unsigned long)lockflags, /* XFS_ILOCK_EXCL etc */
-                     (void *)ra,                /* caller of ilock */
-                     (void *)(unsigned long)current_cpu(),
-                     (void *)(unsigned long)current_pid(),
-                     NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL);
-}
-#endif
 /*
 * Return a pointer to the extent record at file index idx.
 */
@@ -3300,13 +3150,17 @@ xfs_iext_get_ext(
 */
 void
 xfs_iext_insert(
-        xfs_ifork_t     *ifp,           /* inode fork pointer */
+        xfs_inode_t     *ip,            /* incore inode pointer */
        xfs_extnum_t    idx,            /* starting index of new items */
        xfs_extnum_t    count,          /* number of inserted items */
-        xfs_bmbt_irec_t *new)           /* items to insert */
+        xfs_bmbt_irec_t *new,           /* items to insert */
+        int             state)          /* type of extent conversion */
 {
+        xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
        xfs_extnum_t    i;              /* extent record index */
+        trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
        xfs_iext_add(ifp, idx, count);
        for (i = idx; i < idx + count; i++, new++)
@@ -3549,13 +3403,17 @@ xfs_iext_add_indirect_multi(
 */
 void
 xfs_iext_remove(
-        xfs_ifork_t     *ifp,           /* inode fork pointer */
+        xfs_inode_t     *ip,            /* incore inode pointer */
        xfs_extnum_t    idx,            /* index to begin removing exts */
-        int             ext_diff)       /* number of extents to remove */
+        int             ext_diff,       /* number of extents to remove */
+        int             state)          /* type of extent conversion */
 {
+        xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
        xfs_extnum_t    nextents;       /* number of extents in file */
        int             new_size;       /* size of extents after removal */
+        trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
        ASSERT(ext_diff > 0);
        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
        new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 41555de1d1db..9965e40a4615 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -213,7 +213,6 @@ typedef struct xfs_icdinode {
 struct bhv_desc;
 struct cred;
-struct ktrace;
 struct xfs_buf;
 struct xfs_bmap_free;
 struct xfs_bmbt_irec;
@@ -222,13 +221,6 @@ struct xfs_mount;
 struct xfs_trans;
 struct xfs_dquot;
-#if defined(XFS_ILOCK_TRACE)
-#define XFS_ILOCK_KTRACE_SIZE   32
-extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *);
-#else
-#define xfs_ilock_trace(i,n,f,ra)
-#endif
 typedef struct dm_attrs_s {
        __uint32_t      da_dmevmask;    /* DMIG event mask */
        __uint16_t      da_dmstate;     /* DMIG state info */
@@ -271,26 +263,6 @@ typedef struct xfs_inode {
        /* VFS inode */
        struct inode            i_vnode;        /* embedded VFS inode */
-        /* Trace buffers per inode. */
-#ifdef XFS_INODE_TRACE
-        struct ktrace           *i_trace;       /* general inode trace */
-#endif
-#ifdef XFS_BMAP_TRACE
-        struct ktrace           *i_xtrace;      /* inode extent list trace */
-#endif
-#ifdef XFS_BTREE_TRACE
-        struct ktrace           *i_btrace;      /* inode bmap btree trace */
-#endif
-#ifdef XFS_RW_TRACE
-        struct ktrace           *i_rwtrace;     /* inode read/write trace */
-#endif
-#ifdef XFS_ILOCK_TRACE
-        struct ktrace           *i_lock_trace;  /* inode lock/unlock trace */
-#endif
-#ifdef XFS_DIR2_TRACE
-        struct ktrace           *i_dir_trace;   /* inode directory trace */
-#endif
 } xfs_inode_t;
 #define XFS_ISIZE(ip)   (((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \
@@ -406,6 +378,14 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 #define XFS_LOCK_MASK           (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
                                | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)
+#define XFS_LOCK_FLAGS \
+        { XFS_IOLOCK_EXCL,      "IOLOCK_EXCL" }, \
+        { XFS_IOLOCK_SHARED,    "IOLOCK_SHARED" }, \
+        { XFS_ILOCK_EXCL,       "ILOCK_EXCL" }, \
+        { XFS_ILOCK_SHARED,     "ILOCK_SHARED" }, \
+        { XFS_IUNLOCK_NONOTIFY, "IUNLOCK_NONOTIFY" }
 /*
 * Flags for lockdep annotations.
 *
@@ -440,21 +420,15 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 #define XFS_ILOCK_DEP(flags)    (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
 /*
- * Flags for xfs_iflush()
- */
-#define XFS_IFLUSH_DELWRI_ELSE_SYNC     1
-#define XFS_IFLUSH_DELWRI_ELSE_ASYNC    2
-#define XFS_IFLUSH_SYNC                 3
-#define XFS_IFLUSH_ASYNC                4
-#define XFS_IFLUSH_DELWRI               5
-#define XFS_IFLUSH_ASYNC_NOBLOCK        6
-/*
 * Flags for xfs_itruncate_start().
 */
 #define XFS_ITRUNC_DEFINITE     0x1
 #define XFS_ITRUNC_MAYBE        0x2
+#define XFS_ITRUNC_FLAGS \
+        { XFS_ITRUNC_DEFINITE,  "DEFINITE" }, \
+        { XFS_ITRUNC_MAYBE,     "MAYBE" }
 /*
 * For multiple groups support: if S_ISGID bit is set in the parent
 * directory, group of new file is set to that of the parent, and
@@ -497,58 +471,26 @@ int		xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
 int             xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
 void            xfs_iext_realloc(xfs_inode_t *, int, int);
-void            xfs_ipin(xfs_inode_t *);
+void            xfs_iunpin_wait(xfs_inode_t *);
-void            xfs_iunpin(xfs_inode_t *);
 int             xfs_iflush(xfs_inode_t *, uint);
 void            xfs_ichgtime(xfs_inode_t *, int);
 void            xfs_lock_inodes(xfs_inode_t **, int, uint);
 void            xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
 void            xfs_synchronize_times(xfs_inode_t *);
+void            xfs_mark_inode_dirty(xfs_inode_t *);
 void            xfs_mark_inode_dirty_sync(xfs_inode_t *);
-#if defined(XFS_INODE_TRACE)
-#define INODE_TRACE_SIZE        16              /* number of trace entries */
-#define INODE_KTRACE_ENTRY      1
-#define INODE_KTRACE_EXIT       2
-#define INODE_KTRACE_HOLD       3
-#define INODE_KTRACE_REF        4
-#define INODE_KTRACE_RELE       5
-extern void _xfs_itrace_entry(struct xfs_inode *, const char *, inst_t *);
-extern void _xfs_itrace_exit(struct xfs_inode *, const char *, inst_t *);
-extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *);
-extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *);
-extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *);
-#define xfs_itrace_entry(ip)    \
-        _xfs_itrace_entry(ip, __func__, (inst_t *)__return_address)
-#define xfs_itrace_exit(ip)     \
-        _xfs_itrace_exit(ip, __func__, (inst_t *)__return_address)
-#define xfs_itrace_exit_tag(ip, tag)    \
-        _xfs_itrace_exit(ip, tag, (inst_t *)__return_address)
-#define xfs_itrace_ref(ip)      \
-        _xfs_itrace_ref(ip, __FILE__, __LINE__, (inst_t *)__return_address)
-#else
-#define xfs_itrace_entry(a)
-#define xfs_itrace_exit(a)
-#define xfs_itrace_exit_tag(a, b)
-#define xfs_itrace_hold(a, b, c, d)
-#define xfs_itrace_ref(a)
-#define xfs_itrace_rele(a, b, c, d)
-#endif
 #define IHOLD(ip) \
 do { \
        ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
        atomic_inc(&(VFS_I(ip)->i_count)); \
-        xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
+        trace_xfs_ihold(ip, _THIS_IP_); \
 } while (0)
 #define IRELE(ip) \
 do { \
-        xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
+        trace_xfs_irele(ip, _THIS_IP_); \
        iput(VFS_I(ip)); \
 } while (0)
@@ -577,11 +519,11 @@ int		xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
 int             xfs_iextents_copy(struct xfs_inode *, xfs_bmbt_rec_t *, int);
 xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t);
-void            xfs_iext_insert(xfs_ifork_t *, xfs_extnum_t, xfs_extnum_t,
+void            xfs_iext_insert(xfs_inode_t *, xfs_extnum_t, xfs_extnum_t,
-                                xfs_bmbt_irec_t *);
+                                xfs_bmbt_irec_t *, int);
 void            xfs_iext_add(xfs_ifork_t *, xfs_extnum_t, int);
 void            xfs_iext_add_indirect_multi(xfs_ifork_t *, int, xfs_extnum_t, int);
-void            xfs_iext_remove(xfs_ifork_t *, xfs_extnum_t, int);
+void            xfs_iext_remove(xfs_inode_t *, xfs_extnum_t, int, int);
 void            xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int);
 void            xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int);
 void            xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 9794b876d6ff..7bfea8540159 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -41,6 +41,7 @@
 #include "xfs_ialloc.h"
 #include "xfs_rw.h"
 #include "xfs_error.h"
+#include "xfs_trace.h"
 kmem_zone_t     *xfs_ili_zone;          /* inode log item zone */
@@ -227,7 +228,7 @@ xfs_inode_item_format(
        vecp->i_addr = (xfs_caddr_t)&iip->ili_format;
        vecp->i_len  = sizeof(xfs_inode_log_format_t);
-        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IFORMAT);
+        vecp->i_type = XLOG_REG_TYPE_IFORMAT;
        vecp++;
        nvecs        = 1;
@@ -278,7 +279,7 @@ xfs_inode_item_format(
        vecp->i_addr = (xfs_caddr_t)&ip->i_d;
        vecp->i_len  = sizeof(struct xfs_icdinode);
-        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE);
+        vecp->i_type = XLOG_REG_TYPE_ICORE;
        vecp++;
        nvecs++;
        iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
@@ -335,7 +336,7 @@ xfs_inode_item_format(
                                vecp->i_addr =
                                        (char *)(ip->i_df.if_u1.if_extents);
                                vecp->i_len = ip->i_df.if_bytes;
-                                XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT);
+                                vecp->i_type = XLOG_REG_TYPE_IEXT;
                        } else
 #endif
                        {
@@ -354,7 +355,7 @@ xfs_inode_item_format(
                                vecp->i_addr = (xfs_caddr_t)ext_buffer;
                                vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
                                                XFS_DATA_FORK);
-                                XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT);
+                                vecp->i_type = XLOG_REG_TYPE_IEXT;
                        }
                        ASSERT(vecp->i_len <= ip->i_df.if_bytes);
                        iip->ili_format.ilf_dsize = vecp->i_len;
@@ -372,7 +373,7 @@ xfs_inode_item_format(
                        ASSERT(ip->i_df.if_broot != NULL);
                        vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot;
                        vecp->i_len = ip->i_df.if_broot_bytes;
-                        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IBROOT);
+                        vecp->i_type = XLOG_REG_TYPE_IBROOT;
                        vecp++;
                        nvecs++;
                        iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
@@ -398,7 +399,7 @@ xfs_inode_item_format(
                        ASSERT((ip->i_df.if_real_bytes == 0) ||
                               (ip->i_df.if_real_bytes == data_bytes));
                        vecp->i_len = (int)data_bytes;
-                        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ILOCAL);
+                        vecp->i_type = XLOG_REG_TYPE_ILOCAL;
                        vecp++;
                        nvecs++;
                        iip->ili_format.ilf_dsize = (unsigned)data_bytes;
@@ -476,7 +477,7 @@ xfs_inode_item_format(
                        vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
                                        XFS_ATTR_FORK);
 #endif
-                        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_EXT);
+                        vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
                        iip->ili_format.ilf_asize = vecp->i_len;
                        vecp++;
                        nvecs++;
@@ -491,7 +492,7 @@ xfs_inode_item_format(
                        ASSERT(ip->i_afp->if_broot != NULL);
                        vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot;
                        vecp->i_len = ip->i_afp->if_broot_bytes;
-                        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_BROOT);
+                        vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
                        vecp++;
                        nvecs++;
                        iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
@@ -515,7 +516,7 @@ xfs_inode_item_format(
                        ASSERT((ip->i_afp->if_real_bytes == 0) ||
                               (ip->i_afp->if_real_bytes == data_bytes));
                        vecp->i_len = (int)data_bytes;
-                        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_LOCAL);
+                        vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL;
                        vecp++;
                        nvecs++;
                        iip->ili_format.ilf_asize = (unsigned)data_bytes;
@@ -534,23 +535,23 @@ xfs_inode_item_format(
 /*
 * This is called to pin the inode associated with the inode log
- * item in memory so it cannot be written out.  Do this by calling
+ * item in memory so it cannot be written out.
- * xfs_ipin() to bump the pin count in the inode while holding the
- * inode pin lock.
 */
 STATIC void
 xfs_inode_item_pin(
        xfs_inode_log_item_t    *iip)
 {
        ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
-        xfs_ipin(iip->ili_inode);
+        atomic_inc(&iip->ili_inode->i_pincount);
 }
 /*
 * This is called to unpin the inode associated with the inode log
 * item which was previously pinned with a call to xfs_inode_item_pin().
- * Just call xfs_iunpin() on the inode to do this.
+ *
+ * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0.
 */
 /* ARGSUSED */
 STATIC void
@@ -558,7 +559,11 @@ xfs_inode_item_unpin(
        xfs_inode_log_item_t    *iip,
        int                     stale)
 {
-        xfs_iunpin(iip->ili_inode);
+        struct xfs_inode        *ip = iip->ili_inode;
+        ASSERT(atomic_read(&ip->i_pincount) > 0);
+        if (atomic_dec_and_test(&ip->i_pincount))
+                wake_up(&ip->i_ipin_wait);
 }
 /* ARGSUSED */
@@ -567,7 +572,7 @@ xfs_inode_item_unpin_remove(
        xfs_inode_log_item_t    *iip,
        xfs_trans_t             *tp)
 {
-        xfs_iunpin(iip->ili_inode);
+        xfs_inode_item_unpin(iip, 0);
 }
 /*
@@ -601,33 +606,20 @@ xfs_inode_item_trylock(
        if (!xfs_iflock_nowait(ip)) {
                /*
-                 * If someone else isn't already trying to push the inode
+                 * inode has already been flushed to the backing buffer,
-                 * buffer, we get to do it.
+                 * leave it locked in shared mode, pushbuf routine will
+                 * unlock it.
                 */
-                if (iip->ili_pushbuf_flag == 0) {
+                return XFS_ITEM_PUSHBUF;
-                        iip->ili_pushbuf_flag = 1;
-#ifdef DEBUG
-                        iip->ili_push_owner = current_pid();
-#endif
-                        /*
-                         * Inode is left locked in shared mode.
-                         * Pushbuf routine gets to unlock it.
-                         */
-                        return XFS_ITEM_PUSHBUF;
-                } else {
-                        /*
-                         * We hold the AIL lock, so we must specify the
-                         * NONOTIFY flag so that we won't double trip.
-                         */
-                        xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
-                        return XFS_ITEM_FLUSHING;
-                }
-                /* NOTREACHED */
        }
        /* Stale items should force out the iclog */
        if (ip->i_flags & XFS_ISTALE) {
                xfs_ifunlock(ip);
+                /*
+                 * we hold the AIL lock - notify the unlock routine of this
+                 * so it doesn't try to get the lock again.
+                 */
                xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
                return XFS_ITEM_PINNED;
        }
@@ -745,11 +737,8 @@ xfs_inode_item_committed(
 * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
 * failed to get the inode flush lock but did get the inode locked SHARED.
 * Here we're trying to see if the inode buffer is incore, and if so whether it's
- * marked delayed write. If that's the case, we'll initiate a bawrite on that
+ * marked delayed write. If that's the case, we'll promote it and that will
- * buffer to expedite the process.
+ * allow the caller to write the buffer by triggering the xfsbufd to run.
- *
- * We aren't holding the AIL lock (or the flush lock) when this gets called,
- * so it is inherently race-y.
 */
 STATIC void
 xfs_inode_item_pushbuf(
@@ -758,80 +747,30 @@ xfs_inode_item_pushbuf(
        xfs_inode_t     *ip;
        xfs_mount_t     *mp;
        xfs_buf_t       *bp;
-        uint            dopush;
        ip = iip->ili_inode;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
        /*
-         * The ili_pushbuf_flag keeps others from
-         * trying to duplicate our effort.
-         */
-        ASSERT(iip->ili_pushbuf_flag != 0);
-        ASSERT(iip->ili_push_owner == current_pid());
-        /*
         * If a flush is not in progress anymore, chances are that the
         * inode was taken off the AIL. So, just get out.
         */
        if (completion_done(&ip->i_flush) ||
            ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
-                iip->ili_pushbuf_flag = 0;
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
                return;
        }
        mp = ip->i_mount;
        bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno,
-                    iip->ili_format.ilf_len, XFS_INCORE_TRYLOCK);
+                    iip->ili_format.ilf_len, XBF_TRYLOCK);
-        if (bp != NULL) {
-                if (XFS_BUF_ISDELAYWRITE(bp)) {
-                        /*
-                         * We were racing with iflush because we don't hold
-                         * the AIL lock or the flush lock. However, at this point,
-                         * we have the buffer, and we know that it's dirty.
-                         * So, it's possible that iflush raced with us, and
-                         * this item is already taken off the AIL.
-                         * If not, we can flush it async.
-                         */
-                        dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
-                                  !completion_done(&ip->i_flush));
-                        iip->ili_pushbuf_flag = 0;
-                        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-                        xfs_buftrace("INODE ITEM PUSH", bp);
-                        if (XFS_BUF_ISPINNED(bp)) {
-                                xfs_log_force(mp, (xfs_lsn_t)0,
-                                              XFS_LOG_FORCE);
-                        }
-                        if (dopush) {
-                                int     error;
-                                error = xfs_bawrite(mp, bp);
-                                if (error)
-                                        xfs_fs_cmn_err(CE_WARN, mp,
-                "xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p",
-                                                        error, iip, bp);
-                        } else {
-                                xfs_buf_relse(bp);
-                        }
-                } else {
-                        iip->ili_pushbuf_flag = 0;
-                        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-                        xfs_buf_relse(bp);
-                }
-                return;
-        }
-        /*
-         * We have to be careful about resetting pushbuf flag too early (above).
-         * Even though in theory we can do it as soon as we have the buflock,
-         * we don't want others to be doing work needlessly. They'll come to
-         * this function thinking that pushing the buffer is their
-         * responsibility only to find that the buffer is still locked by
-         * another doing the same thing
-         */
-        iip->ili_pushbuf_flag = 0;
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        if (!bp)
+                return;
+        if (XFS_BUF_ISDELAYWRITE(bp))
+                xfs_buf_delwri_promote(bp);
+        xfs_buf_relse(bp);
        return;
 }
@@ -864,10 +803,14 @@ xfs_inode_item_push(
               iip->ili_format.ilf_fields != 0);
        /*
-         * Write out the inode.  The completion routine ('iflush_done') will
+         * Push the inode to it's backing buffer. This will not remove the
-         * pull it from the AIL, mark it clean, unlock the flush lock.
+         * inode from the AIL - a further push will be required to trigger a
+         * buffer push. However, this allows all the dirty inodes to be pushed
+         * to the buffer before it is pushed to disk. THe buffer IO completion
+         * will pull th einode from the AIL, mark it clean and unlock the flush
+         * lock.
         */
-        (void) xfs_iflush(ip, XFS_IFLUSH_ASYNC);
+        (void) xfs_iflush(ip, 0);
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
        return;
@@ -931,7 +874,6 @@ xfs_inode_item_init(
        /*
           We have zeroed memory. No need ...
           iip->ili_extents_buf = NULL;
-           iip->ili_pushbuf_flag = 0;
         */
        iip->ili_format.ilf_type = XFS_LI_INODE;
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 65bae4c9b8bf..9a467958ecdd 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -127,7 +127,7 @@ static inline int xfs_ilog_fdata(int w)
 #ifdef __KERNEL__
 struct xfs_buf;
-struct xfs_bmbt_rec_64;
+struct xfs_bmbt_rec;
 struct xfs_inode;
 struct xfs_mount;
@@ -140,16 +140,10 @@ typedef struct xfs_inode_log_item {
        unsigned short          ili_flags;         /* misc flags */
        unsigned short          ili_logged;        /* flushed logged data */
        unsigned int            ili_last_fields;   /* fields when flushed */
-        struct xfs_bmbt_rec_64  *ili_extents_buf;  /* array of logged
+        struct xfs_bmbt_rec     *ili_extents_buf;  /* array of logged
                                                      data exts */
-        struct xfs_bmbt_rec_64  *ili_aextents_buf; /* array of logged
+        struct xfs_bmbt_rec     *ili_aextents_buf; /* array of logged
                                                      attr exts */
-        unsigned int            ili_pushbuf_flag;  /* one bit used in push_ail */
-#ifdef DEBUG
-        uint64_t                ili_push_owner;    /* one who sets pushbuf_flag
-                                                      above gets to push the buf */
-#endif
 #ifdef XFS_TRANS_DEBUG
        int                     ili_root_size;
        char                    *ili_orig_root;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 67ae5555a30a..0b65039951a0 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -47,72 +47,8 @@
 #include "xfs_trans_space.h"
 #include "xfs_utils.h"
 #include "xfs_iomap.h"
+#include "xfs_trace.h"
-#if defined(XFS_RW_TRACE)
-void
-xfs_iomap_enter_trace(
-        int             tag,
-        xfs_inode_t     *ip,
-        xfs_off_t       offset,
-        ssize_t         count)
-{
-        if (!ip->i_rwtrace)
-                return;
-        ktrace_enter(ip->i_rwtrace,
-                (void *)((unsigned long)tag),
-                (void *)ip,
-                (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
-                (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
-                (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
-                (void *)((unsigned long)(offset & 0xffffffff)),
-                (void *)((unsigned long)count),
-                (void *)((unsigned long)((ip->i_new_size >> 32) & 0xffffffff)),
-                (void *)((unsigned long)(ip->i_new_size & 0xffffffff)),
-                (void *)((unsigned long)current_pid()),
-                (void *)NULL,
-                (void *)NULL,
-                (void *)NULL,
-                (void *)NULL,
-                (void *)NULL,
-                (void *)NULL);
-}
-void
-xfs_iomap_map_trace(
-        int             tag,
-        xfs_inode_t     *ip,
-        xfs_off_t       offset,
-        ssize_t         count,
-        xfs_iomap_t     *iomapp,
-        xfs_bmbt_irec_t *imapp,
-        int             flags)
-{
-        if (!ip->i_rwtrace)
-                return;
-        ktrace_enter(ip->i_rwtrace,
-                (void *)((unsigned long)tag),
-                (void *)ip,
-                (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
-                (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
-                (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
-                (void *)((unsigned long)(offset & 0xffffffff)),
-                (void *)((unsigned long)count),
-                (void *)((unsigned long)flags),
-                (void *)((unsigned long)((iomapp->iomap_offset >> 32) & 0xffffffff)),
-                (void *)((unsigned long)(iomapp->iomap_offset & 0xffffffff)),
-                (void *)((unsigned long)(iomapp->iomap_delta)),
-                (void *)((unsigned long)(iomapp->iomap_bsize)),
-                (void *)((unsigned long)(iomapp->iomap_bn)),
-                (void *)(__psint_t)(imapp->br_startoff),
-                (void *)((unsigned long)(imapp->br_blockcount)),
-                (void *)(__psint_t)(imapp->br_startblock));
-}
-#else
-#define xfs_iomap_enter_trace(tag, io, offset, count)
-#define xfs_iomap_map_trace(tag, io, offset, count, iomapp, imapp, flags)
-#endif
 #define XFS_WRITEIO_ALIGN(mp,off)       (((off) >> mp->m_writeio_log) \
                                                << mp->m_writeio_log)
@@ -187,21 +123,20 @@ xfs_iomap(
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
+        trace_xfs_iomap_enter(ip, offset, count, flags, NULL);
        switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) {
        case BMAPI_READ:
-                xfs_iomap_enter_trace(XFS_IOMAP_READ_ENTER, ip, offset, count);
                lockmode = xfs_ilock_map_shared(ip);
                bmapi_flags = XFS_BMAPI_ENTIRE;
                break;
        case BMAPI_WRITE:
-                xfs_iomap_enter_trace(XFS_IOMAP_WRITE_ENTER, ip, offset, count);
                lockmode = XFS_ILOCK_EXCL;
                if (flags & BMAPI_IGNSTATE)
                        bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
                xfs_ilock(ip, lockmode);
                break;
        case BMAPI_ALLOCATE:
-                xfs_iomap_enter_trace(XFS_IOMAP_ALLOC_ENTER, ip, offset, count);
                lockmode = XFS_ILOCK_SHARED;
                bmapi_flags = XFS_BMAPI_ENTIRE;
@@ -237,8 +172,7 @@ xfs_iomap(
                if (nimaps &&
                    (imap.br_startblock != HOLESTARTBLOCK) &&
                    (imap.br_startblock != DELAYSTARTBLOCK)) {
-                        xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, ip,
+                        trace_xfs_iomap_found(ip, offset, count, flags, &imap);
-                                        offset, count, iomapp, &imap, flags);
                        break;
                }
@@ -250,8 +184,7 @@ xfs_iomap(
                                                      &imap, &nimaps);
                }
                if (!error) {
-                        xfs_iomap_map_trace(XFS_IOMAP_ALLOC_MAP, ip,
+                        trace_xfs_iomap_alloc(ip, offset, count, flags, &imap);
-                                        offset, count, iomapp, &imap, flags);
                }
                iomap_flags = IOMAP_NEW;
                break;
@@ -261,8 +194,7 @@ xfs_iomap(
                lockmode = 0;
                if (nimaps && !isnullstartblock(imap.br_startblock)) {
-                        xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, ip,
+                        trace_xfs_iomap_found(ip, offset, count, flags, &imap);
-                                        offset, count, iomapp, &imap, flags);
                        break;
                }
@@ -623,8 +555,7 @@ retry:
         * delalloc blocks and retry without EOF preallocation.
         */
        if (nimaps == 0) {
-                xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE,
+                trace_xfs_delalloc_enospc(ip, offset, count);
-                                        ip, offset, count);
                if (flushed)
                        return XFS_ERROR(ENOSPC);
@@ -837,7 +768,7 @@ xfs_iomap_write_unwritten(
        int             committed;
        int             error;
-        xfs_iomap_enter_trace(XFS_IOMAP_UNWRITTEN, ip, offset, count);
+        trace_xfs_unwritten_convert(ip, offset, count);
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
        count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
@@ -860,8 +791,15 @@ xfs_iomap_write_unwritten(
                 * set up a transaction to convert the range of extents
                 * from unwritten to real. Do allocations in a loop until
                 * we have covered the range passed in.
+                 *
+                 * Note that we open code the transaction allocation here
+                 * to pass KM_NOFS--we can't risk to recursing back into
+                 * the filesystem here as we might be asked to write out
+                 * the same inode that we complete here and might deadlock
+                 * on the iolock.
                 */
-                tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
+                xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
+                tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
                tp->t_flags |= XFS_TRANS_RESERVE;
                error = xfs_trans_reserve(tp, resblks,
                                XFS_WRITE_LOG_RES(mp), 0,
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index fdcf7b82747f..174f29990991 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -43,6 +43,14 @@ typedef enum {
        BMAPI_TRYLOCK = (1 << 7),       /* non-blocking request */
 } bmapi_flags_t;
+#define BMAPI_FLAGS \
+        { BMAPI_READ,           "READ" }, \
+        { BMAPI_WRITE,          "WRITE" }, \
+        { BMAPI_ALLOCATE,       "ALLOCATE" }, \
+        { BMAPI_IGNSTATE,       "IGNSTATE" }, \
+        { BMAPI_DIRECT,         "DIRECT" }, \
+        { BMAPI_MMAP,           "MMAP" }, \
+        { BMAPI_TRYLOCK,        "TRYLOCK" }
 /*
 * xfs_iomap_t:  File system I/O map
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 62efab2f3839..b1b801e4a28e 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -106,6 +106,7 @@ xfs_bulkstat_one_iget(
        buf->bs_dmevmask = dic->di_dmevmask;
        buf->bs_dmstate = dic->di_dmstate;
        buf->bs_aextents = dic->di_anextents;
+        buf->bs_forkoff = XFS_IFORK_BOFF(ip);
        switch (dic->di_format) {
        case XFS_DINODE_FMT_DEV:
@@ -176,6 +177,7 @@ xfs_bulkstat_one_dinode(
        buf->bs_dmevmask = be32_to_cpu(dic->di_dmevmask);
        buf->bs_dmstate = be16_to_cpu(dic->di_dmstate);
        buf->bs_aextents = be16_to_cpu(dic->di_anextents);
+        buf->bs_forkoff = XFS_DFORK_BOFF(dic);
        switch (dic->di_format) {
        case XFS_DINODE_FMT_DEV:
@@ -408,8 +410,10 @@ xfs_bulkstat(
                (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
        nimask = ~(nicluster - 1);
        nbcluster = nicluster >> mp->m_sb.sb_inopblog;
-        irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4,
+        irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
-                                   KM_SLEEP | KM_MAYFAIL | KM_LARGE);
+        if (!irbuf)
+                return ENOMEM;
        nirbuf = irbsize / sizeof(*irbuf);
        /*
@@ -420,9 +424,7 @@ xfs_bulkstat(
        while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
                cond_resched();
                bp = NULL;
-                down_read(&mp->m_peraglock);
                error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
-                up_read(&mp->m_peraglock);
                if (error) {
                        /*
                         * Skip this allocation group and go to the next one.
@@ -729,7 +731,7 @@ xfs_bulkstat(
        /*
         * Done, we're either out of filesystem or space to put the data.
         */
-        kmem_free(irbuf);
+        kmem_free_large(irbuf);
        *ubcountp = ubelem;
        /*
         * Found some inodes, return them now and return the error next time.
@@ -849,9 +851,7 @@ xfs_inumbers(
        agbp = NULL;
        while (left > 0 && agno < mp->m_sb.sb_agcount) {
                if (agbp == NULL) {
-                        down_read(&mp->m_peraglock);
                        error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
-                        up_read(&mp->m_peraglock);
                        if (error) {
                                /*
                                 * If we can't read the AGI of this ag,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 9dbdff3ea484..e8fba92d7cd9 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -40,6 +40,7 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_rw.h"
+#include "xfs_trace.h"
 kmem_zone_t     *xfs_log_ticket_zone;
@@ -49,7 +50,6 @@ kmem_zone_t	*xfs_log_ticket_zone;
          (off) += (bytes);}
 /* Local miscellaneous function prototypes */
-STATIC int       xlog_bdstrat_cb(struct xfs_buf *);
 STATIC int       xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket,
                                    xlog_in_core_t **, xfs_lsn_t *);
 STATIC xlog_t *  xlog_alloc_log(xfs_mount_t     *mp,
@@ -60,7 +60,7 @@ STATIC int	 xlog_space_left(xlog_t *log, int cycle, int bytes);
 STATIC int       xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
 STATIC void      xlog_dealloc_log(xlog_t *log);
 STATIC int       xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[],
-                            int nentries, xfs_log_ticket_t tic,
+                            int nentries, struct xlog_ticket *tic,
                            xfs_lsn_t *start_lsn,
                            xlog_in_core_t **commit_iclog,
                            uint flags);
@@ -79,11 +79,6 @@ STATIC int  xlog_state_release_iclog(xlog_t		*log,
 STATIC void xlog_state_switch_iclogs(xlog_t             *log,
                                     xlog_in_core_t *iclog,
                                     int                eventual_size);
-STATIC int  xlog_state_sync(xlog_t                      *log,
-                            xfs_lsn_t                   lsn,
-                            uint                        flags,
-                            int                         *log_flushed);
-STATIC int  xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed);
 STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog);
 /* local functions to manipulate grant head */
@@ -122,85 +117,6 @@ STATIC void	xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
 STATIC int      xlog_iclogs_empty(xlog_t *log);
-#if defined(XFS_LOG_TRACE)
-#define XLOG_TRACE_LOGGRANT_SIZE        2048
-#define XLOG_TRACE_ICLOG_SIZE           256
-void
-xlog_trace_loggrant_alloc(xlog_t *log)
-{
-        log->l_grant_trace = ktrace_alloc(XLOG_TRACE_LOGGRANT_SIZE, KM_NOFS);
-}
-void
-xlog_trace_loggrant_dealloc(xlog_t *log)
-{
-        ktrace_free(log->l_grant_trace);
-}
-void
-xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
-{
-        unsigned long cnts;
-        /* ticket counts are 1 byte each */
-        cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8;
-        ktrace_enter(log->l_grant_trace,
-                     (void *)tic,
-                     (void *)log->l_reserve_headq,
-                     (void *)log->l_write_headq,
-                     (void *)((unsigned long)log->l_grant_reserve_cycle),
-                     (void *)((unsigned long)log->l_grant_reserve_bytes),
-                     (void *)((unsigned long)log->l_grant_write_cycle),
-                     (void *)((unsigned long)log->l_grant_write_bytes),
-                     (void *)((unsigned long)log->l_curr_cycle),
-                     (void *)((unsigned long)log->l_curr_block),
-                     (void *)((unsigned long)CYCLE_LSN(log->l_tail_lsn)),
-                     (void *)((unsigned long)BLOCK_LSN(log->l_tail_lsn)),
-                     (void *)string,
-                     (void *)((unsigned long)tic->t_trans_type),
-                     (void *)cnts,
-                     (void *)((unsigned long)tic->t_curr_res),
-                     (void *)((unsigned long)tic->t_unit_res));
-}
-void
-xlog_trace_iclog_alloc(xlog_in_core_t *iclog)
-{
-        iclog->ic_trace = ktrace_alloc(XLOG_TRACE_ICLOG_SIZE, KM_NOFS);
-}
-void
-xlog_trace_iclog_dealloc(xlog_in_core_t *iclog)
-{
-        ktrace_free(iclog->ic_trace);
-}
-void
-xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
-{
-        ktrace_enter(iclog->ic_trace,
-                     (void *)((unsigned long)state),
-                     (void *)((unsigned long)current_pid()),
-                     (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL,
-                     (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL,
-                     (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL,
-                     (void *)NULL, (void *)NULL);
-}
-#else
-#define xlog_trace_loggrant_alloc(log)
-#define xlog_trace_loggrant_dealloc(log)
-#define xlog_trace_loggrant(log,tic,string)
-#define xlog_trace_iclog_alloc(iclog)
-#define xlog_trace_iclog_dealloc(iclog)
-#define xlog_trace_iclog(iclog,state)
-#endif /* XFS_LOG_TRACE */
 static void
 xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
@@ -327,14 +243,14 @@ xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
 * out when the next write occurs.
 */
 xfs_lsn_t
-xfs_log_done(xfs_mount_t        *mp,
+xfs_log_done(
-             xfs_log_ticket_t   xtic,
+        struct xfs_mount        *mp,
-             void               **iclog,
+        struct xlog_ticket      *ticket,
-             uint               flags)
+        struct xlog_in_core     **iclog,
+        uint                    flags)
 {
-        xlog_t          *log    = mp->m_log;
+        struct log              *log = mp->m_log;
-        xlog_ticket_t   *ticket = (xfs_log_ticket_t) xtic;
+        xfs_lsn_t               lsn = 0;
-        xfs_lsn_t       lsn     = 0;
        if (XLOG_FORCED_SHUTDOWN(log) ||
            /*
@@ -342,8 +258,7 @@ xfs_log_done(xfs_mount_t	*mp,
             * If we get an error, just continue and give back the log ticket.
             */
            (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
-             (xlog_commit_record(mp, ticket,
+             (xlog_commit_record(mp, ticket, iclog, &lsn)))) {
-                                 (xlog_in_core_t **)iclog, &lsn)))) {
                lsn = (xfs_lsn_t) -1;
                if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
                        flags |= XFS_LOG_REL_PERM_RESERV;
@@ -353,15 +268,17 @@ xfs_log_done(xfs_mount_t	*mp,
        if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 ||
            (flags & XFS_LOG_REL_PERM_RESERV)) {
+                trace_xfs_log_done_nonperm(log, ticket);
                /*
                 * Release ticket if not permanent reservation or a specific
                 * request has been made to release a permanent reservation.
                 */
-                xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)");
                xlog_ungrant_log_space(log, ticket);
                xfs_log_ticket_put(ticket);
        } else {
-                xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)");
+                trace_xfs_log_done_perm(log, ticket);
                xlog_regrant_reserve_log_space(log, ticket);
                /* If this ticket was a permanent reservation and we aren't
                 * trying to release it, reset the inited flags; so next time
@@ -371,67 +288,8 @@ xfs_log_done(xfs_mount_t	*mp,
        }
        return lsn;
-}       /* xfs_log_done */
-/*
- * Force the in-core log to disk.  If flags == XFS_LOG_SYNC,
- *      the force is done synchronously.
- *
- * Asynchronous forces are implemented by setting the WANT_SYNC
- * bit in the appropriate in-core log and then returning.
- *
- * Synchronous forces are implemented with a signal variable. All callers
- * to force a given lsn to disk will wait on a the sv attached to the
- * specific in-core log.  When given in-core log finally completes its
- * write to disk, that thread will wake up all threads waiting on the
- * sv.
- */
-int
-_xfs_log_force(
-        xfs_mount_t     *mp,
-        xfs_lsn_t       lsn,
-        uint            flags,
-        int             *log_flushed)
-{
-        xlog_t          *log = mp->m_log;
-        int             dummy;
-        if (!log_flushed)
-                log_flushed = &dummy;
-        ASSERT(flags & XFS_LOG_FORCE);
-        XFS_STATS_INC(xs_log_force);
-        if (log->l_flags & XLOG_IO_ERROR)
-                return XFS_ERROR(EIO);
-        if (lsn == 0)
-                return xlog_state_sync_all(log, flags, log_flushed);
-        else
-                return xlog_state_sync(log, lsn, flags, log_flushed);
-}       /* _xfs_log_force */
-/*
- * Wrapper for _xfs_log_force(), to be used when caller doesn't care
- * about errors or whether the log was flushed or not. This is the normal
- * interface to use when trying to unpin items or move the log forward.
- */
-void
-xfs_log_force(
-        xfs_mount_t     *mp,
-        xfs_lsn_t       lsn,
-        uint            flags)
-{
-        int     error;
-        error = _xfs_log_force(mp, lsn, flags, NULL);
-        if (error) {
-                xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
-                        "error %d returned.", error);
-        }
 }
 /*
 * Attaches a new iclog I/O completion callback routine during
 * transaction commit.  If the log is in error state, a non-zero
@@ -439,11 +297,11 @@ xfs_log_force(
 * executing the callback at an appropriate time.
 */
 int
-xfs_log_notify(xfs_mount_t        *mp,          /* mount of partition */
+xfs_log_notify(
-               void               *iclog_hndl,  /* iclog to hang callback off */
+        struct xfs_mount        *mp,
-               xfs_log_callback_t *cb)
+        struct xlog_in_core     *iclog,
+        xfs_log_callback_t      *cb)
 {
-        xlog_in_core_t    *iclog = (xlog_in_core_t *)iclog_hndl;
        int     abortflg;
        spin_lock(&iclog->ic_callback_lock);
@@ -457,16 +315,14 @@ xfs_log_notify(xfs_mount_t	  *mp,		/* mount of partition */
        }
        spin_unlock(&iclog->ic_callback_lock);
        return abortflg;
-}       /* xfs_log_notify */
+}
 int
-xfs_log_release_iclog(xfs_mount_t *mp,
+xfs_log_release_iclog(
-                      void        *iclog_hndl)
+        struct xfs_mount        *mp,
+        struct xlog_in_core     *iclog)
 {
-        xlog_t *log = mp->m_log;
+        if (xlog_state_release_iclog(mp->m_log, iclog)) {
-        xlog_in_core_t    *iclog = (xlog_in_core_t *)iclog_hndl;
-        if (xlog_state_release_iclog(log, iclog)) {
                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
                return EIO;
        }
@@ -485,17 +341,18 @@ xfs_log_release_iclog(xfs_mount_t *mp,
 * reservation, we prevent over allocation problems.
 */
 int
-xfs_log_reserve(xfs_mount_t      *mp,
+xfs_log_reserve(
-                int              unit_bytes,
+        struct xfs_mount        *mp,
-                int              cnt,
+        int                     unit_bytes,
-                xfs_log_ticket_t *ticket,
+        int                     cnt,
-                __uint8_t        client,
+        struct xlog_ticket      **ticket,
-                uint             flags,
+        __uint8_t               client,
-                uint             t_type)
+        uint                    flags,
+        uint                    t_type)
 {
-        xlog_t          *log = mp->m_log;
+        struct log              *log = mp->m_log;
-        xlog_ticket_t   *internal_ticket;
+        struct xlog_ticket      *internal_ticket;
-        int             retval = 0;
+        int                     retval = 0;
        ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
        ASSERT((flags & XFS_LOG_NOSLEEP) == 0);
@@ -505,10 +362,13 @@ xfs_log_reserve(xfs_mount_t	 *mp,
        XFS_STATS_INC(xs_try_logspace);
        if (*ticket != NULL) {
                ASSERT(flags & XFS_LOG_PERM_RESERV);
-                internal_ticket = (xlog_ticket_t *)*ticket;
+                internal_ticket = *ticket;
-                xlog_trace_loggrant(log, internal_ticket, "xfs_log_reserve: existing ticket (permanent trans)");
+                trace_xfs_log_reserve(log, internal_ticket);
                xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
                retval = xlog_regrant_write_log_space(log, internal_ticket);
        } else {
@@ -519,10 +379,9 @@ xfs_log_reserve(xfs_mount_t	 *mp,
                        return XFS_ERROR(ENOMEM);
                internal_ticket->t_trans_type = t_type;
                *ticket = internal_ticket;
-                xlog_trace_loggrant(log, internal_ticket, 
-                        (internal_ticket->t_flags & XLOG_TIC_PERM_RESERV) ?
+                trace_xfs_log_reserve(log, internal_ticket);
-                        "xfs_log_reserve: create new ticket (permanent trans)" :
-                        "xfs_log_reserve: create new ticket");
                xlog_grant_push_ail(mp,
                                    (internal_ticket->t_unit_res *
                                     internal_ticket->t_cnt));
@@ -658,7 +517,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
        xlog_in_core_t   *first_iclog;
 #endif
        xfs_log_iovec_t  reg[1];
-        xfs_log_ticket_t tic = NULL;
+        xlog_ticket_t   *tic = NULL;
        xfs_lsn_t        lsn;
        int              error;
@@ -676,7 +535,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
        if (mp->m_flags & XFS_MOUNT_RDONLY)
                return 0;
-        error = _xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC, NULL);
+        error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
        ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
 #ifdef DEBUG
@@ -692,7 +551,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
        if (! (XLOG_FORCED_SHUTDOWN(log))) {
                reg[0].i_addr = (void*)&magic;
                reg[0].i_len  = sizeof(magic);
-                XLOG_VEC_SET_TYPE(&reg[0], XLOG_REG_TYPE_UNMOUNT);
+                reg[0].i_type = XLOG_REG_TYPE_UNMOUNT;
                error = xfs_log_reserve(mp, 600, 1, &tic,
                                        XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
@@ -734,7 +593,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                        spin_unlock(&log->l_icloglock);
                }
                if (tic) {
-                        xlog_trace_loggrant(log, tic, "unmount rec");
+                        trace_xfs_log_umount_write(log, tic);
                        xlog_ungrant_log_space(log, tic);
                        xfs_log_ticket_put(tic);
                }
@@ -795,24 +654,24 @@ xfs_log_unmount(xfs_mount_t *mp)
 * transaction occur with one call to xfs_log_write().
 */
 int
-xfs_log_write(xfs_mount_t *     mp,
+xfs_log_write(
-              xfs_log_iovec_t   reg[],
+        struct xfs_mount        *mp,
-              int               nentries,
+        struct xfs_log_iovec    reg[],
-              xfs_log_ticket_t  tic,
+        int                     nentries,
-              xfs_lsn_t         *start_lsn)
+        struct xlog_ticket      *tic,
+        xfs_lsn_t               *start_lsn)
 {
-        int     error;
+        struct log              *log = mp->m_log;
-        xlog_t *log = mp->m_log;
+        int                     error;
        if (XLOG_FORCED_SHUTDOWN(log))
                return XFS_ERROR(EIO);
-        if ((error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0))) {
+        error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0);
+        if (error)
                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
-        }
        return error;
-}       /* xfs_log_write */
+}
 void
 xfs_log_move_tail(xfs_mount_t   *mp,
@@ -1030,7 +889,6 @@ xlog_iodone(xfs_buf_t *bp)
                xfs_fs_cmn_err(CE_WARN, l->l_mp,
                                "xlog_iodone: Barriers are no longer supported"
                                " by device. Disabling barriers\n");
-                xfs_buftrace("XLOG_IODONE BARRIERS OFF", bp);
        }
        /*
@@ -1063,38 +921,6 @@ xlog_iodone(xfs_buf_t *bp)
 }       /* xlog_iodone */
 /*
- * The bdstrat callback function for log bufs. This gives us a central
- * place to trap bufs in case we get hit by a log I/O error and need to
- * shutdown. Actually, in practice, even when we didn't get a log error,
- * we transition the iclogs to IOERROR state *after* flushing all existing
- * iclogs to disk. This is because we don't want anymore new transactions to be
- * started or completed afterwards.
- */
-STATIC int
-xlog_bdstrat_cb(struct xfs_buf *bp)
-{
-        xlog_in_core_t *iclog;
-        iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
-        if ((iclog->ic_state & XLOG_STATE_IOERROR) == 0) {
-          /* note for irix bstrat will need  struct bdevsw passed
-           * Fix the following macro if the code ever is merged
-           */
-            XFS_bdstrat(bp);
-                return 0;
-        }
-        xfs_buftrace("XLOG__BDSTRAT IOERROR", bp);
-        XFS_BUF_ERROR(bp, EIO);
-        XFS_BUF_STALE(bp);
-        xfs_biodone(bp);
-        return XFS_ERROR(EIO);
-}
-/*
 * Return size of each in-core log record buffer.
 *
 * All machines get 8 x 32kB buffers by default, unless tuned otherwise.
@@ -1236,7 +1062,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
        if (!bp)
                goto out_free_log;
        XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
-        XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
        XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
@@ -1246,7 +1071,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
        spin_lock_init(&log->l_grant_lock);
        sv_init(&log->l_flush_wait, 0, "flush_wait");
-        xlog_trace_loggrant_alloc(log);
        /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
        ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1275,7 +1099,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
                if (!XFS_BUF_CPSEMA(bp))
                        ASSERT(0);
                XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
-                XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
                XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
                iclog->ic_bp = bp;
                iclog->ic_data = bp->b_addr;
@@ -1305,8 +1128,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
                sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");
                sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write");
-                xlog_trace_iclog_alloc(iclog);
                iclogp = &iclog->ic_next;
        }
        *iclogp = log->l_iclog;                 /* complete ring */
@@ -1321,13 +1142,11 @@ out_free_iclog:
                        sv_destroy(&iclog->ic_force_wait);
                        sv_destroy(&iclog->ic_write_wait);
                        xfs_buf_free(iclog->ic_bp);
-                        xlog_trace_iclog_dealloc(iclog);
                }
                kmem_free(iclog);
        }
        spinlock_destroy(&log->l_icloglock);
        spinlock_destroy(&log->l_grant_lock);
-        xlog_trace_loggrant_dealloc(log);
        xfs_buf_free(log->l_xbuf);
 out_free_log:
        kmem_free(log);
@@ -1351,7 +1170,7 @@ xlog_commit_record(xfs_mount_t  *mp,
        reg[0].i_addr = NULL;
        reg[0].i_len = 0;
-        XLOG_VEC_SET_TYPE(&reg[0], XLOG_REG_TYPE_COMMIT);
+        reg[0].i_type = XLOG_REG_TYPE_COMMIT;
        ASSERT_ALWAYS(iclog);
        if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp,
@@ -1426,6 +1245,37 @@ xlog_grant_push_ail(xfs_mount_t	*mp,
            xfs_trans_ail_push(log->l_ailp, threshold_lsn);
 }       /* xlog_grant_push_ail */
+/*
+ * The bdstrat callback function for log bufs. This gives us a central
+ * place to trap bufs in case we get hit by a log I/O error and need to
+ * shutdown. Actually, in practice, even when we didn't get a log error,
+ * we transition the iclogs to IOERROR state *after* flushing all existing
+ * iclogs to disk. This is because we don't want anymore new transactions to be
+ * started or completed afterwards.
+ */
+STATIC int
+xlog_bdstrat(
+        struct xfs_buf          *bp)
+{
+        struct xlog_in_core     *iclog;
+        iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
+        if (iclog->ic_state & XLOG_STATE_IOERROR) {
+                XFS_BUF_ERROR(bp, EIO);
+                XFS_BUF_STALE(bp);
+                xfs_biodone(bp);
+                /*
+                 * It would seem logical to return EIO here, but we rely on
+                 * the log state machine to propagate I/O errors instead of
+                 * doing it here.
+                 */
+                return 0;
+        }
+        bp->b_flags |= _XBF_RUN_QUEUES;
+        xfs_buf_iorequest(bp);
+        return 0;
+}
 /*
 * Flush out the in-core log (iclog) to the on-disk log in an asynchronous 
@@ -1524,6 +1374,7 @@ xlog_sync(xlog_t		*log,
        XFS_BUF_ZEROFLAGS(bp);
        XFS_BUF_BUSY(bp);
        XFS_BUF_ASYNC(bp);
+        bp->b_flags |= XBF_LOG_BUFFER;
        /*
         * Do an ordered write for the log block.
         * Its unnecessary to flush the first split block in the log wrap case.
@@ -1544,7 +1395,7 @@ xlog_sync(xlog_t		*log,
         */
        XFS_BUF_WRITE(bp);
-        if ((error = XFS_bwrite(bp))) {
+        if ((error = xlog_bdstrat(bp))) {
                xfs_ioerror_alert("xlog_sync", log->l_mp, bp,
                                  XFS_BUF_ADDR(bp));
                return error;
@@ -1561,6 +1412,7 @@ xlog_sync(xlog_t		*log,
                XFS_BUF_ZEROFLAGS(bp);
                XFS_BUF_BUSY(bp);
                XFS_BUF_ASYNC(bp);
+                bp->b_flags |= XBF_LOG_BUFFER;
                if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
                        XFS_BUF_ORDERED(bp);
                dptr = XFS_BUF_PTR(bp);
@@ -1583,7 +1435,7 @@ xlog_sync(xlog_t		*log,
                /* account for internal log which doesn't start at block #0 */
                XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
                XFS_BUF_WRITE(bp);
-                if ((error = XFS_bwrite(bp))) {
+                if ((error = xlog_bdstrat(bp))) {
                        xfs_ioerror_alert("xlog_sync (split)", log->l_mp,
                                          bp, XFS_BUF_ADDR(bp));
                        return error;
@@ -1607,7 +1459,6 @@ xlog_dealloc_log(xlog_t *log)
                sv_destroy(&iclog->ic_force_wait);
                sv_destroy(&iclog->ic_write_wait);
                xfs_buf_free(iclog->ic_bp);
-                xlog_trace_iclog_dealloc(iclog);
                next_iclog = iclog->ic_next;
                kmem_free(iclog);
                iclog = next_iclog;
@@ -1616,7 +1467,6 @@ xlog_dealloc_log(xlog_t *log)
        spinlock_destroy(&log->l_grant_lock);
        xfs_buf_free(log->l_xbuf);
-        xlog_trace_loggrant_dealloc(log);
        log->l_mp->m_log = NULL;
        kmem_free(log);
 }       /* xlog_dealloc_log */
@@ -1790,16 +1640,16 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
 *      bytes have been written out.
 */
 STATIC int
-xlog_write(xfs_mount_t *        mp,
+xlog_write(
-           xfs_log_iovec_t      reg[],
+        struct xfs_mount        *mp,
-           int                  nentries,
+        struct xfs_log_iovec    reg[],
-           xfs_log_ticket_t     tic,
+        int                     nentries,
-           xfs_lsn_t            *start_lsn,
+        struct xlog_ticket      *ticket,
-           xlog_in_core_t       **commit_iclog,
+        xfs_lsn_t               *start_lsn,
-           uint                 flags)
+        struct xlog_in_core     **commit_iclog,
+        uint                    flags)
 {
    xlog_t           *log = mp->m_log;
-    xlog_ticket_t    *ticket = (xlog_ticket_t *)tic;
    xlog_in_core_t   *iclog = NULL;  /* ptr to current in-core log */
    xlog_op_header_t *logop_head;    /* ptr to log operation header */
    __psint_t        ptr;            /* copy address into data region */
@@ -1913,7 +1763,7 @@ xlog_write(xfs_mount_t *	mp,
            default:
                xfs_fs_cmn_err(CE_WARN, mp,
                    "Bad XFS transaction clientid 0x%x in ticket 0x%p",
-                    logop_head->oh_clientid, tic);
+                    logop_head->oh_clientid, ticket);
                return XFS_ERROR(EIO);
            }
@@ -2414,7 +2264,6 @@ restart:
        iclog = log->l_iclog;
        if (iclog->ic_state != XLOG_STATE_ACTIVE) {
-                xlog_trace_iclog(iclog, XLOG_TRACE_SLEEP_FLUSH);
                XFS_STATS_INC(xs_log_noiclogs);
                /* Wait for log writes to have flushed */
@@ -2520,13 +2369,15 @@ xlog_grant_log_space(xlog_t	   *log,
        /* Is there space or do we need to sleep? */
        spin_lock(&log->l_grant_lock);
-        xlog_trace_loggrant(log, tic, "xlog_grant_log_space: enter");
+        trace_xfs_log_grant_enter(log, tic);
        /* something is already sleeping; insert new transaction at end */
        if (log->l_reserve_headq) {
                xlog_ins_ticketq(&log->l_reserve_headq, tic);
-                xlog_trace_loggrant(log, tic,
-                                    "xlog_grant_log_space: sleep 1");
+                trace_xfs_log_grant_sleep1(log, tic);
                /*
                 * Gotta check this before going to sleep, while we're
                 * holding the grant lock.
@@ -2540,8 +2391,7 @@ xlog_grant_log_space(xlog_t	   *log,
                 * If we got an error, and the filesystem is shutting down,
                 * we'll catch it down below. So just continue...
                 */
-                xlog_trace_loggrant(log, tic,
+                trace_xfs_log_grant_wake1(log, tic);
-                                    "xlog_grant_log_space: wake 1");
                spin_lock(&log->l_grant_lock);
        }
        if (tic->t_flags & XFS_LOG_PERM_RESERV)
@@ -2558,8 +2408,9 @@ redo:
        if (free_bytes < need_bytes) {
                if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
                        xlog_ins_ticketq(&log->l_reserve_headq, tic);
-                xlog_trace_loggrant(log, tic,
-                                    "xlog_grant_log_space: sleep 2");
+                trace_xfs_log_grant_sleep2(log, tic);
                spin_unlock(&log->l_grant_lock);
                xlog_grant_push_ail(log->l_mp, need_bytes);
                spin_lock(&log->l_grant_lock);
@@ -2571,8 +2422,8 @@ redo:
                if (XLOG_FORCED_SHUTDOWN(log))
                        goto error_return;
-                xlog_trace_loggrant(log, tic,
+                trace_xfs_log_grant_wake2(log, tic);
-                                    "xlog_grant_log_space: wake 2");
                goto redo;
        } else if (tic->t_flags & XLOG_TIC_IN_Q)
                xlog_del_ticketq(&log->l_reserve_headq, tic);
@@ -2592,7 +2443,7 @@ redo:
                ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
        }
 #endif
-        xlog_trace_loggrant(log, tic, "xlog_grant_log_space: exit");
+        trace_xfs_log_grant_exit(log, tic);
        xlog_verify_grant_head(log, 1);
        spin_unlock(&log->l_grant_lock);
        return 0;
@@ -2600,7 +2451,9 @@ redo:
 error_return:
        if (tic->t_flags & XLOG_TIC_IN_Q)
                xlog_del_ticketq(&log->l_reserve_headq, tic);
-        xlog_trace_loggrant(log, tic, "xlog_grant_log_space: err_ret");
+        trace_xfs_log_grant_error(log, tic);
        /*
         * If we are failing, make sure the ticket doesn't have any
         * current reservations. We don't want to add this back when
@@ -2640,7 +2493,8 @@ xlog_regrant_write_log_space(xlog_t	   *log,
 #endif
        spin_lock(&log->l_grant_lock);
-        xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: enter");
+        trace_xfs_log_regrant_write_enter(log, tic);
        if (XLOG_FORCED_SHUTDOWN(log))
                goto error_return;
@@ -2669,8 +2523,8 @@ xlog_regrant_write_log_space(xlog_t	   *log,
                        if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
                                xlog_ins_ticketq(&log->l_write_headq, tic);
-                        xlog_trace_loggrant(log, tic,
+                        trace_xfs_log_regrant_write_sleep1(log, tic);
-                                    "xlog_regrant_write_log_space: sleep 1");
                        spin_unlock(&log->l_grant_lock);
                        xlog_grant_push_ail(log->l_mp, need_bytes);
                        spin_lock(&log->l_grant_lock);
@@ -2685,8 +2539,7 @@ xlog_regrant_write_log_space(xlog_t	   *log,
                        if (XLOG_FORCED_SHUTDOWN(log))
                                goto error_return;
-                        xlog_trace_loggrant(log, tic,
+                        trace_xfs_log_regrant_write_wake1(log, tic);
-                                    "xlog_regrant_write_log_space: wake 1");
                }
        }
@@ -2704,6 +2557,8 @@ redo:
                spin_lock(&log->l_grant_lock);
                XFS_STATS_INC(xs_sleep_logspace);
+                trace_xfs_log_regrant_write_sleep2(log, tic);
                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
                /* If we're shutting down, this tic is already off the queue */
@@ -2711,8 +2566,7 @@ redo:
                if (XLOG_FORCED_SHUTDOWN(log))
                        goto error_return;
-                xlog_trace_loggrant(log, tic,
+                trace_xfs_log_regrant_write_wake2(log, tic);
-                                    "xlog_regrant_write_log_space: wake 2");
                goto redo;
        } else if (tic->t_flags & XLOG_TIC_IN_Q)
                xlog_del_ticketq(&log->l_write_headq, tic);
@@ -2727,7 +2581,8 @@ redo:
        }
 #endif
-        xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: exit");
+        trace_xfs_log_regrant_write_exit(log, tic);
        xlog_verify_grant_head(log, 1);
        spin_unlock(&log->l_grant_lock);
        return 0;
@@ -2736,7 +2591,9 @@ redo:
 error_return:
        if (tic->t_flags & XLOG_TIC_IN_Q)
                xlog_del_ticketq(&log->l_reserve_headq, tic);
-        xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: err_ret");
+        trace_xfs_log_regrant_write_error(log, tic);
        /*
         * If we are failing, make sure the ticket doesn't have any
         * current reservations. We don't want to add this back when
@@ -2760,8 +2617,8 @@ STATIC void
 xlog_regrant_reserve_log_space(xlog_t        *log,
                               xlog_ticket_t *ticket)
 {
-        xlog_trace_loggrant(log, ticket,
+        trace_xfs_log_regrant_reserve_enter(log, ticket);
-                            "xlog_regrant_reserve_log_space: enter");
        if (ticket->t_cnt > 0)
                ticket->t_cnt--;
@@ -2769,8 +2626,9 @@ xlog_regrant_reserve_log_space(xlog_t	     *log,
        xlog_grant_sub_space(log, ticket->t_curr_res);
        ticket->t_curr_res = ticket->t_unit_res;
        xlog_tic_reset_res(ticket);
-        xlog_trace_loggrant(log, ticket,
-                            "xlog_regrant_reserve_log_space: sub current res");
+        trace_xfs_log_regrant_reserve_sub(log, ticket);
        xlog_verify_grant_head(log, 1);
        /* just return if we still have some of the pre-reserved space */
@@ -2780,8 +2638,9 @@ xlog_regrant_reserve_log_space(xlog_t	     *log,
        }
        xlog_grant_add_space_reserve(log, ticket->t_unit_res);
-        xlog_trace_loggrant(log, ticket,
-                            "xlog_regrant_reserve_log_space: exit");
+        trace_xfs_log_regrant_reserve_exit(log, ticket);
        xlog_verify_grant_head(log, 0);
        spin_unlock(&log->l_grant_lock);
        ticket->t_curr_res = ticket->t_unit_res;
@@ -2811,11 +2670,11 @@ xlog_ungrant_log_space(xlog_t	     *log,
                ticket->t_cnt--;
        spin_lock(&log->l_grant_lock);
-        xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: enter");
+        trace_xfs_log_ungrant_enter(log, ticket);
        xlog_grant_sub_space(log, ticket->t_curr_res);
-        xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: sub current");
+        trace_xfs_log_ungrant_sub(log, ticket);
        /* If this is a permanent reservation ticket, we may be able to free
         * up more space based on the remaining count.
@@ -2825,7 +2684,8 @@ xlog_ungrant_log_space(xlog_t	     *log,
                xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt);
        }
-        xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: exit");
+        trace_xfs_log_ungrant_exit(log, ticket);
        xlog_verify_grant_head(log, 1);
        spin_unlock(&log->l_grant_lock);
        xfs_log_move_tail(log->l_mp, 1);
@@ -2927,7 +2787,6 @@ xlog_state_switch_iclogs(xlog_t		*log,
        log->l_iclog = iclog->ic_next;
 }       /* xlog_state_switch_iclogs */
 /*
 * Write out all data in the in-core log as of this exact moment in time.
 *
@@ -2955,11 +2814,17 @@ xlog_state_switch_iclogs(xlog_t		*log,
 *         b) when we return from flushing out this iclog, it is still
 *              not in the active nor dirty state.
 */
-STATIC int
+int
-xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
+_xfs_log_force(
+        struct xfs_mount        *mp,
+        uint                    flags,
+        int                     *log_flushed)
 {
-        xlog_in_core_t  *iclog;
+        struct log              *log = mp->m_log;
-        xfs_lsn_t       lsn;
+        struct xlog_in_core     *iclog;
+        xfs_lsn_t               lsn;
+        XFS_STATS_INC(xs_log_force);
        spin_lock(&log->l_icloglock);
@@ -3005,7 +2870,9 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
                                if (xlog_state_release_iclog(log, iclog))
                                        return XFS_ERROR(EIO);
-                                *log_flushed = 1;
+                                if (log_flushed)
+                                        *log_flushed = 1;
                                spin_lock(&log->l_icloglock);
                                if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn &&
                                    iclog->ic_state != XLOG_STATE_DIRTY)
@@ -3049,19 +2916,37 @@ maybe_sleep:
                 */
                if (iclog->ic_state & XLOG_STATE_IOERROR)
                        return XFS_ERROR(EIO);
-                *log_flushed = 1;
+                if (log_flushed)
+                        *log_flushed = 1;
        } else {
 no_sleep:
                spin_unlock(&log->l_icloglock);
        }
        return 0;
-}       /* xlog_state_sync_all */
+}
+/*
+ * Wrapper for _xfs_log_force(), to be used when caller doesn't care
+ * about errors or whether the log was flushed or not. This is the normal
+ * interface to use when trying to unpin items or move the log forward.
+ */
+void
+xfs_log_force(
+        xfs_mount_t     *mp,
+        uint            flags)
+{
+        int     error;
+        error = _xfs_log_force(mp, flags, NULL);
+        if (error) {
+                xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
+                        "error %d returned.", error);
+        }
+}
 /*
- * Used by code which implements synchronous log forces.
+ * Force the in-core log to disk for a specific LSN.
 *
 * Find in-core log with lsn.
 *      If it is in the DIRTY state, just return.
@@ -3069,109 +2954,142 @@ no_sleep:
 *              state and go to sleep or return.
 *      If it is in any other state, go to sleep or return.
 *
- * If filesystem activity goes to zero, the iclog will get flushed only by
+ * Synchronous forces are implemented with a signal variable. All callers
- * bdflush().
+ * to force a given lsn to disk will wait on a the sv attached to the
+ * specific in-core log.  When given in-core log finally completes its
+ * write to disk, that thread will wake up all threads waiting on the
+ * sv.
 */
-STATIC int
+int
-xlog_state_sync(xlog_t    *log,
+_xfs_log_force_lsn(
-                xfs_lsn_t lsn,
+        struct xfs_mount        *mp,
-                uint      flags,
+        xfs_lsn_t               lsn,
-                int       *log_flushed)
+        uint                    flags,
+        int                     *log_flushed)
 {
-    xlog_in_core_t      *iclog;
+        struct log              *log = mp->m_log;
-    int                 already_slept = 0;
+        struct xlog_in_core     *iclog;
+        int                     already_slept = 0;
-try_again:
+        ASSERT(lsn != 0);
-    spin_lock(&log->l_icloglock);
-    iclog = log->l_iclog;
-    if (iclog->ic_state & XLOG_STATE_IOERROR) {
+        XFS_STATS_INC(xs_log_force);
-            spin_unlock(&log->l_icloglock);
-            return XFS_ERROR(EIO);
-    }
-    do {
-        if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
-                iclog = iclog->ic_next;
-                continue;
-        }
-        if (iclog->ic_state == XLOG_STATE_DIRTY) {
+try_again:
+        spin_lock(&log->l_icloglock);
+        iclog = log->l_iclog;
+        if (iclog->ic_state & XLOG_STATE_IOERROR) {
                spin_unlock(&log->l_icloglock);
-                return 0;
+                return XFS_ERROR(EIO);
        }
-        if (iclog->ic_state == XLOG_STATE_ACTIVE) {
+        do {
-                /*
+                if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
-                 * We sleep here if we haven't already slept (e.g.
+                        iclog = iclog->ic_next;
-                 * this is the first time we've looked at the correct
+                        continue;
-                 * iclog buf) and the buffer before us is going to
+                }
-                 * be sync'ed. The reason for this is that if we
-                 * are doing sync transactions here, by waiting for
+                if (iclog->ic_state == XLOG_STATE_DIRTY) {
-                 * the previous I/O to complete, we can allow a few
+                        spin_unlock(&log->l_icloglock);
-                 * more transactions into this iclog before we close
+                        return 0;
-                 * it down.
+                }
-                 *
-                 * Otherwise, we mark the buffer WANT_SYNC, and bump
+                if (iclog->ic_state == XLOG_STATE_ACTIVE) {
-                 * up the refcnt so we can release the log (which drops
+                        /*
-                 * the ref count).  The state switch keeps new transaction
+                         * We sleep here if we haven't already slept (e.g.
-                 * commits from using this buffer.  When the current commits
+                         * this is the first time we've looked at the correct
-                 * finish writing into the buffer, the refcount will drop to
+                         * iclog buf) and the buffer before us is going to
-                 * zero and the buffer will go out then.
+                         * be sync'ed. The reason for this is that if we
-                 */
+                         * are doing sync transactions here, by waiting for
-                if (!already_slept &&
+                         * the previous I/O to complete, we can allow a few
-                    (iclog->ic_prev->ic_state & (XLOG_STATE_WANT_SYNC |
+                         * more transactions into this iclog before we close
-                                                 XLOG_STATE_SYNCING))) {
+                         * it down.
-                        ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
+                         *
-                        XFS_STATS_INC(xs_log_force_sleep);
+                         * Otherwise, we mark the buffer WANT_SYNC, and bump
-                        sv_wait(&iclog->ic_prev->ic_write_wait, PSWP,
+                         * up the refcnt so we can release the log (which
-                                &log->l_icloglock, s);
+                         * drops the ref count).  The state switch keeps new
-                        *log_flushed = 1;
+                         * transaction commits from using this buffer.  When
-                        already_slept = 1;
+                         * the current commits finish writing into the buffer,
-                        goto try_again;
+                         * the refcount will drop to zero and the buffer will
-                } else {
+                         * go out then.
+                         */
+                        if (!already_slept &&
+                            (iclog->ic_prev->ic_state &
+                             (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) {
+                                ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
+                                XFS_STATS_INC(xs_log_force_sleep);
+                                sv_wait(&iclog->ic_prev->ic_write_wait,
+                                        PSWP, &log->l_icloglock, s);
+                                if (log_flushed)
+                                        *log_flushed = 1;
+                                already_slept = 1;
+                                goto try_again;
+                        }
                        atomic_inc(&iclog->ic_refcnt);
                        xlog_state_switch_iclogs(log, iclog, 0);
                        spin_unlock(&log->l_icloglock);
                        if (xlog_state_release_iclog(log, iclog))
                                return XFS_ERROR(EIO);
-                        *log_flushed = 1;
+                        if (log_flushed)
+                                *log_flushed = 1;
                        spin_lock(&log->l_icloglock);
                }
-        }
-        if ((flags & XFS_LOG_SYNC) && /* sleep */
+                if ((flags & XFS_LOG_SYNC) && /* sleep */
-            !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
+                    !(iclog->ic_state &
+                      (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
+                        /*
+                         * Don't wait on completion if we know that we've
+                         * gotten a log write error.
+                         */
+                        if (iclog->ic_state & XLOG_STATE_IOERROR) {
+                                spin_unlock(&log->l_icloglock);
+                                return XFS_ERROR(EIO);
+                        }
+                        XFS_STATS_INC(xs_log_force_sleep);
+                        sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
+                        /*
+                         * No need to grab the log lock here since we're
+                         * only deciding whether or not to return EIO
+                         * and the memory read should be atomic.
+                         */
+                        if (iclog->ic_state & XLOG_STATE_IOERROR)
+                                return XFS_ERROR(EIO);
-                /*
+                        if (log_flushed)
-                 * Don't wait on completion if we know that we've
+                                *log_flushed = 1;
-                 * gotten a log write error.
+                } else {                /* just return */
-                 */
-                if (iclog->ic_state & XLOG_STATE_IOERROR) {
                        spin_unlock(&log->l_icloglock);
-                        return XFS_ERROR(EIO);
                }
-                XFS_STATS_INC(xs_log_force_sleep);
-                sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
-                /*
-                 * No need to grab the log lock here since we're
-                 * only deciding whether or not to return EIO
-                 * and the memory read should be atomic.
-                 */
-                if (iclog->ic_state & XLOG_STATE_IOERROR)
-                        return XFS_ERROR(EIO);
-                *log_flushed = 1;
-        } else {                /* just return */
-                spin_unlock(&log->l_icloglock);
-        }
-        return 0;
-    } while (iclog != log->l_iclog);
+                return 0;
+        } while (iclog != log->l_iclog);
+        spin_unlock(&log->l_icloglock);
+        return 0;
+}
-    spin_unlock(&log->l_icloglock);
+/*
-    return 0;
+ * Wrapper for _xfs_log_force_lsn(), to be used when caller doesn't care
-}       /* xlog_state_sync */
+ * about errors or whether the log was flushed or not. This is the normal
+ * interface to use when trying to unpin items or move the log forward.
+ */
+void
+xfs_log_force_lsn(
+        xfs_mount_t     *mp,
+        xfs_lsn_t       lsn,
+        uint            flags)
+{
+        int     error;
+        error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
+        if (error) {
+                xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
+                        "error %d returned.", error);
+        }
+}
 /*
 * Called when we want to mark the current iclog as being ready to sync to
@@ -3536,7 +3454,6 @@ xfs_log_force_umount(
        xlog_ticket_t   *tic;
        xlog_t          *log;
        int             retval;
-        int             dummy;
        log = mp->m_log;
@@ -3610,13 +3527,14 @@ xfs_log_force_umount(
        }
        spin_unlock(&log->l_grant_lock);
-        if (! (log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
+        if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
                ASSERT(!logerror);
                /*
                 * Force the incore logs to disk before shutting the
                 * log down completely.
                 */
-                xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC, &dummy);
+                _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
                spin_lock(&log->l_icloglock);
                retval = xlog_state_ioerror(log);
                spin_unlock(&log->l_icloglock);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index d0c9baa50b1a..97a24c7795a4 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -70,14 +70,8 @@ static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 * Flags to xfs_log_force()
 *
 *      XFS_LOG_SYNC:   Synchronous force in-core log to disk
- *      XFS_LOG_FORCE:  Start in-core log write now.
- *      XFS_LOG_URGE:   Start write within some window of time.
- *
- * Note: Either XFS_LOG_FORCE or XFS_LOG_URGE must be set.
 */
 #define XFS_LOG_SYNC            0x1
-#define XFS_LOG_FORCE           0x2
-#define XFS_LOG_URGE            0x4
 #endif  /* __KERNEL__ */
@@ -110,16 +104,12 @@ static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 #define XLOG_REG_TYPE_TRANSHDR          19
 #define XLOG_REG_TYPE_MAX               19
-#define XLOG_VEC_SET_TYPE(vecp, t) ((vecp)->i_type = (t))
 typedef struct xfs_log_iovec {
-        xfs_caddr_t             i_addr;         /* beginning address of region */
+        xfs_caddr_t     i_addr;         /* beginning address of region */
        int             i_len;          /* length in bytes of region */
        uint            i_type;         /* type of region */
 } xfs_log_iovec_t;
-typedef void* xfs_log_ticket_t;
 /*
 * Structure used to pass callback function and the function's argument
 * to the log manager.
@@ -134,18 +124,25 @@ typedef struct xfs_log_callback {
 #ifdef __KERNEL__
 /* Log manager interfaces */
 struct xfs_mount;
+struct xlog_in_core;
 struct xlog_ticket;
 xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
-                       xfs_log_ticket_t ticket,
+                       struct xlog_ticket *ticket,
-                       void             **iclog,
+                       struct xlog_in_core **iclog,
                       uint             flags);
 int       _xfs_log_force(struct xfs_mount *mp,
-                         xfs_lsn_t      lsn,
                         uint           flags,
                         int            *log_forced);
 void      xfs_log_force(struct xfs_mount        *mp,
-                        xfs_lsn_t               lsn,
                        uint                    flags);
+int       _xfs_log_force_lsn(struct xfs_mount *mp,
+                             xfs_lsn_t          lsn,
+                             uint               flags,
+                             int                *log_forced);
+void      xfs_log_force_lsn(struct xfs_mount    *mp,
+                            xfs_lsn_t           lsn,
+                            uint                flags);
 int       xfs_log_mount(struct xfs_mount        *mp,
                        struct xfs_buftarg      *log_target,
                        xfs_daddr_t             start_block,
@@ -154,21 +151,21 @@ int	  xfs_log_mount_finish(struct xfs_mount *mp);
 void      xfs_log_move_tail(struct xfs_mount    *mp,
                            xfs_lsn_t           tail_lsn);
 int       xfs_log_notify(struct xfs_mount       *mp,
-                         void                   *iclog,
+                         struct xlog_in_core    *iclog,
                         xfs_log_callback_t     *callback_entry);
 int       xfs_log_release_iclog(struct xfs_mount *mp,
-                         void                    *iclog_hndl);
+                         struct xlog_in_core     *iclog);
 int       xfs_log_reserve(struct xfs_mount *mp,
                          int              length,
                          int              count,
-                          xfs_log_ticket_t *ticket,
+                          struct xlog_ticket **ticket,
                          __uint8_t        clientid,
                          uint             flags,
                          uint             t_type);
 int       xfs_log_write(struct xfs_mount *mp,
                        xfs_log_iovec_t  region[],
                        int              nentries,
-                        xfs_log_ticket_t ticket,
+                        struct xlog_ticket *ticket,
                        xfs_lsn_t        *start_lsn);
 int       xfs_log_unmount_write(struct xfs_mount *mp);
 void      xfs_log_unmount(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 679c7c4926a2..fd02a18facd5 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -19,7 +19,6 @@
 #define __XFS_LOG_PRIV_H__
 struct xfs_buf;
-struct ktrace;
 struct log;
 struct xlog_ticket;
 struct xfs_buf_cancel;
@@ -135,6 +134,12 @@ static inline uint xlog_get_client_id(__be32 i)
 #define XLOG_TIC_INITED         0x1     /* has been initialized */
 #define XLOG_TIC_PERM_RESERV    0x2     /* permanent reservation */
 #define XLOG_TIC_IN_Q           0x4
+#define XLOG_TIC_FLAGS \
+        { XLOG_TIC_INITED,      "XLOG_TIC_INITED" }, \
+        { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }, \
+        { XLOG_TIC_IN_Q,        "XLOG_TIC_IN_Q" }
 #endif  /* __KERNEL__ */
 #define XLOG_UNMOUNT_TYPE       0x556e  /* Un for Unmount */
@@ -361,9 +366,6 @@ typedef struct xlog_in_core {
        int                     ic_bwritecnt;
        unsigned short          ic_state;
        char                    *ic_datap;      /* pointer to iclog data */
-#ifdef XFS_LOG_TRACE
-        struct ktrace           *ic_trace;
-#endif
        /* Callback structures need their own cacheline */
        spinlock_t              ic_callback_lock ____cacheline_aligned_in_smp;
@@ -429,10 +431,6 @@ typedef struct log {
        int                     l_grant_write_cycle;
        int                     l_grant_write_bytes;
-#ifdef XFS_LOG_TRACE
-        struct ktrace           *l_grant_trace;
-#endif
        /* The following field are used for debugging; need to hold icloglock */
 #ifdef DEBUG
        char                    *l_iclog_bak[XLOG_MAX_ICLOGS];
@@ -445,23 +443,12 @@ typedef struct log {
 /* common routines */
 extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
-extern int       xlog_find_tail(xlog_t  *log,
-                                xfs_daddr_t *head_blk,
-                                xfs_daddr_t *tail_blk);
 extern int       xlog_recover(xlog_t *log);
 extern int       xlog_recover_finish(xlog_t *log);
 extern void      xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
-extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
-extern void      xlog_put_bp(struct xfs_buf *);
 extern kmem_zone_t      *xfs_log_ticket_zone;
-/* iclog tracing */
-#define XLOG_TRACE_GRAB_FLUSH  1
-#define XLOG_TRACE_REL_FLUSH   2
-#define XLOG_TRACE_SLEEP_FLUSH 3
-#define XLOG_TRACE_WAKE_FLUSH  4
 /*
 * Unmount record type is used as a pseudo transaction type for the ticket.
 * It's value must be outside the range of XFS_TRANS_* values.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index fb17f8226b09..22e6efdc17ea 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -46,11 +46,10 @@
 #include "xfs_quota.h"
 #include "xfs_rw.h"
 #include "xfs_utils.h"
+#include "xfs_trace.h"
 STATIC int      xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
 STATIC int      xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
-STATIC void     xlog_recover_insert_item_backq(xlog_recover_item_t **q,
-                                               xlog_recover_item_t *item);
 #if defined(DEBUG)
 STATIC void     xlog_recover_check_summary(xlog_t *);
 #else
@@ -67,7 +66,7 @@ STATIC void	xlog_recover_check_summary(xlog_t *);
        ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) )
 #define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno)   ((bno) & ~(log)->l_sectbb_mask)
-xfs_buf_t *
+STATIC xfs_buf_t *
 xlog_get_bp(
        xlog_t          *log,
        int             nbblks)
@@ -87,7 +86,7 @@ xlog_get_bp(
        return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
 }
-void
+STATIC void
 xlog_put_bp(
        xfs_buf_t       *bp)
 {
@@ -225,16 +224,10 @@ xlog_header_check_dump(
        xfs_mount_t             *mp,
        xlog_rec_header_t       *head)
 {
-        int                     b;
+        cmn_err(CE_DEBUG, "%s:  SB : uuid = %pU, fmt = %d\n",
+                __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
-        cmn_err(CE_DEBUG, "%s:  SB : uuid = ", __func__);
+        cmn_err(CE_DEBUG, "    log : uuid = %pU, fmt = %d\n",
-        for (b = 0; b < 16; b++)
+                &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
-                cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&mp->m_sb.sb_uuid)[b]);
-        cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
-        cmn_err(CE_DEBUG, "    log : uuid = ");
-        for (b = 0; b < 16; b++)
-                cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&head->h_fs_uuid)[b]);
-        cmn_err(CE_DEBUG, ", fmt = %d\n", be32_to_cpu(head->h_fmt));
 }
 #else
 #define xlog_header_check_dump(mp, head)
@@ -810,7 +803,7 @@ xlog_find_head(
 * We could speed up search by using current head_blk buffer, but it is not
 * available.
 */
-int
+STATIC int
 xlog_find_tail(
        xlog_t                  *log,
        xfs_daddr_t             *head_blk,
@@ -1372,36 +1365,45 @@ xlog_clear_stale_blocks(
 STATIC xlog_recover_t *
 xlog_recover_find_tid(
-        xlog_recover_t          *q,
+        struct hlist_head       *head,
        xlog_tid_t              tid)
 {
-        xlog_recover_t          *p = q;
+        xlog_recover_t          *trans;
+        struct hlist_node       *n;
-        while (p != NULL) {
+        hlist_for_each_entry(trans, n, head, r_list) {
-                if (p->r_log_tid == tid)
+                if (trans->r_log_tid == tid)
-                    break;
+                        return trans;
-                p = p->r_next;
        }
-        return p;
+        return NULL;
 }
 STATIC void
-xlog_recover_put_hashq(
+xlog_recover_new_tid(
-        xlog_recover_t          **q,
+        struct hlist_head       *head,
-        xlog_recover_t          *trans)
+        xlog_tid_t              tid,
+        xfs_lsn_t               lsn)
 {
-        trans->r_next = *q;
+        xlog_recover_t          *trans;
-        *q = trans;
+        trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
+        trans->r_log_tid   = tid;
+        trans->r_lsn       = lsn;
+        INIT_LIST_HEAD(&trans->r_itemq);
+        INIT_HLIST_NODE(&trans->r_list);
+        hlist_add_head(&trans->r_list, head);
 }
 STATIC void
 xlog_recover_add_item(
-        xlog_recover_item_t     **itemq)
+        struct list_head        *head)
 {
        xlog_recover_item_t     *item;
        item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
-        xlog_recover_insert_item_backq(itemq, item);
+        INIT_LIST_HEAD(&item->ri_list);
+        list_add_tail(&item->ri_list, head);
 }
 STATIC int
@@ -1414,8 +1416,7 @@ xlog_recover_add_to_cont_trans(
        xfs_caddr_t             ptr, old_ptr;
        int                     old_len;
-        item = trans->r_itemq;
+        if (list_empty(&trans->r_itemq)) {
-        if (item == NULL) {
                /* finish copying rest of trans header */
                xlog_recover_add_item(&trans->r_itemq);
                ptr = (xfs_caddr_t) &trans->r_theader +
@@ -1423,7 +1424,8 @@ xlog_recover_add_to_cont_trans(
                memcpy(ptr, dp, len); /* d, s, l */
                return 0;
        }
-        item = item->ri_prev;
+        /* take the tail entry */
+        item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
        old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
        old_len = item->ri_buf[item->ri_cnt-1].i_len;
@@ -1460,8 +1462,7 @@ xlog_recover_add_to_trans(
        if (!len)
                return 0;
-        item = trans->r_itemq;
+        if (list_empty(&trans->r_itemq)) {
-        if (item == NULL) {
                /* we need to catch log corruptions here */
                if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
                        xlog_warn("XFS: xlog_recover_add_to_trans: "
@@ -1479,12 +1480,15 @@ xlog_recover_add_to_trans(
        memcpy(ptr, dp, len);
        in_f = (xfs_inode_log_format_t *)ptr;
-        if (item->ri_prev->ri_total != 0 &&
+        /* take the tail entry */
-             item->ri_prev->ri_total == item->ri_prev->ri_cnt) {
+        item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
+        if (item->ri_total != 0 &&
+             item->ri_total == item->ri_cnt) {
+                /* tail item is in use, get a new one */
                xlog_recover_add_item(&trans->r_itemq);
+                item = list_entry(trans->r_itemq.prev,
+                                        xlog_recover_item_t, ri_list);
        }
-        item = trans->r_itemq;
-        item = item->ri_prev;
        if (item->ri_total == 0) {              /* first region to be added */
                if (in_f->ilf_size == 0 ||
@@ -1509,96 +1513,29 @@ xlog_recover_add_to_trans(
        return 0;
 }
-STATIC void
+/*
-xlog_recover_new_tid(
+ * Sort the log items in the transaction. Cancelled buffers need
-        xlog_recover_t          **q,
+ * to be put first so they are processed before any items that might
-        xlog_tid_t              tid,
+ * modify the buffers. If they are cancelled, then the modifications
-        xfs_lsn_t               lsn)
+ * don't need to be replayed.
-{
+ */
-        xlog_recover_t          *trans;
-        trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
-        trans->r_log_tid   = tid;
-        trans->r_lsn       = lsn;
-        xlog_recover_put_hashq(q, trans);
-}
-STATIC int
-xlog_recover_unlink_tid(
-        xlog_recover_t          **q,
-        xlog_recover_t          *trans)
-{
-        xlog_recover_t          *tp;
-        int                     found = 0;
-        ASSERT(trans != NULL);
-        if (trans == *q) {
-                *q = (*q)->r_next;
-        } else {
-                tp = *q;
-                while (tp) {
-                        if (tp->r_next == trans) {
-                                found = 1;
-                                break;
-                        }
-                        tp = tp->r_next;
-                }
-                if (!found) {
-                        xlog_warn(
-                             "XFS: xlog_recover_unlink_tid: trans not found");
-                        ASSERT(0);
-                        return XFS_ERROR(EIO);
-                }
-                tp->r_next = tp->r_next->r_next;
-        }
-        return 0;
-}
-STATIC void
-xlog_recover_insert_item_backq(
-        xlog_recover_item_t     **q,
-        xlog_recover_item_t     *item)
-{
-        if (*q == NULL) {
-                item->ri_prev = item->ri_next = item;
-                *q = item;
-        } else {
-                item->ri_next           = *q;
-                item->ri_prev           = (*q)->ri_prev;
-                (*q)->ri_prev           = item;
-                item->ri_prev->ri_next  = item;
-        }
-}
-STATIC void
-xlog_recover_insert_item_frontq(
-        xlog_recover_item_t     **q,
-        xlog_recover_item_t     *item)
-{
-        xlog_recover_insert_item_backq(q, item);
-        *q = item;
-}
 STATIC int
 xlog_recover_reorder_trans(
        xlog_recover_t          *trans)
 {
-        xlog_recover_item_t     *first_item, *itemq, *itemq_next;
+        xlog_recover_item_t     *item, *n;
-        xfs_buf_log_format_t    *buf_f;
+        LIST_HEAD(sort_list);
-        ushort                  flags = 0;
-        first_item = itemq = trans->r_itemq;
+        list_splice_init(&trans->r_itemq, &sort_list);
-        trans->r_itemq = NULL;
+        list_for_each_entry_safe(item, n, &sort_list, ri_list) {
-        do {
+                xfs_buf_log_format_t    *buf_f;
-                itemq_next = itemq->ri_next;
-                buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr;
-                switch (ITEM_TYPE(itemq)) {
+                buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
+                switch (ITEM_TYPE(item)) {
                case XFS_LI_BUF:
-                        flags = buf_f->blf_flags;
+                        if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) {
-                        if (!(flags & XFS_BLI_CANCEL)) {
+                                list_move(&item->ri_list, &trans->r_itemq);
-                                xlog_recover_insert_item_frontq(&trans->r_itemq,
-                                                                itemq);
                                break;
                        }
                case XFS_LI_INODE:
@@ -1606,7 +1543,7 @@ xlog_recover_reorder_trans(
                case XFS_LI_QUOTAOFF:
                case XFS_LI_EFD:
                case XFS_LI_EFI:
-                        xlog_recover_insert_item_backq(&trans->r_itemq, itemq);
+                        list_move_tail(&item->ri_list, &trans->r_itemq);
                        break;
                default:
                        xlog_warn(
@@ -1614,8 +1551,8 @@ xlog_recover_reorder_trans(
                        ASSERT(0);
                        return XFS_ERROR(EIO);
                }
-                itemq = itemq_next;
+        }
-        } while (first_item != itemq);
+        ASSERT(list_empty(&sort_list));
        return 0;
 }
@@ -2206,6 +2143,7 @@ xlog_recover_do_buffer_trans(
        xfs_daddr_t             blkno;
        int                     len;
        ushort                  flags;
+        uint                    buf_flags;
        buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
@@ -2246,12 +2184,11 @@ xlog_recover_do_buffer_trans(
        }
        mp = log->l_mp;
-        if (flags & XFS_BLI_INODE_BUF) {
+        buf_flags = XBF_LOCK;
-                bp = xfs_buf_read_flags(mp->m_ddev_targp, blkno, len,
+        if (!(flags & XFS_BLI_INODE_BUF))
-                                                                XFS_BUF_LOCK);
+                buf_flags |= XBF_MAPPED;
-        } else {
-                bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, 0);
+        bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
-        }
        if (XFS_BUF_ISERROR(bp)) {
                xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
                                  bp, blkno);
@@ -2350,8 +2287,8 @@ xlog_recover_do_inode_trans(
                goto error;
        }
-        bp = xfs_buf_read_flags(mp->m_ddev_targp, in_f->ilf_blkno,
+        bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
-                                in_f->ilf_len, XFS_BUF_LOCK);
+                          XBF_LOCK);
        if (XFS_BUF_ISERROR(bp)) {
                xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
                                  bp, in_f->ilf_blkno);
@@ -2819,14 +2756,13 @@ xlog_recover_do_trans(
        int                     pass)
 {
        int                     error = 0;
-        xlog_recover_item_t     *item, *first_item;
+        xlog_recover_item_t     *item;
        error = xlog_recover_reorder_trans(trans);
        if (error)
                return error;
-        first_item = item = trans->r_itemq;
+        list_for_each_entry(item, &trans->r_itemq, ri_list) {
-        do {
                switch (ITEM_TYPE(item)) {
                case XFS_LI_BUF:
                        error = xlog_recover_do_buffer_trans(log, item, pass);
@@ -2859,8 +2795,7 @@ xlog_recover_do_trans(
                if (error)
                        return error;
-                item = item->ri_next;
+        }
-        } while (first_item != item);
        return 0;
 }
@@ -2874,21 +2809,18 @@ STATIC void
 xlog_recover_free_trans(
        xlog_recover_t          *trans)
 {
-        xlog_recover_item_t     *first_item, *item, *free_item;
+        xlog_recover_item_t     *item, *n;
        int                     i;
-        item = first_item = trans->r_itemq;
+        list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
-        do {
+                /* Free the regions in the item. */
-                free_item = item;
+                list_del(&item->ri_list);
-                item = item->ri_next;
+                for (i = 0; i < item->ri_cnt; i++)
-                 /* Free the regions in the item. */
+                        kmem_free(item->ri_buf[i].i_addr);
-                for (i = 0; i < free_item->ri_cnt; i++) {
-                        kmem_free(free_item->ri_buf[i].i_addr);
-                }
                /* Free the item itself */
-                kmem_free(free_item->ri_buf);
+                kmem_free(item->ri_buf);
-                kmem_free(free_item);
+                kmem_free(item);
-        } while (first_item != item);
+        }
        /* Free the transaction recover structure */
        kmem_free(trans);
 }
@@ -2896,14 +2828,12 @@ xlog_recover_free_trans(
 STATIC int
 xlog_recover_commit_trans(
        xlog_t                  *log,
-        xlog_recover_t          **q,
        xlog_recover_t          *trans,
        int                     pass)
 {
        int                     error;
-        if ((error = xlog_recover_unlink_tid(q, trans)))
+        hlist_del(&trans->r_list);
-                return error;
        if ((error = xlog_recover_do_trans(log, trans, pass)))
                return error;
        xlog_recover_free_trans(trans);                 /* no error */
@@ -2931,7 +2861,7 @@ xlog_recover_unmount_trans(
 STATIC int
 xlog_recover_process_data(
        xlog_t                  *log,
-        xlog_recover_t          *rhash[],
+        struct hlist_head       rhash[],
        xlog_rec_header_t       *rhead,
        xfs_caddr_t             dp,
        int                     pass)
@@ -2965,7 +2895,7 @@ xlog_recover_process_data(
                }
                tid = be32_to_cpu(ohead->oh_tid);
                hash = XLOG_RHASH(tid);
-                trans = xlog_recover_find_tid(rhash[hash], tid);
+                trans = xlog_recover_find_tid(&rhash[hash], tid);
                if (trans == NULL) {               /* not found; add new tid */
                        if (ohead->oh_flags & XLOG_START_TRANS)
                                xlog_recover_new_tid(&rhash[hash], tid,
@@ -2983,7 +2913,7 @@ xlog_recover_process_data(
                        switch (flags) {
                        case XLOG_COMMIT_TRANS:
                                error = xlog_recover_commit_trans(log,
-                                                &rhash[hash], trans, pass);
+                                                                trans, pass);
                                break;
                        case XLOG_UNMOUNT_TRANS:
                                error = xlog_recover_unmount_trans(trans);
@@ -3216,7 +3146,7 @@ xlog_recover_process_one_iunlink(
        /*
         * Get the on disk inode to find the next inode in the bucket.
         */
-        error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XFS_BUF_LOCK);
+        error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK);
        if (error)
                goto fail_iput;
@@ -3517,12 +3447,12 @@ xlog_do_recovery_pass(
 {
        xlog_rec_header_t       *rhead;
        xfs_daddr_t             blk_no;
-        xfs_caddr_t             bufaddr, offset;
+        xfs_caddr_t             offset;
        xfs_buf_t               *hbp, *dbp;
        int                     error = 0, h_size;
        int                     bblks, split_bblks;
        int                     hblks, split_hblks, wrapped_hblks;
-        xlog_recover_t          *rhash[XLOG_RHASH_SIZE];
+        struct hlist_head       rhash[XLOG_RHASH_SIZE];
        ASSERT(head_blk != tail_blk);
@@ -3610,7 +3540,7 @@ xlog_do_recovery_pass(
                        /*
                         * Check for header wrapping around physical end-of-log
                         */
-                        offset = NULL;
+                        offset = XFS_BUF_PTR(hbp);
                        split_hblks = 0;
                        wrapped_hblks = 0;
                        if (blk_no + hblks <= log->l_logBBsize) {
@@ -3646,9 +3576,8 @@ xlog_do_recovery_pass(
                                 *   - order is important.
                                 */
                                wrapped_hblks = hblks - split_hblks;
-                                bufaddr = XFS_BUF_PTR(hbp);
                                error = XFS_BUF_SET_PTR(hbp,
-                                                bufaddr + BBTOB(split_hblks),
+                                                offset + BBTOB(split_hblks),
                                                BBTOB(hblks - split_hblks));
                                if (error)
                                        goto bread_err2;
@@ -3658,14 +3587,10 @@ xlog_do_recovery_pass(
                                if (error)
                                        goto bread_err2;
-                                error = XFS_BUF_SET_PTR(hbp, bufaddr,
+                                error = XFS_BUF_SET_PTR(hbp, offset,
                                                        BBTOB(hblks));
                                if (error)
                                        goto bread_err2;
-                                if (!offset)
-                                        offset = xlog_align(log, 0,
-                                                        wrapped_hblks, hbp);
                        }
                        rhead = (xlog_rec_header_t *)offset;
                        error = xlog_valid_rec_header(log, rhead,
@@ -3685,7 +3610,7 @@ xlog_do_recovery_pass(
                        } else {
                                /* This log record is split across the
                                 * physical end of log */
-                                offset = NULL;
+                                offset = XFS_BUF_PTR(dbp);
                                split_bblks = 0;
                                if (blk_no != log->l_logBBsize) {
                                        /* some data is before the physical
@@ -3714,9 +3639,8 @@ xlog_do_recovery_pass(
                                 *   _first_, then the log start (LR header end)
                                 *   - order is important.
                                 */
-                                bufaddr = XFS_BUF_PTR(dbp);
                                error = XFS_BUF_SET_PTR(dbp,
-                                                bufaddr + BBTOB(split_bblks),
+                                                offset + BBTOB(split_bblks),
                                                BBTOB(bblks - split_bblks));
                                if (error)
                                        goto bread_err2;
@@ -3727,13 +3651,9 @@ xlog_do_recovery_pass(
                                if (error)
                                        goto bread_err2;
-                                error = XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
+                                error = XFS_BUF_SET_PTR(dbp, offset, h_size);
                                if (error)
                                        goto bread_err2;
-                                if (!offset)
-                                        offset = xlog_align(log, wrapped_hblks,
-                                                bblks - split_bblks, dbp);
                        }
                        xlog_unpack_data(rhead, offset, log);
                        if ((error = xlog_recover_process_data(log, rhash,
@@ -3993,8 +3913,7 @@ xlog_recover_finish(
                 * case the unlink transactions would have problems
                 * pushing the EFIs out of the way.
                 */
-                xfs_log_force(log->l_mp, (xfs_lsn_t)0,
+                xfs_log_force(log->l_mp, XFS_LOG_SYNC);
-                              (XFS_LOG_FORCE | XFS_LOG_SYNC));
                xlog_recover_process_iunlinks(log);
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
index b22545555301..75d749207258 100644
--- a/fs/xfs/xfs_log_recover.h
+++ b/fs/xfs/xfs_log_recover.h
@@ -35,22 +35,21 @@
 * item headers are in ri_buf[0].  Additional buffers follow.
 */
 typedef struct xlog_recover_item {
-        struct xlog_recover_item *ri_next;
+        struct list_head        ri_list;
-        struct xlog_recover_item *ri_prev;
+        int                     ri_type;
-        int                      ri_type;
+        int                     ri_cnt; /* count of regions found */
-        int                      ri_cnt;        /* count of regions found */
+        int                     ri_total;       /* total regions */
-        int                      ri_total;      /* total regions */
+        xfs_log_iovec_t         *ri_buf;        /* ptr to regions buffer */
-        xfs_log_iovec_t          *ri_buf;       /* ptr to regions buffer */
 } xlog_recover_item_t;
 struct xlog_tid;
 typedef struct xlog_recover {
-        struct xlog_recover *r_next;
+        struct hlist_node       r_list;
-        xlog_tid_t          r_log_tid;          /* log's transaction id */
+        xlog_tid_t              r_log_tid;      /* log's transaction id */
-        xfs_trans_header_t  r_theader;          /* trans header for partial */
+        xfs_trans_header_t      r_theader;      /* trans header for partial */
-        int                 r_state;            /* not needed */
+        int                     r_state;        /* not needed */
-        xfs_lsn_t           r_lsn;              /* xact lsn */
+        xfs_lsn_t               r_lsn;          /* xact lsn */
-        xlog_recover_item_t *r_itemq;           /* q for items */
+        struct list_head        r_itemq;        /* q for items */
 } xlog_recover_t;
 #define ITEM_TYPE(i)    (*(ushort *)(i)->ri_buf[0].i_addr)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 8b6c9e807efb..e79b56b4bca6 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -44,6 +44,8 @@
 #include "xfs_quota.h"
 #include "xfs_fsops.h"
 #include "xfs_utils.h"
+#include "xfs_trace.h"
 STATIC void     xfs_unmountfs_wait(xfs_mount_t *);
@@ -199,6 +201,38 @@ xfs_uuid_unmount(
 /*
+ * Reference counting access wrappers to the perag structures.
+ */
+struct xfs_perag *
+xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
+{
+        struct xfs_perag        *pag;
+        int                     ref = 0;
+        spin_lock(&mp->m_perag_lock);
+        pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+        if (pag) {
+                ASSERT(atomic_read(&pag->pag_ref) >= 0);
+                /* catch leaks in the positive direction during testing */
+                ASSERT(atomic_read(&pag->pag_ref) < 1000);
+                ref = atomic_inc_return(&pag->pag_ref);
+        }
+        spin_unlock(&mp->m_perag_lock);
+        trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
+        return pag;
+}
+void
+xfs_perag_put(struct xfs_perag *pag)
+{
+        int     ref;
+        ASSERT(atomic_read(&pag->pag_ref) > 0);
+        ref = atomic_dec_return(&pag->pag_ref);
+        trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
+}
+/*
 * Free up the resources associated with a mount structure.  Assume that
 * the structure was initially zeroed, so we can tell which fields got
 * initialized.
@@ -207,13 +241,16 @@ STATIC void
 xfs_free_perag(
        xfs_mount_t     *mp)
 {
-        if (mp->m_perag) {
+        xfs_agnumber_t  agno;
-                int     agno;
+        struct xfs_perag *pag;
-                for (agno = 0; agno < mp->m_maxagi; agno++)
+        for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
-                        if (mp->m_perag[agno].pagb_list)
+                spin_lock(&mp->m_perag_lock);
-                                kmem_free(mp->m_perag[agno].pagb_list);
+                pag = radix_tree_delete(&mp->m_perag_tree, agno);
-                kmem_free(mp->m_perag);
+                ASSERT(pag);
+                ASSERT(atomic_read(&pag->pag_ref) == 0);
+                spin_unlock(&mp->m_perag_lock);
+                kmem_free(pag);
        }
 }
@@ -387,22 +424,57 @@ xfs_initialize_perag_icache(
        }
 }
-xfs_agnumber_t
+int
 xfs_initialize_perag(
        xfs_mount_t     *mp,
-        xfs_agnumber_t  agcount)
+        xfs_agnumber_t  agcount,
+        xfs_agnumber_t  *maxagi)
 {
        xfs_agnumber_t  index, max_metadata;
+        xfs_agnumber_t  first_initialised = 0;
        xfs_perag_t     *pag;
        xfs_agino_t     agino;
        xfs_ino_t       ino;
        xfs_sb_t        *sbp = &mp->m_sb;
        xfs_ino_t       max_inum = XFS_MAXINUMBER_32;
+        int             error = -ENOMEM;
        /* Check to see if the filesystem can overflow 32 bit inodes */
        agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
        ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
+        /*
+         * Walk the current per-ag tree so we don't try to initialise AGs
+         * that already exist (growfs case). Allocate and insert all the
+         * AGs we don't find ready for initialisation.
+         */
+        for (index = 0; index < agcount; index++) {
+                pag = xfs_perag_get(mp, index);
+                if (pag) {
+                        xfs_perag_put(pag);
+                        continue;
+                }
+                if (!first_initialised)
+                        first_initialised = index;
+                pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
+                if (!pag)
+                        goto out_unwind;
+                if (radix_tree_preload(GFP_NOFS))
+                        goto out_unwind;
+                spin_lock(&mp->m_perag_lock);
+                if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
+                        BUG();
+                        spin_unlock(&mp->m_perag_lock);
+                        radix_tree_preload_end();
+                        error = -EEXIST;
+                        goto out_unwind;
+                }
+                pag->pag_agno = index;
+                pag->pag_mount = mp;
+                spin_unlock(&mp->m_perag_lock);
+                radix_tree_preload_end();
+        }
        /* Clear the mount flag if no inode can overflow 32 bits
         * on this filesystem, or if specifically requested..
         */
@@ -436,21 +508,33 @@ xfs_initialize_perag(
                        }
                        /* This ag is preferred for inodes */
-                        pag = &mp->m_perag[index];
+                        pag = xfs_perag_get(mp, index);
                        pag->pagi_inodeok = 1;
                        if (index < max_metadata)
                                pag->pagf_metadata = 1;
                        xfs_initialize_perag_icache(pag);
+                        xfs_perag_put(pag);
                }
        } else {
                /* Setup default behavior for smaller filesystems */
                for (index = 0; index < agcount; index++) {
-                        pag = &mp->m_perag[index];
+                        pag = xfs_perag_get(mp, index);
                        pag->pagi_inodeok = 1;
                        xfs_initialize_perag_icache(pag);
+                        xfs_perag_put(pag);
                }
        }
-        return index;
+        if (maxagi)
+                *maxagi = index;
+        return 0;
+out_unwind:
+        kmem_free(pag);
+        for (; index > first_initialised; index--) {
+                pag = radix_tree_delete(&mp->m_perag_tree, index);
+                kmem_free(pag);
+        }
+        return error;
 }
 void
@@ -581,10 +665,10 @@ xfs_readsb(xfs_mount_t *mp, int flags)
         * access to the superblock.
         */
        sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
-        extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED;
+        extra_flags = XBF_LOCK | XBF_FS_MANAGED | XBF_MAPPED;
-        bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR,
+        bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size),
-                                BTOBB(sector_size), extra_flags);
+                          extra_flags);
        if (!bp || XFS_BUF_ISERROR(bp)) {
                xfs_fs_mount_cmn_err(flags, "SB read failed");
                error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
@@ -624,8 +708,8 @@ xfs_readsb(xfs_mount_t *mp, int flags)
                XFS_BUF_UNMANAGE(bp);
                xfs_buf_relse(bp);
                sector_size = mp->m_sb.sb_sectsize;
-                bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR,
+                bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR,
-                                        BTOBB(sector_size), extra_flags);
+                                  BTOBB(sector_size), extra_flags);
                if (!bp || XFS_BUF_ISERROR(bp)) {
                        xfs_fs_mount_cmn_err(flags, "SB re-read failed");
                        error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
@@ -729,12 +813,13 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
                error = xfs_ialloc_pagi_init(mp, NULL, index);
                if (error)
                        return error;
-                pag = &mp->m_perag[index];
+                pag = xfs_perag_get(mp, index);
                ifree += pag->pagi_freecount;
                ialloc += pag->pagi_count;
                bfree += pag->pagf_freeblks;
                bfreelst += pag->pagf_flcount;
                btree += pag->pagf_btreeblks;
+                xfs_perag_put(pag);
        }
        /*
         * Overwrite incore superblock counters with just-read data
@@ -1006,6 +1091,24 @@ xfs_mount_reset_sbqflags(
        return xfs_trans_commit(tp, 0);
 }
+__uint64_t
+xfs_default_resblks(xfs_mount_t *mp)
+{
+        __uint64_t resblks;
+        /*
+         * We default to 5% or 8192 fsbs of space reserved, whichever is
+         * smaller.  This is intended to cover concurrent allocation
+         * transactions when we initially hit enospc. These each require a 4
+         * block reservation. Hence by default we cover roughly 2000 concurrent
+         * allocation reservations.
+         */
+        resblks = mp->m_sb.sb_dblocks;
+        do_div(resblks, 20);
+        resblks = min_t(__uint64_t, resblks, 8192);
+        return resblks;
+}
 /*
 * This function does the following on an initial mount of a file system:
 *      - reads the superblock from disk and init the mount struct
@@ -1150,13 +1253,13 @@ xfs_mountfs(
        /*
         * Allocate and initialize the per-ag data.
         */
-        init_rwsem(&mp->m_peraglock);
+        spin_lock_init(&mp->m_perag_lock);
-        mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t),
+        INIT_RADIX_TREE(&mp->m_perag_tree, GFP_NOFS);
-                                  KM_MAYFAIL);
+        error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
-        if (!mp->m_perag)
+        if (error) {
+                cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error);
                goto out_remove_uuid;
+        }
-        mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount);
        if (!sbp->sb_logblocks) {
                cmn_err(CE_WARN, "XFS: no log defined");
@@ -1317,17 +1420,16 @@ xfs_mountfs(
         * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
         * are not allowed to use this reserved space.
         *
-         * We default to 5% or 1024 fsbs of space reserved, whichever is smaller.
         * This may drive us straight to ENOSPC on mount, but that implies
         * we were already there on the last unmount. Warn if this occurs.
         */
-        resblks = mp->m_sb.sb_dblocks;
+        if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-        do_div(resblks, 20);
+                resblks = xfs_default_resblks(mp);
-        resblks = min_t(__uint64_t, resblks, 1024);
+                error = xfs_reserve_blocks(mp, &resblks, NULL);
-        error = xfs_reserve_blocks(mp, &resblks, NULL);
+                if (error)
-        if (error)
+                        cmn_err(CE_WARN, "XFS: Unable to allocate reserve "
-                cmn_err(CE_WARN, "XFS: Unable to allocate reserve blocks. "
+                                "blocks. Continuing without a reserve pool.");
-                                "Continuing without a reserve pool.");
+        }
        return 0;
@@ -1370,8 +1472,19 @@ xfs_unmountfs(
         * push out the iclog we will never get that unlocked. hence we
         * need to force the log first.
         */
-        xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+        xfs_log_force(mp, XFS_LOG_SYNC);
-        xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC);
+        /*
+         * Do a delwri reclaim pass first so that as many dirty inodes are
+         * queued up for IO as possible. Then flush the buffers before making
+         * a synchronous path to catch all the remaining inodes are reclaimed.
+         * This makes the reclaim process as quick as possible by avoiding
+         * synchronous writeout and blocking on inodes already in the delwri
+         * state as much as possible.
+         */
+        xfs_reclaim_inodes(mp, 0);
+        XFS_bflush(mp->m_ddev_targp);
+        xfs_reclaim_inodes(mp, SYNC_WAIT);
        xfs_qm_unmount(mp);
@@ -1380,7 +1493,7 @@ xfs_unmountfs(
         * that nothing is pinned.  This is important because bflush()
         * will skip pinned buffers.
         */
-        xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+        xfs_log_force(mp, XFS_LOG_SYNC);
        xfs_binval(mp->m_ddev_targp);
        if (mp->m_rtdev_targp) {
@@ -1471,7 +1584,7 @@ xfs_log_sbcount(
        if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
                return 0;
-        tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT);
+        tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
        error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
                                        XFS_DEFAULT_LOG_COUNT);
        if (error) {
@@ -1546,15 +1659,14 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
        xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
        /* find modified range */
+        f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
+        ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+        last = xfs_sb_info[f + 1].offset - 1;
        f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
        ASSERT((1LL << f) & XFS_SB_MOD_BITS);
        first = xfs_sb_info[f].offset;
-        f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
-        ASSERT((1LL << f) & XFS_SB_MOD_BITS);
-        last = xfs_sb_info[f + 1].offset - 1;
        xfs_trans_log_buf(tp, bp, first, last);
 }
@@ -1618,26 +1730,30 @@ xfs_mod_incore_sb_unlocked(
                                lcounter += rem;
                        }
                } else {                                /* Taking blocks away */
                        lcounter += delta;
+                        if (lcounter >= 0) {
+                                mp->m_sb.sb_fdblocks = lcounter +
+                                                        XFS_ALLOC_SET_ASIDE(mp);
+                                return 0;
+                        }
-                /*
+                        /*
-                 * If were out of blocks, use any available reserved blocks if
+                         * We are out of blocks, use any available reserved
-                 * were allowed to.
+                         * blocks if were allowed to.
-                 */
+                         */
+                        if (!rsvd)
+                                return XFS_ERROR(ENOSPC);
-                        if (lcounter < 0) {
+                        lcounter = (long long)mp->m_resblks_avail + delta;
-                                if (rsvd) {
+                        if (lcounter >= 0) {
-                                        lcounter = (long long)mp->m_resblks_avail + delta;
+                                mp->m_resblks_avail = lcounter;
-                                        if (lcounter < 0) {
+                                return 0;
-                                                return XFS_ERROR(ENOSPC);
-                                        }
-                                        mp->m_resblks_avail = lcounter;
-                                        return 0;
-                                } else {        /* not reserved */
-                                        return XFS_ERROR(ENOSPC);
-                                }
                        }
+                        printk_once(KERN_WARNING
+                                "Filesystem \"%s\": reserve blocks depleted! "
+                                "Consider increasing reserve pool size.",
+                                mp->m_fsname);
+                        return XFS_ERROR(ENOSPC);
                }
                mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
@@ -1885,7 +2001,7 @@ xfs_getsb(
        ASSERT(mp->m_sb_bp != NULL);
        bp = mp->m_sb_bp;
-        if (flags & XFS_BUF_TRYLOCK) {
+        if (flags & XBF_TRYLOCK) {
                if (!XFS_BUF_CPSEMA(bp)) {
                        return NULL;
                }
@@ -1945,6 +2061,26 @@ xfs_mount_log_sb(
        return error;
 }
+/*
+ * If the underlying (data/log/rt) device is readonly, there are some
+ * operations that cannot proceed.
+ */
+int
+xfs_dev_is_read_only(
+        struct xfs_mount        *mp,
+        char                    *message)
+{
+        if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
+            xfs_readonly_buftarg(mp->m_logdev_targp) ||
+            (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
+                cmn_err(CE_NOTE,
+                        "XFS: %s required on read-only device.", message);
+                cmn_err(CE_NOTE,
+                        "XFS: write access unavailable, cannot proceed.");
+                return EROFS;
+        }
+        return 0;
+}
 #ifdef HAVE_PERCPU_SB
 /*
@@ -2123,7 +2259,7 @@ xfs_icsb_destroy_counters(
        mutex_destroy(&mp->m_icsb_mutex);
 }
-STATIC_INLINE void
+STATIC void
 xfs_icsb_lock_cntr(
        xfs_icsb_cnts_t *icsbp)
 {
@@ -2132,7 +2268,7 @@ xfs_icsb_lock_cntr(
        }
 }
-STATIC_INLINE void
+STATIC void
 xfs_icsb_unlock_cntr(
        xfs_icsb_cnts_t *icsbp)
 {
@@ -2140,7 +2276,7 @@ xfs_icsb_unlock_cntr(
 }
-STATIC_INLINE void
+STATIC void
 xfs_icsb_lock_all_counters(
        xfs_mount_t     *mp)
 {
@@ -2153,7 +2289,7 @@ xfs_icsb_lock_all_counters(
        }
 }
-STATIC_INLINE void
+STATIC void
 xfs_icsb_unlock_all_counters(
        xfs_mount_t     *mp)
 {
@@ -2389,12 +2525,12 @@ xfs_icsb_modify_counters(
 {
        xfs_icsb_cnts_t *icsbp;
        long long       lcounter;       /* long counter for 64 bit fields */
-        int             cpu, ret = 0;
+        int             ret = 0;
        might_sleep();
 again:
-        cpu = get_cpu();
+        preempt_disable();
-        icsbp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, cpu);
+        icsbp = this_cpu_ptr(mp->m_sb_cnts);
        /*
         * if the counter is disabled, go to slow path
@@ -2438,11 +2574,11 @@ again:
                break;
        }
        xfs_icsb_unlock_cntr(icsbp);
-        put_cpu();
+        preempt_enable();
        return 0;
 slow_path:
-        put_cpu();
+        preempt_enable();
        /*
         * serialise with a mutex so we don't burn lots of cpu on
@@ -2490,7 +2626,7 @@ slow_path:
 balance_counter:
        xfs_icsb_unlock_cntr(icsbp);
-        put_cpu();
+        preempt_enable();
        /*
         * We may have multiple threads here if multiple per-cpu
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a6c023bc0fb2..4fa0bc7b983e 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -78,7 +78,8 @@ typedef int	(*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t);
 typedef int     (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
                        struct xfs_inode *, dm_right_t,
                        struct xfs_inode *, dm_right_t,
-                        const char *, const char *, mode_t, int, int);
+                        const unsigned char *, const unsigned char *,
+                        mode_t, int, int);
 typedef int     (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t,
                        char *, char *);
 typedef void    (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *,
@@ -93,6 +94,9 @@ typedef struct xfs_dmops {
        xfs_send_unmount_t      xfs_send_unmount;
 } xfs_dmops_t;
+#define XFS_DMAPI_UNMOUNT_FLAGS(mp) \
+        (((mp)->m_dmevmask & (1 << DM_EVENT_UNMOUNT)) ? 0 : DM_FLAGS_UNWANTED)
 #define XFS_SEND_DATA(mp, ev,ip,off,len,fl,lock) \
        (*(mp)->m_dm_ops->xfs_send_data)(ev,ip,off,len,fl,lock)
 #define XFS_SEND_MMAP(mp, vma,fl) \
@@ -101,12 +105,24 @@ typedef struct xfs_dmops {
        (*(mp)->m_dm_ops->xfs_send_destroy)(ip,right)
 #define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
        (*(mp)->m_dm_ops->xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl)
-#define XFS_SEND_PREUNMOUNT(mp,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
-        (*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT,mp,b1,r1,b2,r2,n1,n2,mode,rval,fl)
 #define XFS_SEND_MOUNT(mp,right,path,name) \
        (*(mp)->m_dm_ops->xfs_send_mount)(mp,right,path,name)
-#define XFS_SEND_UNMOUNT(mp, ip,right,mode,rval,fl) \
+#define XFS_SEND_PREUNMOUNT(mp) \
-        (*(mp)->m_dm_ops->xfs_send_unmount)(mp,ip,right,mode,rval,fl)
+do { \
+        if (mp->m_flags & XFS_MOUNT_DMAPI) { \
+                (*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT, mp, \
+                        (mp)->m_rootip, DM_RIGHT_NULL, \
+                        (mp)->m_rootip, DM_RIGHT_NULL, \
+                        NULL, NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \
+        } \
+} while (0)
+#define XFS_SEND_UNMOUNT(mp) \
+do { \
+        if (mp->m_flags & XFS_MOUNT_DMAPI) { \
+                (*(mp)->m_dm_ops->xfs_send_unmount)(mp, (mp)->m_rootip, \
+                        DM_RIGHT_NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \
+        } \
+} while (0)
 #ifdef HAVE_PERCPU_SB
@@ -192,8 +208,8 @@ typedef struct xfs_mount {
        uint                    m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
        uint                    m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
        uint                    m_in_maxlevels; /* max inobt btree levels. */
-        struct xfs_perag        *m_perag;       /* per-ag accounting info */
+        struct radix_tree_root  m_perag_tree;   /* per-ag accounting info */
-        struct rw_semaphore     m_peraglock;    /* lock for m_perag (pointer) */
+        spinlock_t              m_perag_lock;   /* lock for m_perag_tree */
        struct mutex            m_growlock;     /* growfs mutex */
        int                     m_fixedfsid[2]; /* unchanged for life of FS */
        uint                    m_dmevmask;     /* DMI events for this FS */
@@ -209,6 +225,7 @@ typedef struct xfs_mount {
        __uint64_t              m_maxioffset;   /* maximum inode offset */
        __uint64_t              m_resblks;      /* total reserved blocks */
        __uint64_t              m_resblks_avail;/* available reserved blocks */
+        __uint64_t              m_resblks_save; /* reserved blks @ remount,ro */
        int                     m_dalign;       /* stripe unit */
        int                     m_swidth;       /* stripe width */
        int                     m_sinoalign;    /* stripe unit inode alignment */
@@ -228,7 +245,7 @@ typedef struct xfs_mount {
        struct xfs_qmops        *m_qm_ops;      /* vector of XQM ops */
        atomic_t                m_active_trans; /* number trans frozen */
 #ifdef HAVE_PERCPU_SB
-        xfs_icsb_cnts_t         *m_sb_cnts;     /* per-cpu superblock counters */
+        xfs_icsb_cnts_t __percpu *m_sb_cnts;    /* per-cpu superblock counters */
        unsigned long           m_icsb_counters; /* disabled per-cpu counters */
        struct notifier_block   m_icsb_notifier; /* hotplug cpu notifier */
        struct mutex            m_icsb_mutex;   /* balancer sync lock */
@@ -369,31 +386,22 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
 }
 /*
- * perag get/put wrappers for eventual ref counting
+ * perag get/put wrappers for ref counting
 */
-static inline xfs_perag_t *
+struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
-xfs_get_perag(struct xfs_mount *mp, xfs_ino_t ino)
+void    xfs_perag_put(struct xfs_perag *pag);
-{
-        return &mp->m_perag[XFS_INO_TO_AGNO(mp, ino)];
-}
-static inline void
-xfs_put_perag(struct xfs_mount *mp, xfs_perag_t *pag)
-{
-        /* nothing to see here, move along */
-}
 /*
 * Per-cpu superblock locking functions
 */
 #ifdef HAVE_PERCPU_SB
-STATIC_INLINE void
+static inline void
 xfs_icsb_lock(xfs_mount_t *mp)
 {
        mutex_lock(&mp->m_icsb_mutex);
 }
-STATIC_INLINE void
+static inline void
 xfs_icsb_unlock(xfs_mount_t *mp)
 {
        mutex_unlock(&mp->m_icsb_mutex);
@@ -413,6 +421,7 @@ typedef struct xfs_mod_sb {
 } xfs_mod_sb_t;
 extern int      xfs_log_sbcount(xfs_mount_t *, uint);
+extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
 extern int      xfs_mountfs(xfs_mount_t *mp);
 extern void     xfs_unmountfs(xfs_mount_t *);
@@ -427,6 +436,8 @@ extern void	xfs_freesb(xfs_mount_t *);
 extern int      xfs_fs_writable(xfs_mount_t *);
 extern int      xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
+extern int      xfs_dev_is_read_only(struct xfs_mount *, char *);
 extern int      xfs_dmops_get(struct xfs_mount *);
 extern void     xfs_dmops_put(struct xfs_mount *);
@@ -435,7 +446,8 @@ extern struct xfs_dmops xfs_dmcore_xfs;
 #endif  /* __KERNEL__ */
 extern void     xfs_mod_sb(struct xfs_trans *, __int64_t);
-extern xfs_agnumber_t   xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t);
+extern int      xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
+                                        xfs_agnumber_t *);
 extern void     xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
 extern void     xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 4b0613d99faa..45ce15dc5b2b 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -398,7 +398,7 @@ exit:
 * guaranteed that all the free functions for all the elements have finished
 * executing and the reaper is not running.
 */
-void
+static void
 xfs_mru_cache_flush(
        xfs_mru_cache_t         *mru)
 {
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index 5d439f34b0c9..36dd3ec8b4eb 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -42,7 +42,6 @@ void xfs_mru_cache_uninit(void);
 int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms,
                             unsigned int grp_count,
                             xfs_mru_cache_free_func_t free_func);
-void xfs_mru_cache_flush(xfs_mru_cache_t *mru);
 void xfs_mru_cache_destroy(struct xfs_mru_cache *mru);
 int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
                                void *value);
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 3ec91ac74c2a..fdcab3f81dde 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -92,6 +92,14 @@ typedef struct xfs_dqblk {
 #define XFS_DQ_ALLTYPES         (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
+#define XFS_DQ_FLAGS \
+        { XFS_DQ_USER,          "USER" }, \
+        { XFS_DQ_PROJ,          "PROJ" }, \
+        { XFS_DQ_GROUP,         "GROUP" }, \
+        { XFS_DQ_DIRTY,         "DIRTY" }, \
+        { XFS_DQ_WANT,          "WANT" }, \
+        { XFS_DQ_INACTIVE,      "INACTIVE" }
 /*
 * In the worst case, when both user and group quotas are on,
 * we can have a max of three dquots changing in a single transaction.
@@ -215,16 +223,9 @@ typedef struct xfs_qoff_logformat {
 #define XFS_QMOPT_RES_INOS      0x0800000
 /*
- * flags for dqflush and dqflush_all.
- */
-#define XFS_QMOPT_SYNC          0x1000000
-#define XFS_QMOPT_ASYNC         0x2000000
-#define XFS_QMOPT_DELWRI        0x4000000
-/*
 * flags for dqalloc.
 */
-#define XFS_QMOPT_INHERIT       0x8000000
+#define XFS_QMOPT_INHERIT       0x1000000
 /*
 * flags to xfs_trans_mod_dquot.
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index b81deea0ce19..fc1cda23b817 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -39,6 +39,7 @@
 #include "xfs_utils.h"
 #include "xfs_trans_space.h"
 #include "xfs_vnodeops.h"
+#include "xfs_trace.h"
 /*
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 385f6dceba5d..6be05f756d59 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -45,6 +45,7 @@
 #include "xfs_inode_item.h"
 #include "xfs_trans_space.h"
 #include "xfs_utils.h"
+#include "xfs_trace.h"
 /*
@@ -1516,6 +1517,8 @@ xfs_rtfree_range(
         */
        error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
                &postblock);
+        if (error)
+                return error;
        /*
         * If there are blocks not being freed at the front of the
         * old extent, add summary data for them to be allocated.
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 3f816ad7ff19..e336742a58a4 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -44,48 +44,7 @@
 #include "xfs_error.h"
 #include "xfs_buf_item.h"
 #include "xfs_rw.h"
+#include "xfs_trace.h"
-/*
- * This is a subroutine for xfs_write() and other writers (xfs_ioctl)
- * which clears the setuid and setgid bits when a file is written.
- */
-int
-xfs_write_clear_setuid(
-        xfs_inode_t     *ip)
-{
-        xfs_mount_t     *mp;
-        xfs_trans_t     *tp;
-        int             error;
-        mp = ip->i_mount;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
-        if ((error = xfs_trans_reserve(tp, 0,
-                                      XFS_WRITEID_LOG_RES(mp),
-                                      0, 0, 0))) {
-                xfs_trans_cancel(tp, 0);
-                return error;
-        }
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-        xfs_trans_ihold(tp, ip);
-        ip->i_d.di_mode &= ~S_ISUID;
-        /*
-         * Note that we don't have to worry about mandatory
-         * file locking being disabled here because we only
-         * clear the S_ISGID bit if the Group execute bit is
-         * on, but if it was on then mandatory locking wouldn't
-         * have been enabled.
-         */
-        if (ip->i_d.di_mode & S_IXGRP) {
-                ip->i_d.di_mode &= ~S_ISGID;
-        }
-        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        xfs_trans_set_sync(tp);
-        error = xfs_trans_commit(tp, 0);
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        return 0;
-}
 /*
 * Force a shutdown of the filesystem instantly while keeping
@@ -152,90 +111,6 @@ xfs_do_force_shutdown(
        }
 }
-/*
- * Called when we want to stop a buffer from getting written or read.
- * We attach the EIO error, muck with its flags, and call biodone
- * so that the proper iodone callbacks get called.
- */
-int
-xfs_bioerror(
-        xfs_buf_t *bp)
-{
-#ifdef XFSERRORDEBUG
-        ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
-#endif
-        /*
-         * No need to wait until the buffer is unpinned.
-         * We aren't flushing it.
-         */
-        xfs_buftrace("XFS IOERROR", bp);
-        XFS_BUF_ERROR(bp, EIO);
-        /*
-         * We're calling biodone, so delete B_DONE flag. Either way
-         * we have to call the iodone callback, and calling biodone
-         * probably is the best way since it takes care of
-         * GRIO as well.
-         */
-        XFS_BUF_UNREAD(bp);
-        XFS_BUF_UNDELAYWRITE(bp);
-        XFS_BUF_UNDONE(bp);
-        XFS_BUF_STALE(bp);
-        XFS_BUF_CLR_BDSTRAT_FUNC(bp);
-        xfs_biodone(bp);
-        return (EIO);
-}
-/*
- * Same as xfs_bioerror, except that we are releasing the buffer
- * here ourselves, and avoiding the biodone call.
- * This is meant for userdata errors; metadata bufs come with
- * iodone functions attached, so that we can track down errors.
- */
-int
-xfs_bioerror_relse(
-        xfs_buf_t *bp)
-{
-        int64_t fl;
-        ASSERT(XFS_BUF_IODONE_FUNC(bp) != xfs_buf_iodone_callbacks);
-        ASSERT(XFS_BUF_IODONE_FUNC(bp) != xlog_iodone);
-        xfs_buftrace("XFS IOERRELSE", bp);
-        fl = XFS_BUF_BFLAGS(bp);
-        /*
-         * No need to wait until the buffer is unpinned.
-         * We aren't flushing it.
-         *
-         * chunkhold expects B_DONE to be set, whether
-         * we actually finish the I/O or not. We don't want to
-         * change that interface.
-         */
-        XFS_BUF_UNREAD(bp);
-        XFS_BUF_UNDELAYWRITE(bp);
-        XFS_BUF_DONE(bp);
-        XFS_BUF_STALE(bp);
-        XFS_BUF_CLR_IODONE_FUNC(bp);
-        XFS_BUF_CLR_BDSTRAT_FUNC(bp);
-        if (!(fl & XFS_B_ASYNC)) {
-                /*
-                 * Mark b_error and B_ERROR _both_.
-                 * Lot's of chunkcache code assumes that.
-                 * There's no reason to mark error for
-                 * ASYNC buffers.
-                 */
-                XFS_BUF_ERROR(bp, EIO);
-                XFS_BUF_FINISH_IOWAIT(bp);
-        } else {
-                xfs_buf_relse(bp);
-        }
-        return (EIO);
-}
 /*
 * Prints out an ALERT message about I/O error.
 */
@@ -277,10 +152,10 @@ xfs_read_buf(
        xfs_buf_t        *bp;
        int              error;
-        if (flags)
+        if (!flags)
-                bp = xfs_buf_read_flags(target, blkno, len, flags);
+                flags = XBF_LOCK | XBF_MAPPED;
-        else
-                bp = xfs_buf_read(target, blkno, len, flags);
+        bp = xfs_buf_read(target, blkno, len, flags);
        if (!bp)
                return XFS_ERROR(EIO);
        error = XFS_BUF_GETERROR(bp);
@@ -307,32 +182,23 @@ xfs_read_buf(
 }
 /*
- * Wrapper around bwrite() so that we can trap
+ * helper function to extract extent size hint from inode
- * write errors, and act accordingly.
 */
-int
+xfs_extlen_t
-xfs_bwrite(
+xfs_get_extsz_hint(
-        struct xfs_mount *mp,
+        struct xfs_inode        *ip)
-        struct xfs_buf   *bp)
 {
-        int     error;
+        xfs_extlen_t            extsz;
-        /*
+        if (unlikely(XFS_IS_REALTIME_INODE(ip))) {
-         * XXXsup how does this work for quotas.
+                extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
-         */
+                                ? ip->i_d.di_extsize
-        XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
+                                : ip->i_mount->m_sb.sb_rextsize;
-        bp->b_mount = mp;
+                ASSERT(extsz);
-        XFS_BUF_WRITE(bp);
+        } else {
+                extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
-        if ((error = XFS_bwrite(bp))) {
+                                ? ip->i_d.di_extsize : 0;
-                ASSERT(mp);
-                /*
-                 * Cannot put a buftrace here since if the buffer is not
-                 * B_HOLD then we will brelse() the buffer before returning
-                 * from bwrite and we could be tracing a buffer that has
-                 * been reused.
-                 */
-                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
        }
-        return (error);
+        return extsz;
 }
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index f5e4874c37d8..11c41ec6ed75 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -37,44 +37,13 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
 }
 /*
- * Flags for xfs_free_eofblocks
- */
-#define XFS_FREE_EOF_LOCK       (1<<0)
-#define XFS_FREE_EOF_NOLOCK     (1<<1)
-/*
- * helper function to extract extent size hint from inode
- */
-STATIC_INLINE xfs_extlen_t
-xfs_get_extsz_hint(
-        xfs_inode_t     *ip)
-{
-        xfs_extlen_t    extsz;
-        if (unlikely(XFS_IS_REALTIME_INODE(ip))) {
-                extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
-                                ? ip->i_d.di_extsize
-                                : ip->i_mount->m_sb.sb_rextsize;
-                ASSERT(extsz);
-        } else {
-                extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
-                                ? ip->i_d.di_extsize : 0;
-        }
-        return extsz;
-}
-/*
 * Prototypes for functions in xfs_rw.c.
 */
-extern int xfs_write_clear_setuid(struct xfs_inode *ip);
-extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
-extern int xfs_bioerror(struct xfs_buf *bp);
-extern int xfs_bioerror_relse(struct xfs_buf *bp);
 extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
                        xfs_daddr_t blkno, int len, uint flags,
                        struct xfs_buf **bpp);
 extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
                                xfs_buf_t *bp, xfs_daddr_t blkno);
+extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
 #endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 66b849358e62..f73e358bae8d 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -236,19 +236,20 @@ xfs_trans_alloc(
        uint            type)
 {
        xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
-        return _xfs_trans_alloc(mp, type);
+        return _xfs_trans_alloc(mp, type, KM_SLEEP);
 }
 xfs_trans_t *
 _xfs_trans_alloc(
        xfs_mount_t     *mp,
-        uint            type)
+        uint            type,
+        uint            memflags)
 {
        xfs_trans_t     *tp;
        atomic_inc(&mp->m_active_trans);
-        tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+        tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
        tp->t_magic = XFS_TRANS_MAGIC;
        tp->t_type = type;
        tp->t_mountp = mp;
@@ -795,7 +796,7 @@ _xfs_trans_commit(
        int                     sync;
 #define XFS_TRANS_LOGVEC_COUNT  16
        xfs_log_iovec_t         log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
-        void                    *commit_iclog;
+        struct xlog_in_core     *commit_iclog;
        int                     shutdown;
        commit_lsn = -1;
@@ -980,9 +981,8 @@ shut_us_down:
         */
        if (sync) {
                if (!error) {
-                        error = _xfs_log_force(mp, commit_lsn,
+                        error = _xfs_log_force_lsn(mp, commit_lsn,
-                                      XFS_LOG_FORCE | XFS_LOG_SYNC,
+                                      XFS_LOG_SYNC, log_flushed);
-                                      log_flushed);
                }
                XFS_STATS_INC(xs_trans_sync);
        } else {
@@ -1120,7 +1120,7 @@ xfs_trans_fill_vecs(
        tp->t_header.th_num_items = nitems;
        log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
        log_vector->i_len = sizeof(xfs_trans_header_t);
-        XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_TRANSHDR);
+        log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
 }
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index ed47fc77759c..79c8bab9dfff 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -100,6 +100,49 @@ typedef struct xfs_trans_header {
 #define XFS_TRANS_TYPE_MAX              41
 /* new transaction types need to be reflected in xfs_logprint(8) */
+#define XFS_TRANS_TYPES \
+        { XFS_TRANS_SETATTR_NOT_SIZE,   "SETATTR_NOT_SIZE" }, \
+        { XFS_TRANS_SETATTR_SIZE,       "SETATTR_SIZE" }, \
+        { XFS_TRANS_INACTIVE,           "INACTIVE" }, \
+        { XFS_TRANS_CREATE,             "CREATE" }, \
+        { XFS_TRANS_CREATE_TRUNC,       "CREATE_TRUNC" }, \
+        { XFS_TRANS_TRUNCATE_FILE,      "TRUNCATE_FILE" }, \
+        { XFS_TRANS_REMOVE,             "REMOVE" }, \
+        { XFS_TRANS_LINK,               "LINK" }, \
+        { XFS_TRANS_RENAME,             "RENAME" }, \
+        { XFS_TRANS_MKDIR,              "MKDIR" }, \
+        { XFS_TRANS_RMDIR,              "RMDIR" }, \
+        { XFS_TRANS_SYMLINK,            "SYMLINK" }, \
+        { XFS_TRANS_SET_DMATTRS,        "SET_DMATTRS" }, \
+        { XFS_TRANS_GROWFS,             "GROWFS" }, \
+        { XFS_TRANS_STRAT_WRITE,        "STRAT_WRITE" }, \
+        { XFS_TRANS_DIOSTRAT,           "DIOSTRAT" }, \
+        { XFS_TRANS_WRITEID,            "WRITEID" }, \
+        { XFS_TRANS_ADDAFORK,           "ADDAFORK" }, \
+        { XFS_TRANS_ATTRINVAL,          "ATTRINVAL" }, \
+        { XFS_TRANS_ATRUNCATE,          "ATRUNCATE" }, \
+        { XFS_TRANS_ATTR_SET,           "ATTR_SET" }, \
+        { XFS_TRANS_ATTR_RM,            "ATTR_RM" }, \
+        { XFS_TRANS_ATTR_FLAG,          "ATTR_FLAG" }, \
+        { XFS_TRANS_CLEAR_AGI_BUCKET,   "CLEAR_AGI_BUCKET" }, \
+        { XFS_TRANS_QM_SBCHANGE,        "QM_SBCHANGE" }, \
+        { XFS_TRANS_QM_QUOTAOFF,        "QM_QUOTAOFF" }, \
+        { XFS_TRANS_QM_DQALLOC,         "QM_DQALLOC" }, \
+        { XFS_TRANS_QM_SETQLIM,         "QM_SETQLIM" }, \
+        { XFS_TRANS_QM_DQCLUSTER,       "QM_DQCLUSTER" }, \
+        { XFS_TRANS_QM_QINOCREATE,      "QM_QINOCREATE" }, \
+        { XFS_TRANS_QM_QUOTAOFF_END,    "QM_QOFF_END" }, \
+        { XFS_TRANS_SB_UNIT,            "SB_UNIT" }, \
+        { XFS_TRANS_FSYNC_TS,           "FSYNC_TS" }, \
+        { XFS_TRANS_GROWFSRT_ALLOC,     "GROWFSRT_ALLOC" }, \
+        { XFS_TRANS_GROWFSRT_ZERO,      "GROWFSRT_ZERO" }, \
+        { XFS_TRANS_GROWFSRT_FREE,      "GROWFSRT_FREE" }, \
+        { XFS_TRANS_SWAPEXT,            "SWAPEXT" }, \
+        { XFS_TRANS_SB_COUNT,           "SB_COUNT" }, \
+        { XFS_TRANS_DUMMY1,             "DUMMY1" }, \
+        { XFS_TRANS_DUMMY2,             "DUMMY2" }, \
+        { XLOG_UNMOUNT_REC_TYPE,        "UNMOUNT" }
 /*
 * This structure is used to track log items associated with
 * a transaction.  It points to the log item and keeps some
@@ -782,6 +825,10 @@ typedef struct xfs_log_item {
 #define XFS_LI_IN_AIL   0x1
 #define XFS_LI_ABORTED  0x2
+#define XFS_LI_FLAGS \
+        { XFS_LI_IN_AIL,        "IN_AIL" }, \
+        { XFS_LI_ABORTED,       "ABORTED" }
 typedef struct xfs_item_ops {
        uint (*iop_size)(xfs_log_item_t *);
        void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
@@ -814,8 +861,7 @@ typedef struct xfs_item_ops {
 #define XFS_ITEM_SUCCESS        0
 #define XFS_ITEM_PINNED         1
 #define XFS_ITEM_LOCKED         2
-#define XFS_ITEM_FLUSHING       3
+#define XFS_ITEM_PUSHBUF        3
-#define XFS_ITEM_PUSHBUF        4
 /*
 * This structure is used to maintain a list of block ranges that have been
@@ -864,7 +910,7 @@ typedef struct xfs_trans {
        unsigned int            t_blk_res_used; /* # of resvd blocks used */
        unsigned int            t_rtx_res;      /* # of rt extents resvd */
        unsigned int            t_rtx_res_used; /* # of resvd rt extents used */
-        xfs_log_ticket_t        t_ticket;       /* log mgr ticket */
+        struct xlog_ticket      *t_ticket;      /* log mgr ticket */
        xfs_lsn_t               t_lsn;          /* log seq num of start of
                                                 * transaction. */
        xfs_lsn_t               t_commit_lsn;   /* log seq num of end of
@@ -924,7 +970,7 @@ typedef struct xfs_trans {
 * XFS transaction mechanism exported interfaces.
 */
 xfs_trans_t     *xfs_trans_alloc(struct xfs_mount *, uint);
-xfs_trans_t     *_xfs_trans_alloc(struct xfs_mount *, uint);
+xfs_trans_t     *_xfs_trans_alloc(struct xfs_mount *, uint, uint);
 xfs_trans_t     *xfs_trans_dup(xfs_trans_t *);
 int             xfs_trans_reserve(xfs_trans_t *, uint, uint, uint,
                                  uint, uint);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 2ffc570679be..e799824f7245 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -237,14 +237,15 @@ out:
 }
 /*
- * Function that does the work of pushing on the AIL
+ * xfsaild_push does the work of pushing on the AIL.  Returning a timeout of
+ * zero indicates that the caller should sleep until woken.
 */
 long
 xfsaild_push(
        struct xfs_ail  *ailp,
        xfs_lsn_t       *last_lsn)
 {
-        long            tout = 1000; /* milliseconds */
+        long            tout = 0;
        xfs_lsn_t       last_pushed_lsn = *last_lsn;
        xfs_lsn_t       target =  ailp->xa_target;
        xfs_lsn_t       lsn;
@@ -252,6 +253,7 @@ xfsaild_push(
        int             flush_log, count, stuck;
        xfs_mount_t     *mp = ailp->xa_mount;
        struct xfs_ail_cursor   *cur = &ailp->xa_cursors;
+        int             push_xfsbufd = 0;
        spin_lock(&ailp->xa_lock);
        xfs_trans_ail_cursor_init(ailp, cur);
@@ -262,7 +264,7 @@ xfsaild_push(
                 */
                xfs_trans_ail_cursor_done(ailp, cur);
                spin_unlock(&ailp->xa_lock);
-                last_pushed_lsn = 0;
+                *last_lsn = 0;
                return tout;
        }
@@ -279,7 +281,6 @@ xfsaild_push(
         * prevents use from spinning when we can't do anything or there is
         * lots of contention on the AIL lists.
         */
-        tout = 10;
        lsn = lip->li_lsn;
        flush_log = stuck = count = 0;
        while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) {
@@ -308,6 +309,7 @@ xfsaild_push(
                        XFS_STATS_INC(xs_push_ail_pushbuf);
                        IOP_PUSHBUF(lip);
                        last_pushed_lsn = lsn;
+                        push_xfsbufd = 1;
                        break;
                case XFS_ITEM_PINNED:
@@ -322,12 +324,6 @@ xfsaild_push(
                        stuck++;
                        break;
-                case XFS_ITEM_FLUSHING:
-                        XFS_STATS_INC(xs_push_ail_flushing);
-                        last_pushed_lsn = lsn;
-                        stuck++;
-                        break;
                default:
                        ASSERT(0);
                        break;
@@ -371,19 +367,24 @@ xfsaild_push(
                 * move forward in the AIL.
                 */
                XFS_STATS_INC(xs_push_ail_flush);
-                xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+                xfs_log_force(mp, 0);
+        }
+        if (push_xfsbufd) {
+                /* we've got delayed write buffers to flush */
+                wake_up_process(mp->m_ddev_targp->bt_task);
        }
        if (!count) {
                /* We're past our target or empty, so idle */
-                tout = 1000;
+                last_pushed_lsn = 0;
        } else if (XFS_LSN_CMP(lsn, target) >= 0) {
                /*
                 * We reached the target so wait a bit longer for I/O to
                 * complete and remove pushed items from the AIL before we
                 * start the next scan from the start of the AIL.
                 */
-                tout += 20;
+                tout = 50;
                last_pushed_lsn = 0;
        } else if ((stuck * 100) / count > 90) {
                /*
@@ -395,11 +396,14 @@ xfsaild_push(
                 * Backoff a bit more to allow some I/O to complete before
                 * continuing from where we were.
                 */
-                tout += 10;
+                tout = 20;
+        } else {
+                /* more to do, but wait a short while before continuing */
+                tout = 10;
        }
        *last_lsn = last_pushed_lsn;
        return tout;
-}       /* xfsaild_push */
+}
 /*
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 218829e6a152..fb586360d1c9 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -38,6 +38,7 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
+#include "xfs_trace.h"
 STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *,
@@ -45,6 +46,65 @@ STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *,
 STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *,
                xfs_daddr_t, int);
+/*
+ * Add the locked buffer to the transaction.
+ *
+ * The buffer must be locked, and it cannot be associated with any
+ * transaction.
+ *
+ * If the buffer does not yet have a buf log item associated with it,
+ * then allocate one for it.  Then add the buf item to the transaction.
+ */
+STATIC void
+_xfs_trans_bjoin(
+        struct xfs_trans        *tp,
+        struct xfs_buf          *bp,
+        int                     reset_recur)
+{
+        struct xfs_buf_log_item *bip;
+        ASSERT(XFS_BUF_ISBUSY(bp));
+        ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
+        /*
+         * The xfs_buf_log_item pointer is stored in b_fsprivate.  If
+         * it doesn't have one yet, then allocate one and initialize it.
+         * The checks to see if one is there are in xfs_buf_item_init().
+         */
+        xfs_buf_item_init(bp, tp->t_mountp);
+        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+        ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+        if (reset_recur)
+                bip->bli_recur = 0;
+        /*
+         * Take a reference for this transaction on the buf item.
+         */
+        atomic_inc(&bip->bli_refcount);
+        /*
+         * Get a log_item_desc to point at the new item.
+         */
+        (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip);
+        /*
+         * Initialize b_fsprivate2 so we can find it with incore_match()
+         * in xfs_trans_get_buf() and friends above.
+         */
+        XFS_BUF_SET_FSPRIVATE2(bp, tp);
+}
+void
+xfs_trans_bjoin(
+        struct xfs_trans        *tp,
+        struct xfs_buf          *bp)
+{
+        _xfs_trans_bjoin(tp, bp, 0);
+        trace_xfs_trans_bjoin(bp->b_fspriv);
+}
 /*
 * Get and lock the buffer for the caller if it is not already
@@ -74,16 +134,14 @@ xfs_trans_get_buf(xfs_trans_t	*tp,
        xfs_buf_log_item_t      *bip;
        if (flags == 0)
-                flags = XFS_BUF_LOCK | XFS_BUF_MAPPED;
+                flags = XBF_LOCK | XBF_MAPPED;
        /*
         * Default to a normal get_buf() call if the tp is NULL.
         */
-        if (tp == NULL) {
+        if (tp == NULL)
-                bp = xfs_buf_get_flags(target_dev, blkno, len,
+                return xfs_buf_get(target_dev, blkno, len,
-                                                        flags | BUF_BUSY);
+                                   flags | XBF_DONT_BLOCK);
-                return(bp);
-        }
        /*
         * If we find the buffer in the cache with this transaction
@@ -98,79 +156,43 @@ xfs_trans_get_buf(xfs_trans_t	*tp,
        }
        if (bp != NULL) {
                ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
-                if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) {
+                if (XFS_FORCED_SHUTDOWN(tp->t_mountp))
-                        xfs_buftrace("TRANS GET RECUR SHUT", bp);
                        XFS_BUF_SUPER_STALE(bp);
-                }
                /*
                 * If the buffer is stale then it was binval'ed
                 * since last read.  This doesn't matter since the
                 * caller isn't allowed to use the data anyway.
                 */
-                else if (XFS_BUF_ISSTALE(bp)) {
+                else if (XFS_BUF_ISSTALE(bp))
-                        xfs_buftrace("TRANS GET RECUR STALE", bp);
                        ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
-                }
                ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
                bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
                ASSERT(bip != NULL);
                ASSERT(atomic_read(&bip->bli_refcount) > 0);
                bip->bli_recur++;
-                xfs_buftrace("TRANS GET RECUR", bp);
+                trace_xfs_trans_get_buf_recur(bip);
-                xfs_buf_item_trace("GET RECUR", bip);
                return (bp);
        }
        /*
-         * We always specify the BUF_BUSY flag within a transaction so
+         * We always specify the XBF_DONT_BLOCK flag within a transaction
-         * that get_buf does not try to push out a delayed write buffer
+         * so that get_buf does not try to push out a delayed write buffer
         * which might cause another transaction to take place (if the
         * buffer was delayed alloc).  Such recursive transactions can
         * easily deadlock with our current transaction as well as cause
         * us to run out of stack space.
         */
-        bp = xfs_buf_get_flags(target_dev, blkno, len, flags | BUF_BUSY);
+        bp = xfs_buf_get(target_dev, blkno, len, flags | XBF_DONT_BLOCK);
        if (bp == NULL) {
                return NULL;
        }
        ASSERT(!XFS_BUF_GETERROR(bp));
-        /*
+        _xfs_trans_bjoin(tp, bp, 1);
-         * The xfs_buf_log_item pointer is stored in b_fsprivate.  If
+        trace_xfs_trans_get_buf(bp->b_fspriv);
-         * it doesn't have one yet, then allocate one and initialize it.
-         * The checks to see if one is there are in xfs_buf_item_init().
-         */
-        xfs_buf_item_init(bp, tp->t_mountp);
-        /*
-         * Set the recursion count for the buffer within this transaction
-         * to 0.
-         */
-        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
-        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
-        ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
-        bip->bli_recur = 0;
-        /*
-         * Take a reference for this transaction on the buf item.
-         */
-        atomic_inc(&bip->bli_refcount);
-        /*
-         * Get a log_item_desc to point at the new item.
-         */
-        (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
-        /*
-         * Initialize b_fsprivate2 so we can find it with incore_match()
-         * above.
-         */
-        XFS_BUF_SET_FSPRIVATE2(bp, tp);
-        xfs_buftrace("TRANS GET", bp);
-        xfs_buf_item_trace("GET", bip);
        return (bp);
 }
@@ -210,49 +232,16 @@ xfs_trans_getsb(xfs_trans_t	*tp,
                ASSERT(bip != NULL);
                ASSERT(atomic_read(&bip->bli_refcount) > 0);
                bip->bli_recur++;
-                xfs_buf_item_trace("GETSB RECUR", bip);
+                trace_xfs_trans_getsb_recur(bip);
                return (bp);
        }
        bp = xfs_getsb(mp, flags);
-        if (bp == NULL) {
+        if (bp == NULL)
                return NULL;
-        }
-        /*
-         * The xfs_buf_log_item pointer is stored in b_fsprivate.  If
-         * it doesn't have one yet, then allocate one and initialize it.
-         * The checks to see if one is there are in xfs_buf_item_init().
-         */
-        xfs_buf_item_init(bp, mp);
-        /*
-         * Set the recursion count for the buffer within this transaction
-         * to 0.
-         */
-        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
-        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
-        ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
-        bip->bli_recur = 0;
-        /*
+        _xfs_trans_bjoin(tp, bp, 1);
-         * Take a reference for this transaction on the buf item.
+        trace_xfs_trans_getsb(bp->b_fspriv);
-         */
-        atomic_inc(&bip->bli_refcount);
-        /*
-         * Get a log_item_desc to point at the new item.
-         */
-        (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
-        /*
-         * Initialize b_fsprivate2 so we can find it with incore_match()
-         * above.
-         */
-        XFS_BUF_SET_FSPRIVATE2(bp, tp);
-        xfs_buf_item_trace("GETSB", bip);
        return (bp);
 }
@@ -296,15 +285,15 @@ xfs_trans_read_buf(
        int                     error;
        if (flags == 0)
-                flags = XFS_BUF_LOCK | XFS_BUF_MAPPED;
+                flags = XBF_LOCK | XBF_MAPPED;
        /*
         * Default to a normal get_buf() call if the tp is NULL.
         */
        if (tp == NULL) {
-                bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY);
+                bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
                if (!bp)
-                        return (flags & XFS_BUF_TRYLOCK) ?
+                        return (flags & XBF_TRYLOCK) ?
                                        EAGAIN : XFS_ERROR(ENOMEM);
                if (XFS_BUF_GETERROR(bp) != 0) {
@@ -350,7 +339,7 @@ xfs_trans_read_buf(
                ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
                ASSERT((XFS_BUF_ISERROR(bp)) == 0);
                if (!(XFS_BUF_ISDONE(bp))) {
-                        xfs_buftrace("READ_BUF_INCORE !DONE", bp);
+                        trace_xfs_trans_read_buf_io(bp, _RET_IP_);
                        ASSERT(!XFS_BUF_ISASYNC(bp));
                        XFS_BUF_READ(bp);
                        xfsbdstrat(tp->t_mountp, bp);
@@ -375,7 +364,7 @@ xfs_trans_read_buf(
                 * brelse it either. Just get out.
                 */
                if (XFS_FORCED_SHUTDOWN(mp)) {
-                        xfs_buftrace("READ_BUF_INCORE XFSSHUTDN", bp);
+                        trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
                        *bpp = NULL;
                        return XFS_ERROR(EIO);
                }
@@ -385,27 +374,26 @@ xfs_trans_read_buf(
                bip->bli_recur++;
                ASSERT(atomic_read(&bip->bli_refcount) > 0);
-                xfs_buf_item_trace("READ RECUR", bip);
+                trace_xfs_trans_read_buf_recur(bip);
                *bpp = bp;
                return 0;
        }
        /*
-         * We always specify the BUF_BUSY flag within a transaction so
+         * We always specify the XBF_DONT_BLOCK flag within a transaction
-         * that get_buf does not try to push out a delayed write buffer
+         * so that get_buf does not try to push out a delayed write buffer
         * which might cause another transaction to take place (if the
         * buffer was delayed alloc).  Such recursive transactions can
         * easily deadlock with our current transaction as well as cause
         * us to run out of stack space.
         */
-        bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY);
+        bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
        if (bp == NULL) {
                *bpp = NULL;
                return 0;
        }
        if (XFS_BUF_GETERROR(bp) != 0) {
            XFS_BUF_SUPER_STALE(bp);
-                xfs_buftrace("READ ERROR", bp);
                error = XFS_BUF_GETERROR(bp);
                xfs_ioerror_alert("xfs_trans_read_buf", mp,
@@ -431,41 +419,9 @@ xfs_trans_read_buf(
        if (XFS_FORCED_SHUTDOWN(mp))
                goto shutdown_abort;
-        /*
+        _xfs_trans_bjoin(tp, bp, 1);
-         * The xfs_buf_log_item pointer is stored in b_fsprivate.  If
+        trace_xfs_trans_read_buf(bp->b_fspriv);
-         * it doesn't have one yet, then allocate one and initialize it.
-         * The checks to see if one is there are in xfs_buf_item_init().
-         */
-        xfs_buf_item_init(bp, tp->t_mountp);
-        /*
-         * Set the recursion count for the buffer within this transaction
-         * to 0.
-         */
-        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
-        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
-        ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
-        bip->bli_recur = 0;
-        /*
-         * Take a reference for this transaction on the buf item.
-         */
-        atomic_inc(&bip->bli_refcount);
-        /*
-         * Get a log_item_desc to point at the new item.
-         */
-        (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
-        /*
-         * Initialize b_fsprivate2 so we can find it with incore_match()
-         * above.
-         */
-        XFS_BUF_SET_FSPRIVATE2(bp, tp);
-        xfs_buftrace("TRANS READ", bp);
-        xfs_buf_item_trace("READ", bip);
        *bpp = bp;
        return 0;
@@ -480,10 +436,10 @@ shutdown_abort:
        if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
                cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp);
 #endif
-        ASSERT((XFS_BUF_BFLAGS(bp) & (XFS_B_STALE|XFS_B_DELWRI)) !=
+        ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) !=
-                                                (XFS_B_STALE|XFS_B_DELWRI));
+                                     (XBF_STALE|XBF_DELWRI));
-        xfs_buftrace("READ_BUF XFSSHUTDN", bp);
+        trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
        xfs_buf_relse(bp);
        *bpp = NULL;
        return XFS_ERROR(EIO);
@@ -549,13 +505,14 @@ xfs_trans_brelse(xfs_trans_t	*tp,
        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
        ASSERT(lidp != NULL);
+        trace_xfs_trans_brelse(bip);
        /*
         * If the release is just for a recursive lock,
         * then decrement the count and return.
         */
        if (bip->bli_recur > 0) {
                bip->bli_recur--;
-                xfs_buf_item_trace("RELSE RECUR", bip);
                return;
        }
@@ -563,10 +520,8 @@ xfs_trans_brelse(xfs_trans_t	*tp,
         * If the buffer is dirty within this transaction, we can't
         * release it until we commit.
         */
-        if (lidp->lid_flags & XFS_LID_DIRTY) {
+        if (lidp->lid_flags & XFS_LID_DIRTY)
-                xfs_buf_item_trace("RELSE DIRTY", bip);
                return;
-        }
        /*
         * If the buffer has been invalidated, then we can't release
@@ -574,13 +529,10 @@ xfs_trans_brelse(xfs_trans_t	*tp,
         * as part of this transaction.  This prevents us from pulling
         * the item from the AIL before we should.
         */
-        if (bip->bli_flags & XFS_BLI_STALE) {
+        if (bip->bli_flags & XFS_BLI_STALE)
-                xfs_buf_item_trace("RELSE STALE", bip);
                return;
-        }
        ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
-        xfs_buf_item_trace("RELSE", bip);
        /*
         * Free up the log item descriptor tracking the released item.
@@ -634,53 +586,6 @@ xfs_trans_brelse(xfs_trans_t	*tp,
 }
 /*
- * Add the locked buffer to the transaction.
- * The buffer must be locked, and it cannot be associated with any
- * transaction.
- *
- * If the buffer does not yet have a buf log item associated with it,
- * then allocate one for it.  Then add the buf item to the transaction.
- */
-void
-xfs_trans_bjoin(xfs_trans_t     *tp,
-                xfs_buf_t       *bp)
-{
-        xfs_buf_log_item_t      *bip;
-        ASSERT(XFS_BUF_ISBUSY(bp));
-        ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
-        /*
-         * The xfs_buf_log_item pointer is stored in b_fsprivate.  If
-         * it doesn't have one yet, then allocate one and initialize it.
-         * The checks to see if one is there are in xfs_buf_item_init().
-         */
-        xfs_buf_item_init(bp, tp->t_mountp);
-        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
-        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
-        ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
-        /*
-         * Take a reference for this transaction on the buf item.
-         */
-        atomic_inc(&bip->bli_refcount);
-        /*
-         * Get a log_item_desc to point at the new item.
-         */
-        (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip);
-        /*
-         * Initialize b_fsprivate2 so we can find it with incore_match()
-         * in xfs_trans_get_buf() and friends above.
-         */
-        XFS_BUF_SET_FSPRIVATE2(bp, tp);
-        xfs_buf_item_trace("BJOIN", bip);
-}
-/*
 * Mark the buffer as not needing to be unlocked when the buf item's
 * IOP_UNLOCK() routine is called.  The buffer must already be locked
 * and associated with the given transaction.
@@ -701,7 +606,7 @@ xfs_trans_bhold(xfs_trans_t	*tp,
        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        bip->bli_flags |= XFS_BLI_HOLD;
-        xfs_buf_item_trace("BHOLD", bip);
+        trace_xfs_trans_bhold(bip);
 }
 /*
@@ -724,7 +629,8 @@ xfs_trans_bhold_release(xfs_trans_t	*tp,
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        ASSERT(bip->bli_flags & XFS_BLI_HOLD);
        bip->bli_flags &= ~XFS_BLI_HOLD;
-        xfs_buf_item_trace("BHOLD RELEASE", bip);
+        trace_xfs_trans_bhold_release(bip);
 }
 /*
@@ -770,6 +676,8 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
        XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
        bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))xfs_buf_iodone;
+        trace_xfs_trans_log_buf(bip);
        /*
         * If we invalidated the buffer within this transaction, then
         * cancel the invalidation now that we're dirtying the buffer
@@ -777,7 +685,6 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
         * because we have a reference to the buffer this entire time.
         */
        if (bip->bli_flags & XFS_BLI_STALE) {
-                xfs_buf_item_trace("BLOG UNSTALE", bip);
                bip->bli_flags &= ~XFS_BLI_STALE;
                ASSERT(XFS_BUF_ISSTALE(bp));
                XFS_BUF_UNSTALE(bp);
@@ -792,7 +699,6 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
        lidp->lid_flags &= ~XFS_LID_BUF_STALE;
        bip->bli_flags |= XFS_BLI_LOGGED;
        xfs_buf_item_log(bip, first, last);
-        xfs_buf_item_trace("BLOG", bip);
 }
@@ -831,6 +737,8 @@ xfs_trans_binval(
        ASSERT(lidp != NULL);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
+        trace_xfs_trans_binval(bip);
        if (bip->bli_flags & XFS_BLI_STALE) {
                /*
                 * If the buffer is already invalidated, then
@@ -843,8 +751,6 @@ xfs_trans_binval(
                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
                ASSERT(lidp->lid_flags & XFS_LID_DIRTY);
                ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
-                xfs_buftrace("XFS_BINVAL RECUR", bp);
-                xfs_buf_item_trace("BINVAL RECUR", bip);
                return;
        }
@@ -878,8 +784,6 @@ xfs_trans_binval(
              (bip->bli_format.blf_map_size * sizeof(uint)));
        lidp->lid_flags |= XFS_LID_DIRTY|XFS_LID_BUF_STALE;
        tp->t_flags |= XFS_TRANS_DIRTY;
-        xfs_buftrace("XFS_BINVAL", bp);
-        xfs_buf_item_trace("BINVAL", bip);
 }
 /*
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index d725428c9df6..b09904555d07 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -151,8 +151,8 @@ typedef enum {
 } xfs_btnum_t;
 struct xfs_name {
-        const char      *name;
+        const unsigned char     *name;
-        int             len;
+        int                     len;
 };
 #endif  /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index b572f7e840e0..9d376be0ea38 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -53,6 +53,7 @@
 #include "xfs_log_priv.h"
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
+#include "xfs_trace.h"
 int
 xfs_setattr(
@@ -69,7 +70,6 @@ xfs_setattr(
        uint                    commit_flags=0;
        uid_t                   uid=0, iuid=0;
        gid_t                   gid=0, igid=0;
-        int                     timeflags = 0;
        struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
        int                     need_iolock = 1;
@@ -134,16 +134,13 @@ xfs_setattr(
        if (flags & XFS_ATTR_NOLOCK)
                need_iolock = 0;
        if (!(mask & ATTR_SIZE)) {
-                if ((mask != (ATTR_CTIME|ATTR_ATIME|ATTR_MTIME)) ||
+                tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
-                    (mp->m_flags & XFS_MOUNT_WSYNC)) {
+                commit_flags = 0;
-                        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+                code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp),
-                        commit_flags = 0;
+                                         0, 0, 0);
-                        if ((code = xfs_trans_reserve(tp, 0,
+                if (code) {
-                                                     XFS_ICHANGE_LOG_RES(mp), 0,
+                        lock_flags = 0;
-                                                     0, 0))) {
+                        goto error_return;
-                                lock_flags = 0;
-                                goto error_return;
-                        }
                }
        } else {
                if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
@@ -259,7 +256,7 @@ xfs_setattr(
                    iattr->ia_size > ip->i_d.di_size) {
                        code = xfs_flush_pages(ip,
                                        ip->i_d.di_size, iattr->ia_size,
-                                        XFS_B_ASYNC, FI_NONE);
+                                        XBF_ASYNC, FI_NONE);
                }
                /* wait for all I/O to complete */
@@ -294,15 +291,23 @@ xfs_setattr(
                 * or we are explicitly asked to change it. This handles
                 * the semantic difference between truncate() and ftruncate()
                 * as implemented in the VFS.
+                 *
+                 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME
+                 * is a special case where we need to update the times despite
+                 * not having these flags set.  For all other operations the
+                 * VFS set these flags explicitly if it wants a timestamp
+                 * update.
                 */
-                if (iattr->ia_size != ip->i_size || (mask & ATTR_CTIME))
+                if (iattr->ia_size != ip->i_size &&
-                        timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+                    (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
+                        iattr->ia_ctime = iattr->ia_mtime =
+                                current_fs_time(inode->i_sb);
+                        mask |= ATTR_CTIME | ATTR_MTIME;
+                }
                if (iattr->ia_size > ip->i_size) {
                        ip->i_d.di_size = iattr->ia_size;
                        ip->i_size = iattr->ia_size;
-                        if (!(flags & XFS_ATTR_DMI))
-                                xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
                        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
                } else if (iattr->ia_size <= ip->i_size ||
                           (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
@@ -373,9 +378,6 @@ xfs_setattr(
                        ip->i_d.di_gid = gid;
                        inode->i_gid = gid;
                }
-                xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
-                timeflags |= XFS_ICHGTIME_CHG;
        }
        /*
@@ -392,51 +394,37 @@ xfs_setattr(
                inode->i_mode &= S_IFMT;
                inode->i_mode |= mode & ~S_IFMT;
-                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-                timeflags |= XFS_ICHGTIME_CHG;
        }
        /*
         * Change file access or modified times.
         */
-        if (mask & (ATTR_ATIME|ATTR_MTIME)) {
+        if (mask & ATTR_ATIME) {
-                if (mask & ATTR_ATIME) {
+                inode->i_atime = iattr->ia_atime;
-                        inode->i_atime = iattr->ia_atime;
+                ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
-                        ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
+                ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
-                        ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
+                ip->i_update_core = 1;
-                        ip->i_update_core = 1;
-                }
-                if (mask & ATTR_MTIME) {
-                        inode->i_mtime = iattr->ia_mtime;
-                        ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
-                        ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
-                        timeflags &= ~XFS_ICHGTIME_MOD;
-                        timeflags |= XFS_ICHGTIME_CHG;
-                }
-                if (tp && (mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)))
-                        xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
        }
+        if (mask & ATTR_CTIME) {
-        /*
-         * Change file inode change time only if ATTR_CTIME set
-         * AND we have been called by a DMI function.
-         */
-        if ((flags & XFS_ATTR_DMI) && (mask & ATTR_CTIME)) {
                inode->i_ctime = iattr->ia_ctime;
                ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
                ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
                ip->i_update_core = 1;
-                timeflags &= ~XFS_ICHGTIME_CHG;
+        }
+        if (mask & ATTR_MTIME) {
+                inode->i_mtime = iattr->ia_mtime;
+                ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
+                ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
+                ip->i_update_core = 1;
        }
        /*
-         * Send out timestamp changes that need to be set to the
+         * And finally, log the inode core if any attribute in it
-         * current time.  Not done when called by a DMI function.
+         * has been changed.
         */
-        if (timeflags && !(flags & XFS_ATTR_DMI))
+        if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE|
-                xfs_ichgtime(ip, timeflags);
+                    ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
+                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        XFS_STATS_INC(xs_ig_attrchg);
@@ -451,12 +439,10 @@ xfs_setattr(
         * mix so this probably isn't worth the trouble to optimize.
         */
        code = 0;
-        if (tp) {
+        if (mp->m_flags & XFS_MOUNT_WSYNC)
-                if (mp->m_flags & XFS_MOUNT_WSYNC)
+                xfs_trans_set_sync(tp);
-                        xfs_trans_set_sync(tp);
-                code = xfs_trans_commit(tp, commit_flags);
+        code = xfs_trans_commit(tp, commit_flags);
-        }
        xfs_iunlock(ip, lock_flags);
@@ -538,9 +524,8 @@ xfs_readlink_bmap(
                d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
                byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
-                bp = xfs_buf_read_flags(mp->m_ddev_targp, d, BTOBB(byte_cnt),
+                bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt),
-                                        XBF_LOCK | XBF_MAPPED |
+                                  XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK);
-                                        XBF_DONT_BLOCK);
                error = XFS_BUF_GETERROR(bp);
                if (error) {
                        xfs_ioerror_alert("xfs_readlink",
@@ -599,114 +584,9 @@ xfs_readlink(
 }
 /*
- * xfs_fsync
+ * Flags for xfs_free_eofblocks
- *
- * This is called to sync the inode and its data out to disk.  We need to hold
- * the I/O lock while flushing the data, and the inode lock while flushing the
- * inode.  The inode lock CANNOT be held while flushing the data, so acquire
- * after we're done with that.
 */
-int
+#define XFS_FREE_EOF_TRYLOCK    (1<<0)
-xfs_fsync(
-        xfs_inode_t     *ip)
-{
-        xfs_trans_t     *tp;
-        int             error = 0;
-        int             log_flushed = 0, changed = 1;
-        xfs_itrace_entry(ip);
-        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-                return XFS_ERROR(EIO);
-        /*
-         * We always need to make sure that the required inode state is safe on
-         * disk.  The inode might be clean but we still might need to force the
-         * log because of committed transactions that haven't hit the disk yet.
-         * Likewise, there could be unflushed non-transactional changes to the
-         * inode core that have to go to disk and this requires us to issue
-         * a synchronous transaction to capture these changes correctly.
-         *
-         * This code relies on the assumption that if the update_* fields
-         * of the inode are clear and the inode is unpinned then it is clean
-         * and no action is required.
-         */
-        xfs_ilock(ip, XFS_ILOCK_SHARED);
-        if (!ip->i_update_core) {
-                /*
-                 * Timestamps/size haven't changed since last inode flush or
-                 * inode transaction commit.  That means either nothing got
-                 * written or a transaction committed which caught the updates.
-                 * If the latter happened and the transaction hasn't hit the
-                 * disk yet, the inode will be still be pinned.  If it is,
-                 * force the log.
-                 */
-                xfs_iunlock(ip, XFS_ILOCK_SHARED);
-                if (xfs_ipincount(ip)) {
-                        error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
-                                      XFS_LOG_FORCE | XFS_LOG_SYNC,
-                                      &log_flushed);
-                } else {
-                        /*
-                         * If the inode is not pinned and nothing has changed
-                         * we don't need to flush the cache.
-                         */
-                        changed = 0;
-                }
-        } else  {
-                /*
-                 * Kick off a transaction to log the inode core to get the
-                 * updates.  The sync transaction will also force the log.
-                 */
-                xfs_iunlock(ip, XFS_ILOCK_SHARED);
-                tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
-                error = xfs_trans_reserve(tp, 0,
-                                XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
-                if (error) {
-                        xfs_trans_cancel(tp, 0);
-                        return error;
-                }
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                /*
-                 * Note - it's possible that we might have pushed ourselves out
-                 * of the way during trans_reserve which would flush the inode.
-                 * But there's no guarantee that the inode buffer has actually
-                 * gone out yet (it's delwri).  Plus the buffer could be pinned
-                 * anyway if it's part of an inode in another recent
-                 * transaction.  So we play it safe and fire off the
-                 * transaction anyway.
-                 */
-                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-                xfs_trans_ihold(tp, ip);
-                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-                xfs_trans_set_sync(tp);
-                error = _xfs_trans_commit(tp, 0, &log_flushed);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        }
-        if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
-                /*
-                 * If the log write didn't issue an ordered tag we need
-                 * to flush the disk cache for the data device now.
-                 */
-                if (!log_flushed)
-                        xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
-                /*
-                 * If this inode is on the RT dev we need to flush that
-                 * cache as well.
-                 */
-                if (XFS_IS_REALTIME_INODE(ip))
-                        xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
-        }
-        return error;
-}
 /*
 * This is called by xfs_inactive to free any blocks beyond eof
@@ -726,7 +606,6 @@ xfs_free_eofblocks(
        xfs_filblks_t   map_len;
        int             nimaps;
        xfs_bmbt_irec_t imap;
-        int             use_iolock = (flags & XFS_FREE_EOF_LOCK);
        /*
         * Figure out if there are any blocks beyond the end
@@ -768,14 +647,19 @@ xfs_free_eofblocks(
                 * cache and we can't
                 * do that within a transaction.
                 */
-                if (use_iolock)
+                if (flags & XFS_FREE_EOF_TRYLOCK) {
+                        if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+                                xfs_trans_cancel(tp, 0);
+                                return 0;
+                        }
+                } else {
                        xfs_ilock(ip, XFS_IOLOCK_EXCL);
+                }
                error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
                                    ip->i_size);
                if (error) {
                        xfs_trans_cancel(tp, 0);
-                        if (use_iolock)
+                        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-                                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
                        return error;
                }
@@ -812,8 +696,7 @@ xfs_free_eofblocks(
                        error = xfs_trans_commit(tp,
                                                XFS_TRANS_RELEASE_LOG_RES);
                }
-                xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)
+                xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
-                                            : XFS_ILOCK_EXCL));
        }
        return error;
 }
@@ -1103,7 +986,7 @@ xfs_release(
                 */
                truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
                if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
-                        xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE);
+                        xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
        }
        if (ip->i_d.di_nlink != 0) {
@@ -1113,7 +996,17 @@ xfs_release(
                     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
                    (!(ip->i_d.di_flags &
                                (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
-                        error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
+                        /*
+                         * If we can't get the iolock just skip truncating
+                         * the blocks past EOF because we could deadlock
+                         * with the mmap_sem otherwise.  We'll get another
+                         * chance to drop them once the last reference to
+                         * the inode is dropped, so we'll never leak blocks
+                         * permanently.
+                         */
+                        error = xfs_free_eofblocks(mp, ip,
+                                                   XFS_FREE_EOF_TRYLOCK);
                        if (error)
                                return error;
                }
@@ -1184,7 +1077,7 @@ xfs_inactive(
                     (!(ip->i_d.di_flags &
                                (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
                      (ip->i_delayed_blks != 0)))) {
-                        error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
+                        error = xfs_free_eofblocks(mp, ip, 0);
                        if (error)
                                return VN_INACTIVE_CACHE;
                }
@@ -1380,7 +1273,6 @@ xfs_lookup(
        if (error)
                goto out_free_name;
-        xfs_itrace_ref(*ipp);
        return 0;
 out_free_name:
@@ -1526,7 +1418,6 @@ xfs_create(
         * At this point, we've gotten a newly allocated inode.
         * It is locked (and joined to the transaction).
         */
-        xfs_itrace_ref(ip);
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        /*
@@ -1986,9 +1877,6 @@ xfs_remove(
        if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
                xfs_filestream_deassociate(ip);
-        xfs_itrace_exit(ip);
-        xfs_itrace_exit(dp);
 std_return:
        if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
                XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, dp, DM_RIGHT_NULL,
@@ -2201,7 +2089,8 @@ xfs_symlink(
        if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
                error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
                                        DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
-                                        link_name->name, target_path, 0, 0, 0);
+                                        link_name->name,
+                                        (unsigned char *)target_path, 0, 0, 0);
                if (error)
                        return error;
        }
@@ -2285,7 +2174,6 @@ xfs_symlink(
                        goto error_return;
                goto error1;
        }
-        xfs_itrace_ref(ip);
        /*
         * An error after we've joined dp to the transaction will result in the
@@ -2398,7 +2286,8 @@ std_return:
                                        dp, DM_RIGHT_NULL,
                                        error ? NULL : ip,
                                        DM_RIGHT_NULL, link_name->name,
-                                        target_path, 0, error, 0);
+                                        (unsigned char *)target_path,
+                                        0, error, 0);
        }
        if (!error)
@@ -2456,46 +2345,6 @@ xfs_set_dmattrs(
        return error;
 }
-int
-xfs_reclaim(
-        xfs_inode_t     *ip)
-{
-        xfs_itrace_entry(ip);
-        ASSERT(!VN_MAPPED(VFS_I(ip)));
-        /* bad inode, get out here ASAP */
-        if (is_bad_inode(VFS_I(ip))) {
-                xfs_ireclaim(ip);
-                return 0;
-        }
-        xfs_ioend_wait(ip);
-        ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
-        /*
-         * If we have nothing to flush with this inode then complete the
-         * teardown now, otherwise break the link between the xfs inode and the
-         * linux inode and clean up the xfs inode later. This avoids flushing
-         * the inode to disk during the delete operation itself.
-         *
-         * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
-         * first to ensure that xfs_iunpin() will never see an xfs inode
-         * that has a linux inode being reclaimed. Synchronisation is provided
-         * by the i_flags_lock.
-         */
-        if (!ip->i_update_core && (ip->i_itemp == NULL)) {
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                xfs_iflock(ip);
-                xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-                return xfs_reclaim_inode(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
-        }
-        xfs_inode_set_reclaim_tag(ip);
-        return 0;
-}
 /*
 * xfs_alloc_file_space()
 *      This routine allocates disk space for the given file.
@@ -2868,7 +2717,6 @@ xfs_free_file_space(
        ioffset = offset & ~(rounding - 1);
        if (VN_CACHED(VFS_I(ip)) != 0) {
-                xfs_inval_cached_trace(ip, ioffset, -1, ioffset, -1);
                error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
                if (error)
                        goto out_unlock_iolock;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index a9e102de71a1..d8dfa8d0dadd 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -21,7 +21,6 @@ int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
 #define XFS_ATTR_NOACL          0x08    /* Don't call xfs_acl_chmod */
 int xfs_readlink(struct xfs_inode *ip, char *link);
-int xfs_fsync(struct xfs_inode *ip);
 int xfs_release(struct xfs_inode *ip);
 int xfs_inactive(struct xfs_inode *ip);
 int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
@@ -38,31 +37,18 @@ int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
                const char *target_path, mode_t mode, struct xfs_inode **ipp,
                cred_t *credp);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
-int xfs_reclaim(struct xfs_inode *ip);
 int xfs_change_file_space(struct xfs_inode *ip, int cmd,
                xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
 int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
                struct xfs_inode *src_ip, struct xfs_inode *target_dp,
                struct xfs_name *target_name, struct xfs_inode *target_ip);
-int xfs_attr_get(struct xfs_inode *ip, const char *name, char *value,
+int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
-                int *valuelenp, int flags);
+                unsigned char *value, int *valuelenp, int flags);
-int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value,
+int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
-                int valuelen, int flags);
+                unsigned char *value, int valuelen, int flags);
-int xfs_attr_remove(struct xfs_inode *dp, const char *name, int flags);
+int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
 int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
                int flags, struct attrlist_cursor_kern *cursor);
-ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb,
-                const struct iovec *iovp, unsigned int segs,
-                loff_t *offset, int ioflags);
-ssize_t xfs_splice_read(struct xfs_inode *ip, struct file *infilp,
-                loff_t *ppos, struct pipe_inode_info *pipe, size_t count,
-                int flags, int ioflags);
-ssize_t xfs_splice_write(struct xfs_inode *ip,
-                struct pipe_inode_info *pipe, struct file *outfilp,
-                loff_t *ppos, size_t count, int flags, int ioflags);
-ssize_t xfs_write(struct xfs_inode *xip, struct kiocb *iocb,
-                const struct iovec *iovp, unsigned int nsegs,
-                loff_t *offset, int ioflags);
 int xfs_bmap(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
                int flags, struct xfs_iomap *iomapp, int *niomaps);
 void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first,
@@ -73,4 +59,6 @@ int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
                xfs_off_t last, uint64_t flags, int fiopt);
 int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
+int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
 #endif /* _XFS_VNODEOPS_H */