773 files changed, 24707 insertions, 15985 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index e777961939f3..0dbe0d139ac2 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -22,6 +22,7 @@
 #include <linux/jiffies.h>
 #include <linux/file.h>
+#include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 08b2eb157048..7317b39b2815 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -24,6 +24,7 @@
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/idr.h>
 #include <net/9p/9p.h>
@@ -110,7 +111,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 {
        int i, n, l, clone, any, access;
        u32 uid;
-        struct p9_fid *fid;
+        struct p9_fid *fid, *old_fid = NULL;
        struct dentry *d, *ds;
        struct v9fs_session_info *v9ses;
        char **wnames, *uname;
@@ -183,10 +184,18 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
                l = min(n - i, P9_MAXWELEM);
                fid = p9_client_walk(fid, l, &wnames[i], clone);
                if (IS_ERR(fid)) {
+                        if (old_fid) {
+                                /*
+                                 * If we fail, clunk fid which are mapping
+                                 * to path component and not the last component
+                                 * of the path.
+                                 */
+                                p9_client_clunk(old_fid);
+                        }
                        kfree(wnames);
                        return fid;
                }
+                old_fid = fid;
                i += l;
                clone = 0;
        }
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6c7f6a251115..f8b86e92cd66 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -29,6 +29,7 @@
 #include <linux/sched.h>
 #include <linux/parser.h>
 #include <linux/idr.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 #include <net/9p/transport.h>
@@ -237,11 +238,18 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
                return ERR_PTR(-ENOMEM);
        }
+        rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
+        if (rc) {
+                __putname(v9ses->aname);
+                __putname(v9ses->uname);
+                return ERR_PTR(rc);
+        }
        spin_lock(&v9fs_sessionlist_lock);
        list_add(&v9ses->slist, &v9fs_sessionlist);
        spin_unlock(&v9fs_sessionlist_lock);
-        v9ses->flags = V9FS_PROTO_2000U | V9FS_ACCESS_USER;
+        v9ses->flags = V9FS_ACCESS_USER;
        strcpy(v9ses->uname, V9FS_DEFUSER);
        strcpy(v9ses->aname, V9FS_DEFANAME);
        v9ses->uid = ~0;
@@ -262,8 +270,10 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
                goto error;
        }
-        if (!p9_is_proto_dotu(v9ses->clnt))
+        if (p9_is_proto_dotl(v9ses->clnt))
-                v9ses->flags &= ~V9FS_PROTO_2000U;
+                v9ses->flags |= V9FS_PROTO_2000L;
+        else if (p9_is_proto_dotu(v9ses->clnt))
+                v9ses->flags |= V9FS_PROTO_2000U;
        v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
@@ -298,6 +308,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        return fid;
 error:
+        bdi_destroy(&v9ses->bdi);
        return ERR_PTR(retval);
 }
@@ -323,6 +334,8 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
        __putname(v9ses->uname);
        __putname(v9ses->aname);
+        bdi_destroy(&v9ses->bdi);
        spin_lock(&v9fs_sessionlist_lock);
        list_del(&v9ses->slist);
        spin_unlock(&v9fs_sessionlist_lock);
@@ -340,6 +353,19 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
        p9_client_disconnect(v9ses->clnt);
 }
+/**
+ * v9fs_session_begin_cancel - Begin terminate of a session
+ * @v9ses: session to terminate
+ *
+ * After this call we don't allow any request other than clunk.
+ */
+void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses)
+{
+        P9_DPRINTK(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses);
+        p9_client_begin_disconnect(v9ses->clnt);
+}
 extern int v9fs_error_init(void);
 static struct kobject *v9fs_kobj;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 6b801d1ddf4b..bec4d0bcb458 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -20,6 +20,7 @@
 *  Boston, MA  02111-1301  USA
 *
 */
+#include <linux/backing-dev.h>
 /**
 * enum p9_session_flags - option flags for each 9P session
@@ -102,12 +103,14 @@ struct v9fs_session_info {
        u32 uid;                /* if ACCESS_SINGLE, the uid that has access */
        struct p9_client *clnt; /* 9p client */
        struct list_head slist; /* list of sessions registered with v9fs */
+        struct backing_dev_info bdi;
 };
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
                                                                        char *);
 void v9fs_session_close(struct v9fs_session_info *v9ses);
 void v9fs_session_cancel(struct v9fs_session_info *v9ses);
+void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
 #define V9FS_MAGIC 0x01021997
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index ed835836e0dc..32ef4009d030 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -40,7 +40,9 @@
 extern struct file_system_type v9fs_fs_type;
 extern const struct address_space_operations v9fs_addr_operations;
 extern const struct file_operations v9fs_file_operations;
+extern const struct file_operations v9fs_file_operations_dotl;
 extern const struct file_operations v9fs_dir_operations;
+extern const struct file_operations v9fs_dir_operations_dotl;
 extern const struct dentry_operations v9fs_dentry_operations;
 extern const struct dentry_operations v9fs_cached_dentry_operations;
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index d74325295b1e..cbf4e50f3933 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -34,6 +34,7 @@
 #include <linux/namei.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index d8a3afe4ff72..d61e3b28ce37 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -32,6 +32,7 @@
 #include <linux/sched.h>
 #include <linux/inet.h>
 #include <linux/idr.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
@@ -130,6 +131,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
        rdir = (struct p9_rdir *) fid->rdir;
        err = mutex_lock_interruptible(&rdir->mutex);
+        if (err)
+                return err;
        while (err == 0) {
                if (rdir->tail == rdir->head) {
                        err = v9fs_file_readn(filp, rdir->buf, NULL,
@@ -200,3 +203,11 @@ const struct file_operations v9fs_dir_operations = {
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
 };
+const struct file_operations v9fs_dir_operations_dotl = {
+        .read = generic_read_dir,
+        .llseek = generic_file_llseek,
+        .readdir = v9fs_dir_readdir,
+        .open = v9fs_file_open,
+        .release = v9fs_dir_release,
+};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index df52d488d2a6..2bedc6c94fc2 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -257,15 +257,13 @@ v9fs_file_write(struct file *filp, const char __user * data,
        return total;
 }
-static int v9fs_file_fsync(struct file *filp, struct dentry *dentry,
+static int v9fs_file_fsync(struct file *filp, int datasync)
-                                        int datasync)
 {
        struct p9_fid *fid;
        struct p9_wstat wstat;
        int retval;
-        P9_DPRINTK(P9_DEBUG_VFS, "filp %p dentry %p datasync %x\n", filp,
+        P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
-                                                dentry, datasync);
        fid = filp->private_data;
        v9fs_blank_wstat(&wstat);
@@ -296,3 +294,14 @@ const struct file_operations v9fs_file_operations = {
        .mmap = generic_file_readonly_mmap,
        .fsync = v9fs_file_fsync,
 };
+const struct file_operations v9fs_file_operations_dotl = {
+        .llseek = generic_file_llseek,
+        .read = v9fs_file_read,
+        .write = v9fs_file_write,
+        .open = v9fs_file_open,
+        .release = v9fs_dir_release,
+        .lock = v9fs_file_lock,
+        .mmap = generic_file_readonly_mmap,
+        .fsync = v9fs_file_fsync,
+};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 5fe45d692c9f..4331b3b5ee1c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -34,6 +34,7 @@
 #include <linux/namei.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
@@ -43,9 +44,12 @@
 #include "cache.h"
 static const struct inode_operations v9fs_dir_inode_operations;
-static const struct inode_operations v9fs_dir_inode_operations_ext;
+static const struct inode_operations v9fs_dir_inode_operations_dotu;
+static const struct inode_operations v9fs_dir_inode_operations_dotl;
 static const struct inode_operations v9fs_file_inode_operations;
+static const struct inode_operations v9fs_file_inode_operations_dotl;
 static const struct inode_operations v9fs_symlink_inode_operations;
+static const struct inode_operations v9fs_symlink_inode_operations_dotl;
 /**
 * unixmode2p9mode - convert unix mode bits to plan 9
@@ -252,9 +256,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
                return ERR_PTR(-ENOMEM);
        }
-        inode->i_mode = mode;
+        inode_init_owner(inode, NULL, mode);
-        inode->i_uid = current_fsuid();
-        inode->i_gid = current_fsgid();
        inode->i_blocks = 0;
        inode->i_rdev = 0;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -274,25 +276,44 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
                init_special_inode(inode, inode->i_mode, inode->i_rdev);
                break;
        case S_IFREG:
-                inode->i_op = &v9fs_file_inode_operations;
+                if (v9fs_proto_dotl(v9ses)) {
-                inode->i_fop = &v9fs_file_operations;
+                        inode->i_op = &v9fs_file_inode_operations_dotl;
+                        inode->i_fop = &v9fs_file_operations_dotl;
+                } else {
+                        inode->i_op = &v9fs_file_inode_operations;
+                        inode->i_fop = &v9fs_file_operations;
+                }
                break;
        case S_IFLNK:
-                if (!v9fs_proto_dotu(v9ses)) {
+                if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
-                        P9_DPRINTK(P9_DEBUG_ERROR,
+                        P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
-                                   "extended modes used w/o 9P2000.u\n");
+                                                "legacy protocol.\n");
                        err = -EINVAL;
                        goto error;
                }
-                inode->i_op = &v9fs_symlink_inode_operations;
+                if (v9fs_proto_dotl(v9ses))
+                        inode->i_op = &v9fs_symlink_inode_operations_dotl;
+                else
+                        inode->i_op = &v9fs_symlink_inode_operations;
                break;
        case S_IFDIR:
                inc_nlink(inode);
-                if (v9fs_proto_dotu(v9ses))
+                if (v9fs_proto_dotl(v9ses))
-                        inode->i_op = &v9fs_dir_inode_operations_ext;
+                        inode->i_op = &v9fs_dir_inode_operations_dotl;
+                else if (v9fs_proto_dotu(v9ses))
+                        inode->i_op = &v9fs_dir_inode_operations_dotu;
                else
                        inode->i_op = &v9fs_dir_inode_operations;
-                inode->i_fop = &v9fs_dir_operations;
+                if (v9fs_proto_dotl(v9ses))
+                        inode->i_fop = &v9fs_dir_operations_dotl;
+                else
+                        inode->i_fop = &v9fs_dir_operations;
                break;
        default:
                P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
@@ -431,20 +452,22 @@ error:
 static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
 {
+        int retval;
        struct inode *file_inode;
-        struct v9fs_session_info *v9ses;
        struct p9_fid *v9fid;
        P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
                rmdir);
        file_inode = file->d_inode;
-        v9ses = v9fs_inode2v9ses(file_inode);
        v9fid = v9fs_fid_clone(file);
        if (IS_ERR(v9fid))
                return PTR_ERR(v9fid);
-        return p9_client_remove(v9fid);
+        retval = p9_client_remove(v9fid);
+        if (!retval)
+                drop_nlink(file_inode);
+        return retval;
 }
 static int
@@ -479,12 +502,11 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
        ofid = NULL;
        fid = NULL;
        name = (char *) dentry->d_name.name;
-        dfid = v9fs_fid_clone(dentry->d_parent);
+        dfid = v9fs_fid_lookup(dentry->d_parent);
        if (IS_ERR(dfid)) {
                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid clone failed %d\n", err);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-                dfid = NULL;
+                return ERR_PTR(err);
-                goto error;
        }
        /* clone a fid to use for creation */
@@ -492,8 +514,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
        if (IS_ERR(ofid)) {
                err = PTR_ERR(ofid);
                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
-                ofid = NULL;
+                return ERR_PTR(err);
-                goto error;
        }
        err = p9_client_fcreate(ofid, name, perm, mode, extension);
@@ -503,14 +524,13 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
        }
        /* now walk from the parent so we can get unopened fid */
-        fid = p9_client_walk(dfid, 1, &name, 0);
+        fid = p9_client_walk(dfid, 1, &name, 1);
        if (IS_ERR(fid)) {
                err = PTR_ERR(fid);
                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
                fid = NULL;
                goto error;
-        } else
+        }
-                dfid = NULL;
        /* instantiate inode and assign the unopened fid to the dentry */
        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
@@ -533,9 +553,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
        return ofid;
 error:
-        if (dfid)
-                p9_client_clunk(dfid);
        if (ofid)
                p9_client_clunk(ofid);
@@ -656,6 +673,9 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
        P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
                dir, dentry->d_name.name, dentry, nameidata);
+        if (dentry->d_name.len > NAME_MAX)
+                return ERR_PTR(-ENAMETOOLONG);
        sb = dir->i_sb;
        v9ses = v9fs_inode2v9ses(dir);
        dfid = v9fs_fid_lookup(dentry->d_parent);
@@ -667,8 +687,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(fid)) {
                result = PTR_ERR(fid);
                if (result == -ENOENT) {
-                        d_add(dentry, NULL);
+                        inode = NULL;
-                        return NULL;
+                        goto inst_out;
                }
                return ERR_PTR(result);
@@ -685,7 +705,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
        if (result < 0)
                goto error;
-        if ((fid->qid.version) && (v9ses->cache))
+inst_out:
+        if (v9ses->cache)
                dentry->d_op = &v9fs_cached_dentry_operations;
        else
                dentry->d_op = &v9fs_dentry_operations;
@@ -764,6 +785,13 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                goto clunk_olddir;
        }
+        if (v9fs_proto_dotl(v9ses)) {
+                retval = p9_client_rename(oldfid, newdirfid,
+                                        (char *) new_dentry->d_name.name);
+                if (retval != -ENOSYS)
+                        goto clunk_newdir;
+        }
        /* 9P can only handle file rename in the same directory */
        if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) {
                P9_DPRINTK(P9_DEBUG_ERROR,
@@ -1189,6 +1217,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
                sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev));
        else if (S_ISFIFO(mode))
                *name = 0;
+        else if (S_ISSOCK(mode))
+                *name = 0;
        else {
                __putname(name);
                return -EINVAL;
@@ -1200,7 +1230,21 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
        return retval;
 }
-static const struct inode_operations v9fs_dir_inode_operations_ext = {
+static const struct inode_operations v9fs_dir_inode_operations_dotu = {
+        .create = v9fs_vfs_create,
+        .lookup = v9fs_vfs_lookup,
+        .symlink = v9fs_vfs_symlink,
+        .link = v9fs_vfs_link,
+        .unlink = v9fs_vfs_unlink,
+        .mkdir = v9fs_vfs_mkdir,
+        .rmdir = v9fs_vfs_rmdir,
+        .mknod = v9fs_vfs_mknod,
+        .rename = v9fs_vfs_rename,
+        .getattr = v9fs_vfs_getattr,
+        .setattr = v9fs_vfs_setattr,
+};
+static const struct inode_operations v9fs_dir_inode_operations_dotl = {
        .create = v9fs_vfs_create,
        .lookup = v9fs_vfs_lookup,
        .symlink = v9fs_vfs_symlink,
@@ -1231,6 +1275,11 @@ static const struct inode_operations v9fs_file_inode_operations = {
        .setattr = v9fs_vfs_setattr,
 };
+static const struct inode_operations v9fs_file_inode_operations_dotl = {
+        .getattr = v9fs_vfs_getattr,
+        .setattr = v9fs_vfs_setattr,
+};
 static const struct inode_operations v9fs_symlink_inode_operations = {
        .readlink = generic_readlink,
        .follow_link = v9fs_vfs_follow_link,
@@ -1238,3 +1287,11 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
        .getattr = v9fs_vfs_getattr,
        .setattr = v9fs_vfs_setattr,
 };
+static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
+        .readlink = generic_readlink,
+        .follow_link = v9fs_vfs_follow_link,
+        .put_link = v9fs_vfs_put_link,
+        .getattr = v9fs_vfs_getattr,
+        .setattr = v9fs_vfs_setattr,
+};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 69357c0d9899..be74d020436e 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -37,6 +37,8 @@
 #include <linux/mount.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/statfs.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
@@ -44,7 +46,7 @@
 #include "v9fs_vfs.h"
 #include "fid.h"
-static const struct super_operations v9fs_super_ops;
+static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
 /**
 * v9fs_set_super - set the superblock
@@ -75,7 +77,11 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
        sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
        sb->s_blocksize = 1 << sb->s_blocksize_bits;
        sb->s_magic = V9FS_MAGIC;
-        sb->s_op = &v9fs_super_ops;
+        if (v9fs_proto_dotl(v9ses))
+                sb->s_op = &v9fs_super_ops_dotl;
+        else
+                sb->s_op = &v9fs_super_ops;
+        sb->s_bdi = &v9ses->bdi;
        sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
            MS_NOATIME;
@@ -193,6 +199,7 @@ static void v9fs_kill_super(struct super_block *s)
        kill_anon_super(s);
+        v9fs_session_cancel(v9ses);
        v9fs_session_close(v9ses);
        kfree(v9ses);
        s->s_fs_info = NULL;
@@ -205,7 +212,43 @@ v9fs_umount_begin(struct super_block *sb)
        struct v9fs_session_info *v9ses;
        v9ses = sb->s_fs_info;
-        v9fs_session_cancel(v9ses);
+        v9fs_session_begin_cancel(v9ses);
+}
+static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid;
+        struct p9_rstatfs rs;
+        int res;
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid)) {
+                res = PTR_ERR(fid);
+                goto done;
+        }
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        if (v9fs_proto_dotl(v9ses)) {
+                res = p9_client_statfs(fid, &rs);
+                if (res == 0) {
+                        buf->f_type = rs.type;
+                        buf->f_bsize = rs.bsize;
+                        buf->f_blocks = rs.blocks;
+                        buf->f_bfree = rs.bfree;
+                        buf->f_bavail = rs.bavail;
+                        buf->f_files = rs.files;
+                        buf->f_ffree = rs.ffree;
+                        buf->f_fsid.val[0] = rs.fsid & 0xFFFFFFFFUL;
+                        buf->f_fsid.val[1] = (rs.fsid >> 32) & 0xFFFFFFFFUL;
+                        buf->f_namelen = rs.namelen;
+                }
+                if (res != -ENOSYS)
+                        goto done;
+        }
+        res = simple_statfs(dentry, buf);
+done:
+        return res;
 }
 static const struct super_operations v9fs_super_ops = {
@@ -219,6 +262,17 @@ static const struct super_operations v9fs_super_ops = {
        .umount_begin = v9fs_umount_begin,
 };
+static const struct super_operations v9fs_super_ops_dotl = {
+#ifdef CONFIG_9P_FSCACHE
+        .alloc_inode = v9fs_alloc_inode,
+        .destroy_inode = v9fs_destroy_inode,
+#endif
+        .statfs = v9fs_statfs,
+        .clear_inode = v9fs_clear_inode,
+        .show_options = generic_show_options,
+        .umount_begin = v9fs_umount_begin,
+};
 struct file_system_type v9fs_fs_type = {
        .name = "9p",
        .get_sb = v9fs_get_sb,
diff --git a/fs/Makefile b/fs/Makefile
index 97f340f14ba2..e6ec1d309b1d 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
                attr.o bad_inode.o file.o filesystems.o namespace.o \
                seq_file.o xattr.o libfs.o fs-writeback.o \
                pnode.o drop_caches.o splice.o sync.o utimes.o \
-                stack.o fs_struct.o
+                stack.o fs_struct.o statfs.o
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=        buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 23aa52f548a0..f4287e4de744 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -197,7 +197,7 @@ const struct file_operations adfs_dir_operations = {
        .read           = generic_read_dir,
        .llseek         = generic_file_llseek,
        .readdir        = adfs_readdir,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
 };
 static int
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 005ea34d1758..a36da5382b40 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -26,7 +26,7 @@ const struct file_operations adfs_file_operations = {
        .read           = do_sync_read,
        .aio_read       = generic_file_aio_read,
        .mmap           = generic_file_mmap,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .splice_read    = generic_file_splice_read,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 0f5e30978135..6f850b06ab62 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -322,8 +322,9 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
        if (error)
                goto out;
+        /* XXX: this is missing some actual on-disk truncation.. */
        if (ia_valid & ATTR_SIZE)
-                error = vmtruncate(inode, attr->ia_size);
+                error = simple_setsize(inode, attr->ia_size);
        if (error)
                goto out;
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 6910a98bd73c..4a3af7075c1d 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -13,6 +13,7 @@
 #include <linux/parser.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/statfs.h>
 #include "adfs.h"
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 861dae68ac12..f05b6155ccc8 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -183,7 +183,7 @@ extern int			 affs_add_entry(struct inode *dir, struct inode *inode, struct dent
 void            affs_free_prealloc(struct inode *inode);
 extern void     affs_truncate(struct inode *);
-int             affs_file_fsync(struct file *, struct dentry *, int);
+int             affs_file_fsync(struct file *, int);
 /* dir.c */
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index 8306d53307ed..3e262711ae06 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -7,6 +7,7 @@
 *  block allocation, deallocation, calculation of free space.
 */
+#include <linux/slab.h>
 #include "affs.h"
 /* This is, of course, shamelessly stolen from fs/minix */
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 184e55c1c9ba..322710c3eedf 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -916,9 +916,9 @@ affs_truncate(struct inode *inode)
        affs_free_prealloc(inode);
 }
-int affs_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int affs_file_fsync(struct file *filp, int datasync)
 {
-        struct inode * inode = dentry->d_inode;
+        struct inode *inode = filp->f_mapping->host;
        int ret, err;
        ret = write_inode_now(inode, 0);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index c9744d771d98..f4b2a4ee4f91 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -10,6 +10,7 @@
 *  (C) 1991  Linus Torvalds - minix filesystem
 */
 #include <linux/sched.h>
+#include <linux/gfp.h>
 #include "affs.h"
 extern const struct inode_operations affs_symlink_inode_operations;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index d70bbbac6b7b..914d1c0bc07a 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -224,7 +224,7 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
                affs_brelse(bh);
                inode = affs_iget(sb, ino);
                if (IS_ERR(inode))
-                        return ERR_PTR(PTR_ERR(inode));
+                        return ERR_CAST(inode);
        }
        dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations;
        d_add(dentry, inode);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index d41e9673cd97..16a3e4765f68 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -17,6 +17,7 @@
 #include <linux/magic.h>
 #include <linux/sched.h>
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 #include "affs.h"
 extern struct timezone sys_tz;
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index e2b1d3f16519..0fb315dd4d2a 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -9,7 +9,6 @@
 * 2 of the License, or (at your option) any later version.
 */
-#include <linux/slab.h>
 #include <linux/sched.h>
 #include "internal.h"
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index eb765489164f..a3bcec75c54a 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/ip.h>
 #include "internal.h"
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 88067f36e5e7..b42d5cc1d6d2 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/ctype.h>
@@ -190,13 +189,9 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index,
                                     struct key *key)
 {
        struct page *page;
-        struct file file = {
-                .private_data = key,
-        };
        _enter("{%lu},%lu", dir->i_ino, index);
-        page = read_mapping_page(dir->i_mapping, index, &file);
+        page = read_cache_page(dir->i_mapping, index, afs_page_filler, key);
        if (!IS_ERR(page)) {
                kmap(page);
                if (!PageChecked(page))
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 39b301662f22..14d89fa58fee 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -12,10 +12,10 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
+#include <linux/gfp.h>
 #include "internal.h"
 static int afs_readpage(struct file *file, struct page *page);
@@ -121,34 +121,19 @@ static void afs_file_readpage_read_complete(struct page *page,
 #endif
 /*
- * AFS read page from file, directory or symlink
+ * read page from file, directory or symlink, given a key to use
 */
-static int afs_readpage(struct file *file, struct page *page)
+int afs_page_filler(void *data, struct page *page)
 {
-        struct afs_vnode *vnode;
+        struct inode *inode = page->mapping->host;
-        struct inode *inode;
+        struct afs_vnode *vnode = AFS_FS_I(inode);
-        struct key *key;
+        struct key *key = data;
        size_t len;
        off_t offset;
        int ret;
-        inode = page->mapping->host;
-        if (file) {
-                key = file->private_data;
-                ASSERT(key != NULL);
-        } else {
-                key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
-                if (IS_ERR(key)) {
-                        ret = PTR_ERR(key);
-                        goto error_nokey;
-                }
-        }
        _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
-        vnode = AFS_FS_I(inode);
        BUG_ON(!PageLocked(page));
        ret = -ESTALE;
@@ -214,31 +199,56 @@ static int afs_readpage(struct file *file, struct page *page)
                unlock_page(page);
        }
-        if (!file)
-                key_put(key);
        _leave(" = 0");
        return 0;
 error:
        SetPageError(page);
        unlock_page(page);
-        if (!file)
-                key_put(key);
-error_nokey:
        _leave(" = %d", ret);
        return ret;
 }
 /*
+ * read page from file, directory or symlink, given a file to nominate the key
+ * to be used
+ */
+static int afs_readpage(struct file *file, struct page *page)
+{
+        struct key *key;
+        int ret;
+        if (file) {
+                key = file->private_data;
+                ASSERT(key != NULL);
+                ret = afs_page_filler(key, page);
+        } else {
+                struct inode *inode = page->mapping->host;
+                key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
+                if (IS_ERR(key)) {
+                        ret = PTR_ERR(key);
+                } else {
+                        ret = afs_page_filler(key, page);
+                        key_put(key);
+                }
+        }
+        return ret;
+}
+/*
 * read a set of pages
 */
 static int afs_readpages(struct file *file, struct address_space *mapping,
                         struct list_head *pages, unsigned nr_pages)
 {
+        struct key *key = file->private_data;
        struct afs_vnode *vnode;
        int ret = 0;
-        _enter(",{%lu},,%d", mapping->host->i_ino, nr_pages);
+        _enter("{%d},{%lu},,%d",
+               key_serial(key), mapping->host->i_ino, nr_pages);
+        ASSERT(key != NULL);
        vnode = AFS_FS_I(mapping->host);
        if (vnode->flags & AFS_VNODE_DELETED) {
@@ -279,7 +289,7 @@ static int afs_readpages(struct file *file, struct address_space *mapping,
        }
        /* load the missing pages from the network */
-        ret = read_cache_pages(mapping, pages, (void *) afs_readpage, file);
+        ret = read_cache_pages(mapping, pages, afs_page_filler, key);
        _leave(" = %d [netting]", ret);
        return ret;
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 023b95b0d9d7..4bd0218473a9 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -10,6 +10,7 @@
 */
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/circ_buf.h>
 #include "internal.h"
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index c048f0658751..d00b312e3110 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -16,7 +16,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/sched.h>
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index c54dad4e6063..5f679b77ce24 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -19,6 +19,7 @@
 #include <linux/workqueue.h>
 #include <linux/sched.h>
 #include <linux/fscache.h>
+#include <linux/backing-dev.h>
 #include "afs.h"
 #include "afs_vl.h"
@@ -313,6 +314,7 @@ struct afs_volume {
        unsigned short          rjservers;      /* number of servers discarded due to -ENOMEDIUM */
        struct afs_server       *servers[8];    /* servers on which volume resides (ordered) */
        struct rw_semaphore     server_sem;     /* lock for accessing current server */
+        struct backing_dev_info bdi;
 };
 /*
@@ -492,6 +494,7 @@ extern const struct file_operations afs_file_operations;
 extern int afs_open(struct inode *, struct file *);
 extern int afs_release(struct inode *, struct file *);
+extern int afs_page_filler(void *, struct page *);
 /*
 * flock.c
@@ -737,7 +740,7 @@ extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
 extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
                              unsigned long, loff_t);
 extern int afs_writeback_all(struct afs_vnode *);
-extern int afs_fsync(struct file *, struct dentry *, int);
+extern int afs_fsync(struct file *, int);
 /*****************************************************************************/
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 5ffb570cd3a8..a9e23039ea34 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -12,11 +12,11 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/gfp.h>
 #include "internal.h"
@@ -49,9 +49,6 @@ static unsigned long afs_mntpt_expiry_timeout = 10 * 60;
 */
 int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
 {
-        struct file file = {
-                .private_data = key,
-        };
        struct page *page;
        size_t size;
        char *buf;
@@ -61,7 +58,8 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
               vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
        /* read the contents of the symlink into the pagecache */
-        page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, &file);
+        page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0,
+                               afs_page_filler, key);
        if (IS_ERR(page)) {
                ret = PTR_ERR(page);
                goto out;
@@ -138,9 +136,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 {
        struct afs_super_info *super;
        struct vfsmount *mnt;
-        struct page *page = NULL;
+        struct page *page;
        size_t size;
-        char *buf, *devname = NULL, *options = NULL;
+        char *buf, *devname, *options;
        int ret;
        _enter("{%s}", mntpt->d_name.name);
@@ -150,22 +148,22 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
        ret = -EINVAL;
        size = mntpt->d_inode->i_size;
        if (size > PAGE_SIZE - 1)
-                goto error;
+                goto error_no_devname;
        ret = -ENOMEM;
        devname = (char *) get_zeroed_page(GFP_KERNEL);
        if (!devname)
-                goto error;
+                goto error_no_devname;
        options = (char *) get_zeroed_page(GFP_KERNEL);
        if (!options)
-                goto error;
+                goto error_no_options;
        /* read the contents of the AFS special symlink */
        page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
        if (IS_ERR(page)) {
                ret = PTR_ERR(page);
-                goto error;
+                goto error_no_page;
        }
        ret = -EIO;
@@ -196,12 +194,12 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
        return mnt;
 error:
-        if (page)
+        page_cache_release(page);
-                page_cache_release(page);
+error_no_page:
-        if (devname)
+        free_page((unsigned long) options);
-                free_page((unsigned long) devname);
+error_no_options:
-        if (options)
+        free_page((unsigned long) devname);
-                free_page((unsigned long) options);
+error_no_devname:
        _leave(" = %d", ret);
        return ERR_PTR(ret);
 }
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index bde3f19c0995..67cf810e0fd6 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -9,6 +9,7 @@
 * 2 of the License, or (at your option) any later version.
 */
+#include <linux/slab.h>
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
 #include <rxrpc/packet.h>
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 3ef504370034..bb4ed144d0e4 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -189,8 +189,9 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
        if (!permits)
                goto out_unlock;
-        memcpy(permits->permits, xpermits->permits,
+        if (xpermits)
-               count * sizeof(struct afs_permit));
+                memcpy(permits->permits, xpermits->permits,
+                        count * sizeof(struct afs_permit));
        _debug("key %x access %x",
               key_serial(key), vnode->status.caller_access);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 14f6431598ad..e932e5a3a0c1 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -311,6 +311,7 @@ static int afs_fill_super(struct super_block *sb, void *data)
        sb->s_magic             = AFS_FS_MAGIC;
        sb->s_op                = &afs_super_ops;
        sb->s_fs_info           = as;
+        sb->s_bdi               = &as->volume->bdi;
        /* allocate the root inode and dentry */
        fid.vid         = as->volume->vid;
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 36c1306e09e0..340afd0cd182 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -9,6 +9,7 @@
 * 2 of the License, or (at your option) any later version.
 */
+#include <linux/gfp.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include "internal.h"
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 6e689208def2..9ac260d1361d 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include "internal.h"
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 2f05c4fc2a70..25cf4c3f4ff7 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include "internal.h"
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index a353e69e2391..401eeb21869f 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,6 +106,10 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
        volume->cell            = params->cell;
        volume->vid             = vlocation->vldb.vid[params->type];
+        ret = bdi_setup_and_register(&volume->bdi, "afs", BDI_CAP_MAP_COPY);
+        if (ret)
+                goto error_bdi;
        init_rwsem(&volume->server_sem);
        /* look up all the applicable server records */
@@ -151,6 +155,8 @@ error:
        return ERR_PTR(ret);
 error_discard:
+        bdi_destroy(&volume->bdi);
+error_bdi:
        up_write(&params->cell->vl_sem);
        for (loop = volume->nservers - 1; loop >= 0; loop--)
@@ -200,6 +206,7 @@ void afs_put_volume(struct afs_volume *volume)
        for (loop = volume->nservers - 1; loop >= 0; loop--)
                afs_put_server(volume->servers[loop]);
+        bdi_destroy(&volume->bdi);
        kfree(volume);
        _leave(" [destroyed]");
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3bed54a294d4..3dab9e9948d0 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -701,8 +701,9 @@ int afs_writeback_all(struct afs_vnode *vnode)
 * - the return status from this call provides a reliable indication of
 *   whether any write errors occurred for this process.
 */
-int afs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int afs_fsync(struct file *file, int datasync)
 {
+        struct dentry *dentry = file->f_path.dentry;
        struct afs_writeback *wb, *xwb;
        struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
        int ret;
diff --git a/fs/aio.c b/fs/aio.c
index 1cf12b3dd83a..1ccf25cef1f0 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -36,6 +36,7 @@
 #include <linux/blkdev.h>
 #include <linux/mempool.h>
 #include <linux/hash.h>
+#include <linux/compat.h>
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -526,7 +527,7 @@ static void aio_fput_routine(struct work_struct *data)
                /* Complete the fput(s) */
                if (req->ki_filp != NULL)
-                        __fput(req->ki_filp);
+                        fput(req->ki_filp);
                /* Link the iocb into the context's free list */
                spin_lock_irq(&ctx->ctx_lock);
@@ -559,11 +560,11 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
        /*
         * Try to optimize the aio and eventfd file* puts, by avoiding to
-         * schedule work in case it is not __fput() time. In normal cases,
+         * schedule work in case it is not final fput() time. In normal cases,
         * we would not be holding the last reference to the file*, so
         * this function will be executed w/out any aio kthread wakeup.
         */
-        if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) {
+        if (unlikely(!fput_atomic(req->ki_filp))) {
                get_ioctx(ctx);
                spin_lock(&fput_lock);
                list_add(&req->ki_list, &fput_head);
@@ -1384,13 +1385,22 @@ static ssize_t aio_fsync(struct kiocb *iocb)
        return ret;
 }
-static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb)
+static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
 {
        ssize_t ret;
-        ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf,
+#ifdef CONFIG_COMPAT
-                                    kiocb->ki_nbytes, 1,
+        if (compat)
-                                    &kiocb->ki_inline_vec, &kiocb->ki_iovec);
+                ret = compat_rw_copy_check_uvector(type,
+                                (struct compat_iovec __user *)kiocb->ki_buf,
+                                kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+                                &kiocb->ki_iovec);
+        else
+#endif
+                ret = rw_copy_check_uvector(type,
+                                (struct iovec __user *)kiocb->ki_buf,
+                                kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+                                &kiocb->ki_iovec);
        if (ret < 0)
                goto out;
@@ -1420,7 +1430,7 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb)
 *      Performs the initial checks and aio retry method
 *      setup for the kiocb at the time of io submission.
 */
-static ssize_t aio_setup_iocb(struct kiocb *kiocb)
+static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
 {
        struct file *file = kiocb->ki_filp;
        ssize_t ret = 0;
@@ -1469,7 +1479,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
                ret = security_file_permission(file, MAY_READ);
                if (unlikely(ret))
                        break;
-                ret = aio_setup_vectored_rw(READ, kiocb);
+                ret = aio_setup_vectored_rw(READ, kiocb, compat);
                if (ret)
                        break;
                ret = -EINVAL;
@@ -1483,7 +1493,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
                ret = security_file_permission(file, MAY_WRITE);
                if (unlikely(ret))
                        break;
-                ret = aio_setup_vectored_rw(WRITE, kiocb);
+                ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
                if (ret)
                        break;
                ret = -EINVAL;
@@ -1548,7 +1558,8 @@ static void aio_batch_free(struct hlist_head *batch_hash)
 }
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-                         struct iocb *iocb, struct hlist_head *batch_hash)
+                         struct iocb *iocb, struct hlist_head *batch_hash,
+                         bool compat)
 {
        struct kiocb *req;
        struct file *file;
@@ -1609,7 +1620,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
        req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
        req->ki_opcode = iocb->aio_lio_opcode;
-        ret = aio_setup_iocb(req);
+        ret = aio_setup_iocb(req, compat);
        if (ret)
                goto out_put_req;
@@ -1637,20 +1648,8 @@ out_put_req:
        return ret;
 }
-/* sys_io_submit:
+long do_io_submit(aio_context_t ctx_id, long nr,
- *      Queue the nr iocbs pointed to by iocbpp for processing.  Returns
+                  struct iocb __user *__user *iocbpp, bool compat)
- *      the number of iocbs queued.  May return -EINVAL if the aio_context
- *      specified by ctx_id is invalid, if nr is < 0, if the iocb at
- *      *iocbpp[0] is not properly initialized, if the operation specified
- *      is invalid for the file descriptor in the iocb.  May fail with
- *      -EFAULT if any of the data structures point to invalid data.  May
- *      fail with -EBADF if the file descriptor specified in the first
- *      iocb is invalid.  May fail with -EAGAIN if insufficient resources
- *      are available to queue any iocbs.  Will return 0 if nr is 0.  Will
- *      fail with -ENOSYS if not implemented.
- */
-SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
-                struct iocb __user * __user *, iocbpp)
 {
        struct kioctx *ctx;
        long ret = 0;
@@ -1687,7 +1686,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
                        break;
                }
-                ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash);
+                ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat);
                if (ret)
                        break;
        }
@@ -1697,6 +1696,24 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
        return i ? i : ret;
 }
+/* sys_io_submit:
+ *      Queue the nr iocbs pointed to by iocbpp for processing.  Returns
+ *      the number of iocbs queued.  May return -EINVAL if the aio_context
+ *      specified by ctx_id is invalid, if nr is < 0, if the iocb at
+ *      *iocbpp[0] is not properly initialized, if the operation specified
+ *      is invalid for the file descriptor in the iocb.  May fail with
+ *      -EFAULT if any of the data structures point to invalid data.  May
+ *      fail with -EBADF if the file descriptor specified in the first
+ *      iocb is invalid.  May fail with -EAGAIN if insufficient resources
+ *      are available to queue any iocbs.  Will return 0 if nr is 0.  Will
+ *      fail with -ENOSYS if not implemented.
+ */
+SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
+                struct iocb __user * __user *, iocbpp)
+{
+        return do_io_submit(ctx_id, nr, iocbpp, 0);
+}
 /* lookup_kiocb
 *      Finds a given iocb for cancellation.
 */
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 2de009565d8e..e4b75d6eda83 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -12,7 +12,6 @@
 #include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/sched.h>
-#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
diff --git a/fs/attr.c b/fs/attr.c
index 0815e93bb487..b4fa3b0aa596 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -67,14 +67,14 @@ EXPORT_SYMBOL(inode_change_ok);
 * @offset:     the new size to assign to the inode
 * @Returns:    0 on success, -ve errno on failure
 *
+ * inode_newsize_ok must be called with i_mutex held.
+ *
 * inode_newsize_ok will check filesystem limits and ulimits to check that the
 * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
 * when necessary. Caller must not proceed with inode size change if failure is
 * returned. @inode must be a file (not directory), with appropriate
 * permissions to allow truncate (inode_newsize_ok does NOT check these
 * conditions).
- *
- * inode_newsize_ok must be called with i_mutex held.
 */
 int inode_newsize_ok(const struct inode *inode, loff_t offset)
 {
@@ -104,17 +104,25 @@ out_big:
 }
 EXPORT_SYMBOL(inode_newsize_ok);
-int inode_setattr(struct inode * inode, struct iattr * attr)
+/**
+ * generic_setattr - copy simple metadata updates into the generic inode
+ * @inode:      the inode to be updated
+ * @attr:       the new attributes
+ *
+ * generic_setattr must be called with i_mutex held.
+ *
+ * generic_setattr updates the inode's metadata with that specified
+ * in attr. Noticably missing is inode size update, which is more complex
+ * as it requires pagecache updates. See simple_setsize.
+ *
+ * The inode is not marked as dirty after this operation. The rationale is
+ * that for "simple" filesystems, the struct inode is the inode storage.
+ * The caller is free to mark the inode dirty afterwards if needed.
+ */
+void generic_setattr(struct inode *inode, const struct iattr *attr)
 {
        unsigned int ia_valid = attr->ia_valid;
-        if (ia_valid & ATTR_SIZE &&
-            attr->ia_size != i_size_read(inode)) {
-                int error = vmtruncate(inode, attr->ia_size);
-                if (error)
-                        return error;
-        }
        if (ia_valid & ATTR_UID)
                inode->i_uid = attr->ia_uid;
        if (ia_valid & ATTR_GID)
@@ -135,6 +143,28 @@ int inode_setattr(struct inode * inode, struct iattr * attr)
                        mode &= ~S_ISGID;
                inode->i_mode = mode;
        }
+}
+EXPORT_SYMBOL(generic_setattr);
+/*
+ * note this function is deprecated, the new truncate sequence should be
+ * used instead -- see eg. simple_setsize, generic_setattr.
+ */
+int inode_setattr(struct inode *inode, const struct iattr *attr)
+{
+        unsigned int ia_valid = attr->ia_valid;
+        if (ia_valid & ATTR_SIZE &&
+            attr->ia_size != i_size_read(inode)) {
+                int error;
+                error = vmtruncate(inode, attr->ia_size);
+                if (error)
+                        return error;
+        }
+        generic_setattr(inode, attr);
        mark_inode_dirty(inode);
        return 0;
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 4a1401cea0a1..9a0520b50663 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -13,6 +13,7 @@
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/param.h>
 #include <linux/time.h>
 #include <linux/smp_lock.h>
@@ -27,6 +28,7 @@ static int autofs_root_mkdir(struct inode *,struct dentry *,int);
 static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
 const struct file_operations autofs_root_operations = {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = autofs_root_readdir,
        .ioctl          = autofs_root_ioctl,
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index c8a80dffb455..ba4a38b9c22f 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -22,6 +22,7 @@
 #include <linux/magic.h>
 #include <linux/dcache.h>
 #include <linux/uaccess.h>
+#include <linux/slab.h>
 #include "autofs_i.h"
@@ -94,7 +95,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
 */
 static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
 {
-        struct autofs_dev_ioctl tmp, *ads;
+        struct autofs_dev_ioctl tmp;
        if (copy_from_user(&tmp, in, sizeof(tmp)))
                return ERR_PTR(-EFAULT);
@@ -102,16 +103,7 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
        if (tmp.size < sizeof(tmp))
                return ERR_PTR(-EINVAL);
-        ads = kmalloc(tmp.size, GFP_KERNEL);
+        return memdup_user(in, tmp.size);
-        if (!ads)
-                return ERR_PTR(-ENOMEM);
-        if (copy_from_user(ads, in, tmp.size)) {
-                kfree(ads);
-                return ERR_PTR(-EFAULT);
-        }
-        return ads;
 }
 static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
@@ -735,11 +727,14 @@ static const struct file_operations _dev_ioctl_fops = {
 };
 static struct miscdevice _autofs_dev_ioctl_misc = {
-        .minor          = MISC_DYNAMIC_MINOR,
+        .minor          = AUTOFS_MINOR,
        .name           = AUTOFS_DEVICE_NAME,
        .fops           = &_dev_ioctl_fops
 };
+MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
+MODULE_ALIAS("devname:autofs");
 /* Register/deregister misc character device */
 int autofs_dev_ioctl_init(void)
 {
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index a015b49891df..db4117ed7803 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -15,15 +15,17 @@
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/param.h>
 #include <linux/time.h>
+#include <linux/smp_lock.h>
 #include "autofs_i.h"
 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
 static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
 static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
-static int autofs4_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
+static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
 static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static void *autofs4_follow_link(struct dentry *, struct nameidata *);
@@ -37,7 +39,7 @@ const struct file_operations autofs4_root_operations = {
        .read           = generic_read_dir,
        .readdir        = dcache_readdir,
        .llseek         = dcache_dir_lseek,
-        .ioctl          = autofs4_root_ioctl,
+        .unlocked_ioctl = autofs4_root_ioctl,
 };
 const struct file_operations autofs4_dir_operations = {
@@ -176,8 +178,7 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
                }
        /* Trigger mount for path component or follow link */
        } else if (ino->flags & AUTOFS_INF_PENDING ||
-                        autofs4_need_mount(flags) ||
+                        autofs4_need_mount(flags)) {
-                        current->link_count) {
                DPRINTK("waiting for mount name=%.*s",
                        dentry->d_name.len, dentry->d_name.name);
@@ -261,7 +262,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
                spin_unlock(&dcache_lock);
                spin_unlock(&sbi->fs_lock);
-                status = try_to_fill_dentry(dentry, 0);
+                status = try_to_fill_dentry(dentry, nd->flags);
                if (status)
                        goto out_error;
@@ -902,8 +903,8 @@ int is_autofs4_dentry(struct dentry *dentry)
 * ioctl()'s on the root directory is the chief method for the daemon to
 * generate kernel reactions
 */
-static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
+static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
-                             unsigned int cmd, unsigned long arg)
+                                       unsigned int cmd, unsigned long arg)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb);
        void __user *p = (void __user *)arg;
@@ -947,3 +948,16 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
                return -ENOSYS;
        }
 }
+static long autofs4_root_ioctl(struct file *filp,
+                               unsigned int cmd, unsigned long arg)
+{
+        long ret;
+        struct inode *inode = filp->f_dentry->d_inode;
+        lock_kernel();
+        ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
+        unlock_kernel();
+        return ret;
+}
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index a05287a23f62..52e59bf4aa5f 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -93,8 +93,7 @@ static int bad_file_release(struct inode *inode, struct file *filp)
        return -EIO;
 }
-static int bad_file_fsync(struct file *file, struct dentry *dentry,
+static int bad_file_fsync(struct file *file, int datasync)
-                        int datasync)
 {
        return -EIO;
 }
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index e3287d0d1a58..59096b5e0fc7 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -11,7 +11,6 @@
 */
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include <linux/string.h>
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 1e41aadb1068..d967e052b779 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -78,7 +78,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
 const struct file_operations bfs_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = bfs_readdir,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
        .llseek         = generic_file_llseek,
 };
@@ -105,14 +105,12 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
        }
        set_bit(ino, info->si_imap);
        info->si_freei--;
-        inode->i_uid = current_fsuid();
+        inode_init_owner(inode, dir, mode);
-        inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid();
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
        inode->i_blocks = 0;
        inode->i_op = &bfs_file_inops;
        inode->i_fop = &bfs_file_operations;
        inode->i_mapping->a_ops = &bfs_aops;
-        inode->i_mode = mode;
        inode->i_ino = ino;
        BFS_I(inode)->i_dsk_ino = ino;
        BFS_I(inode)->i_sblock = 0;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 15d80bb35d6f..f96eff04e11a 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -20,11 +20,11 @@
 #include <linux/fcntl.h>
 #include <linux/ptrace.h>
 #include <linux/user.h>
-#include <linux/slab.h>
 #include <linux/binfmts.h>
 #include <linux/personality.h>
 #include <linux/init.h>
 #include <linux/coredump.h>
+#include <linux/slab.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -75,14 +75,16 @@ static int aout_core_dump(struct coredump_params *cprm)
        struct file *file = cprm->file;
        mm_segment_t fs;
        int has_dumped = 0;
-        unsigned long dump_start, dump_size;
+        void __user *dump_start;
+        int dump_size;
        struct user dump;
 #ifdef __alpha__
-#       define START_DATA(u)    (u.start_data)
+#       define START_DATA(u)    ((void __user *)u.start_data)
 #else
-#       define START_DATA(u)    ((u.u_tsize << PAGE_SHIFT) + u.start_code)
+#       define START_DATA(u)    ((void __user *)((u.u_tsize << PAGE_SHIFT) + \
+                                 u.start_code))
 #endif
-#       define START_STACK(u)   (u.start_stack)
+#       define START_STACK(u)   ((void __user *)u.start_stack)
        fs = get_fs();
        set_fs(KERNEL_DS);
@@ -104,9 +106,9 @@ static int aout_core_dump(struct coredump_params *cprm)
 /* make sure we actually have a data and stack area to dump */
        set_fs(USER_DS);
-        if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
+        if (!access_ok(VERIFY_READ, START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
                dump.u_dsize = 0;
-        if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
+        if (!access_ok(VERIFY_READ, START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
                dump.u_ssize = 0;
        set_fs(KERNEL_DS);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 2c32d00a6690..2c5f9a0e5d72 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1005,15 +1005,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
                                }
                        } else if (!mm->start_data) {
                                mm->start_data = seg->addr;
-#ifndef CONFIG_MMU
                                mm->end_data = seg->addr + phdr->p_memsz;
-#endif
                        }
-#ifdef CONFIG_MMU
-                        if (seg->addr + phdr->p_memsz > mm->end_data)
-                                mm->end_data = seg->addr + phdr->p_memsz;
-#endif
                }
                seg++;
@@ -1590,7 +1583,7 @@ static size_t elf_core_vma_data_size(unsigned long mm_flags)
        struct vm_area_struct *vma;
        size_t size = 0;
-        for (vma = current->mm->mmap; vma; vma->vm_next)
+        for (vma = current->mm->mmap; vma; vma = vma->vm_next)
                if (maydump(vma, mm_flags))
                        size += vma->vm_end - vma->vm_start;
        return size;
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 32fb00b52cd0..b8e8b0acf9bd 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -11,7 +11,6 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/stat.h>
-#include <linux/slab.h>
 #include <linux/binfmts.h>
 #include <linux/elf.h>
 #include <linux/init.h>
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index e0e769bdca59..49566c1687d8 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -355,7 +355,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
        if (!flat_reloc_valid(r, start_brk - start_data + text_len)) {
                printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)",
-                       (int) r,(int)(start_brk-start_code),(int)text_len);
+                       (int) r,(int)(start_brk-start_data+text_len),(int)text_len);
                goto failed;
        }
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 08343505e184..aca9d55afb22 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -8,7 +8,6 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/stat.h>
-#include <linux/slab.h>
 #include <linux/binfmts.h>
 #include <linux/init.h>
 #include <linux/file.h>
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index a16f29e888cd..612a5c38d3c1 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -24,6 +24,7 @@
 #include <linux/mempool.h>
 #include <linux/bio.h>
 #include <linux/workqueue.h>
+#include <linux/slab.h>
 struct integrity_slab {
        struct kmem_cache *slab;
diff --git a/fs/bio.c b/fs/bio.c
index e1f922184b45..e7bf6ca64dcf 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -554,7 +554,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
                                        .bi_rw = bio->bi_rw,
                                };
-                                if (q->merge_bvec_fn(q, &bvm, prev) < len) {
+                                if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) {
                                        prev->bv_len -= len;
                                        return 0;
                                }
@@ -607,7 +607,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
                 * merge_bvec_fn() returns number of bytes it can accept
                 * at this offset
                 */
-                if (q->merge_bvec_fn(q, &bvm, bvec) < len) {
+                if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
                        bvec->bv_page = NULL;
                        bvec->bv_len = 0;
                        bvec->bv_offset = 0;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index d11d0289f3d2..7346c96308a5 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -172,8 +172,9 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
-        return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode),
+        return blockdev_direct_IO_no_locking_newtrunc(rw, iocb, inode,
-                                iov, offset, nr_segs, blkdev_get_blocks, NULL);
+                                I_BDEV(inode), iov, offset, nr_segs,
+                                blkdev_get_blocks, NULL);
 }
 int __sync_blockdev(struct block_device *bdev, int wait)
@@ -245,37 +246,14 @@ struct super_block *freeze_bdev(struct block_device *bdev)
        sb = get_active_super(bdev);
        if (!sb)
                goto out;
-        if (sb->s_flags & MS_RDONLY) {
+        error = freeze_super(sb);
-                sb->s_frozen = SB_FREEZE_TRANS;
+        if (error) {
-                up_write(&sb->s_umount);
+                deactivate_super(sb);
+                bdev->bd_fsfreeze_count--;
                mutex_unlock(&bdev->bd_fsfreeze_mutex);
-                return sb;
+                return ERR_PTR(error);
-        }
-        sb->s_frozen = SB_FREEZE_WRITE;
-        smp_wmb();
-        sync_filesystem(sb);
-        sb->s_frozen = SB_FREEZE_TRANS;
-        smp_wmb();
-        sync_blockdev(sb->s_bdev);
-        if (sb->s_op->freeze_fs) {
-                error = sb->s_op->freeze_fs(sb);
-                if (error) {
-                        printk(KERN_ERR
-                                "VFS:Filesystem freeze failed\n");
-                        sb->s_frozen = SB_UNFROZEN;
-                        deactivate_locked_super(sb);
-                        bdev->bd_fsfreeze_count--;
-                        mutex_unlock(&bdev->bd_fsfreeze_mutex);
-                        return ERR_PTR(error);
-                }
        }
-        up_write(&sb->s_umount);
+        deactivate_super(sb);
 out:
        sync_blockdev(bdev);
        mutex_unlock(&bdev->bd_fsfreeze_mutex);
@@ -296,40 +274,22 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
        mutex_lock(&bdev->bd_fsfreeze_mutex);
        if (!bdev->bd_fsfreeze_count)
-                goto out_unlock;
+                goto out;
        error = 0;
        if (--bdev->bd_fsfreeze_count > 0)
-                goto out_unlock;
+                goto out;
        if (!sb)
-                goto out_unlock;
+                goto out;
-        BUG_ON(sb->s_bdev != bdev);
-        down_write(&sb->s_umount);
-        if (sb->s_flags & MS_RDONLY)
-                goto out_unfrozen;
-        if (sb->s_op->unfreeze_fs) {
-                error = sb->s_op->unfreeze_fs(sb);
-                if (error) {
-                        printk(KERN_ERR
-                                "VFS:Filesystem thaw failed\n");
-                        sb->s_frozen = SB_FREEZE_TRANS;
-                        bdev->bd_fsfreeze_count++;
-                        mutex_unlock(&bdev->bd_fsfreeze_mutex);
-                        return error;
-                }
-        }
-out_unfrozen:
-        sb->s_frozen = SB_UNFROZEN;
-        smp_wmb();
-        wake_up(&sb->s_wait_unfrozen);
-        if (sb)
+        error = thaw_super(sb);
-                deactivate_locked_super(sb);
+        if (error) {
-out_unlock:
+                bdev->bd_fsfreeze_count++;
+                mutex_unlock(&bdev->bd_fsfreeze_mutex);
+                return error;
+        }
+out:
        mutex_unlock(&bdev->bd_fsfreeze_mutex);
        return 0;
 }
@@ -350,8 +310,8 @@ static int blkdev_write_begin(struct file *file, struct address_space *mapping,
                        struct page **pagep, void **fsdata)
 {
        *pagep = NULL;
-        return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        return block_write_begin_newtrunc(file, mapping, pos, len, flags,
-                                blkdev_get_block);
+                                pagep, fsdata, blkdev_get_block);
 }
 static int blkdev_write_end(struct file *file, struct address_space *mapping,
@@ -399,25 +359,28 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
        return retval;
 }
        
-/*
+int blkdev_fsync(struct file *filp, int datasync)
- *      Filp is never NULL; the only case when ->fsync() is called with
- *      NULL first argument is nfsd_sync_dir() and that's not a directory.
- */
- 
-static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
 {
-        struct block_device *bdev = I_BDEV(filp->f_mapping->host);
+        struct inode *bd_inode = filp->f_mapping->host;
+        struct block_device *bdev = I_BDEV(bd_inode);
        int error;
-        error = sync_blockdev(bdev);
+        /*
-        if (error)
+         * There is no need to serialise calls to blkdev_issue_flush with
-                return error;
+         * i_mutex and doing so causes performance issues with concurrent
-        
+         * O_SYNC writers to a block device.
-        error = blkdev_issue_flush(bdev, NULL);
+         */
+        mutex_unlock(&bd_inode->i_mutex);
+        error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT);
        if (error == -EOPNOTSUPP)
                error = 0;
+        mutex_lock(&bd_inode->i_mutex);
        return error;
 }
+EXPORT_SYMBOL(blkdev_fsync);
 /*
 * pseudo-fs
@@ -660,41 +623,209 @@ void bd_forget(struct inode *inode)
                iput(bdev->bd_inode);
 }
-int bd_claim(struct block_device *bdev, void *holder)
+/**
+ * bd_may_claim - test whether a block device can be claimed
+ * @bdev: block device of interest
+ * @whole: whole block device containing @bdev, may equal @bdev
+ * @holder: holder trying to claim @bdev
+ *
+ * Test whther @bdev can be claimed by @holder.
+ *
+ * CONTEXT:
+ * spin_lock(&bdev_lock).
+ *
+ * RETURNS:
+ * %true if @bdev can be claimed, %false otherwise.
+ */
+static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
+                         void *holder)
 {
-        int res;
-        spin_lock(&bdev_lock);
-        /* first decide result */
        if (bdev->bd_holder == holder)
-                res = 0;         /* already a holder */
+                return true;     /* already a holder */
        else if (bdev->bd_holder != NULL)
-                res = -EBUSY;    /* held by someone else */
+                return false;    /* held by someone else */
        else if (bdev->bd_contains == bdev)
-                res = 0;         /* is a whole device which isn't held */
+                return true;     /* is a whole device which isn't held */
-        else if (bdev->bd_contains->bd_holder == bd_claim)
+        else if (whole->bd_holder == bd_claim)
-                res = 0;         /* is a partition of a device that is being partitioned */
+                return true;     /* is a partition of a device that is being partitioned */
-        else if (bdev->bd_contains->bd_holder != NULL)
+        else if (whole->bd_holder != NULL)
-                res = -EBUSY;    /* is a partition of a held device */
+                return false;    /* is a partition of a held device */
        else
-                res = 0;         /* is a partition of an un-held device */
+                return true;     /* is a partition of an un-held device */
+}
+/**
+ * bd_prepare_to_claim - prepare to claim a block device
+ * @bdev: block device of interest
+ * @whole: the whole device containing @bdev, may equal @bdev
+ * @holder: holder trying to claim @bdev
+ *
+ * Prepare to claim @bdev.  This function fails if @bdev is already
+ * claimed by another holder and waits if another claiming is in
+ * progress.  This function doesn't actually claim.  On successful
+ * return, the caller has ownership of bd_claiming and bd_holder[s].
+ *
+ * CONTEXT:
+ * spin_lock(&bdev_lock).  Might release bdev_lock, sleep and regrab
+ * it multiple times.
+ *
+ * RETURNS:
+ * 0 if @bdev can be claimed, -EBUSY otherwise.
+ */
+static int bd_prepare_to_claim(struct block_device *bdev,
+                               struct block_device *whole, void *holder)
+{
+retry:
+        /* if someone else claimed, fail */
+        if (!bd_may_claim(bdev, whole, holder))
+                return -EBUSY;
+        /* if someone else is claiming, wait for it to finish */
+        if (whole->bd_claiming && whole->bd_claiming != holder) {
+                wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
+                DEFINE_WAIT(wait);
+                prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+                spin_unlock(&bdev_lock);
+                schedule();
+                finish_wait(wq, &wait);
+                spin_lock(&bdev_lock);
+                goto retry;
+        }
+        /* yay, all mine */
+        return 0;
+}
+/**
+ * bd_start_claiming - start claiming a block device
+ * @bdev: block device of interest
+ * @holder: holder trying to claim @bdev
+ *
+ * @bdev is about to be opened exclusively.  Check @bdev can be opened
+ * exclusively and mark that an exclusive open is in progress.  Each
+ * successful call to this function must be matched with a call to
+ * either bd_claim() or bd_abort_claiming().  If this function
+ * succeeds, the matching bd_claim() is guaranteed to succeed.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to the block device containing @bdev on success, ERR_PTR()
+ * value on failure.
+ */
+static struct block_device *bd_start_claiming(struct block_device *bdev,
+                                              void *holder)
+{
+        struct gendisk *disk;
+        struct block_device *whole;
+        int partno, err;
+        might_sleep();
+        /*
+         * @bdev might not have been initialized properly yet, look up
+         * and grab the outer block device the hard way.
+         */
+        disk = get_gendisk(bdev->bd_dev, &partno);
+        if (!disk)
+                return ERR_PTR(-ENXIO);
+        whole = bdget_disk(disk, 0);
+        put_disk(disk);
+        if (!whole)
+                return ERR_PTR(-ENOMEM);
+        /* prepare to claim, if successful, mark claiming in progress */
+        spin_lock(&bdev_lock);
+        err = bd_prepare_to_claim(bdev, whole, holder);
+        if (err == 0) {
+                whole->bd_claiming = holder;
+                spin_unlock(&bdev_lock);
+                return whole;
+        } else {
+                spin_unlock(&bdev_lock);
+                bdput(whole);
+                return ERR_PTR(err);
+        }
+}
-        /* now impose change */
+/* releases bdev_lock */
-        if (res==0) {
+static void __bd_abort_claiming(struct block_device *whole, void *holder)
+{
+        BUG_ON(whole->bd_claiming != holder);
+        whole->bd_claiming = NULL;
+        wake_up_bit(&whole->bd_claiming, 0);
+        spin_unlock(&bdev_lock);
+        bdput(whole);
+}
+/**
+ * bd_abort_claiming - abort claiming a block device
+ * @whole: whole block device returned by bd_start_claiming()
+ * @holder: holder trying to claim @bdev
+ *
+ * Abort a claiming block started by bd_start_claiming().  Note that
+ * @whole is not the block device to be claimed but the whole device
+ * returned by bd_start_claiming().
+ *
+ * CONTEXT:
+ * Grabs and releases bdev_lock.
+ */
+static void bd_abort_claiming(struct block_device *whole, void *holder)
+{
+        spin_lock(&bdev_lock);
+        __bd_abort_claiming(whole, holder);             /* releases bdev_lock */
+}
+/**
+ * bd_claim - claim a block device
+ * @bdev: block device to claim
+ * @holder: holder trying to claim @bdev
+ *
+ * Try to claim @bdev which must have been opened successfully.  This
+ * function may be called with or without preceding
+ * blk_start_claiming().  In the former case, this function is always
+ * successful and terminates the claiming block.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 if successful, -EBUSY if @bdev is already claimed.
+ */
+int bd_claim(struct block_device *bdev, void *holder)
+{
+        struct block_device *whole = bdev->bd_contains;
+        int res;
+        might_sleep();
+        spin_lock(&bdev_lock);
+        res = bd_prepare_to_claim(bdev, whole, holder);
+        if (res == 0) {
                /* note that for a whole device bd_holders
                 * will be incremented twice, and bd_holder will
                 * be set to bd_claim before being set to holder
                 */
-                bdev->bd_contains->bd_holders ++;
+                whole->bd_holders++;
-                bdev->bd_contains->bd_holder = bd_claim;
+                whole->bd_holder = bd_claim;
                bdev->bd_holders++;
                bdev->bd_holder = holder;
        }
-        spin_unlock(&bdev_lock);
+        if (whole->bd_claiming)
+                __bd_abort_claiming(whole, holder);     /* releases bdev_lock */
+        else
+                spin_unlock(&bdev_lock);
        return res;
 }
 EXPORT_SYMBOL(bd_claim);
 void bd_release(struct block_device *bdev)
@@ -1308,6 +1439,7 @@ EXPORT_SYMBOL(blkdev_get);
 static int blkdev_open(struct inode * inode, struct file * filp)
 {
+        struct block_device *whole = NULL;
        struct block_device *bdev;
        int res;
@@ -1330,22 +1462,25 @@ static int blkdev_open(struct inode * inode, struct file * filp)
        if (bdev == NULL)
                return -ENOMEM;
+        if (filp->f_mode & FMODE_EXCL) {
+                whole = bd_start_claiming(bdev, filp);
+                if (IS_ERR(whole)) {
+                        bdput(bdev);
+                        return PTR_ERR(whole);
+                }
+        }
        filp->f_mapping = bdev->bd_inode->i_mapping;
        res = blkdev_get(bdev, filp->f_mode);
-        if (res)
-                return res;
-        if (filp->f_mode & FMODE_EXCL) {
+        if (whole) {
-                res = bd_claim(bdev, filp);
+                if (res == 0)
-                if (res)
+                        BUG_ON(bd_claim(bdev, filp) != 0);
-                        goto out_blkdev_put;
+                else
+                        bd_abort_claiming(whole, filp);
        }
-        return 0;
- out_blkdev_put:
-        blkdev_put(bdev, filp->f_mode);
        return res;
 }
@@ -1481,7 +1616,7 @@ const struct file_operations def_blk_fops = {
        .aio_read       = generic_file_aio_read,
        .aio_write      = blkdev_aio_write,
        .mmap           = generic_file_mmap,
-        .fsync          = block_fsync,
+        .fsync          = blkdev_fsync,
        .unlocked_ioctl = block_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = compat_blkdev_ioctl,
@@ -1556,27 +1691,34 @@ EXPORT_SYMBOL(lookup_bdev);
 */
 struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
 {
-        struct block_device *bdev;
+        struct block_device *bdev, *whole;
-        int error = 0;
+        int error;
        bdev = lookup_bdev(path);
        if (IS_ERR(bdev))
                return bdev;
+        whole = bd_start_claiming(bdev, holder);
+        if (IS_ERR(whole)) {
+                bdput(bdev);
+                return whole;
+        }
        error = blkdev_get(bdev, mode);
        if (error)
-                return ERR_PTR(error);
+                goto out_abort_claiming;
        error = -EACCES;
        if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
-                goto blkdev_put;
+                goto out_blkdev_put;
-        error = bd_claim(bdev, holder);
-        if (error)
-                goto blkdev_put;
+        BUG_ON(bd_claim(bdev, holder) != 0);
        return bdev;
-        
-blkdev_put:
+out_blkdev_put:
        blkdev_put(bdev, mode);
+out_abort_claiming:
+        bd_abort_claiming(whole, holder);
        return ERR_PTR(error);
 }
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6df6d6ed74fd..8d432cd9d580 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -22,6 +22,7 @@
 #include <linux/posix_acl_xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include "ctree.h"
 #include "btrfs_inode.h"
@@ -281,14 +282,14 @@ int btrfs_acl_chmod(struct inode *inode)
        return ret;
 }
-struct xattr_handler btrfs_xattr_acl_default_handler = {
+const struct xattr_handler btrfs_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags  = ACL_TYPE_DEFAULT,
        .get    = btrfs_xattr_acl_get,
        .set    = btrfs_xattr_acl_set,
 };
-struct xattr_handler btrfs_xattr_acl_access_handler = {
+const struct xattr_handler btrfs_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags  = ACL_TYPE_ACCESS,
        .get    = btrfs_xattr_acl_get,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c0861e781cdb..7ec14097fef1 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -17,6 +17,7 @@
 */
 #include <linux/kthread.h>
+#include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/freezer.h>
@@ -376,6 +377,7 @@ again:
                                if (!list_empty(&worker->pending) ||
                                    !list_empty(&worker->prio_pending)) {
                                        spin_unlock_irq(&worker->lock);
+                                        set_current_state(TASK_RUNNING);
                                        goto again;
                                }
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 7a4dee199832..6ad63f17eca0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -137,8 +137,8 @@ struct btrfs_inode {
         * of extent items we've reserved metadata for.
         */
        spinlock_t accounting_lock;
+        atomic_t outstanding_extents;
        int reserved_extents;
-        int outstanding_extents;
        /*
         * ordered_data_close is set by truncate when a file that used
@@ -151,6 +151,7 @@ struct btrfs_inode {
         * of these.
         */
        unsigned ordered_data_close:1;
+        unsigned orphan_meta_reserved:1;
        unsigned dummy_inode:1;
        /*
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 28b92a7218ab..396039b3a8a2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -31,7 +31,7 @@
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/bit_spinlock.h>
-#include <linux/pagevec.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -445,7 +445,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
        unsigned long nr_pages = 0;
        struct extent_map *em;
        struct address_space *mapping = inode->i_mapping;
-        struct pagevec pvec;
        struct extent_map_tree *em_tree;
        struct extent_io_tree *tree;
        u64 end;
@@ -461,7 +460,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
        end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
-        pagevec_init(&pvec, 0);
        while (last_offset < compressed_end) {
                page_index = last_offset >> PAGE_CACHE_SHIFT;
@@ -478,26 +476,17 @@ static noinline int add_ra_bio_pages(struct inode *inode,
                        goto next;
                }
-                page = alloc_page(mapping_gfp_mask(mapping) & ~__GFP_FS);
+                page = __page_cache_alloc(mapping_gfp_mask(mapping) &
+                                                                ~__GFP_FS);
                if (!page)
                        break;
-                page->index = page_index;
+                if (add_to_page_cache_lru(page, mapping, page_index,
-                /*
+                                                                GFP_NOFS)) {
-                 * what we want to do here is call add_to_page_cache_lru,
-                 * but that isn't exported, so we reproduce it here
-                 */
-                if (add_to_page_cache(page, mapping,
-                                      page->index, GFP_NOFS)) {
                        page_cache_release(page);
                        goto next;
                }
-                /* open coding of lru_cache_add, also not exported */
-                page_cache_get(page);
-                if (!pagevec_add(&pvec, page))
-                        __pagevec_lru_add_file(&pvec);
                end = last_offset + PAGE_CACHE_SIZE - 1;
                /*
                 * at this point, we have a locked page in the page cache
@@ -551,8 +540,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 next:
                last_offset += PAGE_CACHE_SIZE;
        }
-        if (pagevec_count(&pvec))
-                __pagevec_lru_add_file(&pvec);
        return 0;
 }
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c4bc570a396e..0d1d966b0fe4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -17,6 +17,7 @@
 */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -279,7 +280,8 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                                       struct btrfs_root *root,
                                       struct extent_buffer *buf,
-                                       struct extent_buffer *cow)
+                                       struct extent_buffer *cow,
+                                       int *last_ref)
 {
        u64 refs;
        u64 owner;
@@ -365,6 +367,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                        BUG_ON(ret);
                }
                clean_tree_block(trans, root, buf);
+                *last_ref = 1;
        }
        return 0;
 }
@@ -391,6 +394,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        struct btrfs_disk_key disk_key;
        struct extent_buffer *cow;
        int level;
+        int last_ref = 0;
        int unlock_orig = 0;
        u64 parent_start;
@@ -441,7 +445,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                            (unsigned long)btrfs_header_fsid(cow),
                            BTRFS_FSID_SIZE);
-        update_ref_for_cow(trans, root, buf, cow);
+        update_ref_for_cow(trans, root, buf, cow, &last_ref);
+        if (root->ref_cows)
+                btrfs_reloc_cow_block(trans, root, buf, cow);
        if (buf == root->node) {
                WARN_ON(parent && parent != buf);
@@ -456,8 +463,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                extent_buffer_get(cow);
                spin_unlock(&root->node_lock);
-                btrfs_free_tree_block(trans, root, buf->start, buf->len,
+                btrfs_free_tree_block(trans, root, buf, parent_start,
-                                parent_start, root->root_key.objectid, level);
+                                      last_ref);
                free_extent_buffer(buf);
                add_root_to_dirty_list(root);
        } else {
@@ -472,8 +479,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                btrfs_set_node_ptr_generation(parent, parent_slot,
                                              trans->transid);
                btrfs_mark_buffer_dirty(parent);
-                btrfs_free_tree_block(trans, root, buf->start, buf->len,
+                btrfs_free_tree_block(trans, root, buf, parent_start,
-                                parent_start, root->root_key.objectid, level);
+                                      last_ref);
        }
        if (unlock_orig)
                btrfs_tree_unlock(buf);
@@ -948,6 +955,22 @@ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
        return bin_search(eb, key, level, slot);
 }
+static void root_add_used(struct btrfs_root *root, u32 size)
+{
+        spin_lock(&root->accounting_lock);
+        btrfs_set_root_used(&root->root_item,
+                            btrfs_root_used(&root->root_item) + size);
+        spin_unlock(&root->accounting_lock);
+}
+static void root_sub_used(struct btrfs_root *root, u32 size)
+{
+        spin_lock(&root->accounting_lock);
+        btrfs_set_root_used(&root->root_item,
+                            btrfs_root_used(&root->root_item) - size);
+        spin_unlock(&root->accounting_lock);
+}
 /* given a node and slot number, this reads the blocks it points to.  The
 * extent buffer is returned with a reference taken (but unlocked).
 * NULL is returned on error.
@@ -1018,7 +1041,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                btrfs_tree_lock(child);
                btrfs_set_lock_blocking(child);
                ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
-                BUG_ON(ret);
+                if (ret) {
+                        btrfs_tree_unlock(child);
+                        free_extent_buffer(child);
+                        goto enospc;
+                }
                spin_lock(&root->node_lock);
                root->node = child;
@@ -1033,11 +1060,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                btrfs_tree_unlock(mid);
                /* once for the path */
                free_extent_buffer(mid);
-                ret = btrfs_free_tree_block(trans, root, mid->start, mid->len,
-                                            0, root->root_key.objectid, level);
+                root_sub_used(root, mid->len);
+                btrfs_free_tree_block(trans, root, mid, 0, 1);
                /* once for the root ptr */
                free_extent_buffer(mid);
-                return ret;
+                return 0;
        }
        if (btrfs_header_nritems(mid) >
            BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
@@ -1087,23 +1115,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                if (wret < 0 && wret != -ENOSPC)
                        ret = wret;
                if (btrfs_header_nritems(right) == 0) {
-                        u64 bytenr = right->start;
-                        u32 blocksize = right->len;
                        clean_tree_block(trans, root, right);
                        btrfs_tree_unlock(right);
-                        free_extent_buffer(right);
-                        right = NULL;
                        wret = del_ptr(trans, root, path, level + 1, pslot +
                                       1);
                        if (wret)
                                ret = wret;
-                        wret = btrfs_free_tree_block(trans, root,
+                        root_sub_used(root, right->len);
-                                                     bytenr, blocksize, 0,
+                        btrfs_free_tree_block(trans, root, right, 0, 1);
-                                                     root->root_key.objectid,
+                        free_extent_buffer(right);
-                                                     level);
+                        right = NULL;
-                        if (wret)
-                                ret = wret;
                } else {
                        struct btrfs_disk_key right_key;
                        btrfs_node_key(right, &right_key, 0);
@@ -1135,21 +1156,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                BUG_ON(wret == 1);
        }
        if (btrfs_header_nritems(mid) == 0) {
-                /* we've managed to empty the middle node, drop it */
-                u64 bytenr = mid->start;
-                u32 blocksize = mid->len;
                clean_tree_block(trans, root, mid);
                btrfs_tree_unlock(mid);
-                free_extent_buffer(mid);
-                mid = NULL;
                wret = del_ptr(trans, root, path, level + 1, pslot);
                if (wret)
                        ret = wret;
-                wret = btrfs_free_tree_block(trans, root, bytenr, blocksize,
+                root_sub_used(root, mid->len);
-                                         0, root->root_key.objectid, level);
+                btrfs_free_tree_block(trans, root, mid, 0, 1);
-                if (wret)
+                free_extent_buffer(mid);
-                        ret = wret;
+                mid = NULL;
        } else {
                /* update the parent key to reflect our changes */
                struct btrfs_disk_key mid_key;
@@ -1589,7 +1604,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
        btrfs_release_path(NULL, p);
        ret = -EAGAIN;
-        tmp = read_tree_block(root, blocknr, blocksize, gen);
+        tmp = read_tree_block(root, blocknr, blocksize, 0);
        if (tmp) {
                /*
                 * If the read above didn't mark this buffer up to date,
@@ -1739,7 +1754,6 @@ again:
                                              p->nodes[level + 1],
                                              p->slots[level + 1], &b);
                        if (err) {
-                                free_extent_buffer(b);
                                ret = err;
                                goto done;
                        }
@@ -2075,6 +2089,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        if (IS_ERR(c))
                return PTR_ERR(c);
+        root_add_used(root, root->nodesize);
        memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
        btrfs_set_header_nritems(c, 1);
        btrfs_set_header_level(c, level);
@@ -2133,6 +2149,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
        int nritems;
        BUG_ON(!path->nodes[level]);
+        btrfs_assert_tree_locked(path->nodes[level]);
        lower = path->nodes[level];
        nritems = btrfs_header_nritems(lower);
        BUG_ON(slot > nritems);
@@ -2201,6 +2218,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
        if (IS_ERR(split))
                return PTR_ERR(split);
+        root_add_used(root, root->nodesize);
        memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
        btrfs_set_header_level(split, btrfs_header_level(c));
        btrfs_set_header_bytenr(split, split->start);
@@ -2414,6 +2433,9 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
        if (left_nritems)
                btrfs_mark_buffer_dirty(left);
+        else
+                clean_tree_block(trans, root, left);
        btrfs_mark_buffer_dirty(right);
        btrfs_item_key(right, &disk_key, 0);
@@ -2659,6 +2681,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(left);
        if (right_nritems)
                btrfs_mark_buffer_dirty(right);
+        else
+                clean_tree_block(trans, root, right);
        btrfs_item_key(right, &disk_key, 0);
        wret = fixup_low_keys(trans, root, path, &disk_key, 1);
@@ -2668,8 +2692,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        /* then fixup the leaf pointer in the path */
        if (path->slots[0] < push_items) {
                path->slots[0] += old_left_nritems;
-                if (btrfs_header_nritems(path->nodes[0]) == 0)
-                        clean_tree_block(trans, root, path->nodes[0]);
                btrfs_tree_unlock(path->nodes[0]);
                free_extent_buffer(path->nodes[0]);
                path->nodes[0] = left;
@@ -2931,10 +2953,10 @@ again:
        right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
                                        root->root_key.objectid,
                                        &disk_key, 0, l->start, 0);
-        if (IS_ERR(right)) {
+        if (IS_ERR(right))
-                BUG_ON(1);
                return PTR_ERR(right);
-        }
+        root_add_used(root, root->leafsize);
        memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
        btrfs_set_header_bytenr(right, right->start);
@@ -3040,6 +3062,10 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
        if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
                goto err;
+        /* the leaf has  changed, it now has room.  return now */
+        if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
+                goto err;
        if (key.type == BTRFS_EXTENT_DATA_KEY) {
                fi = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_file_extent_item);
@@ -3049,7 +3075,8 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
        btrfs_set_path_blocking(path);
        ret = split_leaf(trans, root, &key, path, ins_len, 1);
-        BUG_ON(ret);
+        if (ret)
+                goto err;
        path->keep_locks = 0;
        btrfs_unlock_up_safe(path, 1);
@@ -3791,9 +3818,10 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
         */
        btrfs_unlock_up_safe(path, 0);
-        ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len,
+        root_sub_used(root, leaf->len);
-                                    0, root->root_key.objectid, 0);
-        return ret;
+        btrfs_free_tree_block(trans, root, leaf, 0, 1);
+        return 0;
 }
 /*
 * delete the item at the leaf level in path.  If that empties
@@ -3860,6 +3888,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                if (leaf == root->node) {
                        btrfs_set_header_level(leaf, 0);
                } else {
+                        btrfs_set_path_blocking(path);
+                        clean_tree_block(trans, root, leaf);
                        ret = btrfs_del_leaf(trans, root, path, leaf);
                        BUG_ON(ret);
                }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0af2e3868573..29c20092847e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -26,6 +26,7 @@
 #include <linux/completion.h>
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
+#include <linux/slab.h>
 #include <asm/kmap_types.h>
 #include "extent_io.h"
 #include "extent_map.h"
@@ -33,6 +34,7 @@
 struct btrfs_trans_handle;
 struct btrfs_transaction;
+struct btrfs_pending_snapshot;
 extern struct kmem_cache *btrfs_trans_handle_cachep;
 extern struct kmem_cache *btrfs_transaction_cachep;
 extern struct kmem_cache *btrfs_bit_radix_cachep;
@@ -662,6 +664,7 @@ struct btrfs_csum_item {
 #define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
 #define BTRFS_BLOCK_GROUP_DUP      (1 << 5)
 #define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
+#define BTRFS_NR_RAID_TYPES        5
 struct btrfs_block_group_item {
        __le64 used;
@@ -673,42 +676,46 @@ struct btrfs_space_info {
        u64 flags;
        u64 total_bytes;        /* total bytes in the space */
-        u64 bytes_used;         /* total bytes used on disk */
+        u64 bytes_used;         /* total bytes used,
+                                   this does't take mirrors into account */
        u64 bytes_pinned;       /* total bytes pinned, will be freed when the
                                   transaction finishes */
        u64 bytes_reserved;     /* total bytes the allocator has reserved for
                                   current allocations */
        u64 bytes_readonly;     /* total bytes that are read only */
-        u64 bytes_super;        /* total bytes reserved for the super blocks */
-        u64 bytes_root;         /* the number of bytes needed to commit a
-                                   transaction */
        u64 bytes_may_use;      /* number of bytes that may be used for
                                   delalloc/allocations */
-        u64 bytes_delalloc;     /* number of bytes currently reserved for
+        u64 disk_used;          /* total bytes used on disk */
-                                   delayed allocation */
        int full;               /* indicates that we cannot allocate any more
                                   chunks for this space */
        int force_alloc;        /* set if we need to force a chunk alloc for
                                   this space */
-        int force_delalloc;     /* make people start doing filemap_flush until
-                                   we're under a threshold */
        struct list_head list;
-        /* for controlling how we free up space for allocations */
-        wait_queue_head_t allocate_wait;
-        wait_queue_head_t flush_wait;
-        int allocating_chunk;
-        int flushing;
        /* for block groups in our same type */
-        struct list_head block_groups;
+        struct list_head block_groups[BTRFS_NR_RAID_TYPES];
        spinlock_t lock;
        struct rw_semaphore groups_sem;
        atomic_t caching_threads;
 };
+struct btrfs_block_rsv {
+        u64 size;
+        u64 reserved;
+        u64 freed[2];
+        struct btrfs_space_info *space_info;
+        struct list_head list;
+        spinlock_t lock;
+        atomic_t usage;
+        unsigned int priority:8;
+        unsigned int durable:1;
+        unsigned int refill_used:1;
+        unsigned int full:1;
+};
 /*
 * free clusters are used to claim free space in relatively large chunks,
 * allowing us to do less seeky writes.  They are used for all metadata
@@ -759,6 +766,7 @@ struct btrfs_block_group_cache {
        spinlock_t lock;
        u64 pinned;
        u64 reserved;
+        u64 reserved_pinned;
        u64 bytes_super;
        u64 flags;
        u64 sectorsize;
@@ -824,6 +832,22 @@ struct btrfs_fs_info {
        /* logical->physical extent mapping */
        struct btrfs_mapping_tree mapping_tree;
+        /* block reservation for extent, checksum and root tree */
+        struct btrfs_block_rsv global_block_rsv;
+        /* block reservation for delay allocation */
+        struct btrfs_block_rsv delalloc_block_rsv;
+        /* block reservation for metadata operations */
+        struct btrfs_block_rsv trans_block_rsv;
+        /* block reservation for chunk tree */
+        struct btrfs_block_rsv chunk_block_rsv;
+        struct btrfs_block_rsv empty_block_rsv;
+        /* list of block reservations that cross multiple transactions */
+        struct list_head durable_block_rsv_list;
+        struct mutex durable_block_rsv_mutex;
        u64 generation;
        u64 last_trans_committed;
@@ -834,7 +858,6 @@ struct btrfs_fs_info {
        u64 last_trans_log_full_commit;
        u64 open_ioctl_trans;
        unsigned long mount_opt;
-        u64 max_extent;
        u64 max_inline;
        u64 alloc_start;
        struct btrfs_transaction *running_transaction;
@@ -927,7 +950,6 @@ struct btrfs_fs_info {
        struct btrfs_workers endio_meta_write_workers;
        struct btrfs_workers endio_write_workers;
        struct btrfs_workers submit_workers;
-        struct btrfs_workers enospc_workers;
        /*
         * fixup workers take dirty pages that didn't properly go through
         * the cow mechanism and make them safe to write.  It happens
@@ -943,6 +965,7 @@ struct btrfs_fs_info {
        int do_barriers;
        int closing;
        int log_root_recovering;
+        int enospc_unlink;
        u64 total_pinned;
@@ -1012,6 +1035,9 @@ struct btrfs_root {
        struct completion kobj_unregister;
        struct mutex objectid_mutex;
+        spinlock_t accounting_lock;
+        struct btrfs_block_rsv *block_rsv;
        struct mutex log_mutex;
        wait_queue_head_t log_writer_wait;
        wait_queue_head_t log_commit_wait[2];
@@ -1043,7 +1069,6 @@ struct btrfs_root {
        int ref_cows;
        int track_dirty;
        int in_radix;
-        int clean_orphans;
        u64 defrag_trans_start;
        struct btrfs_key defrag_progress;
@@ -1057,8 +1082,11 @@ struct btrfs_root {
        struct list_head root_list;
-        spinlock_t list_lock;
+        spinlock_t orphan_lock;
        struct list_head orphan_list;
+        struct btrfs_block_rsv *orphan_block_rsv;
+        int orphan_item_inserted;
+        int orphan_cleanup_state;
        spinlock_t inode_lock;
        /* red-black tree that keeps track of in-memory inodes */
@@ -1965,6 +1993,9 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, unsigned long count);
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, u64 bytenr,
+                             u64 num_bytes, u64 *refs, u64 *flags);
 int btrfs_pin_extent(struct btrfs_root *root,
                     u64 bytenr, u64 num, int reserved);
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
@@ -1984,10 +2015,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                        u64 parent, u64 root_objectid,
                                        struct btrfs_disk_key *key, int level,
                                        u64 hint, u64 empty_size);
-int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root,
+                           struct btrfs_root *root,
-                          u64 bytenr, u32 blocksize,
+                           struct extent_buffer *buf,
-                          u64 parent, u64 root_objectid, int level);
+                           u64 parent, int last_ref);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize,
@@ -2041,27 +2072,49 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                           u64 size);
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 group_start);
-int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
-                                struct btrfs_block_group_cache *group);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
-int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
+void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
-int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
+int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
-int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
+                                struct btrfs_root *root,
-                                          struct inode *inode, int num_items);
+                                int num_items, int *retries);
-int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
+void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
-                                        struct inode *inode, int num_items);
+                                struct btrfs_root *root);
-int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
-                                u64 bytes);
+                                  struct inode *inode);
-void btrfs_free_reserved_data_space(struct btrfs_root *root,
+void btrfs_orphan_release_metadata(struct inode *inode);
-                                    struct inode *inode, u64 bytes);
+int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
-void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
+                                struct btrfs_pending_snapshot *pending);
-                                 u64 bytes);
+int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
-void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
+void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
-                              u64 bytes);
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
+void btrfs_free_block_rsv(struct btrfs_root *root,
+                          struct btrfs_block_rsv *rsv);
+void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
+                                 struct btrfs_block_rsv *rsv);
+int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct btrfs_block_rsv *block_rsv,
+                        u64 num_bytes, int *retries);
+int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root,
+                          struct btrfs_block_rsv *block_rsv,
+                          u64 min_reserved, int min_factor);
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+                            struct btrfs_block_rsv *dst_rsv,
+                            u64 num_bytes);
+void btrfs_block_rsv_release(struct btrfs_root *root,
+                             struct btrfs_block_rsv *block_rsv,
+                             u64 num_bytes);
+int btrfs_set_block_group_ro(struct btrfs_root *root,
+                             struct btrfs_block_group_cache *cache);
+int btrfs_set_block_group_rw(struct btrfs_root *root,
+                             struct btrfs_block_group_cache *cache);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
@@ -2152,7 +2205,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
-int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref);
+int btrfs_drop_snapshot(struct btrfs_root *root,
+                        struct btrfs_block_rsv *block_rsv, int update_ref);
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct extent_buffer *node,
@@ -2245,6 +2299,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           const char *name, int name_len,
                           u64 inode_objectid, u64 ref_objectid, u64 *index);
+struct btrfs_inode_ref *
+btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct btrfs_path *path,
+                        const char *name, int name_len,
+                        u64 inode_objectid, u64 ref_objectid, int mod);
 int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid);
@@ -2257,6 +2317,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
                    struct btrfs_root *root, u64 bytenr, u64 len);
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
                          struct bio *bio, u32 *dst);
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+                              struct bio *bio, u64 logical_offset, u32 *dst);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             u64 objectid, u64 pos,
@@ -2311,6 +2373,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                               u32 min_type);
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
                              struct extent_state **cached_state);
 int btrfs_writepages(struct address_space *mapping,
@@ -2349,10 +2412,20 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
 void btrfs_orphan_cleanup(struct btrfs_root *root);
+void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
+                                struct btrfs_pending_snapshot *pending,
+                                u64 *bytes_to_reserve);
+void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
+                                struct btrfs_pending_snapshot *pending);
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root);
 int btrfs_cont_expand(struct inode *inode, loff_t size);
 int btrfs_invalidate_inodes(struct btrfs_root *root);
 void btrfs_add_delayed_iput(struct inode *inode);
 void btrfs_run_delayed_iputs(struct btrfs_root *root);
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+                              u64 start, u64 num_bytes, u64 min_size,
+                              loff_t actual_len, u64 *alloc_hint);
 extern const struct dentry_operations btrfs_dentry_operations;
 /* ioctl.c */
@@ -2361,7 +2434,7 @@ void btrfs_update_iflags(struct inode *inode);
 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
 /* file.c */
-int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
+int btrfs_sync_file(struct file *file, int datasync);
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                            int skip_pinned);
 int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
@@ -2409,4 +2482,12 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root);
 int btrfs_recover_relocation(struct btrfs_root *root);
 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
+void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, struct extent_buffer *buf,
+                           struct extent_buffer *cow);
+void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
+                              struct btrfs_pending_snapshot *pending,
+                              u64 *bytes_to_reserve);
+void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+                              struct btrfs_pending_snapshot *pending);
 #endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 84e6781413b1..e807b143b857 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -17,6 +17,7 @@
 */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/sort.h>
 #include "ctree.h"
 #include "delayed-ref.h"
@@ -318,107 +319,6 @@ out:
 }
 /*
- * helper function to lookup reference count and flags of extent.
- *
- * the head node for delayed ref is used to store the sum of all the
- * reference count modifications queued up in the rbtree. the head
- * node may also store the extent flags to set. This way you can check
- * to see what the reference count and extent flags would be if all of
- * the delayed refs are not processed.
- */
-int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root, u64 bytenr,
-                             u64 num_bytes, u64 *refs, u64 *flags)
-{
-        struct btrfs_delayed_ref_node *ref;
-        struct btrfs_delayed_ref_head *head;
-        struct btrfs_delayed_ref_root *delayed_refs;
-        struct btrfs_path *path;
-        struct btrfs_extent_item *ei;
-        struct extent_buffer *leaf;
-        struct btrfs_key key;
-        u32 item_size;
-        u64 num_refs;
-        u64 extent_flags;
-        int ret;
-        path = btrfs_alloc_path();
-        if (!path)
-                return -ENOMEM;
-        key.objectid = bytenr;
-        key.type = BTRFS_EXTENT_ITEM_KEY;
-        key.offset = num_bytes;
-        delayed_refs = &trans->transaction->delayed_refs;
-again:
-        ret = btrfs_search_slot(trans, root->fs_info->extent_root,
-                                &key, path, 0, 0);
-        if (ret < 0)
-                goto out;
-        if (ret == 0) {
-                leaf = path->nodes[0];
-                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-                if (item_size >= sizeof(*ei)) {
-                        ei = btrfs_item_ptr(leaf, path->slots[0],
-                                            struct btrfs_extent_item);
-                        num_refs = btrfs_extent_refs(leaf, ei);
-                        extent_flags = btrfs_extent_flags(leaf, ei);
-                } else {
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-                        struct btrfs_extent_item_v0 *ei0;
-                        BUG_ON(item_size != sizeof(*ei0));
-                        ei0 = btrfs_item_ptr(leaf, path->slots[0],
-                                             struct btrfs_extent_item_v0);
-                        num_refs = btrfs_extent_refs_v0(leaf, ei0);
-                        /* FIXME: this isn't correct for data */
-                        extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
-#else
-                        BUG();
-#endif
-                }
-                BUG_ON(num_refs == 0);
-        } else {
-                num_refs = 0;
-                extent_flags = 0;
-                ret = 0;
-        }
-        spin_lock(&delayed_refs->lock);
-        ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
-        if (ref) {
-                head = btrfs_delayed_node_to_head(ref);
-                if (!mutex_trylock(&head->mutex)) {
-                        atomic_inc(&ref->refs);
-                        spin_unlock(&delayed_refs->lock);
-                        btrfs_release_path(root->fs_info->extent_root, path);
-                        mutex_lock(&head->mutex);
-                        mutex_unlock(&head->mutex);
-                        btrfs_put_delayed_ref(ref);
-                        goto again;
-                }
-                if (head->extent_op && head->extent_op->update_flags)
-                        extent_flags |= head->extent_op->flags_to_set;
-                else
-                        BUG_ON(num_refs == 0);
-                num_refs += ref->ref_mod;
-                mutex_unlock(&head->mutex);
-        }
-        WARN_ON(num_refs == 0);
-        if (refs)
-                *refs = num_refs;
-        if (flags)
-                *flags = extent_flags;
-out:
-        spin_unlock(&delayed_refs->lock);
-        btrfs_free_path(path);
-        return ret;
-}
-/*
 * helper function to update an extent delayed ref in the
 * rbtree.  existing and update must both have the same
 * bytenr and parent
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index f6fc67ddad36..50e3cf92fbda 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -167,9 +167,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
 int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root, u64 bytenr,
-                             u64 num_bytes, u64 *refs, u64 *flags);
 int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
                          u64 bytenr, u64 num_bytes, u64 orig_parent,
                          u64 parent, u64 orig_ref_root, u64 ref_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 11d0ad30e203..f3b287c22caf 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -27,6 +27,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/crc32c.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -43,8 +44,6 @@ static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
-static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
 /*
 * end_io_wq structs are used to do processing in task context when an IO is
 * complete.  This is used during reads to verify checksums, and it is used
@@ -75,6 +74,11 @@ struct async_submit_bio {
        int rw;
        int mirror_num;
        unsigned long bio_flags;
+        /*
+         * bio_offset is optional, can be used if the pages in the bio
+         * can't tell us where in the file the bio should go
+         */
+        u64 bio_offset;
        struct btrfs_work work;
 };
@@ -535,7 +539,8 @@ static void run_one_async_start(struct btrfs_work *work)
        async = container_of(work, struct  async_submit_bio, work);
        fs_info = BTRFS_I(async->inode)->root->fs_info;
        async->submit_bio_start(async->inode, async->rw, async->bio,
-                               async->mirror_num, async->bio_flags);
+                               async->mirror_num, async->bio_flags,
+                               async->bio_offset);
 }
 static void run_one_async_done(struct btrfs_work *work)
@@ -557,7 +562,8 @@ static void run_one_async_done(struct btrfs_work *work)
                wake_up(&fs_info->async_submit_wait);
        async->submit_bio_done(async->inode, async->rw, async->bio,
-                               async->mirror_num, async->bio_flags);
+                               async->mirror_num, async->bio_flags,
+                               async->bio_offset);
 }
 static void run_one_async_free(struct btrfs_work *work)
@@ -571,6 +577,7 @@ static void run_one_async_free(struct btrfs_work *work)
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
                        int rw, struct bio *bio, int mirror_num,
                        unsigned long bio_flags,
+                        u64 bio_offset,
                        extent_submit_bio_hook_t *submit_bio_start,
                        extent_submit_bio_hook_t *submit_bio_done)
 {
@@ -593,6 +600,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
        async->work.flags = 0;
        async->bio_flags = bio_flags;
+        async->bio_offset = bio_offset;
        atomic_inc(&fs_info->nr_async_submits);
@@ -628,7 +636,8 @@ static int btree_csum_one_bio(struct bio *bio)
 static int __btree_submit_bio_start(struct inode *inode, int rw,
                                    struct bio *bio, int mirror_num,
-                                    unsigned long bio_flags)
+                                    unsigned long bio_flags,
+                                    u64 bio_offset)
 {
        /*
         * when we're called for a write, we're already in the async
@@ -639,7 +648,8 @@ static int __btree_submit_bio_start(struct inode *inode, int rw,
 }
 static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
-                                 int mirror_num, unsigned long bio_flags)
+                                 int mirror_num, unsigned long bio_flags,
+                                 u64 bio_offset)
 {
        /*
         * when we're called for a write, we're already in the async
@@ -649,7 +659,8 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 }
 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-                                 int mirror_num, unsigned long bio_flags)
+                                 int mirror_num, unsigned long bio_flags,
+                                 u64 bio_offset)
 {
        int ret;
@@ -672,6 +683,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
         */
        return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
                                   inode, rw, bio, mirror_num, 0,
+                                   bio_offset,
                                   __btree_submit_bio_start,
                                   __btree_submit_bio_done);
 }
@@ -895,7 +907,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->ref_cows = 0;
        root->track_dirty = 0;
        root->in_radix = 0;
-        root->clean_orphans = 0;
+        root->orphan_item_inserted = 0;
+        root->orphan_cleanup_state = 0;
        root->fs_info = fs_info;
        root->objectid = objectid;
@@ -904,13 +917,16 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->name = NULL;
        root->in_sysfs = 0;
        root->inode_tree = RB_ROOT;
+        root->block_rsv = NULL;
+        root->orphan_block_rsv = NULL;
        INIT_LIST_HEAD(&root->dirty_list);
        INIT_LIST_HEAD(&root->orphan_list);
        INIT_LIST_HEAD(&root->root_list);
        spin_lock_init(&root->node_lock);
-        spin_lock_init(&root->list_lock);
+        spin_lock_init(&root->orphan_lock);
        spin_lock_init(&root->inode_lock);
+        spin_lock_init(&root->accounting_lock);
        mutex_init(&root->objectid_mutex);
        mutex_init(&root->log_mutex);
        init_waitqueue_head(&root->log_writer_wait);
@@ -969,42 +985,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
        return 0;
 }
-int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
-                             struct btrfs_fs_info *fs_info)
-{
-        struct extent_buffer *eb;
-        struct btrfs_root *log_root_tree = fs_info->log_root_tree;
-        u64 start = 0;
-        u64 end = 0;
-        int ret;
-        if (!log_root_tree)
-                return 0;
-        while (1) {
-                ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
-                                0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
-                if (ret)
-                        break;
-                clear_extent_bits(&log_root_tree->dirty_log_pages, start, end,
-                                  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
-        }
-        eb = fs_info->log_root_tree->node;
-        WARN_ON(btrfs_header_level(eb) != 0);
-        WARN_ON(btrfs_header_nritems(eb) != 0);
-        ret = btrfs_free_reserved_extent(fs_info->tree_root,
-                                eb->start, eb->len);
-        BUG_ON(ret);
-        free_extent_buffer(eb);
-        kfree(fs_info->log_root_tree);
-        fs_info->log_root_tree = NULL;
-        return 0;
-}
 static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
                                         struct btrfs_fs_info *fs_info)
 {
@@ -1192,19 +1172,23 @@ again:
        if (root)
                return root;
-        ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
-        if (ret == 0)
-                ret = -ENOENT;
-        if (ret < 0)
-                return ERR_PTR(ret);
        root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
        if (IS_ERR(root))
                return root;
-        WARN_ON(btrfs_root_refs(&root->root_item) == 0);
        set_anon_super(&root->anon_super, NULL);
+        if (btrfs_root_refs(&root->root_item) == 0) {
+                ret = -ENOENT;
+                goto fail;
+        }
+        ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
+        if (ret < 0)
+                goto fail;
+        if (ret == 0)
+                root->orphan_item_inserted = 1;
        ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
        if (ret)
                goto fail;
@@ -1213,10 +1197,9 @@ again:
        ret = radix_tree_insert(&fs_info->fs_roots_radix,
                                (unsigned long)root->root_key.objectid,
                                root);
-        if (ret == 0) {
+        if (ret == 0)
                root->in_radix = 1;
-                root->clean_orphans = 1;
-        }
        spin_unlock(&fs_info->fs_roots_radix_lock);
        radix_tree_preload_end();
        if (ret) {
@@ -1374,19 +1357,11 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 {
        int err;
-        bdi->name = "btrfs";
        bdi->capabilities = BDI_CAP_MAP_COPY;
-        err = bdi_init(bdi);
+        err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY);
        if (err)
                return err;
-        err = bdi_register(bdi, NULL, "btrfs-%d",
-                                atomic_inc_return(&btrfs_bdi_num));
-        if (err) {
-                bdi_destroy(bdi);
-                return err;
-        }
        bdi->ra_pages   = default_backing_dev_info.ra_pages;
        bdi->unplug_io_fn       = btrfs_unplug_io_fn;
        bdi->unplug_io_data     = info;
@@ -1470,10 +1445,6 @@ static int cleaner_kthread(void *arg)
        struct btrfs_root *root = arg;
        do {
-                smp_mb();
-                if (root->fs_info->closing)
-                        break;
                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
                if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
@@ -1486,11 +1457,9 @@ static int cleaner_kthread(void *arg)
                if (freezing(current)) {
                        refrigerator();
                } else {
-                        smp_mb();
-                        if (root->fs_info->closing)
-                                break;
                        set_current_state(TASK_INTERRUPTIBLE);
-                        schedule();
+                        if (!kthread_should_stop())
+                                schedule();
                        __set_current_state(TASK_RUNNING);
                }
        } while (!kthread_should_stop());
@@ -1502,36 +1471,40 @@ static int transaction_kthread(void *arg)
        struct btrfs_root *root = arg;
        struct btrfs_trans_handle *trans;
        struct btrfs_transaction *cur;
+        u64 transid;
        unsigned long now;
        unsigned long delay;
        int ret;
        do {
-                smp_mb();
-                if (root->fs_info->closing)
-                        break;
                delay = HZ * 30;
                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
                mutex_lock(&root->fs_info->transaction_kthread_mutex);
-                mutex_lock(&root->fs_info->trans_mutex);
+                spin_lock(&root->fs_info->new_trans_lock);
                cur = root->fs_info->running_transaction;
                if (!cur) {
-                        mutex_unlock(&root->fs_info->trans_mutex);
+                        spin_unlock(&root->fs_info->new_trans_lock);
                        goto sleep;
                }
                now = get_seconds();
-                if (now < cur->start_time || now - cur->start_time < 30) {
+                if (!cur->blocked &&
-                        mutex_unlock(&root->fs_info->trans_mutex);
+                    (now < cur->start_time || now - cur->start_time < 30)) {
+                        spin_unlock(&root->fs_info->new_trans_lock);
                        delay = HZ * 5;
                        goto sleep;
                }
-                mutex_unlock(&root->fs_info->trans_mutex);
+                transid = cur->transid;
-                trans = btrfs_start_transaction(root, 1);
+                spin_unlock(&root->fs_info->new_trans_lock);
-                ret = btrfs_commit_transaction(trans, root);
+                trans = btrfs_join_transaction(root, 1);
+                if (transid == trans->transid) {
+                        ret = btrfs_commit_transaction(trans, root);
+                        BUG_ON(ret);
+                } else {
+                        btrfs_end_transaction(trans, root);
+                }
 sleep:
                wake_up_process(root->fs_info->cleaner_kthread);
                mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1539,10 +1512,10 @@ sleep:
                if (freezing(current)) {
                        refrigerator();
                } else {
-                        if (root->fs_info->closing)
-                                break;
                        set_current_state(TASK_INTERRUPTIBLE);
-                        schedule_timeout(delay);
+                        if (!kthread_should_stop() &&
+                            !btrfs_transaction_blocked(root->fs_info))
+                                schedule_timeout(delay);
                        __set_current_state(TASK_RUNNING);
                }
        } while (!kthread_should_stop());
@@ -1629,12 +1602,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
        INIT_LIST_HEAD(&fs_info->space_info);
        btrfs_mapping_init(&fs_info->mapping_tree);
+        btrfs_init_block_rsv(&fs_info->global_block_rsv);
+        btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
+        btrfs_init_block_rsv(&fs_info->trans_block_rsv);
+        btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
+        btrfs_init_block_rsv(&fs_info->empty_block_rsv);
+        INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
+        mutex_init(&fs_info->durable_block_rsv_mutex);
        atomic_set(&fs_info->nr_async_submits, 0);
        atomic_set(&fs_info->async_delalloc_pages, 0);
        atomic_set(&fs_info->async_submit_draining, 0);
        atomic_set(&fs_info->nr_async_bios, 0);
        fs_info->sb = sb;
-        fs_info->max_extent = (u64)-1;
        fs_info->max_inline = 8192 * 1024;
        fs_info->metadata_ratio = 0;
@@ -1769,9 +1748,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                           min_t(u64, fs_devices->num_devices,
                           fs_info->thread_pool_size),
                           &fs_info->generic_worker);
-        btrfs_init_workers(&fs_info->enospc_workers, "enospc",
-                           fs_info->thread_pool_size,
-                           &fs_info->generic_worker);
        /* a higher idle thresh on the submit workers makes it much more
         * likely that bios will be send down in a sane order to the
@@ -1819,7 +1795,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        btrfs_start_workers(&fs_info->endio_meta_workers, 1);
        btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
        btrfs_start_workers(&fs_info->endio_write_workers, 1);
-        btrfs_start_workers(&fs_info->enospc_workers, 1);
        fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
        fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1922,17 +1897,22 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        csum_root->track_dirty = 1;
-        btrfs_read_block_groups(extent_root);
        fs_info->generation = generation;
        fs_info->last_trans_committed = generation;
        fs_info->data_alloc_profile = (u64)-1;
        fs_info->metadata_alloc_profile = (u64)-1;
        fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
+        ret = btrfs_read_block_groups(extent_root);
+        if (ret) {
+                printk(KERN_ERR "Failed to read block groups: %d\n", ret);
+                goto fail_block_groups;
+        }
        fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
                                               "btrfs-cleaner");
        if (IS_ERR(fs_info->cleaner_kthread))
-                goto fail_csum_root;
+                goto fail_block_groups;
        fs_info->transaction_kthread = kthread_run(transaction_kthread,
                                                   tree_root,
@@ -1983,6 +1963,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        BUG_ON(ret);
        if (!(sb->s_flags & MS_RDONLY)) {
+                ret = btrfs_cleanup_fs_roots(fs_info);
+                BUG_ON(ret);
                ret = btrfs_recover_relocation(tree_root);
                if (ret < 0) {
                        printk(KERN_WARNING
@@ -2020,7 +2003,8 @@ fail_cleaner:
        filemap_write_and_wait(fs_info->btree_inode->i_mapping);
        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
-fail_csum_root:
+fail_block_groups:
+        btrfs_free_block_groups(fs_info);
        free_extent_buffer(csum_root->node);
        free_extent_buffer(csum_root->commit_root);
 fail_dev_root:
@@ -2045,7 +2029,6 @@ fail_sb_buffer:
        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
        btrfs_stop_workers(&fs_info->endio_write_workers);
        btrfs_stop_workers(&fs_info->submit_workers);
-        btrfs_stop_workers(&fs_info->enospc_workers);
 fail_iput:
        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
        iput(fs_info->btree_inode);
@@ -2410,11 +2393,11 @@ int btrfs_commit_super(struct btrfs_root *root)
        down_write(&root->fs_info->cleanup_work_sem);
        up_write(&root->fs_info->cleanup_work_sem);
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_join_transaction(root, 1);
        ret = btrfs_commit_transaction(trans, root);
        BUG_ON(ret);
        /* run commit again to drop the original snapshot */
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_join_transaction(root, 1);
        btrfs_commit_transaction(trans, root);
        ret = btrfs_write_and_wait_transaction(NULL, root);
        BUG_ON(ret);
@@ -2431,15 +2414,15 @@ int close_ctree(struct btrfs_root *root)
        fs_info->closing = 1;
        smp_mb();
-        kthread_stop(root->fs_info->transaction_kthread);
-        kthread_stop(root->fs_info->cleaner_kthread);
        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
                ret =  btrfs_commit_super(root);
                if (ret)
                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
        }
+        kthread_stop(root->fs_info->transaction_kthread);
+        kthread_stop(root->fs_info->cleaner_kthread);
        fs_info->closing = 2;
        smp_mb();
@@ -2478,7 +2461,6 @@ int close_ctree(struct btrfs_root *root)
        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
        btrfs_stop_workers(&fs_info->endio_write_workers);
        btrfs_stop_workers(&fs_info->submit_workers);
-        btrfs_stop_workers(&fs_info->enospc_workers);
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c958ecbc1916..88e825a0bf21 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -87,7 +87,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
                        int metadata);
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
                        int rw, struct bio *bio, int mirror_num,
-                        unsigned long bio_flags,
+                        unsigned long bio_flags, u64 bio_offset,
                        extent_submit_bio_hook_t *submit_bio_start,
                        extent_submit_bio_hook_t *submit_bio_done);
@@ -95,8 +95,6 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
 int btrfs_write_tree_block(struct extent_buffer *buf);
 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
-int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
-                             struct btrfs_fs_info *fs_info);
 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
                             struct btrfs_fs_info *fs_info);
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1727b26fb194..b9080d71991a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -22,6 +22,7 @@
 #include <linux/sort.h>
 #include <linux/rcupdate.h>
 #include <linux/kthread.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "hash.h"
 #include "ctree.h"
@@ -34,10 +35,9 @@
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
-                              u64 bytenr, u64 num_bytes, int alloc,
+                              u64 bytenr, u64 num_bytes, int alloc);
-                              int mark_free);
+static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
-static int update_reserved_extents(struct btrfs_block_group_cache *cache,
+                                 u64 num_bytes, int reserve, int sinfo);
-                                   u64 num_bytes, int reserve);
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                u64 bytenr, u64 num_bytes, u64 parent,
@@ -60,12 +60,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
                          u64 flags, int force);
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root,
-                          struct btrfs_path *path,
-                          u64 bytenr, u64 num_bytes,
-                          int is_data, int reserved,
-                          struct extent_buffer **must_clean);
 static int find_next_key(struct btrfs_path *path, int level,
                         struct btrfs_key *key);
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -90,8 +84,12 @@ void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 {
-        if (atomic_dec_and_test(&cache->count))
+        if (atomic_dec_and_test(&cache->count)) {
+                WARN_ON(cache->pinned > 0);
+                WARN_ON(cache->reserved > 0);
+                WARN_ON(cache->reserved_pinned > 0);
                kfree(cache);
+        }
 }
 /*
@@ -318,7 +316,7 @@ static int caching_kthread(void *data)
        exclude_super_stripes(extent_root, block_group);
        spin_lock(&block_group->space_info->lock);
-        block_group->space_info->bytes_super += block_group->bytes_super;
+        block_group->space_info->bytes_readonly += block_group->bytes_super;
        spin_unlock(&block_group->space_info->lock);
        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
@@ -506,6 +504,9 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
        struct list_head *head = &info->space_info;
        struct btrfs_space_info *found;
+        flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
+                 BTRFS_BLOCK_GROUP_METADATA;
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
                if (found->flags == flags) {
@@ -609,6 +610,113 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 }
 /*
+ * helper function to lookup reference count and flags of extent.
+ *
+ * the head node for delayed ref is used to store the sum of all the
+ * reference count modifications queued up in the rbtree. the head
+ * node may also store the extent flags to set. This way you can check
+ * to see what the reference count and extent flags would be if all of
+ * the delayed refs are not processed.
+ */
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, u64 bytenr,
+                             u64 num_bytes, u64 *refs, u64 *flags)
+{
+        struct btrfs_delayed_ref_head *head;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        struct btrfs_path *path;
+        struct btrfs_extent_item *ei;
+        struct extent_buffer *leaf;
+        struct btrfs_key key;
+        u32 item_size;
+        u64 num_refs;
+        u64 extent_flags;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = bytenr;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.offset = num_bytes;
+        if (!trans) {
+                path->skip_locking = 1;
+                path->search_commit_root = 1;
+        }
+again:
+        ret = btrfs_search_slot(trans, root->fs_info->extent_root,
+                                &key, path, 0, 0);
+        if (ret < 0)
+                goto out_free;
+        if (ret == 0) {
+                leaf = path->nodes[0];
+                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+                if (item_size >= sizeof(*ei)) {
+                        ei = btrfs_item_ptr(leaf, path->slots[0],
+                                            struct btrfs_extent_item);
+                        num_refs = btrfs_extent_refs(leaf, ei);
+                        extent_flags = btrfs_extent_flags(leaf, ei);
+                } else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                        struct btrfs_extent_item_v0 *ei0;
+                        BUG_ON(item_size != sizeof(*ei0));
+                        ei0 = btrfs_item_ptr(leaf, path->slots[0],
+                                             struct btrfs_extent_item_v0);
+                        num_refs = btrfs_extent_refs_v0(leaf, ei0);
+                        /* FIXME: this isn't correct for data */
+                        extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+#else
+                        BUG();
+#endif
+                }
+                BUG_ON(num_refs == 0);
+        } else {
+                num_refs = 0;
+                extent_flags = 0;
+                ret = 0;
+        }
+        if (!trans)
+                goto out;
+        delayed_refs = &trans->transaction->delayed_refs;
+        spin_lock(&delayed_refs->lock);
+        head = btrfs_find_delayed_ref_head(trans, bytenr);
+        if (head) {
+                if (!mutex_trylock(&head->mutex)) {
+                        atomic_inc(&head->node.refs);
+                        spin_unlock(&delayed_refs->lock);
+                        btrfs_release_path(root->fs_info->extent_root, path);
+                        mutex_lock(&head->mutex);
+                        mutex_unlock(&head->mutex);
+                        btrfs_put_delayed_ref(&head->node);
+                        goto again;
+                }
+                if (head->extent_op && head->extent_op->update_flags)
+                        extent_flags |= head->extent_op->flags_to_set;
+                else
+                        BUG_ON(num_refs == 0);
+                num_refs += head->node.ref_mod;
+                mutex_unlock(&head->mutex);
+        }
+        spin_unlock(&delayed_refs->lock);
+out:
+        WARN_ON(num_refs == 0);
+        if (refs)
+                *refs = num_refs;
+        if (flags)
+                *flags = extent_flags;
+out_free:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
 * Back reference rules.  Back refs have three main goals:
 *
 * 1) differentiate between all holders of references to an extent so that
@@ -1588,7 +1696,7 @@ static void btrfs_issue_discard(struct block_device *bdev,
                                u64 start, u64 len)
 {
        blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
-                             DISCARD_FL_BARRIER);
+                        BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 }
 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@ -1870,7 +1978,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
        return ret;
 }
 /* helper function to actually process a single delayed ref entry */
 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
@@ -1890,32 +1997,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                BUG_ON(extent_op);
                head = btrfs_delayed_node_to_head(node);
                if (insert_reserved) {
-                        int mark_free = 0;
+                        btrfs_pin_extent(root, node->bytenr,
-                        struct extent_buffer *must_clean = NULL;
+                                         node->num_bytes, 1);
-                        ret = pin_down_bytes(trans, root, NULL,
-                                             node->bytenr, node->num_bytes,
-                                             head->is_data, 1, &must_clean);
-                        if (ret > 0)
-                                mark_free = 1;
-                        if (must_clean) {
-                                clean_tree_block(NULL, root, must_clean);
-                                btrfs_tree_unlock(must_clean);
-                                free_extent_buffer(must_clean);
-                        }
                        if (head->is_data) {
                                ret = btrfs_del_csums(trans, root,
                                                      node->bytenr,
                                                      node->num_bytes);
                                BUG_ON(ret);
                        }
-                        if (mark_free) {
-                                ret = btrfs_free_reserved_extent(root,
-                                                        node->bytenr,
-                                                        node->num_bytes);
-                                BUG_ON(ret);
-                        }
                }
                mutex_unlock(&head->mutex);
                return 0;
@@ -2346,6 +2435,8 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
                ret = 0;
 out:
        btrfs_free_path(path);
+        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+                WARN_ON(ret > 0);
        return ret;
 }
@@ -2659,12 +2750,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
                             struct btrfs_space_info **space_info)
 {
        struct btrfs_space_info *found;
+        int i;
+        int factor;
+        if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+                     BTRFS_BLOCK_GROUP_RAID10))
+                factor = 2;
+        else
+                factor = 1;
        found = __find_space_info(info, flags);
        if (found) {
                spin_lock(&found->lock);
                found->total_bytes += total_bytes;
                found->bytes_used += bytes_used;
+                found->disk_used += bytes_used * factor;
                found->full = 0;
                spin_unlock(&found->lock);
                *space_info = found;
@@ -2674,16 +2774,20 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        if (!found)
                return -ENOMEM;
-        INIT_LIST_HEAD(&found->block_groups);
+        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+                INIT_LIST_HEAD(&found->block_groups[i]);
        init_rwsem(&found->groups_sem);
        spin_lock_init(&found->lock);
-        found->flags = flags;
+        found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
+                                BTRFS_BLOCK_GROUP_SYSTEM |
+                                BTRFS_BLOCK_GROUP_METADATA);
        found->total_bytes = total_bytes;
        found->bytes_used = bytes_used;
+        found->disk_used = bytes_used * factor;
        found->bytes_pinned = 0;
        found->bytes_reserved = 0;
        found->bytes_readonly = 0;
-        found->bytes_delalloc = 0;
+        found->bytes_may_use = 0;
        found->full = 0;
        found->force_alloc = 0;
        *space_info = found;
@@ -2708,19 +2812,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
        }
 }
-static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
-{
-        spin_lock(&cache->space_info->lock);
-        spin_lock(&cache->lock);
-        if (!cache->ro) {
-                cache->space_info->bytes_readonly += cache->key.offset -
-                                        btrfs_block_group_used(&cache->item);
-                cache->ro = 1;
-        }
-        spin_unlock(&cache->lock);
-        spin_unlock(&cache->space_info->lock);
-}
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
        u64 num_devices = root->fs_info->fs_devices->rw_devices;
@@ -2749,492 +2840,49 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        return flags;
 }
-static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
+static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
 {
-        struct btrfs_fs_info *info = root->fs_info;
+        if (flags & BTRFS_BLOCK_GROUP_DATA)
-        u64 alloc_profile;
+                flags |= root->fs_info->avail_data_alloc_bits &
+                         root->fs_info->data_alloc_profile;
-        if (data) {
+        else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-                alloc_profile = info->avail_data_alloc_bits &
+                flags |= root->fs_info->avail_system_alloc_bits &
-                        info->data_alloc_profile;
+                         root->fs_info->system_alloc_profile;
-                data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
+        else if (flags & BTRFS_BLOCK_GROUP_METADATA)
-        } else if (root == root->fs_info->chunk_root) {
+                flags |= root->fs_info->avail_metadata_alloc_bits &
-                alloc_profile = info->avail_system_alloc_bits &
+                         root->fs_info->metadata_alloc_profile;
-                        info->system_alloc_profile;
+        return btrfs_reduce_alloc_profile(root, flags);
-                data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
-        } else {
-                alloc_profile = info->avail_metadata_alloc_bits &
-                        info->metadata_alloc_profile;
-                data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
-        }
-        return btrfs_reduce_alloc_profile(root, data);
 }
-void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
+static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
 {
-        u64 alloc_target;
+        u64 flags;
-        alloc_target = btrfs_get_alloc_profile(root, 1);
-        BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
-                                                       alloc_target);
-}
-static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
-{
-        u64 num_bytes;
-        int level;
-        level = BTRFS_MAX_LEVEL - 2;
-        /*
-         * NOTE: these calculations are absolutely the worst possible case.
-         * This assumes that _every_ item we insert will require a new leaf, and
-         * that the tree has grown to its maximum level size.
-         */
-        /*
-         * for every item we insert we could insert both an extent item and a
-         * extent ref item.  Then for ever item we insert, we will need to cow
-         * both the original leaf, plus the leaf to the left and right of it.
-         *
-         * Unless we are talking about the extent root, then we just want the
-         * number of items * 2, since we just need the extent item plus its ref.
-         */
-        if (root == root->fs_info->extent_root)
-                num_bytes = num_items * 2;
-        else
-                num_bytes = (num_items + (2 * num_items)) * 3;
-        /*
-         * num_bytes is total number of leaves we could need times the leaf
-         * size, and then for every leaf we could end up cow'ing 2 nodes per
-         * level, down to the leaf level.
-         */
-        num_bytes = (num_bytes * root->leafsize) +
-                (num_bytes * (level * 2)) * root->nodesize;
-        return num_bytes;
-}
-/*
- * Unreserve metadata space for delalloc.  If we have less reserved credits than
- * we have extents, this function does nothing.
- */
-int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
-                                          struct inode *inode, int num_items)
-{
-        struct btrfs_fs_info *info = root->fs_info;
-        struct btrfs_space_info *meta_sinfo;
-        u64 num_bytes;
-        u64 alloc_target;
-        bool bug = false;
-        /* get the space info for where the metadata will live */
-        alloc_target = btrfs_get_alloc_profile(root, 0);
-        meta_sinfo = __find_space_info(info, alloc_target);
-        num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
-                                           num_items);
-        spin_lock(&meta_sinfo->lock);
-        spin_lock(&BTRFS_I(inode)->accounting_lock);
-        if (BTRFS_I(inode)->reserved_extents <=
-            BTRFS_I(inode)->outstanding_extents) {
-                spin_unlock(&BTRFS_I(inode)->accounting_lock);
-                spin_unlock(&meta_sinfo->lock);
-                return 0;
-        }
-        spin_unlock(&BTRFS_I(inode)->accounting_lock);
-        BTRFS_I(inode)->reserved_extents--;
-        BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
-        if (meta_sinfo->bytes_delalloc < num_bytes) {
-                bug = true;
-                meta_sinfo->bytes_delalloc = 0;
-        } else {
-                meta_sinfo->bytes_delalloc -= num_bytes;
-        }
-        spin_unlock(&meta_sinfo->lock);
-        BUG_ON(bug);
-        return 0;
-}
-static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
-{
-        u64 thresh;
-        thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
-                meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
-                meta_sinfo->bytes_super + meta_sinfo->bytes_root +
-                meta_sinfo->bytes_may_use;
-        thresh = meta_sinfo->total_bytes - thresh;
+        if (data)
-        thresh *= 80;
+                flags = BTRFS_BLOCK_GROUP_DATA;
-        do_div(thresh, 100);
+        else if (root == root->fs_info->chunk_root)
-        if (thresh <= meta_sinfo->bytes_delalloc)
+                flags = BTRFS_BLOCK_GROUP_SYSTEM;
-                meta_sinfo->force_delalloc = 1;
        else
-                meta_sinfo->force_delalloc = 0;
+                flags = BTRFS_BLOCK_GROUP_METADATA;
-}
-struct async_flush {
-        struct btrfs_root *root;
-        struct btrfs_space_info *info;
-        struct btrfs_work work;
-};
-static noinline void flush_delalloc_async(struct btrfs_work *work)
-{
-        struct async_flush *async;
-        struct btrfs_root *root;
-        struct btrfs_space_info *info;
-        async = container_of(work, struct async_flush, work);
-        root = async->root;
-        info = async->info;
-        btrfs_start_delalloc_inodes(root, 0);
-        wake_up(&info->flush_wait);
-        btrfs_wait_ordered_extents(root, 0, 0);
-        spin_lock(&info->lock);
-        info->flushing = 0;
-        spin_unlock(&info->lock);
-        wake_up(&info->flush_wait);
-        kfree(async);
-}
-static void wait_on_flush(struct btrfs_space_info *info)
-{
-        DEFINE_WAIT(wait);
-        u64 used;
-        while (1) {
-                prepare_to_wait(&info->flush_wait, &wait,
-                                TASK_UNINTERRUPTIBLE);
-                spin_lock(&info->lock);
-                if (!info->flushing) {
-                        spin_unlock(&info->lock);
-                        break;
-                }
-                used = info->bytes_used + info->bytes_reserved +
-                        info->bytes_pinned + info->bytes_readonly +
-                        info->bytes_super + info->bytes_root +
-                        info->bytes_may_use + info->bytes_delalloc;
-                if (used < info->total_bytes) {
-                        spin_unlock(&info->lock);
-                        break;
-                }
-                spin_unlock(&info->lock);
-                schedule();
-        }
-        finish_wait(&info->flush_wait, &wait);
-}
-static void flush_delalloc(struct btrfs_root *root,
-                                 struct btrfs_space_info *info)
-{
-        struct async_flush *async;
-        bool wait = false;
-        spin_lock(&info->lock);
-        if (!info->flushing) {
-                info->flushing = 1;
-                init_waitqueue_head(&info->flush_wait);
-        } else {
-                wait = true;
-        }
-        spin_unlock(&info->lock);
-        if (wait) {
-                wait_on_flush(info);
-                return;
-        }
-        async = kzalloc(sizeof(*async), GFP_NOFS);
-        if (!async)
-                goto flush;
-        async->root = root;
-        async->info = info;
-        async->work.func = flush_delalloc_async;
-        btrfs_queue_worker(&root->fs_info->enospc_workers,
-                           &async->work);
-        wait_on_flush(info);
-        return;
-flush:
-        btrfs_start_delalloc_inodes(root, 0);
-        btrfs_wait_ordered_extents(root, 0, 0);
-        spin_lock(&info->lock);
-        info->flushing = 0;
-        spin_unlock(&info->lock);
-        wake_up(&info->flush_wait);
-}
-static int maybe_allocate_chunk(struct btrfs_root *root,
-                                 struct btrfs_space_info *info)
-{
-        struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
-        struct btrfs_trans_handle *trans;
-        bool wait = false;
-        int ret = 0;
-        u64 min_metadata;
-        u64 free_space;
-        free_space = btrfs_super_total_bytes(disk_super);
-        /*
-         * we allow the metadata to grow to a max of either 10gb or 5% of the
-         * space in the volume.
-         */
-        min_metadata = min((u64)10 * 1024 * 1024 * 1024,
-                             div64_u64(free_space * 5, 100));
-        if (info->total_bytes >= min_metadata) {
-                spin_unlock(&info->lock);
-                return 0;
-        }
-        if (info->full) {
-                spin_unlock(&info->lock);
-                return 0;
-        }
-        if (!info->allocating_chunk) {
-                info->force_alloc = 1;
-                info->allocating_chunk = 1;
-                init_waitqueue_head(&info->allocate_wait);
-        } else {
-                wait = true;
-        }
-        spin_unlock(&info->lock);
-        if (wait) {
-                wait_event(info->allocate_wait,
-                           !info->allocating_chunk);
-                return 1;
-        }
-        trans = btrfs_start_transaction(root, 1);
-        if (!trans) {
-                ret = -ENOMEM;
-                goto out;
-        }
-        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                             4096 + 2 * 1024 * 1024,
-                             info->flags, 0);
-        btrfs_end_transaction(trans, root);
-        if (ret)
-                goto out;
-out:
-        spin_lock(&info->lock);
-        info->allocating_chunk = 0;
-        spin_unlock(&info->lock);
-        wake_up(&info->allocate_wait);
-        if (ret)
-                return 0;
-        return 1;
-}
-/*
- * Reserve metadata space for delalloc.
- */
-int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
-                                        struct inode *inode, int num_items)
-{
-        struct btrfs_fs_info *info = root->fs_info;
-        struct btrfs_space_info *meta_sinfo;
-        u64 num_bytes;
-        u64 used;
-        u64 alloc_target;
-        int flushed = 0;
-        int force_delalloc;
-        /* get the space info for where the metadata will live */
-        alloc_target = btrfs_get_alloc_profile(root, 0);
-        meta_sinfo = __find_space_info(info, alloc_target);
-        num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
-                                           num_items);
-again:
-        spin_lock(&meta_sinfo->lock);
-        force_delalloc = meta_sinfo->force_delalloc;
-        if (unlikely(!meta_sinfo->bytes_root))
-                meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
-        if (!flushed)
-                meta_sinfo->bytes_delalloc += num_bytes;
-        used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
-                meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
-                meta_sinfo->bytes_super + meta_sinfo->bytes_root +
-                meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
-        if (used > meta_sinfo->total_bytes) {
-                flushed++;
-                if (flushed == 1) {
-                        if (maybe_allocate_chunk(root, meta_sinfo))
-                                goto again;
-                        flushed++;
-                } else {
-                        spin_unlock(&meta_sinfo->lock);
-                }
-                if (flushed == 2) {
-                        filemap_flush(inode->i_mapping);
-                        goto again;
-                } else if (flushed == 3) {
-                        flush_delalloc(root, meta_sinfo);
-                        goto again;
-                }
-                spin_lock(&meta_sinfo->lock);
-                meta_sinfo->bytes_delalloc -= num_bytes;
-                spin_unlock(&meta_sinfo->lock);
-                printk(KERN_ERR "enospc, has %d, reserved %d\n",
-                       BTRFS_I(inode)->outstanding_extents,
-                       BTRFS_I(inode)->reserved_extents);
-                dump_space_info(meta_sinfo, 0, 0);
-                return -ENOSPC;
-        }
-        BTRFS_I(inode)->reserved_extents++;
+        return get_alloc_profile(root, flags);
-        check_force_delalloc(meta_sinfo);
-        spin_unlock(&meta_sinfo->lock);
-        if (!flushed && force_delalloc)
-                filemap_flush(inode->i_mapping);
-        return 0;
 }
-/*
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
- * unreserve num_items number of items worth of metadata space.  This needs to
- * be paired with btrfs_reserve_metadata_space.
- *
- * NOTE: if you have the option, run this _AFTER_ you do a
- * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
- * oprations which will result in more used metadata, so we want to make sure we
- * can do that without issue.
- */
-int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
-{
-        struct btrfs_fs_info *info = root->fs_info;
-        struct btrfs_space_info *meta_sinfo;
-        u64 num_bytes;
-        u64 alloc_target;
-        bool bug = false;
-        /* get the space info for where the metadata will live */
-        alloc_target = btrfs_get_alloc_profile(root, 0);
-        meta_sinfo = __find_space_info(info, alloc_target);
-        num_bytes = calculate_bytes_needed(root, num_items);
-        spin_lock(&meta_sinfo->lock);
-        if (meta_sinfo->bytes_may_use < num_bytes) {
-                bug = true;
-                meta_sinfo->bytes_may_use = 0;
-        } else {
-                meta_sinfo->bytes_may_use -= num_bytes;
-        }
-        spin_unlock(&meta_sinfo->lock);
-        BUG_ON(bug);
-        return 0;
-}
-/*
- * Reserve some metadata space for use.  We'll calculate the worste case number
- * of bytes that would be needed to modify num_items number of items.  If we
- * have space, fantastic, if not, you get -ENOSPC.  Please call
- * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
- * items you reserved, since whatever metadata you needed should have already
- * been allocated.
- *
- * This will commit the transaction to make more space if we don't have enough
- * metadata space.  THe only time we don't do this is if we're reserving space
- * inside of a transaction, then we will just return -ENOSPC and it is the
- * callers responsibility to handle it properly.
- */
-int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
 {
-        struct btrfs_fs_info *info = root->fs_info;
+        BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
-        struct btrfs_space_info *meta_sinfo;
+                                                       BTRFS_BLOCK_GROUP_DATA);
-        u64 num_bytes;
-        u64 used;
-        u64 alloc_target;
-        int retries = 0;
-        /* get the space info for where the metadata will live */
-        alloc_target = btrfs_get_alloc_profile(root, 0);
-        meta_sinfo = __find_space_info(info, alloc_target);
-        num_bytes = calculate_bytes_needed(root, num_items);
-again:
-        spin_lock(&meta_sinfo->lock);
-        if (unlikely(!meta_sinfo->bytes_root))
-                meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
-        if (!retries)
-                meta_sinfo->bytes_may_use += num_bytes;
-        used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
-                meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
-                meta_sinfo->bytes_super + meta_sinfo->bytes_root +
-                meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
-        if (used > meta_sinfo->total_bytes) {
-                retries++;
-                if (retries == 1) {
-                        if (maybe_allocate_chunk(root, meta_sinfo))
-                                goto again;
-                        retries++;
-                } else {
-                        spin_unlock(&meta_sinfo->lock);
-                }
-                if (retries == 2) {
-                        flush_delalloc(root, meta_sinfo);
-                        goto again;
-                }
-                spin_lock(&meta_sinfo->lock);
-                meta_sinfo->bytes_may_use -= num_bytes;
-                spin_unlock(&meta_sinfo->lock);
-                dump_space_info(meta_sinfo, 0, 0);
-                return -ENOSPC;
-        }
-        check_force_delalloc(meta_sinfo);
-        spin_unlock(&meta_sinfo->lock);
-        return 0;
 }
 /*
 * This will check the space that the inode allocates from to make sure we have
 * enough space for bytes.
 */
-int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
-                                u64 bytes)
 {
        struct btrfs_space_info *data_sinfo;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        u64 used;
        int ret = 0, committed = 0;
        /* make sure bytes are sectorsize aligned */
@@ -3247,10 +2895,11 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
 again:
        /* make sure we have enough space to handle the data first */
        spin_lock(&data_sinfo->lock);
-        if (data_sinfo->total_bytes - data_sinfo->bytes_used -
+        used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
-            data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
+                data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
-            data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
+                data_sinfo->bytes_may_use;
-            data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) {
+        if (used + bytes > data_sinfo->total_bytes) {
                struct btrfs_trans_handle *trans;
                /*
@@ -3264,15 +2913,15 @@ again:
                        spin_unlock(&data_sinfo->lock);
 alloc:
                        alloc_target = btrfs_get_alloc_profile(root, 1);
-                        trans = btrfs_start_transaction(root, 1);
+                        trans = btrfs_join_transaction(root, 1);
-                        if (!trans)
+                        if (IS_ERR(trans))
-                                return -ENOMEM;
+                                return PTR_ERR(trans);
                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
                                             bytes + 2 * 1024 * 1024,
                                             alloc_target, 0);
                        btrfs_end_transaction(trans, root);
-                        if (ret)
+                        if (ret < 0)
                                return ret;
                        if (!data_sinfo) {
@@ -3287,25 +2936,26 @@ alloc:
                if (!committed && !root->fs_info->open_ioctl_trans) {
                        committed = 1;
                        trans = btrfs_join_transaction(root, 1);
-                        if (!trans)
+                        if (IS_ERR(trans))
-                                return -ENOMEM;
+                                return PTR_ERR(trans);
                        ret = btrfs_commit_transaction(trans, root);
                        if (ret)
                                return ret;
                        goto again;
                }
-                printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
+#if 0 /* I hope we never need this code again, just in case */
-                       ", %llu bytes_used, %llu bytes_reserved, "
+                printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
-                       "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
+                       "%llu bytes_reserved, " "%llu bytes_pinned, "
-                       "%llu total\n", (unsigned long long)bytes,
+                       "%llu bytes_readonly, %llu may use %llu total\n",
-                       (unsigned long long)data_sinfo->bytes_delalloc,
+                       (unsigned long long)bytes,
                       (unsigned long long)data_sinfo->bytes_used,
                       (unsigned long long)data_sinfo->bytes_reserved,
                       (unsigned long long)data_sinfo->bytes_pinned,
                       (unsigned long long)data_sinfo->bytes_readonly,
                       (unsigned long long)data_sinfo->bytes_may_use,
                       (unsigned long long)data_sinfo->total_bytes);
+#endif
                return -ENOSPC;
        }
        data_sinfo->bytes_may_use += bytes;
@@ -3316,12 +2966,13 @@ alloc:
 }
 /*
- * if there was an error for whatever reason after calling
+ * called when we are clearing an delalloc extent from the
- * btrfs_check_data_free_space, call this so we can cleanup the counters.
+ * inode's io_tree or there was an error for whatever reason
+ * after calling btrfs_check_data_free_space
 */
-void btrfs_free_reserved_data_space(struct btrfs_root *root,
+void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
-                                    struct inode *inode, u64 bytes)
 {
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_space_info *data_sinfo;
        /* make sure bytes are sectorsize aligned */
@@ -3334,48 +2985,6 @@ void btrfs_free_reserved_data_space(struct btrfs_root *root,
        spin_unlock(&data_sinfo->lock);
 }
-/* called when we are adding a delalloc extent to the inode's io_tree */
-void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
-                                  u64 bytes)
-{
-        struct btrfs_space_info *data_sinfo;
-        /* get the space info for where this inode will be storing its data */
-        data_sinfo = BTRFS_I(inode)->space_info;
-        /* make sure we have enough space to handle the data first */
-        spin_lock(&data_sinfo->lock);
-        data_sinfo->bytes_delalloc += bytes;
-        /*
-         * we are adding a delalloc extent without calling
-         * btrfs_check_data_free_space first.  This happens on a weird
-         * writepage condition, but shouldn't hurt our accounting
-         */
-        if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
-                data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
-                BTRFS_I(inode)->reserved_bytes = 0;
-        } else {
-                data_sinfo->bytes_may_use -= bytes;
-                BTRFS_I(inode)->reserved_bytes -= bytes;
-        }
-        spin_unlock(&data_sinfo->lock);
-}
-/* called when we are clearing an delalloc extent from the inode's io_tree */
-void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
-                              u64 bytes)
-{
-        struct btrfs_space_info *info;
-        info = BTRFS_I(inode)->space_info;
-        spin_lock(&info->lock);
-        info->bytes_delalloc -= bytes;
-        spin_unlock(&info->lock);
-}
 static void force_metadata_allocation(struct btrfs_fs_info *info)
 {
        struct list_head *head = &info->space_info;
@@ -3389,13 +2998,28 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
        rcu_read_unlock();
 }
+static int should_alloc_chunk(struct btrfs_space_info *sinfo,
+                              u64 alloc_bytes)
+{
+        u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
+        if (sinfo->bytes_used + sinfo->bytes_reserved +
+            alloc_bytes + 256 * 1024 * 1024 < num_bytes)
+                return 0;
+        if (sinfo->bytes_used + sinfo->bytes_reserved +
+            alloc_bytes < div_factor(num_bytes, 8))
+                return 0;
+        return 1;
+}
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
                          u64 flags, int force)
 {
        struct btrfs_space_info *space_info;
        struct btrfs_fs_info *fs_info = extent_root->fs_info;
-        u64 thresh;
        int ret = 0;
        mutex_lock(&fs_info->chunk_mutex);
@@ -3418,11 +3042,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                goto out;
        }
-        thresh = space_info->total_bytes - space_info->bytes_readonly;
+        if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
-        thresh = div_factor(thresh, 8);
-        if (!force &&
-           (space_info->bytes_used + space_info->bytes_pinned +
-            space_info->bytes_reserved + alloc_bytes) < thresh) {
                spin_unlock(&space_info->lock);
                goto out;
        }
@@ -3444,6 +3064,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
        spin_lock(&space_info->lock);
        if (ret)
                space_info->full = 1;
+        else
+                ret = 1;
        space_info->force_alloc = 0;
        spin_unlock(&space_info->lock);
 out:
@@ -3451,13 +3073,713 @@ out:
        return ret;
 }
+static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct btrfs_space_info *sinfo, u64 num_bytes)
+{
+        int ret;
+        int end_trans = 0;
+        if (sinfo->full)
+                return 0;
+        spin_lock(&sinfo->lock);
+        ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
+        spin_unlock(&sinfo->lock);
+        if (!ret)
+                return 0;
+        if (!trans) {
+                trans = btrfs_join_transaction(root, 1);
+                BUG_ON(IS_ERR(trans));
+                end_trans = 1;
+        }
+        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+                             num_bytes + 2 * 1024 * 1024,
+                             get_alloc_profile(root, sinfo->flags), 0);
+        if (end_trans)
+                btrfs_end_transaction(trans, root);
+        return ret == 1 ? 1 : 0;
+}
+/*
+ * shrink metadata reservation for delalloc
+ */
+static int shrink_delalloc(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, u64 to_reclaim)
+{
+        struct btrfs_block_rsv *block_rsv;
+        u64 reserved;
+        u64 max_reclaim;
+        u64 reclaimed = 0;
+        int pause = 1;
+        int ret;
+        block_rsv = &root->fs_info->delalloc_block_rsv;
+        spin_lock(&block_rsv->lock);
+        reserved = block_rsv->reserved;
+        spin_unlock(&block_rsv->lock);
+        if (reserved == 0)
+                return 0;
+        max_reclaim = min(reserved, to_reclaim);
+        while (1) {
+                ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
+                if (!ret) {
+                        __set_current_state(TASK_INTERRUPTIBLE);
+                        schedule_timeout(pause);
+                        pause <<= 1;
+                        if (pause > HZ / 10)
+                                pause = HZ / 10;
+                } else {
+                        pause = 1;
+                }
+                spin_lock(&block_rsv->lock);
+                if (reserved > block_rsv->reserved)
+                        reclaimed = reserved - block_rsv->reserved;
+                reserved = block_rsv->reserved;
+                spin_unlock(&block_rsv->lock);
+                if (reserved == 0 || reclaimed >= max_reclaim)
+                        break;
+                if (trans && trans->transaction->blocked)
+                        return -EAGAIN;
+        }
+        return reclaimed >= to_reclaim;
+}
+static int should_retry_reserve(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct btrfs_block_rsv *block_rsv,
+                                u64 num_bytes, int *retries)
+{
+        struct btrfs_space_info *space_info = block_rsv->space_info;
+        int ret;
+        if ((*retries) > 2)
+                return -ENOSPC;
+        ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
+        if (ret)
+                return 1;
+        if (trans && trans->transaction->in_commit)
+                return -ENOSPC;
+        ret = shrink_delalloc(trans, root, num_bytes);
+        if (ret)
+                return ret;
+        spin_lock(&space_info->lock);
+        if (space_info->bytes_pinned < num_bytes)
+                ret = 1;
+        spin_unlock(&space_info->lock);
+        if (ret)
+                return -ENOSPC;
+        (*retries)++;
+        if (trans)
+                return -EAGAIN;
+        trans = btrfs_join_transaction(root, 1);
+        BUG_ON(IS_ERR(trans));
+        ret = btrfs_commit_transaction(trans, root);
+        BUG_ON(ret);
+        return 1;
+}
+static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
+                                  u64 num_bytes)
+{
+        struct btrfs_space_info *space_info = block_rsv->space_info;
+        u64 unused;
+        int ret = -ENOSPC;
+        spin_lock(&space_info->lock);
+        unused = space_info->bytes_used + space_info->bytes_reserved +
+                 space_info->bytes_pinned + space_info->bytes_readonly;
+        if (unused < space_info->total_bytes)
+                unused = space_info->total_bytes - unused;
+        else
+                unused = 0;
+        if (unused >= num_bytes) {
+                if (block_rsv->priority >= 10) {
+                        space_info->bytes_reserved += num_bytes;
+                        ret = 0;
+                } else {
+                        if ((unused + block_rsv->reserved) *
+                            block_rsv->priority >=
+                            (num_bytes + block_rsv->reserved) * 10) {
+                                space_info->bytes_reserved += num_bytes;
+                                ret = 0;
+                        }
+                }
+        }
+        spin_unlock(&space_info->lock);
+        return ret;
+}
+static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
+                                             struct btrfs_root *root)
+{
+        struct btrfs_block_rsv *block_rsv;
+        if (root->ref_cows)
+                block_rsv = trans->block_rsv;
+        else
+                block_rsv = root->block_rsv;
+        if (!block_rsv)
+                block_rsv = &root->fs_info->empty_block_rsv;
+        return block_rsv;
+}
+static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+                               u64 num_bytes)
+{
+        int ret = -ENOSPC;
+        spin_lock(&block_rsv->lock);
+        if (block_rsv->reserved >= num_bytes) {
+                block_rsv->reserved -= num_bytes;
+                if (block_rsv->reserved < block_rsv->size)
+                        block_rsv->full = 0;
+                ret = 0;
+        }
+        spin_unlock(&block_rsv->lock);
+        return ret;
+}
+static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
+                                u64 num_bytes, int update_size)
+{
+        spin_lock(&block_rsv->lock);
+        block_rsv->reserved += num_bytes;
+        if (update_size)
+                block_rsv->size += num_bytes;
+        else if (block_rsv->reserved >= block_rsv->size)
+                block_rsv->full = 1;
+        spin_unlock(&block_rsv->lock);
+}
+void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
+                             struct btrfs_block_rsv *dest, u64 num_bytes)
+{
+        struct btrfs_space_info *space_info = block_rsv->space_info;
+        spin_lock(&block_rsv->lock);
+        if (num_bytes == (u64)-1)
+                num_bytes = block_rsv->size;
+        block_rsv->size -= num_bytes;
+        if (block_rsv->reserved >= block_rsv->size) {
+                num_bytes = block_rsv->reserved - block_rsv->size;
+                block_rsv->reserved = block_rsv->size;
+                block_rsv->full = 1;
+        } else {
+                num_bytes = 0;
+        }
+        spin_unlock(&block_rsv->lock);
+        if (num_bytes > 0) {
+                if (dest) {
+                        block_rsv_add_bytes(dest, num_bytes, 0);
+                } else {
+                        spin_lock(&space_info->lock);
+                        space_info->bytes_reserved -= num_bytes;
+                        spin_unlock(&space_info->lock);
+                }
+        }
+}
+static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
+                                   struct btrfs_block_rsv *dst, u64 num_bytes)
+{
+        int ret;
+        ret = block_rsv_use_bytes(src, num_bytes);
+        if (ret)
+                return ret;
+        block_rsv_add_bytes(dst, num_bytes, 1);
+        return 0;
+}
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
+{
+        memset(rsv, 0, sizeof(*rsv));
+        spin_lock_init(&rsv->lock);
+        atomic_set(&rsv->usage, 1);
+        rsv->priority = 6;
+        INIT_LIST_HEAD(&rsv->list);
+}
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
+{
+        struct btrfs_block_rsv *block_rsv;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        u64 alloc_target;
+        block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
+        if (!block_rsv)
+                return NULL;
+        btrfs_init_block_rsv(block_rsv);
+        alloc_target = btrfs_get_alloc_profile(root, 0);
+        block_rsv->space_info = __find_space_info(fs_info,
+                                                  BTRFS_BLOCK_GROUP_METADATA);
+        return block_rsv;
+}
+void btrfs_free_block_rsv(struct btrfs_root *root,
+                          struct btrfs_block_rsv *rsv)
+{
+        if (rsv && atomic_dec_and_test(&rsv->usage)) {
+                btrfs_block_rsv_release(root, rsv, (u64)-1);
+                if (!rsv->durable)
+                        kfree(rsv);
+        }
+}
+/*
+ * make the block_rsv struct be able to capture freed space.
+ * the captured space will re-add to the the block_rsv struct
+ * after transaction commit
+ */
+void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
+                                 struct btrfs_block_rsv *block_rsv)
+{
+        block_rsv->durable = 1;
+        mutex_lock(&fs_info->durable_block_rsv_mutex);
+        list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
+        mutex_unlock(&fs_info->durable_block_rsv_mutex);
+}
+int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct btrfs_block_rsv *block_rsv,
+                        u64 num_bytes, int *retries)
+{
+        int ret;
+        if (num_bytes == 0)
+                return 0;
+again:
+        ret = reserve_metadata_bytes(block_rsv, num_bytes);
+        if (!ret) {
+                block_rsv_add_bytes(block_rsv, num_bytes, 1);
+                return 0;
+        }
+        ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
+        if (ret > 0)
+                goto again;
+        return ret;
+}
+int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root,
+                          struct btrfs_block_rsv *block_rsv,
+                          u64 min_reserved, int min_factor)
+{
+        u64 num_bytes = 0;
+        int commit_trans = 0;
+        int ret = -ENOSPC;
+        if (!block_rsv)
+                return 0;
+        spin_lock(&block_rsv->lock);
+        if (min_factor > 0)
+                num_bytes = div_factor(block_rsv->size, min_factor);
+        if (min_reserved > num_bytes)
+                num_bytes = min_reserved;
+        if (block_rsv->reserved >= num_bytes) {
+                ret = 0;
+        } else {
+                num_bytes -= block_rsv->reserved;
+                if (block_rsv->durable &&
+                    block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
+                        commit_trans = 1;
+        }
+        spin_unlock(&block_rsv->lock);
+        if (!ret)
+                return 0;
+        if (block_rsv->refill_used) {
+                ret = reserve_metadata_bytes(block_rsv, num_bytes);
+                if (!ret) {
+                        block_rsv_add_bytes(block_rsv, num_bytes, 0);
+                        return 0;
+                }
+        }
+        if (commit_trans) {
+                if (trans)
+                        return -EAGAIN;
+                trans = btrfs_join_transaction(root, 1);
+                BUG_ON(IS_ERR(trans));
+                ret = btrfs_commit_transaction(trans, root);
+                return 0;
+        }
+        WARN_ON(1);
+        printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
+                block_rsv->size, block_rsv->reserved,
+                block_rsv->freed[0], block_rsv->freed[1]);
+        return -ENOSPC;
+}
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+                            struct btrfs_block_rsv *dst_rsv,
+                            u64 num_bytes)
+{
+        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+void btrfs_block_rsv_release(struct btrfs_root *root,
+                             struct btrfs_block_rsv *block_rsv,
+                             u64 num_bytes)
+{
+        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+        if (global_rsv->full || global_rsv == block_rsv ||
+            block_rsv->space_info != global_rsv->space_info)
+                global_rsv = NULL;
+        block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
+}
+/*
+ * helper to calculate size of global block reservation.
+ * the desired value is sum of space used by extent tree,
+ * checksum tree and root tree
+ */
+static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_space_info *sinfo;
+        u64 num_bytes;
+        u64 meta_used;
+        u64 data_used;
+        int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+#if 0
+        /*
+         * per tree used space accounting can be inaccuracy, so we
+         * can't rely on it.
+         */
+        spin_lock(&fs_info->extent_root->accounting_lock);
+        num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
+        spin_unlock(&fs_info->extent_root->accounting_lock);
+        spin_lock(&fs_info->csum_root->accounting_lock);
+        num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
+        spin_unlock(&fs_info->csum_root->accounting_lock);
+        spin_lock(&fs_info->tree_root->accounting_lock);
+        num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
+        spin_unlock(&fs_info->tree_root->accounting_lock);
+#endif
+        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
+        spin_lock(&sinfo->lock);
+        data_used = sinfo->bytes_used;
+        spin_unlock(&sinfo->lock);
+        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+        spin_lock(&sinfo->lock);
+        meta_used = sinfo->bytes_used;
+        spin_unlock(&sinfo->lock);
+        num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
+                    csum_size * 2;
+        num_bytes += div64_u64(data_used + meta_used, 50);
+        if (num_bytes * 3 > meta_used)
+                num_bytes = div64_u64(meta_used, 3);
+        return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
+}
+static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+        struct btrfs_space_info *sinfo = block_rsv->space_info;
+        u64 num_bytes;
+        num_bytes = calc_global_metadata_size(fs_info);
+        spin_lock(&block_rsv->lock);
+        spin_lock(&sinfo->lock);
+        block_rsv->size = num_bytes;
+        num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
+                    sinfo->bytes_reserved + sinfo->bytes_readonly;
+        if (sinfo->total_bytes > num_bytes) {
+                num_bytes = sinfo->total_bytes - num_bytes;
+                block_rsv->reserved += num_bytes;
+                sinfo->bytes_reserved += num_bytes;
+        }
+        if (block_rsv->reserved >= block_rsv->size) {
+                num_bytes = block_rsv->reserved - block_rsv->size;
+                sinfo->bytes_reserved -= num_bytes;
+                block_rsv->reserved = block_rsv->size;
+                block_rsv->full = 1;
+        }
+#if 0
+        printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
+                block_rsv->size, block_rsv->reserved);
+#endif
+        spin_unlock(&sinfo->lock);
+        spin_unlock(&block_rsv->lock);
+}
+static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_space_info *space_info;
+        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+        fs_info->chunk_block_rsv.space_info = space_info;
+        fs_info->chunk_block_rsv.priority = 10;
+        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+        fs_info->global_block_rsv.space_info = space_info;
+        fs_info->global_block_rsv.priority = 10;
+        fs_info->global_block_rsv.refill_used = 1;
+        fs_info->delalloc_block_rsv.space_info = space_info;
+        fs_info->trans_block_rsv.space_info = space_info;
+        fs_info->empty_block_rsv.space_info = space_info;
+        fs_info->empty_block_rsv.priority = 10;
+        fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
+        fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
+        fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
+        fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
+        fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
+        btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
+        btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
+        update_global_block_rsv(fs_info);
+}
+static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+        block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
+        WARN_ON(fs_info->delalloc_block_rsv.size > 0);
+        WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
+        WARN_ON(fs_info->trans_block_rsv.size > 0);
+        WARN_ON(fs_info->trans_block_rsv.reserved > 0);
+        WARN_ON(fs_info->chunk_block_rsv.size > 0);
+        WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+}
+static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
+{
+        return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+                3 * num_items;
+}
+int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 int num_items, int *retries)
+{
+        u64 num_bytes;
+        int ret;
+        if (num_items == 0 || root->fs_info->chunk_root == root)
+                return 0;
+        num_bytes = calc_trans_metadata_size(root, num_items);
+        ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
+                                  num_bytes, retries);
+        if (!ret) {
+                trans->bytes_reserved += num_bytes;
+                trans->block_rsv = &root->fs_info->trans_block_rsv;
+        }
+        return ret;
+}
+void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root)
+{
+        if (!trans->bytes_reserved)
+                return;
+        BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
+        btrfs_block_rsv_release(root, trans->block_rsv,
+                                trans->bytes_reserved);
+        trans->bytes_reserved = 0;
+}
+int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
+                                  struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+        struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
+        /*
+         * one for deleting orphan item, one for updating inode and
+         * two for calling btrfs_truncate_inode_items.
+         *
+         * btrfs_truncate_inode_items is a delete operation, it frees
+         * more space than it uses in most cases. So two units of
+         * metadata space should be enough for calling it many times.
+         * If all of the metadata space is used, we can commit
+         * transaction and use space it freed.
+         */
+        u64 num_bytes = calc_trans_metadata_size(root, 4);
+        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+void btrfs_orphan_release_metadata(struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        u64 num_bytes = calc_trans_metadata_size(root, 4);
+        btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
+}
+int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
+                                struct btrfs_pending_snapshot *pending)
+{
+        struct btrfs_root *root = pending->root;
+        struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+        struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
+        /*
+         * two for root back/forward refs, two for directory entries
+         * and one for root of the snapshot.
+         */
+        u64 num_bytes = calc_trans_metadata_size(root, 5);
+        dst_rsv->space_info = src_rsv->space_info;
+        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
+{
+        return num_bytes >>= 3;
+}
+int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
+        u64 to_reserve;
+        int nr_extents;
+        int retries = 0;
+        int ret;
+        if (btrfs_transaction_in_commit(root->fs_info))
+                schedule_timeout(1);
+        num_bytes = ALIGN(num_bytes, root->sectorsize);
+again:
+        spin_lock(&BTRFS_I(inode)->accounting_lock);
+        nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
+        if (nr_extents > BTRFS_I(inode)->reserved_extents) {
+                nr_extents -= BTRFS_I(inode)->reserved_extents;
+                to_reserve = calc_trans_metadata_size(root, nr_extents);
+        } else {
+                nr_extents = 0;
+                to_reserve = 0;
+        }
+        to_reserve += calc_csum_metadata_size(inode, num_bytes);
+        ret = reserve_metadata_bytes(block_rsv, to_reserve);
+        if (ret) {
+                spin_unlock(&BTRFS_I(inode)->accounting_lock);
+                ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
+                                           &retries);
+                if (ret > 0)
+                        goto again;
+                return ret;
+        }
+        BTRFS_I(inode)->reserved_extents += nr_extents;
+        atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+        spin_unlock(&BTRFS_I(inode)->accounting_lock);
+        block_rsv_add_bytes(block_rsv, to_reserve, 1);
+        if (block_rsv->size > 512 * 1024 * 1024)
+                shrink_delalloc(NULL, root, to_reserve);
+        return 0;
+}
+void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        u64 to_free;
+        int nr_extents;
+        num_bytes = ALIGN(num_bytes, root->sectorsize);
+        atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+        spin_lock(&BTRFS_I(inode)->accounting_lock);
+        nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
+        if (nr_extents < BTRFS_I(inode)->reserved_extents) {
+                nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
+                BTRFS_I(inode)->reserved_extents -= nr_extents;
+        } else {
+                nr_extents = 0;
+        }
+        spin_unlock(&BTRFS_I(inode)->accounting_lock);
+        to_free = calc_csum_metadata_size(inode, num_bytes);
+        if (nr_extents > 0)
+                to_free += calc_trans_metadata_size(root, nr_extents);
+        btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
+                                to_free);
+}
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+{
+        int ret;
+        ret = btrfs_check_data_free_space(inode, num_bytes);
+        if (ret)
+                return ret;
+        ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
+        if (ret) {
+                btrfs_free_reserved_data_space(inode, num_bytes);
+                return ret;
+        }
+        return 0;
+}
+void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+{
+        btrfs_delalloc_release_metadata(inode, num_bytes);
+        btrfs_free_reserved_data_space(inode, num_bytes);
+}
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
-                              u64 bytenr, u64 num_bytes, int alloc,
+                              u64 bytenr, u64 num_bytes, int alloc)
-                              int mark_free)
 {
        struct btrfs_block_group_cache *cache;
        struct btrfs_fs_info *info = root->fs_info;
+        int factor;
        u64 total = num_bytes;
        u64 old_val;
        u64 byte_in_group;
@@ -3476,6 +3798,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                cache = btrfs_lookup_block_group(info, bytenr);
                if (!cache)
                        return -1;
+                if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
+                                    BTRFS_BLOCK_GROUP_RAID1 |
+                                    BTRFS_BLOCK_GROUP_RAID10))
+                        factor = 2;
+                else
+                        factor = 1;
                byte_in_group = bytenr - cache->key.objectid;
                WARN_ON(byte_in_group > cache->key.offset);
@@ -3488,31 +3816,24 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                        old_val += num_bytes;
                        btrfs_set_block_group_used(&cache->item, old_val);
                        cache->reserved -= num_bytes;
-                        cache->space_info->bytes_used += num_bytes;
                        cache->space_info->bytes_reserved -= num_bytes;
-                        if (cache->ro)
+                        cache->space_info->bytes_used += num_bytes;
-                                cache->space_info->bytes_readonly -= num_bytes;
+                        cache->space_info->disk_used += num_bytes * factor;
                        spin_unlock(&cache->lock);
                        spin_unlock(&cache->space_info->lock);
                } else {
                        old_val -= num_bytes;
-                        cache->space_info->bytes_used -= num_bytes;
-                        if (cache->ro)
-                                cache->space_info->bytes_readonly += num_bytes;
                        btrfs_set_block_group_used(&cache->item, old_val);
+                        cache->pinned += num_bytes;
+                        cache->space_info->bytes_pinned += num_bytes;
+                        cache->space_info->bytes_used -= num_bytes;
+                        cache->space_info->disk_used -= num_bytes * factor;
                        spin_unlock(&cache->lock);
                        spin_unlock(&cache->space_info->lock);
-                        if (mark_free) {
-                                int ret;
-                                ret = btrfs_discard_extent(root, bytenr,
-                                                           num_bytes);
-                                WARN_ON(ret);
-                                ret = btrfs_add_free_space(cache, bytenr,
+                        set_extent_dirty(info->pinned_extents,
-                                                           num_bytes);
+                                         bytenr, bytenr + num_bytes - 1,
-                                WARN_ON(ret);
+                                         GFP_NOFS | __GFP_NOFAIL);
-                        }
                }
                btrfs_put_block_group(cache);
                total -= num_bytes;
@@ -3536,18 +3857,10 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
        return bytenr;
 }
-/*
+static int pin_down_extent(struct btrfs_root *root,
- * this function must be called within transaction
+                           struct btrfs_block_group_cache *cache,
- */
+                           u64 bytenr, u64 num_bytes, int reserved)
-int btrfs_pin_extent(struct btrfs_root *root,
-                     u64 bytenr, u64 num_bytes, int reserved)
 {
-        struct btrfs_fs_info *fs_info = root->fs_info;
-        struct btrfs_block_group_cache *cache;
-        cache = btrfs_lookup_block_group(fs_info, bytenr);
-        BUG_ON(!cache);
        spin_lock(&cache->space_info->lock);
        spin_lock(&cache->lock);
        cache->pinned += num_bytes;
@@ -3559,28 +3872,68 @@ int btrfs_pin_extent(struct btrfs_root *root,
        spin_unlock(&cache->lock);
        spin_unlock(&cache->space_info->lock);
-        btrfs_put_block_group(cache);
+        set_extent_dirty(root->fs_info->pinned_extents, bytenr,
+                         bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
+        return 0;
+}
-        set_extent_dirty(fs_info->pinned_extents,
+/*
-                         bytenr, bytenr + num_bytes - 1, GFP_NOFS);
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent(struct btrfs_root *root,
+                     u64 bytenr, u64 num_bytes, int reserved)
+{
+        struct btrfs_block_group_cache *cache;
+        cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+        BUG_ON(!cache);
+        pin_down_extent(root, cache, bytenr, num_bytes, reserved);
+        btrfs_put_block_group(cache);
        return 0;
 }
-static int update_reserved_extents(struct btrfs_block_group_cache *cache,
+/*
-                                   u64 num_bytes, int reserve)
+ * update size of reserved extents. this function may return -EAGAIN
+ * if 'reserve' is true or 'sinfo' is false.
+ */
+static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
+                                 u64 num_bytes, int reserve, int sinfo)
 {
-        spin_lock(&cache->space_info->lock);
+        int ret = 0;
-        spin_lock(&cache->lock);
+        if (sinfo) {
-        if (reserve) {
+                struct btrfs_space_info *space_info = cache->space_info;
-                cache->reserved += num_bytes;
+                spin_lock(&space_info->lock);
-                cache->space_info->bytes_reserved += num_bytes;
+                spin_lock(&cache->lock);
+                if (reserve) {
+                        if (cache->ro) {
+                                ret = -EAGAIN;
+                        } else {
+                                cache->reserved += num_bytes;
+                                space_info->bytes_reserved += num_bytes;
+                        }
+                } else {
+                        if (cache->ro)
+                                space_info->bytes_readonly += num_bytes;
+                        cache->reserved -= num_bytes;
+                        space_info->bytes_reserved -= num_bytes;
+                }
+                spin_unlock(&cache->lock);
+                spin_unlock(&space_info->lock);
        } else {
-                cache->reserved -= num_bytes;
+                spin_lock(&cache->lock);
-                cache->space_info->bytes_reserved -= num_bytes;
+                if (cache->ro) {
+                        ret = -EAGAIN;
+                } else {
+                        if (reserve)
+                                cache->reserved += num_bytes;
+                        else
+                                cache->reserved -= num_bytes;
+                }
+                spin_unlock(&cache->lock);
        }
-        spin_unlock(&cache->lock);
+        return ret;
-        spin_unlock(&cache->space_info->lock);
-        return 0;
 }
 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
@@ -3611,6 +3964,8 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
                fs_info->pinned_extents = &fs_info->freed_extents[0];
        up_write(&fs_info->extent_commit_sem);
+        update_global_block_rsv(fs_info);
        return 0;
 }
@@ -3637,14 +3992,21 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
                        btrfs_add_free_space(cache, start, len);
                }
+                start += len;
                spin_lock(&cache->space_info->lock);
                spin_lock(&cache->lock);
                cache->pinned -= len;
                cache->space_info->bytes_pinned -= len;
+                if (cache->ro) {
+                        cache->space_info->bytes_readonly += len;
+                } else if (cache->reserved_pinned > 0) {
+                        len = min(len, cache->reserved_pinned);
+                        cache->reserved_pinned -= len;
+                        cache->space_info->bytes_reserved += len;
+                }
                spin_unlock(&cache->lock);
                spin_unlock(&cache->space_info->lock);
-                start += len;
        }
        if (cache)
@@ -3657,8 +4019,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_io_tree *unpin;
+        struct btrfs_block_rsv *block_rsv;
+        struct btrfs_block_rsv *next_rsv;
        u64 start;
        u64 end;
+        int idx;
        int ret;
        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -3679,59 +4044,30 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                cond_resched();
        }
-        return ret;
+        mutex_lock(&fs_info->durable_block_rsv_mutex);
-}
+        list_for_each_entry_safe(block_rsv, next_rsv,
+                                 &fs_info->durable_block_rsv_list, list) {
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root,
-                          struct btrfs_path *path,
-                          u64 bytenr, u64 num_bytes,
-                          int is_data, int reserved,
-                          struct extent_buffer **must_clean)
-{
-        int err = 0;
-        struct extent_buffer *buf;
-        if (is_data)
-                goto pinit;
-        /*
-         * discard is sloooow, and so triggering discards on
-         * individual btree blocks isn't a good plan.  Just
-         * pin everything in discard mode.
-         */
-        if (btrfs_test_opt(root, DISCARD))
-                goto pinit;
-        buf = btrfs_find_tree_block(root, bytenr, num_bytes);
+                idx = trans->transid & 0x1;
-        if (!buf)
+                if (block_rsv->freed[idx] > 0) {
-                goto pinit;
+                        block_rsv_add_bytes(block_rsv,
+                                            block_rsv->freed[idx], 0);
+                        block_rsv->freed[idx] = 0;
+                }
+                if (atomic_read(&block_rsv->usage) == 0) {
+                        btrfs_block_rsv_release(root, block_rsv, (u64)-1);
-        /* we can reuse a block if it hasn't been written
+                        if (block_rsv->freed[0] == 0 &&
-         * and it is from this transaction.  We can't
+                            block_rsv->freed[1] == 0) {
-         * reuse anything from the tree log root because
+                                list_del_init(&block_rsv->list);
-         * it has tiny sub-transactions.
+                                kfree(block_rsv);
-         */
+                        }
-        if (btrfs_buffer_uptodate(buf, 0) &&
+                } else {
-            btrfs_try_tree_lock(buf)) {
+                        btrfs_block_rsv_release(root, block_rsv, 0);
-                u64 header_owner = btrfs_header_owner(buf);
-                u64 header_transid = btrfs_header_generation(buf);
-                if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
-                    header_transid == trans->transid &&
-                    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
-                        *must_clean = buf;
-                        return 1;
                }
-                btrfs_tree_unlock(buf);
        }
-        free_extent_buffer(buf);
+        mutex_unlock(&fs_info->durable_block_rsv_mutex);
-pinit:
-        if (path)
-                btrfs_set_path_blocking(path);
-        /* unlocks the pinned mutex */
-        btrfs_pin_extent(root, bytenr, num_bytes, reserved);
-        BUG_ON(err < 0);
        return 0;
 }
@@ -3892,9 +4228,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        BUG_ON(ret);
                }
        } else {
-                int mark_free = 0;
-                struct extent_buffer *must_clean = NULL;
                if (found_extent) {
                        BUG_ON(is_data && refs_to_drop !=
                               extent_data_ref_count(root, path, iref));
@@ -3907,31 +4240,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        }
                }
-                ret = pin_down_bytes(trans, root, path, bytenr,
-                                     num_bytes, is_data, 0, &must_clean);
-                if (ret > 0)
-                        mark_free = 1;
-                BUG_ON(ret < 0);
-                /*
-                 * it is going to be very rare for someone to be waiting
-                 * on the block we're freeing.  del_items might need to
-                 * schedule, so rather than get fancy, just force it
-                 * to blocking here
-                 */
-                if (must_clean)
-                        btrfs_set_lock_blocking(must_clean);
                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
                                      num_to_del);
                BUG_ON(ret);
                btrfs_release_path(extent_root, path);
-                if (must_clean) {
-                        clean_tree_block(NULL, root, must_clean);
-                        btrfs_tree_unlock(must_clean);
-                        free_extent_buffer(must_clean);
-                }
                if (is_data) {
                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
                        BUG_ON(ret);
@@ -3941,8 +4254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                             (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
                }
-                ret = update_block_group(trans, root, bytenr, num_bytes, 0,
+                ret = update_block_group(trans, root, bytenr, num_bytes, 0);
-                                         mark_free);
                BUG_ON(ret);
        }
        btrfs_free_path(path);
@@ -3950,7 +4262,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 }
 /*
- * when we free an extent, it is possible (and likely) that we free the last
+ * when we free an block, it is possible (and likely) that we free the last
 * delayed ref for that extent as well.  This searches the delayed ref tree for
 * a given extent, and if there are no other delayed refs to be processed, it
 * removes it from the tree.
@@ -3962,7 +4274,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_delayed_ref_node *ref;
        struct rb_node *node;
-        int ret;
+        int ret = 0;
        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
@@ -4014,17 +4326,99 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        list_del_init(&head->cluster);
        spin_unlock(&delayed_refs->lock);
-        ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
+        BUG_ON(head->extent_op);
-                                  &head->node, head->extent_op,
+        if (head->must_insert_reserved)
-                                  head->must_insert_reserved);
+                ret = 1;
-        BUG_ON(ret);
+        mutex_unlock(&head->mutex);
        btrfs_put_delayed_ref(&head->node);
-        return 0;
+        return ret;
 out:
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root,
+                           struct extent_buffer *buf,
+                           u64 parent, int last_ref)
+{
+        struct btrfs_block_rsv *block_rsv;
+        struct btrfs_block_group_cache *cache = NULL;
+        int ret;
+        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+                ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
+                                                parent, root->root_key.objectid,
+                                                btrfs_header_level(buf),
+                                                BTRFS_DROP_DELAYED_REF, NULL);
+                BUG_ON(ret);
+        }
+        if (!last_ref)
+                return;
+        block_rsv = get_block_rsv(trans, root);
+        cache = btrfs_lookup_block_group(root->fs_info, buf->start);
+        BUG_ON(block_rsv->space_info != cache->space_info);
+        if (btrfs_header_generation(buf) == trans->transid) {
+                if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+                        ret = check_ref_cleanup(trans, root, buf->start);
+                        if (!ret)
+                                goto pin;
+                }
+                if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+                        pin_down_extent(root, cache, buf->start, buf->len, 1);
+                        goto pin;
+                }
+                WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+                btrfs_add_free_space(cache, buf->start, buf->len);
+                ret = update_reserved_bytes(cache, buf->len, 0, 0);
+                if (ret == -EAGAIN) {
+                        /* block group became read-only */
+                        update_reserved_bytes(cache, buf->len, 0, 1);
+                        goto out;
+                }
+                ret = 1;
+                spin_lock(&block_rsv->lock);
+                if (block_rsv->reserved < block_rsv->size) {
+                        block_rsv->reserved += buf->len;
+                        ret = 0;
+                }
+                spin_unlock(&block_rsv->lock);
+                if (ret) {
+                        spin_lock(&cache->space_info->lock);
+                        cache->space_info->bytes_reserved -= buf->len;
+                        spin_unlock(&cache->space_info->lock);
+                }
+                goto out;
+        }
+pin:
+        if (block_rsv->durable && !cache->ro) {
+                ret = 0;
+                spin_lock(&cache->lock);
+                if (!cache->ro) {
+                        cache->reserved_pinned += buf->len;
+                        ret = 1;
+                }
+                spin_unlock(&cache->lock);
+                if (ret) {
+                        spin_lock(&block_rsv->lock);
+                        block_rsv->freed[trans->transid & 0x1] += buf->len;
+                        spin_unlock(&block_rsv->lock);
+                }
+        }
+out:
+        btrfs_put_block_group(cache);
+}
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
                      u64 bytenr, u64 num_bytes, u64 parent,
@@ -4046,8 +4440,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
                                        parent, root_objectid, (int)owner,
                                        BTRFS_DROP_DELAYED_REF, NULL);
                BUG_ON(ret);
-                ret = check_ref_cleanup(trans, root, bytenr);
-                BUG_ON(ret);
        } else {
                ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
                                        parent, root_objectid, owner,
@@ -4057,21 +4449,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
        return ret;
 }
-int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root,
-                          u64 bytenr, u32 blocksize,
-                          u64 parent, u64 root_objectid, int level)
-{
-        u64 used;
-        spin_lock(&root->node_lock);
-        used = btrfs_root_used(&root->root_item) - blocksize;
-        btrfs_set_root_used(&root->root_item, used);
-        spin_unlock(&root->node_lock);
-        return btrfs_free_extent(trans, root, bytenr, blocksize,
-                                 parent, root_objectid, level, 0);
-}
 static u64 stripe_align(struct btrfs_root *root, u64 val)
 {
        u64 mask = ((u64)root->stripesize - 1);
@@ -4124,6 +4501,22 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
        return 0;
 }
+static int get_block_group_index(struct btrfs_block_group_cache *cache)
+{
+        int index;
+        if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
+                index = 0;
+        else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
+                index = 1;
+        else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
+                index = 2;
+        else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
+                index = 3;
+        else
+                index = 4;
+        return index;
+}
 enum btrfs_loop_type {
        LOOP_FIND_IDEAL = 0,
        LOOP_CACHING_NOWAIT = 1,
@@ -4145,7 +4538,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
                                     u64 num_bytes, u64 empty_size,
                                     u64 search_start, u64 search_end,
                                     u64 hint_byte, struct btrfs_key *ins,
-                                     u64 exclude_start, u64 exclude_nr,
                                     int data)
 {
        int ret = 0;
@@ -4158,6 +4550,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        struct btrfs_space_info *space_info;
        int last_ptr_loop = 0;
        int loop = 0;
+        int index = 0;
        bool found_uncached_bg = false;
        bool failed_cluster_refill = false;
        bool failed_alloc = false;
@@ -4170,6 +4563,10 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        ins->offset = 0;
        space_info = __find_space_info(root->fs_info, data);
+        if (!space_info) {
+                printk(KERN_ERR "No space info for %d\n", data);
+                return -ENOSPC;
+        }
        if (orig_root->ref_cows || empty_size)
                allowed_chunk_alloc = 1;
@@ -4223,6 +4620,7 @@ ideal_cache:
                                btrfs_put_block_group(block_group);
                                up_read(&space_info->groups_sem);
                        } else {
+                                index = get_block_group_index(block_group);
                                goto have_block_group;
                        }
                } else if (block_group) {
@@ -4231,7 +4629,8 @@ ideal_cache:
        }
 search:
        down_read(&space_info->groups_sem);
-        list_for_each_entry(block_group, &space_info->block_groups, list) {
+        list_for_each_entry(block_group, &space_info->block_groups[index],
+                            list) {
                u64 offset;
                int cached;
@@ -4422,23 +4821,22 @@ checks:
                        goto loop;
                }
-                if (exclude_nr > 0 &&
+                ins->objectid = search_start;
-                    (search_start + num_bytes > exclude_start &&
+                ins->offset = num_bytes;
-                     search_start < exclude_start + exclude_nr)) {
-                        search_start = exclude_start + exclude_nr;
+                if (offset < search_start)
+                        btrfs_add_free_space(block_group, offset,
+                                             search_start - offset);
+                BUG_ON(offset > search_start);
+                ret = update_reserved_bytes(block_group, num_bytes, 1,
+                                            (data & BTRFS_BLOCK_GROUP_DATA));
+                if (ret == -EAGAIN) {
                        btrfs_add_free_space(block_group, offset, num_bytes);
-                        /*
-                         * if search_start is still in this block group
-                         * then we just re-search this block group
-                         */
-                        if (search_start >= block_group->key.objectid &&
-                            search_start < (block_group->key.objectid +
-                                            block_group->key.offset))
-                                goto have_block_group;
                        goto loop;
                }
+                /* we are all good, lets return */
                ins->objectid = search_start;
                ins->offset = num_bytes;
@@ -4446,18 +4844,18 @@ checks:
                        btrfs_add_free_space(block_group, offset,
                                             search_start - offset);
                BUG_ON(offset > search_start);
-                update_reserved_extents(block_group, num_bytes, 1);
-                /* we are all good, lets return */
                break;
 loop:
                failed_cluster_refill = false;
                failed_alloc = false;
+                BUG_ON(index != get_block_group_index(block_group));
                btrfs_put_block_group(block_group);
        }
        up_read(&space_info->groups_sem);
+        if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
+                goto search;
        /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
         *                      for them to make caching progress.  Also
         *                      determine the best possible bg to cache
@@ -4471,6 +4869,7 @@ loop:
        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
            (found_uncached_bg || empty_size || empty_cluster ||
             allowed_chunk_alloc)) {
+                index = 0;
                if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
                        found_uncached_bg = false;
                        loop++;
@@ -4553,31 +4952,30 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
                            int dump_block_groups)
 {
        struct btrfs_block_group_cache *cache;
+        int index = 0;
        spin_lock(&info->lock);
        printk(KERN_INFO "space_info has %llu free, is %sfull\n",
               (unsigned long long)(info->total_bytes - info->bytes_used -
                                    info->bytes_pinned - info->bytes_reserved -
-                                    info->bytes_super),
+                                    info->bytes_readonly),
               (info->full) ? "" : "not ");
-        printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
+        printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
-               " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
+               "reserved=%llu, may_use=%llu, readonly=%llu\n",
-               "\n",
               (unsigned long long)info->total_bytes,
+               (unsigned long long)info->bytes_used,
               (unsigned long long)info->bytes_pinned,
-               (unsigned long long)info->bytes_delalloc,
+               (unsigned long long)info->bytes_reserved,
               (unsigned long long)info->bytes_may_use,
-               (unsigned long long)info->bytes_used,
+               (unsigned long long)info->bytes_readonly);
-               (unsigned long long)info->bytes_root,
-               (unsigned long long)info->bytes_super,
-               (unsigned long long)info->bytes_reserved);
        spin_unlock(&info->lock);
        if (!dump_block_groups)
                return;
        down_read(&info->groups_sem);
-        list_for_each_entry(cache, &info->block_groups, list) {
+again:
+        list_for_each_entry(cache, &info->block_groups[index], list) {
                spin_lock(&cache->lock);
                printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
                       "%llu pinned %llu reserved\n",
@@ -4589,6 +4987,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
                btrfs_dump_free_space(cache, bytes);
                spin_unlock(&cache->lock);
        }
+        if (++index < BTRFS_NR_RAID_TYPES)
+                goto again;
        up_read(&info->groups_sem);
 }
@@ -4614,9 +5014,8 @@ again:
        WARN_ON(num_bytes < root->sectorsize);
        ret = find_free_extent(trans, root, num_bytes, empty_size,
-                               search_start, search_end, hint_byte, ins,
+                               search_start, search_end, hint_byte,
-                               trans->alloc_exclude_start,
+                               ins, data);
-                               trans->alloc_exclude_nr, data);
        if (ret == -ENOSPC && num_bytes > min_alloc_size) {
                num_bytes = num_bytes >> 1;
@@ -4654,7 +5053,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
        ret = btrfs_discard_extent(root, start, len);
        btrfs_add_free_space(cache, start, len);
-        update_reserved_extents(cache, len, 0);
+        update_reserved_bytes(cache, len, 0, 1);
        btrfs_put_block_group(cache);
        return ret;
@@ -4717,8 +5116,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(path->nodes[0]);
        btrfs_free_path(path);
-        ret = update_block_group(trans, root, ins->objectid, ins->offset,
+        ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
-                                 1, 0);
        if (ret) {
                printk(KERN_ERR "btrfs update block group failed for %llu "
                       "%llu\n", (unsigned long long)ins->objectid,
@@ -4778,8 +5176,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(leaf);
        btrfs_free_path(path);
-        ret = update_block_group(trans, root, ins->objectid, ins->offset,
+        ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
-                                 1, 0);
        if (ret) {
                printk(KERN_ERR "btrfs update block group failed for %llu "
                       "%llu\n", (unsigned long long)ins->objectid,
@@ -4855,73 +5252,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
                put_caching_control(caching_ctl);
        }
-        update_reserved_extents(block_group, ins->offset, 1);
+        ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
+        BUG_ON(ret);
        btrfs_put_block_group(block_group);
        ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
                                         0, owner, offset, ins, 1);
        return ret;
 }
-/*
- * finds a free extent and does all the dirty work required for allocation
- * returns the key for the extent through ins, and a tree buffer for
- * the first block of the extent through buf.
- *
- * returns 0 if everything worked, non-zero otherwise.
- */
-static int alloc_tree_block(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root,
-                            u64 num_bytes, u64 parent, u64 root_objectid,
-                            struct btrfs_disk_key *key, int level,
-                            u64 empty_size, u64 hint_byte, u64 search_end,
-                            struct btrfs_key *ins)
-{
-        int ret;
-        u64 flags = 0;
-        ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
-                                   empty_size, hint_byte, search_end,
-                                   ins, 0);
-        if (ret)
-                return ret;
-        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
-                if (parent == 0)
-                        parent = ins->objectid;
-                flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
-        } else
-                BUG_ON(parent > 0);
-        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
-                struct btrfs_delayed_extent_op *extent_op;
-                extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-                BUG_ON(!extent_op);
-                if (key)
-                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
-                else
-                        memset(&extent_op->key, 0, sizeof(extent_op->key));
-                extent_op->flags_to_set = flags;
-                extent_op->update_key = 1;
-                extent_op->update_flags = 1;
-                extent_op->is_data = 0;
-                ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
-                                        ins->offset, parent, root_objectid,
-                                        level, BTRFS_ADD_DELAYED_EXTENT,
-                                        extent_op);
-                BUG_ON(ret);
-        }
-        if (root_objectid == root->root_key.objectid) {
-                u64 used;
-                spin_lock(&root->node_lock);
-                used = btrfs_root_used(&root->root_item) + num_bytes;
-                btrfs_set_root_used(&root->root_item, used);
-                spin_unlock(&root->node_lock);
-        }
-        return ret;
-}
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize,
@@ -4960,8 +5298,45 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
        return buf;
 }
+static struct btrfs_block_rsv *
+use_block_rsv(struct btrfs_trans_handle *trans,
+              struct btrfs_root *root, u32 blocksize)
+{
+        struct btrfs_block_rsv *block_rsv;
+        int ret;
+        block_rsv = get_block_rsv(trans, root);
+        if (block_rsv->size == 0) {
+                ret = reserve_metadata_bytes(block_rsv, blocksize);
+                if (ret)
+                        return ERR_PTR(ret);
+                return block_rsv;
+        }
+        ret = block_rsv_use_bytes(block_rsv, blocksize);
+        if (!ret)
+                return block_rsv;
+        WARN_ON(1);
+        printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
+                block_rsv->size, block_rsv->reserved,
+                block_rsv->freed[0], block_rsv->freed[1]);
+        return ERR_PTR(-ENOSPC);
+}
+static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
+{
+        block_rsv_add_bytes(block_rsv, blocksize, 0);
+        block_rsv_release_bytes(block_rsv, NULL, 0);
+}
 /*
- * helper function to allocate a block for a given tree
+ * finds a free extent and does all the dirty work required for allocation
+ * returns the key for the extent through ins, and a tree buffer for
+ * the first block of the extent through buf.
+ *
 * returns the tree buffer or NULL.
 */
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
@@ -4971,18 +5346,53 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                        u64 hint, u64 empty_size)
 {
        struct btrfs_key ins;
-        int ret;
+        struct btrfs_block_rsv *block_rsv;
        struct extent_buffer *buf;
+        u64 flags = 0;
+        int ret;
-        ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
-                               key, level, empty_size, hint, (u64)-1, &ins);
+        block_rsv = use_block_rsv(trans, root, blocksize);
+        if (IS_ERR(block_rsv))
+                return ERR_CAST(block_rsv);
+        ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
+                                   empty_size, hint, (u64)-1, &ins, 0);
        if (ret) {
-                BUG_ON(ret > 0);
+                unuse_block_rsv(block_rsv, blocksize);
                return ERR_PTR(ret);
        }
        buf = btrfs_init_new_buffer(trans, root, ins.objectid,
                                    blocksize, level);
+        BUG_ON(IS_ERR(buf));
+        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
+                if (parent == 0)
+                        parent = ins.objectid;
+                flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+        } else
+                BUG_ON(parent > 0);
+        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+                struct btrfs_delayed_extent_op *extent_op;
+                extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+                BUG_ON(!extent_op);
+                if (key)
+                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
+                else
+                        memset(&extent_op->key, 0, sizeof(extent_op->key));
+                extent_op->flags_to_set = flags;
+                extent_op->update_key = 1;
+                extent_op->update_flags = 1;
+                extent_op->is_data = 0;
+                ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
+                                        ins.offset, parent, root_objectid,
+                                        level, BTRFS_ADD_DELAYED_EXTENT,
+                                        extent_op);
+                BUG_ON(ret);
+        }
        return buf;
 }
@@ -5205,6 +5615,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        next = btrfs_find_tree_block(root, bytenr, blocksize);
        if (!next) {
                next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+                if (!next)
+                        return -ENOMEM;
                reada = 1;
        }
        btrfs_tree_lock(next);
@@ -5305,7 +5717,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                                 struct btrfs_path *path,
                                 struct walk_control *wc)
 {
-        int ret = 0;
+        int ret;
        int level = wc->level;
        struct extent_buffer *eb = path->nodes[level];
        u64 parent = 0;
@@ -5383,13 +5795,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                               btrfs_header_owner(path->nodes[level + 1]));
        }
-        ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent,
+        btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
-                                root->root_key.objectid, level, 0);
-        BUG_ON(ret);
 out:
        wc->refs[level] = 0;
        wc->flags[level] = 0;
-        return ret;
+        return 0;
 }
 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@ -5417,7 +5827,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
                if (ret > 0) {
                        path->slots[level]++;
                        continue;
-                }
+                } else if (ret < 0)
+                        return ret;
                level = wc->level;
        }
        return 0;
@@ -5466,7 +5877,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 * also make sure backrefs for the shared block and all lower level
 * blocks are properly updated.
 */
-int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
+int btrfs_drop_snapshot(struct btrfs_root *root,
+                        struct btrfs_block_rsv *block_rsv, int update_ref)
 {
        struct btrfs_path *path;
        struct btrfs_trans_handle *trans;
@@ -5484,7 +5896,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
        wc = kzalloc(sizeof(*wc), GFP_NOFS);
        BUG_ON(!wc);
-        trans = btrfs_start_transaction(tree_root, 1);
+        trans = btrfs_start_transaction(tree_root, 0);
+        if (block_rsv)
+                trans->block_rsv = block_rsv;
        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
                level = btrfs_header_level(root->node);
@@ -5572,22 +5986,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
                }
                BUG_ON(wc->level == 0);
-                if (trans->transaction->in_commit ||
+                if (btrfs_should_end_transaction(trans, tree_root)) {
-                    trans->transaction->delayed_refs.flushing) {
                        ret = btrfs_update_root(trans, tree_root,
                                                &root->root_key,
                                                root_item);
                        BUG_ON(ret);
-                        btrfs_end_transaction(trans, tree_root);
+                        btrfs_end_transaction_throttle(trans, tree_root);
-                        trans = btrfs_start_transaction(tree_root, 1);
+                        trans = btrfs_start_transaction(tree_root, 0);
-                } else {
+                        if (block_rsv)
-                        unsigned long update;
+                                trans->block_rsv = block_rsv;
-                        update = trans->delayed_ref_updates;
-                        trans->delayed_ref_updates = 0;
-                        if (update)
-                                btrfs_run_delayed_refs(trans, tree_root,
-                                                       update);
                }
        }
        btrfs_release_path(root, path);
@@ -5615,7 +6023,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
                kfree(root);
        }
 out:
-        btrfs_end_transaction(trans, tree_root);
+        btrfs_end_transaction_throttle(trans, tree_root);
        kfree(wc);
        btrfs_free_path(path);
        return err;
@@ -7211,48 +7619,80 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
        return flags;
 }
-static int __alloc_chunk_for_shrink(struct btrfs_root *root,
+static int set_block_group_ro(struct btrfs_block_group_cache *cache)
-                     struct btrfs_block_group_cache *shrink_block_group,
-                     int force)
 {
-        struct btrfs_trans_handle *trans;
+        struct btrfs_space_info *sinfo = cache->space_info;
-        u64 new_alloc_flags;
+        u64 num_bytes;
-        u64 calc;
+        int ret = -ENOSPC;
-        spin_lock(&shrink_block_group->lock);
+        if (cache->ro)
-        if (btrfs_block_group_used(&shrink_block_group->item) +
+                return 0;
-            shrink_block_group->reserved > 0) {
-                spin_unlock(&shrink_block_group->lock);
-                trans = btrfs_start_transaction(root, 1);
+        spin_lock(&sinfo->lock);
-                spin_lock(&shrink_block_group->lock);
+        spin_lock(&cache->lock);
+        num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+                    cache->bytes_super - btrfs_block_group_used(&cache->item);
+        if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
+            sinfo->bytes_may_use + sinfo->bytes_readonly +
+            cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
+                sinfo->bytes_readonly += num_bytes;
+                sinfo->bytes_reserved += cache->reserved_pinned;
+                cache->reserved_pinned = 0;
+                cache->ro = 1;
+                ret = 0;
+        }
+        spin_unlock(&cache->lock);
+        spin_unlock(&sinfo->lock);
+        return ret;
+}
-                new_alloc_flags = update_block_group_flags(root,
+int btrfs_set_block_group_ro(struct btrfs_root *root,
-                                                   shrink_block_group->flags);
+                             struct btrfs_block_group_cache *cache)
-                if (new_alloc_flags != shrink_block_group->flags) {
-                        calc =
-                             btrfs_block_group_used(&shrink_block_group->item);
-                } else {
-                        calc = shrink_block_group->key.offset;
-                }
-                spin_unlock(&shrink_block_group->lock);
-                do_chunk_alloc(trans, root->fs_info->extent_root,
+{
-                               calc + 2 * 1024 * 1024, new_alloc_flags, force);
+        struct btrfs_trans_handle *trans;
+        u64 alloc_flags;
+        int ret;
-                btrfs_end_transaction(trans, root);
+        BUG_ON(cache->ro);
-        } else
-                spin_unlock(&shrink_block_group->lock);
-        return 0;
-}
+        trans = btrfs_join_transaction(root, 1);
+        BUG_ON(IS_ERR(trans));
-int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
+        alloc_flags = update_block_group_flags(root, cache->flags);
-                                         struct btrfs_block_group_cache *group)
+        if (alloc_flags != cache->flags)
+                do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+        ret = set_block_group_ro(cache);
+        if (!ret)
+                goto out;
+        alloc_flags = get_alloc_profile(root, cache->space_info->flags);
+        ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+        if (ret < 0)
+                goto out;
+        ret = set_block_group_ro(cache);
+out:
+        btrfs_end_transaction(trans, root);
+        return ret;
+}
+int btrfs_set_block_group_rw(struct btrfs_root *root,
+                              struct btrfs_block_group_cache *cache)
 {
-        __alloc_chunk_for_shrink(root, group, 1);
+        struct btrfs_space_info *sinfo = cache->space_info;
-        set_block_group_readonly(group);
+        u64 num_bytes;
+        BUG_ON(!cache->ro);
+        spin_lock(&sinfo->lock);
+        spin_lock(&cache->lock);
+        num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+                    cache->bytes_super - btrfs_block_group_used(&cache->item);
+        sinfo->bytes_readonly -= num_bytes;
+        cache->ro = 0;
+        spin_unlock(&cache->lock);
+        spin_unlock(&sinfo->lock);
        return 0;
 }
@@ -7369,7 +7809,6 @@ static int find_first_block_group(struct btrfs_root *root,
                }
                path->slots[0]++;
        }
-        ret = -ENOENT;
 out:
        return ret;
 }
@@ -7420,17 +7859,33 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
         */
        synchronize_rcu();
+        release_global_block_rsv(info);
        while(!list_empty(&info->space_info)) {
                space_info = list_entry(info->space_info.next,
                                        struct btrfs_space_info,
                                        list);
+                if (space_info->bytes_pinned > 0 ||
+                    space_info->bytes_reserved > 0) {
+                        WARN_ON(1);
+                        dump_space_info(space_info, 0, 0);
+                }
                list_del(&space_info->list);
                kfree(space_info);
        }
        return 0;
 }
+static void __link_block_group(struct btrfs_space_info *space_info,
+                               struct btrfs_block_group_cache *cache)
+{
+        int index = get_block_group_index(cache);
+        down_write(&space_info->groups_sem);
+        list_add_tail(&cache->list, &space_info->block_groups[index]);
+        up_write(&space_info->groups_sem);
+}
 int btrfs_read_block_groups(struct btrfs_root *root)
 {
        struct btrfs_path *path;
@@ -7452,10 +7907,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
        while (1) {
                ret = find_first_block_group(root, path, &key);
-                if (ret > 0) {
+                if (ret > 0)
-                        ret = 0;
+                        break;
-                        goto error;
-                }
                if (ret != 0)
                        goto error;
@@ -7464,7 +7917,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                cache = kzalloc(sizeof(*cache), GFP_NOFS);
                if (!cache) {
                        ret = -ENOMEM;
-                        break;
+                        goto error;
                }
                atomic_set(&cache->count, 1);
@@ -7521,20 +7974,36 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                BUG_ON(ret);
                cache->space_info = space_info;
                spin_lock(&cache->space_info->lock);
-                cache->space_info->bytes_super += cache->bytes_super;
+                cache->space_info->bytes_readonly += cache->bytes_super;
                spin_unlock(&cache->space_info->lock);
-                down_write(&space_info->groups_sem);
+                __link_block_group(space_info, cache);
-                list_add_tail(&cache->list, &space_info->block_groups);
-                up_write(&space_info->groups_sem);
                ret = btrfs_add_block_group_cache(root->fs_info, cache);
                BUG_ON(ret);
                set_avail_alloc_bits(root->fs_info, cache->flags);
                if (btrfs_chunk_readonly(root, cache->key.objectid))
-                        set_block_group_readonly(cache);
+                        set_block_group_ro(cache);
        }
+        list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
+                if (!(get_alloc_profile(root, space_info->flags) &
+                      (BTRFS_BLOCK_GROUP_RAID10 |
+                       BTRFS_BLOCK_GROUP_RAID1 |
+                       BTRFS_BLOCK_GROUP_DUP)))
+                        continue;
+                /*
+                 * avoid allocating from un-mirrored block group if there are
+                 * mirrored block groups.
+                 */
+                list_for_each_entry(cache, &space_info->block_groups[3], list)
+                        set_block_group_ro(cache);
+                list_for_each_entry(cache, &space_info->block_groups[4], list)
+                        set_block_group_ro(cache);
+        }
+        init_global_block_rsv(info);
        ret = 0;
 error:
        btrfs_free_path(path);
@@ -7595,12 +8064,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
        spin_lock(&cache->space_info->lock);
-        cache->space_info->bytes_super += cache->bytes_super;
+        cache->space_info->bytes_readonly += cache->bytes_super;
        spin_unlock(&cache->space_info->lock);
-        down_write(&cache->space_info->groups_sem);
+        __link_block_group(cache->space_info, cache);
-        list_add_tail(&cache->list, &cache->space_info->block_groups);
-        up_write(&cache->space_info->groups_sem);
        ret = btrfs_add_block_group_cache(root->fs_info, cache);
        BUG_ON(ret);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c99121ac5d6b..a4080c21ec55 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2,7 +2,6 @@
 #include <linux/slab.h>
 #include <linux/bio.h>
 #include <linux/mm.h>
-#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/page-flags.h>
 #include <linux/module.h>
@@ -136,7 +135,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
        return state;
 }
-static void free_extent_state(struct extent_state *state)
+void free_extent_state(struct extent_state *state)
 {
        if (!state)
                return;
@@ -336,21 +335,18 @@ static int merge_state(struct extent_io_tree *tree,
 }
 static int set_state_cb(struct extent_io_tree *tree,
-                         struct extent_state *state,
+                         struct extent_state *state, int *bits)
-                         unsigned long bits)
 {
        if (tree->ops && tree->ops->set_bit_hook) {
                return tree->ops->set_bit_hook(tree->mapping->host,
-                                               state->start, state->end,
+                                               state, bits);
-                                               state->state, bits);
        }
        return 0;
 }
 static void clear_state_cb(struct extent_io_tree *tree,
-                           struct extent_state *state,
+                           struct extent_state *state, int *bits)
-                           unsigned long bits)
 {
        if (tree->ops && tree->ops->clear_bit_hook)
                tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
@@ -368,9 +364,10 @@ static void clear_state_cb(struct extent_io_tree *tree,
 */
 static int insert_state(struct extent_io_tree *tree,
                        struct extent_state *state, u64 start, u64 end,
-                        int bits)
+                        int *bits)
 {
        struct rb_node *node;
+        int bits_to_set = *bits & ~EXTENT_CTLBITS;
        int ret;
        if (end < start) {
@@ -385,9 +382,9 @@ static int insert_state(struct extent_io_tree *tree,
        if (ret)
                return ret;
-        if (bits & EXTENT_DIRTY)
+        if (bits_to_set & EXTENT_DIRTY)
                tree->dirty_bytes += end - start + 1;
-        state->state |= bits;
+        state->state |= bits_to_set;
        node = tree_insert(&tree->state, end, &state->rb_node);
        if (node) {
                struct extent_state *found;
@@ -457,13 +454,13 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 * struct is freed and removed from the tree
 */
 static int clear_state_bit(struct extent_io_tree *tree,
-                            struct extent_state *state, int bits, int wake,
+                            struct extent_state *state,
-                            int delete)
+                            int *bits, int wake)
 {
-        int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
+        int bits_to_clear = *bits & ~EXTENT_CTLBITS;
        int ret = state->state & bits_to_clear;
-        if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
+        if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
                u64 range = state->end - state->start + 1;
                WARN_ON(range > tree->dirty_bytes);
                tree->dirty_bytes -= range;
@@ -472,9 +469,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
        state->state &= ~bits_to_clear;
        if (wake)
                wake_up(&state->wq);
-        if (delete || state->state == 0) {
+        if (state->state == 0) {
                if (state->tree) {
-                        clear_state_cb(tree, state, state->state);
                        rb_erase(&state->rb_node, &tree->state);
                        state->tree = NULL;
                        free_extent_state(state);
@@ -515,6 +511,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        int set = 0;
        int clear = 0;
+        if (delete)
+                bits |= ~EXTENT_CTLBITS;
+        bits |= EXTENT_FIRST_DELALLOC;
        if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
                clear = 1;
 again:
@@ -581,8 +581,7 @@ hit_next:
                if (err)
                        goto out;
                if (state->end <= end) {
-                        set |= clear_state_bit(tree, state, bits, wake,
+                        set |= clear_state_bit(tree, state, &bits, wake);
-                                               delete);
                        if (last_end == (u64)-1)
                                goto out;
                        start = last_end + 1;
@@ -603,7 +602,7 @@ hit_next:
                if (wake)
                        wake_up(&state->wq);
-                set |= clear_state_bit(tree, prealloc, bits, wake, delete);
+                set |= clear_state_bit(tree, prealloc, &bits, wake);
                prealloc = NULL;
                goto out;
@@ -614,7 +613,7 @@ hit_next:
        else
                next_node = NULL;
-        set |= clear_state_bit(tree, state, bits, wake, delete);
+        set |= clear_state_bit(tree, state, &bits, wake);
        if (last_end == (u64)-1)
                goto out;
        start = last_end + 1;
@@ -707,19 +706,19 @@ out:
 static int set_state_bits(struct extent_io_tree *tree,
                           struct extent_state *state,
-                           int bits)
+                           int *bits)
 {
        int ret;
+        int bits_to_set = *bits & ~EXTENT_CTLBITS;
        ret = set_state_cb(tree, state, bits);
        if (ret)
                return ret;
+        if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
-        if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
                u64 range = state->end - state->start + 1;
                tree->dirty_bytes += range;
        }
-        state->state |= bits;
+        state->state |= bits_to_set;
        return 0;
 }
@@ -746,10 +745,9 @@ static void cache_state(struct extent_state *state,
 * [start, end] is inclusive This takes the tree lock.
 */
-static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                          int bits, int exclusive_bits, u64 *failed_start,
+                   int bits, int exclusive_bits, u64 *failed_start,
-                          struct extent_state **cached_state,
+                   struct extent_state **cached_state, gfp_t mask)
-                          gfp_t mask)
 {
        struct extent_state *state;
        struct extent_state *prealloc = NULL;
@@ -758,6 +756,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        u64 last_start;
        u64 last_end;
+        bits |= EXTENT_FIRST_DELALLOC;
 again:
        if (!prealloc && (mask & __GFP_WAIT)) {
                prealloc = alloc_extent_state(mask);
@@ -779,7 +778,7 @@ again:
         */
        node = tree_search(tree, start);
        if (!node) {
-                err = insert_state(tree, prealloc, start, end, bits);
+                err = insert_state(tree, prealloc, start, end, &bits);
                prealloc = NULL;
                BUG_ON(err == -EEXIST);
                goto out;
@@ -803,7 +802,7 @@ hit_next:
                        goto out;
                }
-                err = set_state_bits(tree, state, bits);
+                err = set_state_bits(tree, state, &bits);
                if (err)
                        goto out;
@@ -853,7 +852,7 @@ hit_next:
                if (err)
                        goto out;
                if (state->end <= end) {
-                        err = set_state_bits(tree, state, bits);
+                        err = set_state_bits(tree, state, &bits);
                        if (err)
                                goto out;
                        cache_state(state, cached_state);
@@ -878,7 +877,7 @@ hit_next:
                else
                        this_end = last_start - 1;
                err = insert_state(tree, prealloc, start, this_end,
-                                   bits);
+                                   &bits);
                BUG_ON(err == -EEXIST);
                if (err) {
                        prealloc = NULL;
@@ -904,7 +903,7 @@ hit_next:
                err = split_state(tree, state, prealloc, end + 1);
                BUG_ON(err == -EEXIST);
-                err = set_state_bits(tree, prealloc, bits);
+                err = set_state_bits(tree, prealloc, &bits);
                if (err) {
                        prealloc = NULL;
                        goto out;
@@ -967,8 +966,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 {
        return clear_extent_bit(tree, start, end,
                                EXTENT_DIRTY | EXTENT_DELALLOC |
-                                EXTENT_DO_ACCOUNTING, 0, 0,
+                                EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
-                                NULL, mask);
 }
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1436,9 +1434,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
        if (op & EXTENT_CLEAR_DELALLOC)
                clear_bits |= EXTENT_DELALLOC;
-        if (op & EXTENT_CLEAR_ACCOUNTING)
-                clear_bits |= EXTENT_DO_ACCOUNTING;
        clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
        if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
                    EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
@@ -1917,7 +1912,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
        if (tree->ops && tree->ops->submit_bio_hook)
                tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
-                                           mirror_num, bio_flags);
+                                           mirror_num, bio_flags, start);
        else
                submit_bio(rw, bio);
        if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -2021,6 +2016,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
        sector_t sector;
        struct extent_map *em;
        struct block_device *bdev;
+        struct btrfs_ordered_extent *ordered;
        int ret;
        int nr = 0;
        size_t page_offset = 0;
@@ -2032,7 +2028,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
        set_page_extent_mapped(page);
        end = page_end;
-        lock_extent(tree, start, end, GFP_NOFS);
+        while (1) {
+                lock_extent(tree, start, end, GFP_NOFS);
+                ordered = btrfs_lookup_ordered_extent(inode, start);
+                if (!ordered)
+                        break;
+                unlock_extent(tree, start, end, GFP_NOFS);
+                btrfs_start_ordered_extent(inode, ordered, 1);
+                btrfs_put_ordered_extent(ordered);
+        }
        if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
                char *userpage;
@@ -2679,33 +2683,20 @@ int extent_readpages(struct extent_io_tree *tree,
 {
        struct bio *bio = NULL;
        unsigned page_idx;
-        struct pagevec pvec;
        unsigned long bio_flags = 0;
-        pagevec_init(&pvec, 0);
        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
                struct page *page = list_entry(pages->prev, struct page, lru);
                prefetchw(&page->flags);
                list_del(&page->lru);
-                /*
+                if (!add_to_page_cache_lru(page, mapping,
-                 * what we want to do here is call add_to_page_cache_lru,
-                 * but that isn't exported, so we reproduce it here
-                 */
-                if (!add_to_page_cache(page, mapping,
                                        page->index, GFP_KERNEL)) {
-                        /* open coding of lru_cache_add, also not exported */
-                        page_cache_get(page);
-                        if (!pagevec_add(&pvec, page))
-                                __pagevec_lru_add_file(&pvec);
                        __extent_read_full_page(tree, page, get_extent,
                                                &bio, 0, &bio_flags);
                }
                page_cache_release(page);
        }
-        if (pagevec_count(&pvec))
-                __pagevec_lru_add_file(&pvec);
        BUG_ON(!list_empty(pages));
        if (bio)
                submit_one_bio(READ, bio, 0, bio_flags);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index bbab4813646f..5691c7b590da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -16,7 +16,9 @@
 #define EXTENT_BOUNDARY (1 << 9)
 #define EXTENT_NODATASUM (1 << 10)
 #define EXTENT_DO_ACCOUNTING (1 << 11)
+#define EXTENT_FIRST_DELALLOC (1 << 12)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 /* flags for bio submission */
 #define EXTENT_BIO_COMPRESSED 1
@@ -47,7 +49,7 @@ struct extent_state;
 typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
                                       struct bio *bio, int mirror_num,
-                                       unsigned long bio_flags);
+                                       unsigned long bio_flags, u64 bio_offset);
 struct extent_io_ops {
        int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
                             u64 start, u64 end, int *page_started,
@@ -69,10 +71,10 @@ struct extent_io_ops {
                                    struct extent_state *state);
        int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
                                      struct extent_state *state, int uptodate);
-        int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
+        int (*set_bit_hook)(struct inode *inode, struct extent_state *state,
-                            unsigned long old, unsigned long bits);
+                            int *bits);
        int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
-                              unsigned long bits);
+                              int *bits);
        int (*merge_extent_hook)(struct inode *inode,
                                 struct extent_state *new,
                                 struct extent_state *other);
@@ -176,6 +178,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
                     u64 *start, u64 search_end,
                     u64 max_bytes, unsigned long bits);
+void free_extent_state(struct extent_state *state);
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
                   int bits, int filled, struct extent_state *cached_state);
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -185,6 +188,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                     gfp_t mask);
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                    int bits, gfp_t mask);
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                   int bits, int exclusive_bits, u64 *failed_start,
+                   struct extent_state **cached_state, gfp_t mask);
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
                        gfp_t mask);
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 28d87ba60ce8..454ca52d6451 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,5 +1,4 @@
 #include <linux/err.h>
-#include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 9b99886562d0..a562a250ae77 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -17,6 +17,7 @@
 */
 #include <linux/bio.h>
+#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
 #include "ctree.h"
@@ -148,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 }
-int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
-                          struct bio *bio, u32 *dst)
+                                   struct inode *inode, struct bio *bio,
+                                   u64 logical_offset, u32 *dst, int dio)
 {
        u32 sum;
        struct bio_vec *bvec = bio->bi_io_vec;
        int bio_index = 0;
-        u64 offset;
+        u64 offset = 0;
        u64 item_start_offset = 0;
        u64 item_last_offset = 0;
        u64 disk_bytenr;
@@ -173,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
        WARN_ON(bio->bi_vcnt <= 0);
        disk_bytenr = (u64)bio->bi_sector << 9;
+        if (dio)
+                offset = logical_offset;
        while (bio_index < bio->bi_vcnt) {
-                offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+                if (!dio)
+                        offset = page_offset(bvec->bv_page) + bvec->bv_offset;
                ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
                if (ret == 0)
                        goto found;
@@ -237,6 +242,7 @@ found:
                else
                        set_state_private(io_tree, offset, sum);
                disk_bytenr += bvec->bv_len;
+                offset += bvec->bv_len;
                bio_index++;
                bvec++;
        }
@@ -244,6 +250,18 @@ found:
        return 0;
 }
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+                          struct bio *bio, u32 *dst)
+{
+        return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
+}
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+                              struct bio *bio, u64 offset, u32 *dst)
+{
+        return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
+}
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
                             struct list_head *list)
 {
@@ -656,6 +674,9 @@ again:
                goto found;
        }
        ret = PTR_ERR(item);
+        if (ret != -EFBIG && ret != -ENOENT)
+                goto fail_unlock;
        if (ret == -EFBIG) {
                u32 item_size;
                /* we found one, but it isn't big enough yet */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ee3323c7fc1c..787b50a16a14 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -28,6 +28,7 @@
 #include <linux/writeback.h>
 #include <linux/statfs.h>
 #include <linux/compat.h>
+#include <linux/slab.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -45,32 +46,42 @@
 static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
                                         int write_bytes,
                                         struct page **prepared_pages,
-                                         const char __user *buf)
+                                         struct iov_iter *i)
 {
-        long page_fault = 0;
+        size_t copied;
-        int i;
+        int pg = 0;
        int offset = pos & (PAGE_CACHE_SIZE - 1);
-        for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
+        while (write_bytes > 0) {
                size_t count = min_t(size_t,
                                     PAGE_CACHE_SIZE - offset, write_bytes);
-                struct page *page = prepared_pages[i];
+                struct page *page = prepared_pages[pg];
-                fault_in_pages_readable(buf, count);
+again:
+                if (unlikely(iov_iter_fault_in_readable(i, count)))
+                        return -EFAULT;
                /* Copy data from userspace to the current page */
-                kmap(page);
+                copied = iov_iter_copy_from_user(page, i, offset, count);
-                page_fault = __copy_from_user(page_address(page) + offset,
-                                              buf, count);
                /* Flush processor's dcache for this page */
                flush_dcache_page(page);
-                kunmap(page);
+                iov_iter_advance(i, copied);
-                buf += count;
+                write_bytes -= copied;
-                write_bytes -= count;
-                if (page_fault)
+                if (unlikely(copied == 0)) {
-                        break;
+                        count = min_t(size_t, PAGE_CACHE_SIZE - offset,
+                                      iov_iter_single_seg_count(i));
+                        goto again;
+                }
+                if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
+                        offset += copied;
+                } else {
+                        pg++;
+                        offset = 0;
+                }
        }
-        return page_fault ? -EFAULT : 0;
+        return 0;
 }
 /*
@@ -125,8 +136,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
        end_of_last_block = start_pos + num_bytes - 1;
        err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
                                        NULL);
-        if (err)
+        BUG_ON(err);
-                return err;
        for (i = 0; i < num_pages; i++) {
                struct page *p = pages[i];
@@ -141,7 +151,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
                 * at this time.
                 */
        }
-        return err;
+        return 0;
 }
 /*
@@ -822,45 +832,46 @@ again:
        return 0;
 }
-static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
+static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
-                                size_t count, loff_t *ppos)
+                                    const struct iovec *iov,
+                                    unsigned long nr_segs, loff_t pos)
 {
-        loff_t pos;
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = fdentry(file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct page *pinned[2];
+        struct page **pages = NULL;
+        struct iov_iter i;
+        loff_t *ppos = &iocb->ki_pos;
        loff_t start_pos;
        ssize_t num_written = 0;
        ssize_t err = 0;
+        size_t count;
+        size_t ocount;
        int ret = 0;
-        struct inode *inode = fdentry(file)->d_inode;
-        struct btrfs_root *root = BTRFS_I(inode)->root;
-        struct page **pages = NULL;
        int nrptrs;
-        struct page *pinned[2];
        unsigned long first_index;
        unsigned long last_index;
        int will_write;
+        int buffered = 0;
        will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
                      (file->f_flags & O_DIRECT));
-        nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
-                     PAGE_CACHE_SIZE / (sizeof(struct page *)));
        pinned[0] = NULL;
        pinned[1] = NULL;
-        pos = *ppos;
        start_pos = pos;
        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
-        /* do the reserve before the mutex lock in case we have to do some
-         * flushing.  We wouldn't deadlock, but this is more polite.
-         */
-        err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
-        if (err)
-                goto out_nolock;
        mutex_lock(&inode->i_mutex);
+        err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
+        if (err)
+                goto out;
+        count = ocount;
        current->backing_dev_info = inode->i_mapping->backing_dev_info;
        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
        if (err)
@@ -874,15 +885,53 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                goto out;
        file_update_time(file);
+        BTRFS_I(inode)->sequence++;
+        if (unlikely(file->f_flags & O_DIRECT)) {
+                num_written = generic_file_direct_write(iocb, iov, &nr_segs,
+                                                        pos, ppos, count,
+                                                        ocount);
+                /*
+                 * the generic O_DIRECT will update in-memory i_size after the
+                 * DIOs are done.  But our endio handlers that update the on
+                 * disk i_size never update past the in memory i_size.  So we
+                 * need one more update here to catch any additions to the
+                 * file
+                 */
+                if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
+                        btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+                        mark_inode_dirty(inode);
+                }
+                if (num_written < 0) {
+                        ret = num_written;
+                        num_written = 0;
+                        goto out;
+                } else if (num_written == count) {
+                        /* pick up pos changes done by the generic code */
+                        pos = *ppos;
+                        goto out;
+                }
+                /*
+                 * We are going to do buffered for the rest of the range, so we
+                 * need to make sure to invalidate the buffered pages when we're
+                 * done.
+                 */
+                buffered = 1;
+                pos += num_written;
+        }
+        iov_iter_init(&i, iov, nr_segs, count, num_written);
+        nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
+                     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
+                     (sizeof(struct page *)));
        pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
        /* generic_write_checks can change our pos */
        start_pos = pos;
-        BTRFS_I(inode)->sequence++;
        first_index = pos >> PAGE_CACHE_SHIFT;
-        last_index = (pos + count) >> PAGE_CACHE_SHIFT;
+        last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
        /*
         * there are lots of better ways to do this, but this code
@@ -899,7 +948,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                        unlock_page(pinned[0]);
                }
        }
-        if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
+        if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
                pinned[1] = grab_cache_page(inode->i_mapping, last_index);
                if (!PageUptodate(pinned[1])) {
                        ret = btrfs_readpage(NULL, pinned[1]);
@@ -910,10 +959,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                }
        }
-        while (count > 0) {
+        while (iov_iter_count(&i) > 0) {
                size_t offset = pos & (PAGE_CACHE_SIZE - 1);
-                size_t write_bytes = min(count, nrptrs *
+                size_t write_bytes = min(iov_iter_count(&i),
-                                        (size_t)PAGE_CACHE_SIZE -
+                                         nrptrs * (size_t)PAGE_CACHE_SIZE -
                                         offset);
                size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
                                        PAGE_CACHE_SHIFT;
@@ -921,7 +970,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                WARN_ON(num_pages > nrptrs);
                memset(pages, 0, sizeof(struct page *) * nrptrs);
-                ret = btrfs_check_data_free_space(root, inode, write_bytes);
+                ret = btrfs_delalloc_reserve_space(inode, write_bytes);
                if (ret)
                        goto out;
@@ -929,26 +978,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                                    pos, first_index, last_index,
                                    write_bytes);
                if (ret) {
-                        btrfs_free_reserved_data_space(root, inode,
+                        btrfs_delalloc_release_space(inode, write_bytes);
-                                                       write_bytes);
                        goto out;
                }
                ret = btrfs_copy_from_user(pos, num_pages,
-                                           write_bytes, pages, buf);
+                                           write_bytes, pages, &i);
-                if (ret) {
+                if (ret == 0) {
-                        btrfs_free_reserved_data_space(root, inode,
+                        dirty_and_release_pages(NULL, root, file, pages,
-                                                       write_bytes);
+                                                num_pages, pos, write_bytes);
-                        btrfs_drop_pages(pages, num_pages);
-                        goto out;
                }
-                ret = dirty_and_release_pages(NULL, root, file, pages,
-                                              num_pages, pos, write_bytes);
                btrfs_drop_pages(pages, num_pages);
                if (ret) {
-                        btrfs_free_reserved_data_space(root, inode,
+                        btrfs_delalloc_release_space(inode, write_bytes);
-                                                       write_bytes);
                        goto out;
                }
@@ -964,8 +1007,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                        btrfs_throttle(root);
                }
-                buf += write_bytes;
-                count -= write_bytes;
                pos += write_bytes;
                num_written += write_bytes;
@@ -975,9 +1016,7 @@ out:
        mutex_unlock(&inode->i_mutex);
        if (ret)
                err = ret;
-        btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
-out_nolock:
        kfree(pages);
        if (pinned[0])
                page_cache_release(pinned[0]);
@@ -1007,7 +1046,7 @@ out_nolock:
                        num_written = err;
                if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
-                        trans = btrfs_start_transaction(root, 1);
+                        trans = btrfs_start_transaction(root, 0);
                        ret = btrfs_log_dentry_safe(trans, root,
                                                    file->f_dentry);
                        if (ret == 0) {
@@ -1022,7 +1061,7 @@ out_nolock:
                                btrfs_end_transaction(trans, root);
                        }
                }
-                if (file->f_flags & O_DIRECT) {
+                if (file->f_flags & O_DIRECT && buffered) {
                        invalidate_mapping_pages(inode->i_mapping,
                              start_pos >> PAGE_CACHE_SHIFT,
                             (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
@@ -1062,8 +1101,9 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
 * important optimization for directories because holding the mutex prevents
 * new operations on the dir while we write to disk.
 */
-int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+int btrfs_sync_file(struct file *file, int datasync)
 {
+        struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = dentry->d_inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
@@ -1103,9 +1143,9 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
        if (file && file->private_data)
                btrfs_ioctl_trans_end(file);
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
-        if (!trans) {
+        if (IS_ERR(trans)) {
-                ret = -ENOMEM;
+                ret = PTR_ERR(trans);
                goto out;
        }
@@ -1160,7 +1200,7 @@ const struct file_operations btrfs_file_operations = {
        .read           = do_sync_read,
        .aio_read       = generic_file_aio_read,
        .splice_read    = generic_file_splice_read,
-        .write          = btrfs_file_write,
+        .aio_write      = btrfs_file_aio_write,
        .mmap           = btrfs_file_mmap,
        .open           = generic_file_open,
        .release        = btrfs_release_file,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index dd831ed31eea..f488fac04d99 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,7 @@
 #include <linux/pagemap.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/math64.h>
 #include "ctree.h"
 #include "free-space-cache.h"
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 72ce3c173d6a..64f1150bb48d 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -49,6 +49,33 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
        return 0;
 }
+struct btrfs_inode_ref *
+btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct btrfs_path *path,
+                        const char *name, int name_len,
+                        u64 inode_objectid, u64 ref_objectid, int mod)
+{
+        struct btrfs_key key;
+        struct btrfs_inode_ref *ref;
+        int ins_len = mod < 0 ? -1 : 0;
+        int cow = mod != 0;
+        int ret;
+        key.objectid = inode_objectid;
+        key.type = BTRFS_INODE_REF_KEY;
+        key.offset = ref_objectid;
+        ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+        if (ret < 0)
+                return ERR_PTR(ret);
+        if (ret > 0)
+                return NULL;
+        if (!find_name_in_backref(path, name, name_len, &ref))
+                return NULL;
+        return ref;
+}
 int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           const char *name, int name_len,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 02bb099845fd..fa6ccc1bfe2a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -36,6 +36,7 @@
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/falloc.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -251,6 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
                                   inline_len, compressed_size,
                                   compressed_pages);
        BUG_ON(ret);
+        btrfs_delalloc_release_metadata(inode, end + 1 - start);
        btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
        return 0;
 }
@@ -413,6 +415,7 @@ again:
                trans = btrfs_join_transaction(root, 1);
                BUG_ON(!trans);
                btrfs_set_trans_block_group(trans, inode);
+                trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                /* lets try to make an inline extent */
                if (ret || total_in < (actual_end - start)) {
@@ -438,7 +441,6 @@ again:
                             start, end, NULL,
                             EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
                             EXTENT_CLEAR_DELALLOC |
-                             EXTENT_CLEAR_ACCOUNTING |
                             EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
                        btrfs_end_transaction(trans, root);
@@ -696,6 +698,38 @@ retry:
        return 0;
 }
+static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
+                                      u64 num_bytes)
+{
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct extent_map *em;
+        u64 alloc_hint = 0;
+        read_lock(&em_tree->lock);
+        em = search_extent_mapping(em_tree, start, num_bytes);
+        if (em) {
+                /*
+                 * if block start isn't an actual block number then find the
+                 * first block in this inode and use that as a hint.  If that
+                 * block is also bogus then just don't worry about it.
+                 */
+                if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+                        free_extent_map(em);
+                        em = search_extent_mapping(em_tree, 0, 0);
+                        if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
+                                alloc_hint = em->block_start;
+                        if (em)
+                                free_extent_map(em);
+                } else {
+                        alloc_hint = em->block_start;
+                        free_extent_map(em);
+                }
+        }
+        read_unlock(&em_tree->lock);
+        return alloc_hint;
+}
 /*
 * when extent_io.c finds a delayed allocation range in the file,
 * the call backs end up in this code.  The basic idea is to
@@ -733,6 +767,7 @@ static noinline int cow_file_range(struct inode *inode,
        trans = btrfs_join_transaction(root, 1);
        BUG_ON(!trans);
        btrfs_set_trans_block_group(trans, inode);
+        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
        actual_end = min_t(u64, isize, end + 1);
@@ -752,7 +787,6 @@ static noinline int cow_file_range(struct inode *inode,
                                     EXTENT_CLEAR_UNLOCK_PAGE |
                                     EXTENT_CLEAR_UNLOCK |
                                     EXTENT_CLEAR_DELALLOC |
-                                     EXTENT_CLEAR_ACCOUNTING |
                                     EXTENT_CLEAR_DIRTY |
                                     EXTENT_SET_WRITEBACK |
                                     EXTENT_END_WRITEBACK);
@@ -768,35 +802,13 @@ static noinline int cow_file_range(struct inode *inode,
        BUG_ON(disk_num_bytes >
               btrfs_super_total_bytes(&root->fs_info->super_copy));
+        alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
-        read_lock(&BTRFS_I(inode)->extent_tree.lock);
-        em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
-                                   start, num_bytes);
-        if (em) {
-                /*
-                 * if block start isn't an actual block number then find the
-                 * first block in this inode and use that as a hint.  If that
-                 * block is also bogus then just don't worry about it.
-                 */
-                if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-                        free_extent_map(em);
-                        em = search_extent_mapping(em_tree, 0, 0);
-                        if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
-                                alloc_hint = em->block_start;
-                        if (em)
-                                free_extent_map(em);
-                } else {
-                        alloc_hint = em->block_start;
-                        free_extent_map(em);
-                }
-        }
-        read_unlock(&BTRFS_I(inode)->extent_tree.lock);
        btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
        while (disk_num_bytes > 0) {
                unsigned long op;
-                cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
+                cur_alloc_size = disk_num_bytes;
                ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
                                           root->sectorsize, 0, alloc_hint,
                                           (u64)-1, &ins, 1);
@@ -1173,6 +1185,13 @@ out_check:
                                               num_bytes, num_bytes, type);
                BUG_ON(ret);
+                if (root->root_key.objectid ==
+                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
+                        ret = btrfs_reloc_clone_csums(inode, cur_offset,
+                                                      num_bytes);
+                        BUG_ON(ret);
+                }
                extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
                                cur_offset, cur_offset + num_bytes - 1,
                                locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
@@ -1225,36 +1244,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 }
 static int btrfs_split_extent_hook(struct inode *inode,
-                                    struct extent_state *orig, u64 split)
+                                   struct extent_state *orig, u64 split)
 {
-        struct btrfs_root *root = BTRFS_I(inode)->root;
+        /* not delalloc, ignore it */
-        u64 size;
        if (!(orig->state & EXTENT_DELALLOC))
                return 0;
-        size = orig->end - orig->start + 1;
+        atomic_inc(&BTRFS_I(inode)->outstanding_extents);
-        if (size > root->fs_info->max_extent) {
-                u64 num_extents;
-                u64 new_size;
-                new_size = orig->end - split + 1;
-                num_extents = div64_u64(size + root->fs_info->max_extent - 1,
-                                        root->fs_info->max_extent);
-                /*
-                 * if we break a large extent up then leave oustanding_extents
-                 * be, since we've already accounted for the large extent.
-                 */
-                if (div64_u64(new_size + root->fs_info->max_extent - 1,
-                              root->fs_info->max_extent) < num_extents)
-                        return 0;
-        }
-        spin_lock(&BTRFS_I(inode)->accounting_lock);
-        BTRFS_I(inode)->outstanding_extents++;
-        spin_unlock(&BTRFS_I(inode)->accounting_lock);
        return 0;
 }
@@ -1268,42 +1264,11 @@ static int btrfs_merge_extent_hook(struct inode *inode,
                                   struct extent_state *new,
                                   struct extent_state *other)
 {
-        struct btrfs_root *root = BTRFS_I(inode)->root;
-        u64 new_size, old_size;
-        u64 num_extents;
        /* not delalloc, ignore it */
        if (!(other->state & EXTENT_DELALLOC))
                return 0;
-        old_size = other->end - other->start + 1;
+        atomic_dec(&BTRFS_I(inode)->outstanding_extents);
-        if (new->start < other->start)
-                new_size = other->end - new->start + 1;
-        else
-                new_size = new->end - other->start + 1;
-        /* we're not bigger than the max, unreserve the space and go */
-        if (new_size <= root->fs_info->max_extent) {
-                spin_lock(&BTRFS_I(inode)->accounting_lock);
-                BTRFS_I(inode)->outstanding_extents--;
-                spin_unlock(&BTRFS_I(inode)->accounting_lock);
-                return 0;
-        }
-        /*
-         * If we grew by another max_extent, just return, we want to keep that
-         * reserved amount.
-         */
-        num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
-                                root->fs_info->max_extent);
-        if (div64_u64(new_size + root->fs_info->max_extent - 1,
-                      root->fs_info->max_extent) > num_extents)
-                return 0;
-        spin_lock(&BTRFS_I(inode)->accounting_lock);
-        BTRFS_I(inode)->outstanding_extents--;
-        spin_unlock(&BTRFS_I(inode)->accounting_lock);
        return 0;
 }
@@ -1312,8 +1277,8 @@ static int btrfs_merge_extent_hook(struct inode *inode,
 * bytes in this file, and to maintain the list of inodes that
 * have pending delalloc work to be done.
 */
-static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
+static int btrfs_set_bit_hook(struct inode *inode,
-                       unsigned long old, unsigned long bits)
+                              struct extent_state *state, int *bits)
 {
        /*
@@ -1321,16 +1286,18 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
         * but in this case, we are only testeing for the DELALLOC
         * bit, which is only set or cleared with irqs on
         */
-        if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+        if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
+                u64 len = state->end + 1 - state->start;
+                if (*bits & EXTENT_FIRST_DELALLOC)
+                        *bits &= ~EXTENT_FIRST_DELALLOC;
+                else
+                        atomic_inc(&BTRFS_I(inode)->outstanding_extents);
-                spin_lock(&BTRFS_I(inode)->accounting_lock);
-                BTRFS_I(inode)->outstanding_extents++;
-                spin_unlock(&BTRFS_I(inode)->accounting_lock);
-                btrfs_delalloc_reserve_space(root, inode, end - start + 1);
                spin_lock(&root->fs_info->delalloc_lock);
-                BTRFS_I(inode)->delalloc_bytes += end - start + 1;
+                BTRFS_I(inode)->delalloc_bytes += len;
-                root->fs_info->delalloc_bytes += end - start + 1;
+                root->fs_info->delalloc_bytes += len;
                if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
                        list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
                                      &root->fs_info->delalloc_inodes);
@@ -1344,44 +1311,32 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 * extent_io.c clear_bit_hook, see set_bit_hook for why
 */
 static int btrfs_clear_bit_hook(struct inode *inode,
-                                struct extent_state *state, unsigned long bits)
+                                struct extent_state *state, int *bits)
 {
        /*
         * set_bit and clear bit hooks normally require _irqsave/restore
         * but in this case, we are only testeing for the DELALLOC
         * bit, which is only set or cleared with irqs on
         */
-        if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+        if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
+                u64 len = state->end + 1 - state->start;
-                if (bits & EXTENT_DO_ACCOUNTING) {
+                if (*bits & EXTENT_FIRST_DELALLOC)
-                        spin_lock(&BTRFS_I(inode)->accounting_lock);
+                        *bits &= ~EXTENT_FIRST_DELALLOC;
-                        BTRFS_I(inode)->outstanding_extents--;
+                else if (!(*bits & EXTENT_DO_ACCOUNTING))
-                        spin_unlock(&BTRFS_I(inode)->accounting_lock);
+                        atomic_dec(&BTRFS_I(inode)->outstanding_extents);
-                        btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
-                }
+                if (*bits & EXTENT_DO_ACCOUNTING)
+                        btrfs_delalloc_release_metadata(inode, len);
+                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
+                        btrfs_free_reserved_data_space(inode, len);
                spin_lock(&root->fs_info->delalloc_lock);
-                if (state->end - state->start + 1 >
+                root->fs_info->delalloc_bytes -= len;
-                    root->fs_info->delalloc_bytes) {
+                BTRFS_I(inode)->delalloc_bytes -= len;
-                        printk(KERN_INFO "btrfs warning: delalloc account "
-                               "%llu %llu\n",
-                               (unsigned long long)
-                               state->end - state->start + 1,
-                               (unsigned long long)
-                               root->fs_info->delalloc_bytes);
-                        btrfs_delalloc_free_space(root, inode, (u64)-1);
-                        root->fs_info->delalloc_bytes = 0;
-                        BTRFS_I(inode)->delalloc_bytes = 0;
-                } else {
-                        btrfs_delalloc_free_space(root, inode,
-                                                  state->end -
-                                                  state->start + 1);
-                        root->fs_info->delalloc_bytes -= state->end -
-                                state->start + 1;
-                        BTRFS_I(inode)->delalloc_bytes -= state->end -
-                                state->start + 1;
-                }
                if (BTRFS_I(inode)->delalloc_bytes == 0 &&
                    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
                        list_del_init(&BTRFS_I(inode)->delalloc_inodes);
@@ -1430,7 +1385,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 */
 static int __btrfs_submit_bio_start(struct inode *inode, int rw,
                                    struct bio *bio, int mirror_num,
-                                    unsigned long bio_flags)
+                                    unsigned long bio_flags,
+                                    u64 bio_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
@@ -1449,7 +1405,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
 * are inserted into the btree
 */
 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
-                          int mirror_num, unsigned long bio_flags)
+                          int mirror_num, unsigned long bio_flags,
+                          u64 bio_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        return btrfs_map_bio(root, rw, bio, mirror_num, 1);
@@ -1460,7 +1417,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 * on write, or reading the csums from the tree before a read
 */
 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-                          int mirror_num, unsigned long bio_flags)
+                          int mirror_num, unsigned long bio_flags,
+                          u64 bio_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
@@ -1485,7 +1443,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                /* we're doing a write, do the async checksumming */
                return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
                                   inode, rw, bio, mirror_num,
-                                   bio_flags, __btrfs_submit_bio_start,
+                                   bio_flags, bio_offset,
+                                   __btrfs_submit_bio_start,
                                   __btrfs_submit_bio_done);
        }
@@ -1566,6 +1525,7 @@ again:
                goto again;
        }
+        BUG();
        btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
        ClearPageChecked(page);
 out:
@@ -1696,7 +1656,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        struct btrfs_trans_handle *trans;
+        struct btrfs_trans_handle *trans = NULL;
        struct btrfs_ordered_extent *ordered_extent = NULL;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_state *cached_state = NULL;
@@ -1714,9 +1674,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
                if (!ret) {
                        trans = btrfs_join_transaction(root, 1);
+                        btrfs_set_trans_block_group(trans, inode);
+                        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                        ret = btrfs_update_inode(trans, root, inode);
                        BUG_ON(ret);
-                        btrfs_end_transaction(trans, root);
                }
                goto out;
        }
@@ -1726,6 +1687,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                         0, &cached_state, GFP_NOFS);
        trans = btrfs_join_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, inode);
+        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
                compressed = 1;
@@ -1757,12 +1720,13 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        add_pending_csums(trans, inode, ordered_extent->file_offset,
                          &ordered_extent->list);
-        /* this also removes the ordered extent from the tree */
        btrfs_ordered_update_i_size(inode, 0, ordered_extent);
        ret = btrfs_update_inode(trans, root, inode);
        BUG_ON(ret);
-        btrfs_end_transaction(trans, root);
 out:
+        btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+        if (trans)
+                btrfs_end_transaction(trans, root);
        /* once for us */
        btrfs_put_ordered_extent(ordered_extent);
        /* once for the tree */
@@ -1884,7 +1848,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
        BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
                                                      failrec->last_mirror,
-                                                      failrec->bio_flags);
+                                                      failrec->bio_flags, 0);
        return 0;
 }
@@ -2039,32 +2003,196 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
 }
 /*
+ * calculate extra metadata reservation when snapshotting a subvolume
+ * contains orphan files.
+ */
+void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
+                                struct btrfs_pending_snapshot *pending,
+                                u64 *bytes_to_reserve)
+{
+        struct btrfs_root *root;
+        struct btrfs_block_rsv *block_rsv;
+        u64 num_bytes;
+        int index;
+        root = pending->root;
+        if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+                return;
+        block_rsv = root->orphan_block_rsv;
+        /* orphan block reservation for the snapshot */
+        num_bytes = block_rsv->size;
+        /*
+         * after the snapshot is created, COWing tree blocks may use more
+         * space than it frees. So we should make sure there is enough
+         * reserved space.
+         */
+        index = trans->transid & 0x1;
+        if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+                num_bytes += block_rsv->size -
+                             (block_rsv->reserved + block_rsv->freed[index]);
+        }
+        *bytes_to_reserve += num_bytes;
+}
+void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
+                                struct btrfs_pending_snapshot *pending)
+{
+        struct btrfs_root *root = pending->root;
+        struct btrfs_root *snap = pending->snap;
+        struct btrfs_block_rsv *block_rsv;
+        u64 num_bytes;
+        int index;
+        int ret;
+        if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+                return;
+        /* refill source subvolume's orphan block reservation */
+        block_rsv = root->orphan_block_rsv;
+        index = trans->transid & 0x1;
+        if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+                num_bytes = block_rsv->size -
+                            (block_rsv->reserved + block_rsv->freed[index]);
+                ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+                                              root->orphan_block_rsv,
+                                              num_bytes);
+                BUG_ON(ret);
+        }
+        /* setup orphan block reservation for the snapshot */
+        block_rsv = btrfs_alloc_block_rsv(snap);
+        BUG_ON(!block_rsv);
+        btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
+        snap->orphan_block_rsv = block_rsv;
+        num_bytes = root->orphan_block_rsv->size;
+        ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+                                      block_rsv, num_bytes);
+        BUG_ON(ret);
+#if 0
+        /* insert orphan item for the snapshot */
+        WARN_ON(!root->orphan_item_inserted);
+        ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+                                       snap->root_key.objectid);
+        BUG_ON(ret);
+        snap->orphan_item_inserted = 1;
+#endif
+}
+enum btrfs_orphan_cleanup_state {
+        ORPHAN_CLEANUP_STARTED  = 1,
+        ORPHAN_CLEANUP_DONE     = 2,
+};
+/*
+ * This is called in transaction commmit time. If there are no orphan
+ * files in the subvolume, it removes orphan item and frees block_rsv
+ * structure.
+ */
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root)
+{
+        int ret;
+        if (!list_empty(&root->orphan_list) ||
+            root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
+                return;
+        if (root->orphan_item_inserted &&
+            btrfs_root_refs(&root->root_item) > 0) {
+                ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
+                                            root->root_key.objectid);
+                BUG_ON(ret);
+                root->orphan_item_inserted = 0;
+        }
+        if (root->orphan_block_rsv) {
+                WARN_ON(root->orphan_block_rsv->size > 0);
+                btrfs_free_block_rsv(root, root->orphan_block_rsv);
+                root->orphan_block_rsv = NULL;
+        }
+}
+/*
 * This creates an orphan entry for the given inode in case something goes
 * wrong in the middle of an unlink/truncate.
+ *
+ * NOTE: caller of this function should reserve 5 units of metadata for
+ *       this function.
 */
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        int ret = 0;
+        struct btrfs_block_rsv *block_rsv = NULL;
+        int reserve = 0;
+        int insert = 0;
+        int ret;
-        spin_lock(&root->list_lock);
+        if (!root->orphan_block_rsv) {
+                block_rsv = btrfs_alloc_block_rsv(root);
+                BUG_ON(!block_rsv);
+        }
-        /* already on the orphan list, we're good */
+        spin_lock(&root->orphan_lock);
-        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+        if (!root->orphan_block_rsv) {
-                spin_unlock(&root->list_lock);
+                root->orphan_block_rsv = block_rsv;
-                return 0;
+        } else if (block_rsv) {
+                btrfs_free_block_rsv(root, block_rsv);
+                block_rsv = NULL;
        }
-        list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+        if (list_empty(&BTRFS_I(inode)->i_orphan)) {
+                list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+#if 0
+                /*
+                 * For proper ENOSPC handling, we should do orphan
+                 * cleanup when mounting. But this introduces backward
+                 * compatibility issue.
+                 */
+                if (!xchg(&root->orphan_item_inserted, 1))
+                        insert = 2;
+                else
+                        insert = 1;
+#endif
+                insert = 1;
+        } else {
+                WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
+        }
-        spin_unlock(&root->list_lock);
+        if (!BTRFS_I(inode)->orphan_meta_reserved) {
+                BTRFS_I(inode)->orphan_meta_reserved = 1;
+                reserve = 1;
+        }
+        spin_unlock(&root->orphan_lock);
-        /*
+        if (block_rsv)
-         * insert an orphan item to track this unlinked/truncated file
+                btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
-         */
-        ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
-        return ret;
+        /* grab metadata reservation from transaction handle */
+        if (reserve) {
+                ret = btrfs_orphan_reserve_metadata(trans, inode);
+                BUG_ON(ret);
+        }
+        /* insert an orphan item to track this unlinked/truncated file */
+        if (insert >= 1) {
+                ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+                BUG_ON(ret);
+        }
+        /* insert an orphan item to track subvolume contains orphan files */
+        if (insert >= 2) {
+                ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+                                               root->root_key.objectid);
+                BUG_ON(ret);
+        }
+        return 0;
 }
 /*
@@ -2074,26 +2202,31 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int delete_item = 0;
+        int release_rsv = 0;
        int ret = 0;
-        spin_lock(&root->list_lock);
+        spin_lock(&root->orphan_lock);
+        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-        if (list_empty(&BTRFS_I(inode)->i_orphan)) {
+                list_del_init(&BTRFS_I(inode)->i_orphan);
-                spin_unlock(&root->list_lock);
+                delete_item = 1;
-                return 0;
        }
-        list_del_init(&BTRFS_I(inode)->i_orphan);
+        if (BTRFS_I(inode)->orphan_meta_reserved) {
-        if (!trans) {
+                BTRFS_I(inode)->orphan_meta_reserved = 0;
-                spin_unlock(&root->list_lock);
+                release_rsv = 1;
-                return 0;
        }
+        spin_unlock(&root->orphan_lock);
-        spin_unlock(&root->list_lock);
+        if (trans && delete_item) {
+                ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+                BUG_ON(ret);
+        }
-        ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+        if (release_rsv)
+                btrfs_orphan_release_metadata(inode);
-        return ret;
+        return 0;
 }
 /*
@@ -2110,7 +2243,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
        struct inode *inode;
        int ret = 0, nr_unlink = 0, nr_truncate = 0;
-        if (!xchg(&root->clean_orphans, 0))
+        if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
                return;
        path = btrfs_alloc_path();
@@ -2163,16 +2296,15 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                found_key.type = BTRFS_INODE_ITEM_KEY;
                found_key.offset = 0;
                inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
-                if (IS_ERR(inode))
+                BUG_ON(IS_ERR(inode));
-                        break;
                /*
                 * add this inode to the orphan list so btrfs_orphan_del does
                 * the proper thing when we hit it
                 */
-                spin_lock(&root->list_lock);
+                spin_lock(&root->orphan_lock);
                list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
-                spin_unlock(&root->list_lock);
+                spin_unlock(&root->orphan_lock);
                /*
                 * if this is a bad inode, means we actually succeeded in
@@ -2181,7 +2313,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                 * do a destroy_inode
                 */
                if (is_bad_inode(inode)) {
-                        trans = btrfs_start_transaction(root, 1);
+                        trans = btrfs_start_transaction(root, 0);
                        btrfs_orphan_del(trans, inode);
                        btrfs_end_transaction(trans, root);
                        iput(inode);
@@ -2199,13 +2331,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                /* this will do delete_inode and everything for us */
                iput(inode);
        }
+        btrfs_free_path(path);
+        root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
+        if (root->orphan_block_rsv)
+                btrfs_block_rsv_release(root, root->orphan_block_rsv,
+                                        (u64)-1);
+        if (root->orphan_block_rsv || root->orphan_item_inserted) {
+                trans = btrfs_join_transaction(root, 1);
+                btrfs_end_transaction(trans, root);
+        }
        if (nr_unlink)
                printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
        if (nr_truncate)
                printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
-        btrfs_free_path(path);
 }
 /*
@@ -2524,44 +2666,217 @@ out:
        return ret;
 }
-static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+/* helper to check if there is any shared block in the path */
+static int check_path_shared(struct btrfs_root *root,
+                             struct btrfs_path *path)
+{
+        struct extent_buffer *eb;
+        int level;
+        int ret;
+        u64 refs;
+        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+                if (!path->nodes[level])
+                        break;
+                eb = path->nodes[level];
+                if (!btrfs_block_can_be_shared(root, eb))
+                        continue;
+                ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
+                                               &refs, NULL);
+                if (refs > 1)
+                        return 1;
+        }
+        return 0;
+}
+/*
+ * helper to start transaction for unlink and rmdir.
+ *
+ * unlink and rmdir are special in btrfs, they do not always free space.
+ * so in enospc case, we should make sure they will free space before
+ * allowing them to use the global metadata reservation.
+ */
+static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
+                                                       struct dentry *dentry)
 {
-        struct btrfs_root *root;
        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        struct btrfs_path *path;
+        struct btrfs_inode_ref *ref;
+        struct btrfs_dir_item *di;
        struct inode *inode = dentry->d_inode;
+        u64 index;
+        int check_link = 1;
+        int err = -ENOSPC;
        int ret;
-        unsigned long nr = 0;
-        root = BTRFS_I(dir)->root;
+        trans = btrfs_start_transaction(root, 10);
+        if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
+                return trans;
-        /*
+        if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
-         * 5 items for unlink inode
+                return ERR_PTR(-ENOSPC);
-         * 1 for orphan
-         */
+        /* check if there is someone else holds reference */
-        ret = btrfs_reserve_metadata_space(root, 6);
+        if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
-        if (ret)
+                return ERR_PTR(-ENOSPC);
-                return ret;
-        trans = btrfs_start_transaction(root, 1);
+        if (atomic_read(&inode->i_count) > 2)
+                return ERR_PTR(-ENOSPC);
+        if (xchg(&root->fs_info->enospc_unlink, 1))
+                return ERR_PTR(-ENOSPC);
+        path = btrfs_alloc_path();
+        if (!path) {
+                root->fs_info->enospc_unlink = 0;
+                return ERR_PTR(-ENOMEM);
+        }
+        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
-                btrfs_unreserve_metadata_space(root, 6);
+                btrfs_free_path(path);
-                return PTR_ERR(trans);
+                root->fs_info->enospc_unlink = 0;
+                return trans;
        }
+        path->skip_locking = 1;
+        path->search_commit_root = 1;
+        ret = btrfs_lookup_inode(trans, root, path,
+                                &BTRFS_I(dir)->location, 0);
+        if (ret < 0) {
+                err = ret;
+                goto out;
+        }
+        if (ret == 0) {
+                if (check_path_shared(root, path))
+                        goto out;
+        } else {
+                check_link = 0;
+        }
+        btrfs_release_path(root, path);
+        ret = btrfs_lookup_inode(trans, root, path,
+                                &BTRFS_I(inode)->location, 0);
+        if (ret < 0) {
+                err = ret;
+                goto out;
+        }
+        if (ret == 0) {
+                if (check_path_shared(root, path))
+                        goto out;
+        } else {
+                check_link = 0;
+        }
+        btrfs_release_path(root, path);
+        if (ret == 0 && S_ISREG(inode->i_mode)) {
+                ret = btrfs_lookup_file_extent(trans, root, path,
+                                               inode->i_ino, (u64)-1, 0);
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                BUG_ON(ret == 0);
+                if (check_path_shared(root, path))
+                        goto out;
+                btrfs_release_path(root, path);
+        }
+        if (!check_link) {
+                err = 0;
+                goto out;
+        }
+        di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+                                dentry->d_name.name, dentry->d_name.len, 0);
+        if (IS_ERR(di)) {
+                err = PTR_ERR(di);
+                goto out;
+        }
+        if (di) {
+                if (check_path_shared(root, path))
+                        goto out;
+        } else {
+                err = 0;
+                goto out;
+        }
+        btrfs_release_path(root, path);
+        ref = btrfs_lookup_inode_ref(trans, root, path,
+                                dentry->d_name.name, dentry->d_name.len,
+                                inode->i_ino, dir->i_ino, 0);
+        if (IS_ERR(ref)) {
+                err = PTR_ERR(ref);
+                goto out;
+        }
+        BUG_ON(!ref);
+        if (check_path_shared(root, path))
+                goto out;
+        index = btrfs_inode_ref_index(path->nodes[0], ref);
+        btrfs_release_path(root, path);
+        di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
+                                dentry->d_name.name, dentry->d_name.len, 0);
+        if (IS_ERR(di)) {
+                err = PTR_ERR(di);
+                goto out;
+        }
+        BUG_ON(ret == -ENOENT);
+        if (check_path_shared(root, path))
+                goto out;
+        err = 0;
+out:
+        btrfs_free_path(path);
+        if (err) {
+                btrfs_end_transaction(trans, root);
+                root->fs_info->enospc_unlink = 0;
+                return ERR_PTR(err);
+        }
+        trans->block_rsv = &root->fs_info->global_block_rsv;
+        return trans;
+}
+static void __unlink_end_trans(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root)
+{
+        if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+                BUG_ON(!root->fs_info->enospc_unlink);
+                root->fs_info->enospc_unlink = 0;
+        }
+        btrfs_end_transaction_throttle(trans, root);
+}
+static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        struct btrfs_trans_handle *trans;
+        struct inode *inode = dentry->d_inode;
+        int ret;
+        unsigned long nr = 0;
+        trans = __unlink_start_trans(dir, dentry);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
        btrfs_set_trans_block_group(trans, dir);
        btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
        ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
                                 dentry->d_name.name, dentry->d_name.len);
+        BUG_ON(ret);
-        if (inode->i_nlink == 0)
+        if (inode->i_nlink == 0) {
                ret = btrfs_orphan_add(trans, inode);
+                BUG_ON(ret);
+        }
        nr = trans->blocks_used;
+        __unlink_end_trans(trans, root);
-        btrfs_end_transaction_throttle(trans, root);
-        btrfs_unreserve_metadata_space(root, 6);
        btrfs_btree_balance_dirty(root, nr);
        return ret;
 }
@@ -2633,7 +2948,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
        int err = 0;
-        int ret;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_trans_handle *trans;
        unsigned long nr = 0;
@@ -2642,15 +2956,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
            inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
                return -ENOTEMPTY;
-        ret = btrfs_reserve_metadata_space(root, 5);
+        trans = __unlink_start_trans(dir, dentry);
-        if (ret)
+        if (IS_ERR(trans))
-                return ret;
-        trans = btrfs_start_transaction(root, 1);
-        if (IS_ERR(trans)) {
-                btrfs_unreserve_metadata_space(root, 5);
                return PTR_ERR(trans);
-        }
        btrfs_set_trans_block_group(trans, dir);
@@ -2673,12 +2981,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
                btrfs_i_size_write(inode, 0);
 out:
        nr = trans->blocks_used;
-        ret = btrfs_end_transaction_throttle(trans, root);
+        __unlink_end_trans(trans, root);
-        btrfs_unreserve_metadata_space(root, 5);
        btrfs_btree_balance_dirty(root, nr);
-        if (ret && !err)
-                err = ret;
        return err;
 }
@@ -3075,6 +3380,7 @@ out:
        if (pending_del_nr) {
                ret = btrfs_del_items(trans, root, path, pending_del_slot,
                                      pending_del_nr);
+                BUG_ON(ret);
        }
        btrfs_free_path(path);
        return err;
@@ -3102,11 +3408,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
        if ((offset & (blocksize - 1)) == 0)
                goto out;
-        ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+        ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
-        if (ret)
-                goto out;
-        ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
        if (ret)
                goto out;
@@ -3114,8 +3416,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 again:
        page = grab_cache_page(mapping, index);
        if (!page) {
-                btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+                btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
-                btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
                goto out;
        }
@@ -3178,8 +3479,7 @@ again:
 out_unlock:
        if (ret)
-                btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+                btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
-        btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
        unlock_page(page);
        page_cache_release(page);
 out:
@@ -3191,7 +3491,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-        struct extent_map *em;
+        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
        u64 mask = root->sectorsize - 1;
        u64 hole_start = (inode->i_size + mask) & ~mask;
@@ -3229,11 +3529,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
                        u64 hint_byte = 0;
                        hole_size = last_byte - cur_offset;
-                        err = btrfs_reserve_metadata_space(root, 2);
+                        trans = btrfs_start_transaction(root, 2);
-                        if (err)
+                        if (IS_ERR(trans)) {
+                                err = PTR_ERR(trans);
                                break;
+                        }
-                        trans = btrfs_start_transaction(root, 1);
                        btrfs_set_trans_block_group(trans, inode);
                        err = btrfs_drop_extents(trans, inode, cur_offset,
@@ -3251,14 +3551,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
                                        last_byte - 1, 0);
                        btrfs_end_transaction(trans, root);
-                        btrfs_unreserve_metadata_space(root, 2);
                }
                free_extent_map(em);
+                em = NULL;
                cur_offset = last_byte;
                if (cur_offset >= block_end)
                        break;
        }
+        free_extent_map(em);
        unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
                             GFP_NOFS);
        return err;
@@ -3285,11 +3586,10 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
                }
        }
-        ret = btrfs_reserve_metadata_space(root, 1);
+        trans = btrfs_start_transaction(root, 5);
-        if (ret)
+        if (IS_ERR(trans))
-                return ret;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
        btrfs_set_trans_block_group(trans, inode);
        ret = btrfs_orphan_add(trans, inode);
@@ -3297,7 +3597,6 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
-        btrfs_unreserve_metadata_space(root, 1);
        btrfs_btree_balance_dirty(root, nr);
        if (attr->ia_size > inode->i_size) {
@@ -3310,8 +3609,11 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
                i_size_write(inode, attr->ia_size);
                btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
-                trans = btrfs_start_transaction(root, 1);
+                trans = btrfs_start_transaction(root, 0);
+                BUG_ON(IS_ERR(trans));
                btrfs_set_trans_block_group(trans, inode);
+                trans->block_rsv = root->orphan_block_rsv;
+                BUG_ON(!trans->block_rsv);
                ret = btrfs_update_inode(trans, root, inode);
                BUG_ON(ret);
@@ -3391,10 +3693,21 @@ void btrfs_delete_inode(struct inode *inode)
        btrfs_i_size_write(inode, 0);
        while (1) {
-                trans = btrfs_start_transaction(root, 1);
+                trans = btrfs_start_transaction(root, 0);
+                BUG_ON(IS_ERR(trans));
                btrfs_set_trans_block_group(trans, inode);
-                ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
+                trans->block_rsv = root->orphan_block_rsv;
+                ret = btrfs_block_rsv_check(trans, root,
+                                            root->orphan_block_rsv, 0, 5);
+                if (ret) {
+                        BUG_ON(ret != -EAGAIN);
+                        ret = btrfs_commit_transaction(trans, root);
+                        BUG_ON(ret);
+                        continue;
+                }
+                ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
                if (ret != -EAGAIN)
                        break;
@@ -3402,6 +3715,7 @@ void btrfs_delete_inode(struct inode *inode)
                btrfs_end_transaction(trans, root);
                trans = NULL;
                btrfs_btree_balance_dirty(root, nr);
        }
        if (ret == 0) {
@@ -3642,40 +3956,10 @@ again:
        return 0;
 }
-static noinline void init_btrfs_i(struct inode *inode)
-{
-        struct btrfs_inode *bi = BTRFS_I(inode);
-        bi->generation = 0;
-        bi->sequence = 0;
-        bi->last_trans = 0;
-        bi->last_sub_trans = 0;
-        bi->logged_trans = 0;
-        bi->delalloc_bytes = 0;
-        bi->reserved_bytes = 0;
-        bi->disk_i_size = 0;
-        bi->flags = 0;
-        bi->index_cnt = (u64)-1;
-        bi->last_unlink_trans = 0;
-        bi->ordered_data_close = 0;
-        bi->force_compress = 0;
-        extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
-        extent_io_tree_init(&BTRFS_I(inode)->io_tree,
-                             inode->i_mapping, GFP_NOFS);
-        extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
-                             inode->i_mapping, GFP_NOFS);
-        INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
-        INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
-        RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
-        btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
-        mutex_init(&BTRFS_I(inode)->log_mutex);
-}
 static int btrfs_init_locked_inode(struct inode *inode, void *p)
 {
        struct btrfs_iget_args *args = p;
        inode->i_ino = args->ino;
-        init_btrfs_i(inode);
        BTRFS_I(inode)->root = args->root;
        btrfs_set_inode_space_info(args->root, inode);
        return 0;
@@ -3738,8 +4022,6 @@ static struct inode *new_simple_dir(struct super_block *s,
        if (!inode)
                return ERR_PTR(-ENOMEM);
-        init_btrfs_i(inode);
        BTRFS_I(inode)->root = root;
        memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
        BTRFS_I(inode)->dummy_inode = 1;
@@ -3996,7 +4278,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
        struct btrfs_trans_handle *trans;
        int ret = 0;
-        if (root->fs_info->btree_inode == inode)
+        if (BTRFS_I(inode)->dummy_inode)
                return 0;
        if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -4017,10 +4299,38 @@ void btrfs_dirty_inode(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
+        int ret;
+        if (BTRFS_I(inode)->dummy_inode)
+                return;
        trans = btrfs_join_transaction(root, 1);
        btrfs_set_trans_block_group(trans, inode);
-        btrfs_update_inode(trans, root, inode);
+        ret = btrfs_update_inode(trans, root, inode);
+        if (ret && ret == -ENOSPC) {
+                /* whoops, lets try again with the full transaction */
+                btrfs_end_transaction(trans, root);
+                trans = btrfs_start_transaction(root, 1);
+                if (IS_ERR(trans)) {
+                        if (printk_ratelimit()) {
+                                printk(KERN_ERR "btrfs: fail to "
+                                       "dirty  inode %lu error %ld\n",
+                                       inode->i_ino, PTR_ERR(trans));
+                        }
+                        return;
+                }
+                btrfs_set_trans_block_group(trans, inode);
+                ret = btrfs_update_inode(trans, root, inode);
+                if (ret) {
+                        if (printk_ratelimit()) {
+                                printk(KERN_ERR "btrfs: fail to "
+                                       "dirty  inode %lu error %d\n",
+                                       inode->i_ino, ret);
+                        }
+                }
+        }
        btrfs_end_transaction(trans, root);
 }
@@ -4138,7 +4448,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
         * btrfs_get_inode_index_count has an explanation for the magic
         * number
         */
-        init_btrfs_i(inode);
        BTRFS_I(inode)->index_cnt = 2;
        BTRFS_I(inode)->root = root;
        BTRFS_I(inode)->generation = trans->transid;
@@ -4167,16 +4476,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        if (ret != 0)
                goto fail;
-        inode->i_uid = current_fsuid();
+        inode_init_owner(inode, dir, mode);
-        if (dir && (dir->i_mode & S_ISGID)) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
-        } else
-                inode->i_gid = current_fsgid();
-        inode->i_mode = mode;
        inode->i_ino = objectid;
        inode_set_bytes(inode, 0);
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -4302,26 +4602,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        if (!new_valid_dev(rdev))
                return -EINVAL;
+        err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+        if (err)
+                return err;
        /*
         * 2 for inode item and ref
         * 2 for dir items
         * 1 for xattr if selinux is on
         */
-        err = btrfs_reserve_metadata_space(root, 5);
+        trans = btrfs_start_transaction(root, 5);
-        if (err)
+        if (IS_ERR(trans))
-                return err;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
-        if (!trans)
-                goto fail;
        btrfs_set_trans_block_group(trans, dir);
-        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-        if (err) {
-                err = -ENOSPC;
-                goto out_unlock;
-        }
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len,
                                dentry->d_parent->d_inode->i_ino, objectid,
@@ -4350,13 +4645,11 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 out_unlock:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
-fail:
+        btrfs_btree_balance_dirty(root, nr);
-        btrfs_unreserve_metadata_space(root, 5);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
        }
-        btrfs_btree_balance_dirty(root, nr);
        return err;
 }
@@ -4366,32 +4659,26 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct inode *inode = NULL;
-        int err;
        int drop_inode = 0;
+        int err;
        unsigned long nr = 0;
        u64 objectid;
        u64 index = 0;
+        err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+        if (err)
+                return err;
        /*
         * 2 for inode item and ref
         * 2 for dir items
         * 1 for xattr if selinux is on
         */
-        err = btrfs_reserve_metadata_space(root, 5);
+        trans = btrfs_start_transaction(root, 5);
-        if (err)
+        if (IS_ERR(trans))
-                return err;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
-        if (!trans)
-                goto fail;
        btrfs_set_trans_block_group(trans, dir);
-        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-        if (err) {
-                err = -ENOSPC;
-                goto out_unlock;
-        }
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len,
                                dentry->d_parent->d_inode->i_ino,
@@ -4423,8 +4710,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 out_unlock:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
-fail:
-        btrfs_unreserve_metadata_space(root, 5);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@ -4451,21 +4736,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        if (root->objectid != BTRFS_I(inode)->root->objectid)
                return -EPERM;
-        /*
-         * 1 item for inode ref
-         * 2 items for dir items
-         */
-        err = btrfs_reserve_metadata_space(root, 3);
-        if (err)
-                return err;
        btrfs_inc_nlink(inode);
        err = btrfs_set_inode_index(dir, &index);
        if (err)
                goto fail;
-        trans = btrfs_start_transaction(root, 1);
+        /*
+         * 1 item for inode ref
+         * 2 items for dir items
+         */
+        trans = btrfs_start_transaction(root, 3);
+        if (IS_ERR(trans)) {
+                err = PTR_ERR(trans);
+                goto fail;
+        }
        btrfs_set_trans_block_group(trans, dir);
        atomic_inc(&inode->i_count);
@@ -4484,7 +4769,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
 fail:
-        btrfs_unreserve_metadata_space(root, 3);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@ -4504,28 +4788,20 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        u64 index = 0;
        unsigned long nr = 1;
+        err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+        if (err)
+                return err;
        /*
         * 2 items for inode and ref
         * 2 items for dir items
         * 1 for xattr if selinux is on
         */
-        err = btrfs_reserve_metadata_space(root, 5);
+        trans = btrfs_start_transaction(root, 5);
-        if (err)
+        if (IS_ERR(trans))
-                return err;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
-        if (!trans) {
-                err = -ENOMEM;
-                goto out_unlock;
-        }
        btrfs_set_trans_block_group(trans, dir);
-        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-        if (err) {
-                err = -ENOSPC;
-                goto out_fail;
-        }
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len,
                                dentry->d_parent->d_inode->i_ino, objectid,
@@ -4565,9 +4841,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 out_fail:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
-out_unlock:
-        btrfs_unreserve_metadata_space(root, 5);
        if (drop_on_err)
                iput(inode);
        btrfs_btree_balance_dirty(root, nr);
@@ -4825,6 +5098,7 @@ again:
                        }
                        flush_dcache_page(page);
                } else if (create && PageUptodate(page)) {
+                        WARN_ON(1);
                        if (!trans) {
                                kunmap(page);
                                free_extent_map(em);
@@ -4921,11 +5195,651 @@ out:
        return em;
 }
+static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+                                                  u64 start, u64 len)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        struct extent_map *em;
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct btrfs_key ins;
+        u64 alloc_hint;
+        int ret;
+        btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+        trans = btrfs_join_transaction(root, 0);
+        if (!trans)
+                return ERR_PTR(-ENOMEM);
+        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+        alloc_hint = get_extent_allocation_hint(inode, start, len);
+        ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
+                                   alloc_hint, (u64)-1, &ins, 1);
+        if (ret) {
+                em = ERR_PTR(ret);
+                goto out;
+        }
+        em = alloc_extent_map(GFP_NOFS);
+        if (!em) {
+                em = ERR_PTR(-ENOMEM);
+                goto out;
+        }
+        em->start = start;
+        em->orig_start = em->start;
+        em->len = ins.offset;
+        em->block_start = ins.objectid;
+        em->block_len = ins.offset;
+        em->bdev = root->fs_info->fs_devices->latest_bdev;
+        set_bit(EXTENT_FLAG_PINNED, &em->flags);
+        while (1) {
+                write_lock(&em_tree->lock);
+                ret = add_extent_mapping(em_tree, em);
+                write_unlock(&em_tree->lock);
+                if (ret != -EEXIST)
+                        break;
+                btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
+        }
+        ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
+                                           ins.offset, ins.offset, 0);
+        if (ret) {
+                btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+                em = ERR_PTR(ret);
+        }
+out:
+        btrfs_end_transaction(trans, root);
+        return em;
+}
+/*
+ * returns 1 when the nocow is safe, < 1 on error, 0 if the
+ * block must be cow'd
+ */
+static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
+                                      struct inode *inode, u64 offset, u64 len)
+{
+        struct btrfs_path *path;
+        int ret;
+        struct extent_buffer *leaf;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_file_extent_item *fi;
+        struct btrfs_key key;
+        u64 disk_bytenr;
+        u64 backref_offset;
+        u64 extent_end;
+        u64 num_bytes;
+        int slot;
+        int found_type;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+                                       offset, 0);
+        if (ret < 0)
+                goto out;
+        slot = path->slots[0];
+        if (ret == 1) {
+                if (slot == 0) {
+                        /* can't find the item, must cow */
+                        ret = 0;
+                        goto out;
+                }
+                slot--;
+        }
+        ret = 0;
+        leaf = path->nodes[0];
+        btrfs_item_key_to_cpu(leaf, &key, slot);
+        if (key.objectid != inode->i_ino ||
+            key.type != BTRFS_EXTENT_DATA_KEY) {
+                /* not our file or wrong item type, must cow */
+                goto out;
+        }
+        if (key.offset > offset) {
+                /* Wrong offset, must cow */
+                goto out;
+        }
+        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+        found_type = btrfs_file_extent_type(leaf, fi);
+        if (found_type != BTRFS_FILE_EXTENT_REG &&
+            found_type != BTRFS_FILE_EXTENT_PREALLOC) {
+                /* not a regular extent, must cow */
+                goto out;
+        }
+        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+        backref_offset = btrfs_file_extent_offset(leaf, fi);
+        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+        if (extent_end < offset + len) {
+                /* extent doesn't include our full range, must cow */
+                goto out;
+        }
+        if (btrfs_extent_readonly(root, disk_bytenr))
+                goto out;
+        /*
+         * look for other files referencing this extent, if we
+         * find any we must cow
+         */
+        if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+                                  key.offset - backref_offset, disk_bytenr))
+                goto out;
+        /*
+         * adjust disk_bytenr and num_bytes to cover just the bytes
+         * in this extent we are about to write.  If there
+         * are any csums in that range we have to cow in order
+         * to keep the csums correct
+         */
+        disk_bytenr += backref_offset;
+        disk_bytenr += offset - key.offset;
+        num_bytes = min(offset + len, extent_end) - offset;
+        if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+                                goto out;
+        /*
+         * all of the above have passed, it is safe to overwrite this extent
+         * without cow
+         */
+        ret = 1;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
+                                   struct buffer_head *bh_result, int create)
+{
+        struct extent_map *em;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        u64 start = iblock << inode->i_blkbits;
+        u64 len = bh_result->b_size;
+        struct btrfs_trans_handle *trans;
+        em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+        if (IS_ERR(em))
+                return PTR_ERR(em);
+        /*
+         * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
+         * io.  INLINE is special, and we could probably kludge it in here, but
+         * it's still buffered so for safety lets just fall back to the generic
+         * buffered path.
+         *
+         * For COMPRESSED we _have_ to read the entire extent in so we can
+         * decompress it, so there will be buffering required no matter what we
+         * do, so go ahead and fallback to buffered.
+         *
+         * We return -ENOTBLK because thats what makes DIO go ahead and go back
+         * to buffered IO.  Don't blame me, this is the price we pay for using
+         * the generic code.
+         */
+        if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+            em->block_start == EXTENT_MAP_INLINE) {
+                free_extent_map(em);
+                return -ENOTBLK;
+        }
+        /* Just a good old fashioned hole, return */
+        if (!create && (em->block_start == EXTENT_MAP_HOLE ||
+                        test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+                free_extent_map(em);
+                /* DIO will do one hole at a time, so just unlock a sector */
+                unlock_extent(&BTRFS_I(inode)->io_tree, start,
+                              start + root->sectorsize - 1, GFP_NOFS);
+                return 0;
+        }
+        /*
+         * We don't allocate a new extent in the following cases
+         *
+         * 1) The inode is marked as NODATACOW.  In this case we'll just use the
+         * existing extent.
+         * 2) The extent is marked as PREALLOC.  We're good to go here and can
+         * just use the extent.
+         *
+         */
+        if (!create) {
+                len = em->len - (start - em->start);
+                goto map;
+        }
+        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+            ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+             em->block_start != EXTENT_MAP_HOLE)) {
+                int type;
+                int ret;
+                u64 block_start;
+                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+                        type = BTRFS_ORDERED_PREALLOC;
+                else
+                        type = BTRFS_ORDERED_NOCOW;
+                len = min(len, em->len - (start - em->start));
+                block_start = em->block_start + (start - em->start);
+                /*
+                 * we're not going to log anything, but we do need
+                 * to make sure the current transaction stays open
+                 * while we look for nocow cross refs
+                 */
+                trans = btrfs_join_transaction(root, 0);
+                if (!trans)
+                        goto must_cow;
+                if (can_nocow_odirect(trans, inode, start, len) == 1) {
+                        ret = btrfs_add_ordered_extent_dio(inode, start,
+                                           block_start, len, len, type);
+                        btrfs_end_transaction(trans, root);
+                        if (ret) {
+                                free_extent_map(em);
+                                return ret;
+                        }
+                        goto unlock;
+                }
+                btrfs_end_transaction(trans, root);
+        }
+must_cow:
+        /*
+         * this will cow the extent, reset the len in case we changed
+         * it above
+         */
+        len = bh_result->b_size;
+        free_extent_map(em);
+        em = btrfs_new_extent_direct(inode, start, len);
+        if (IS_ERR(em))
+                return PTR_ERR(em);
+        len = min(len, em->len - (start - em->start));
+unlock:
+        clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+                          EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
+                          0, NULL, GFP_NOFS);
+map:
+        bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+                inode->i_blkbits;
+        bh_result->b_size = len;
+        bh_result->b_bdev = em->bdev;
+        set_buffer_mapped(bh_result);
+        if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+                set_buffer_new(bh_result);
+        free_extent_map(em);
+        return 0;
+}
+struct btrfs_dio_private {
+        struct inode *inode;
+        u64 logical_offset;
+        u64 disk_bytenr;
+        u64 bytes;
+        u32 *csums;
+        void *private;
+};
+static void btrfs_endio_direct_read(struct bio *bio, int err)
+{
+        struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct bio_vec *bvec = bio->bi_io_vec;
+        struct btrfs_dio_private *dip = bio->bi_private;
+        struct inode *inode = dip->inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        u64 start;
+        u32 *private = dip->csums;
+        start = dip->logical_offset;
+        do {
+                if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+                        struct page *page = bvec->bv_page;
+                        char *kaddr;
+                        u32 csum = ~(u32)0;
+                        unsigned long flags;
+                        local_irq_save(flags);
+                        kaddr = kmap_atomic(page, KM_IRQ0);
+                        csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
+                                               csum, bvec->bv_len);
+                        btrfs_csum_final(csum, (char *)&csum);
+                        kunmap_atomic(kaddr, KM_IRQ0);
+                        local_irq_restore(flags);
+                        flush_dcache_page(bvec->bv_page);
+                        if (csum != *private) {
+                                printk(KERN_ERR "btrfs csum failed ino %lu off"
+                                      " %llu csum %u private %u\n",
+                                      inode->i_ino, (unsigned long long)start,
+                                      csum, *private);
+                                err = -EIO;
+                        }
+                }
+                start += bvec->bv_len;
+                private++;
+                bvec++;
+        } while (bvec <= bvec_end);
+        unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
+                      dip->logical_offset + dip->bytes - 1, GFP_NOFS);
+        bio->bi_private = dip->private;
+        kfree(dip->csums);
+        kfree(dip);
+        dio_end_io(bio, err);
+}
+static void btrfs_endio_direct_write(struct bio *bio, int err)
+{
+        struct btrfs_dio_private *dip = bio->bi_private;
+        struct inode *inode = dip->inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_ordered_extent *ordered = NULL;
+        struct extent_state *cached_state = NULL;
+        int ret;
+        if (err)
+                goto out_done;
+        ret = btrfs_dec_test_ordered_pending(inode, &ordered,
+                                             dip->logical_offset, dip->bytes);
+        if (!ret)
+                goto out_done;
+        BUG_ON(!ordered);
+        trans = btrfs_join_transaction(root, 1);
+        if (!trans) {
+                err = -ENOMEM;
+                goto out;
+        }
+        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
+                ret = btrfs_ordered_update_i_size(inode, 0, ordered);
+                if (!ret)
+                        ret = btrfs_update_inode(trans, root, inode);
+                err = ret;
+                goto out;
+        }
+        lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+                         ordered->file_offset + ordered->len - 1, 0,
+                         &cached_state, GFP_NOFS);
+        if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
+                ret = btrfs_mark_extent_written(trans, inode,
+                                                ordered->file_offset,
+                                                ordered->file_offset +
+                                                ordered->len);
+                if (ret) {
+                        err = ret;
+                        goto out_unlock;
+                }
+        } else {
+                ret = insert_reserved_file_extent(trans, inode,
+                                                  ordered->file_offset,
+                                                  ordered->start,
+                                                  ordered->disk_len,
+                                                  ordered->len,
+                                                  ordered->len,
+                                                  0, 0, 0,
+                                                  BTRFS_FILE_EXTENT_REG);
+                unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+                                   ordered->file_offset, ordered->len);
+                if (ret) {
+                        err = ret;
+                        WARN_ON(1);
+                        goto out_unlock;
+                }
+        }
+        add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
+        btrfs_ordered_update_i_size(inode, 0, ordered);
+        btrfs_update_inode(trans, root, inode);
+out_unlock:
+        unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+                             ordered->file_offset + ordered->len - 1,
+                             &cached_state, GFP_NOFS);
+out:
+        btrfs_delalloc_release_metadata(inode, ordered->len);
+        btrfs_end_transaction(trans, root);
+        btrfs_put_ordered_extent(ordered);
+        btrfs_put_ordered_extent(ordered);
+out_done:
+        bio->bi_private = dip->private;
+        kfree(dip->csums);
+        kfree(dip);
+        dio_end_io(bio, err);
+}
+static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
+                                    struct bio *bio, int mirror_num,
+                                    unsigned long bio_flags, u64 offset)
+{
+        int ret;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
+        BUG_ON(ret);
+        return 0;
+}
+static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
+                                loff_t file_offset)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_dio_private *dip;
+        struct bio_vec *bvec = bio->bi_io_vec;
+        u64 start;
+        int skip_sum;
+        int write = rw & (1 << BIO_RW);
+        int ret = 0;
+        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+        dip = kmalloc(sizeof(*dip), GFP_NOFS);
+        if (!dip) {
+                ret = -ENOMEM;
+                goto free_ordered;
+        }
+        dip->csums = NULL;
+        if (!skip_sum) {
+                dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
+                if (!dip->csums) {
+                        ret = -ENOMEM;
+                        goto free_ordered;
+                }
+        }
+        dip->private = bio->bi_private;
+        dip->inode = inode;
+        dip->logical_offset = file_offset;
+        start = dip->logical_offset;
+        dip->bytes = 0;
+        do {
+                dip->bytes += bvec->bv_len;
+                bvec++;
+        } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
+        dip->disk_bytenr = (u64)bio->bi_sector << 9;
+        bio->bi_private = dip;
+        if (write)
+                bio->bi_end_io = btrfs_endio_direct_write;
+        else
+                bio->bi_end_io = btrfs_endio_direct_read;
+        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+        if (ret)
+                goto out_err;
+        if (write && !skip_sum) {
+                ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+                                   inode, rw, bio, 0, 0,
+                                   dip->logical_offset,
+                                   __btrfs_submit_bio_start_direct_io,
+                                   __btrfs_submit_bio_done);
+                if (ret)
+                        goto out_err;
+                return;
+        } else if (!skip_sum)
+                btrfs_lookup_bio_sums_dio(root, inode, bio,
+                                          dip->logical_offset, dip->csums);
+        ret = btrfs_map_bio(root, rw, bio, 0, 1);
+        if (ret)
+                goto out_err;
+        return;
+out_err:
+        kfree(dip->csums);
+        kfree(dip);
+free_ordered:
+        /*
+         * If this is a write, we need to clean up the reserved space and kill
+         * the ordered extent.
+         */
+        if (write) {
+                struct btrfs_ordered_extent *ordered;
+                ordered = btrfs_lookup_ordered_extent(inode,
+                                                      dip->logical_offset);
+                if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
+                    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+                        btrfs_free_reserved_extent(root, ordered->start,
+                                                   ordered->disk_len);
+                btrfs_put_ordered_extent(ordered);
+                btrfs_put_ordered_extent(ordered);
+        }
+        bio_endio(bio, ret);
+}
+static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
+                        const struct iovec *iov, loff_t offset,
+                        unsigned long nr_segs)
+{
+        int seg;
+        size_t size;
+        unsigned long addr;
+        unsigned blocksize_mask = root->sectorsize - 1;
+        ssize_t retval = -EINVAL;
+        loff_t end = offset;
+        if (offset & blocksize_mask)
+                goto out;
+        /* Check the memory alignment.  Blocks cannot straddle pages */
+        for (seg = 0; seg < nr_segs; seg++) {
+                addr = (unsigned long)iov[seg].iov_base;
+                size = iov[seg].iov_len;
+                end += size;
+                if ((addr & blocksize_mask) || (size & blocksize_mask)) 
+                        goto out;
+        }
+        retval = 0;
+out:
+        return retval;
+}
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                        const struct iovec *iov, loff_t offset,
                        unsigned long nr_segs)
 {
-        return -EINVAL;
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file->f_mapping->host;
+        struct btrfs_ordered_extent *ordered;
+        struct extent_state *cached_state = NULL;
+        u64 lockstart, lockend;
+        ssize_t ret;
+        int writing = rw & WRITE;
+        int write_bits = 0;
+        size_t count = iov_length(iov, nr_segs);
+        if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
+                            offset, nr_segs)) {
+                return 0;
+        }
+        lockstart = offset;
+        lockend = offset + count - 1;
+        if (writing) {
+                ret = btrfs_delalloc_reserve_space(inode, count);
+                if (ret)
+                        goto out;
+        }
+        while (1) {
+                lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                 0, &cached_state, GFP_NOFS);
+                /*
+                 * We're concerned with the entire range that we're going to be
+                 * doing DIO to, so we need to make sure theres no ordered
+                 * extents in this range.
+                 */
+                ordered = btrfs_lookup_ordered_range(inode, lockstart,
+                                                     lockend - lockstart + 1);
+                if (!ordered)
+                        break;
+                unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                     &cached_state, GFP_NOFS);
+                btrfs_start_ordered_extent(inode, ordered, 1);
+                btrfs_put_ordered_extent(ordered);
+                cond_resched();
+        }
+        /*
+         * we don't use btrfs_set_extent_delalloc because we don't want
+         * the dirty or uptodate bits
+         */
+        if (writing) {
+                write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
+                ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                     EXTENT_DELALLOC, 0, NULL, &cached_state,
+                                     GFP_NOFS);
+                if (ret) {
+                        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                         lockend, EXTENT_LOCKED | write_bits,
+                                         1, 0, &cached_state, GFP_NOFS);
+                        goto out;
+                }
+        }
+        free_extent_state(cached_state);
+        cached_state = NULL;
+        ret = __blockdev_direct_IO(rw, iocb, inode,
+                   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+                   iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+                   btrfs_submit_direct, 0);
+        if (ret < 0 && ret != -EIOCBQUEUED) {
+                clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
+                              offset + iov_length(iov, nr_segs) - 1,
+                              EXTENT_LOCKED | write_bits, 1, 0,
+                              &cached_state, GFP_NOFS);
+        } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
+                /*
+                 * We're falling back to buffered, unlock the section we didn't
+                 * do IO on.
+                 */
+                clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
+                              offset + iov_length(iov, nr_segs) - 1,
+                              EXTENT_LOCKED | write_bits, 1, 0,
+                              &cached_state, GFP_NOFS);
+        }
+out:
+        free_extent_state(cached_state);
+        return ret;
 }
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -5089,7 +6003,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        u64 page_start;
        u64 page_end;
-        ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+        ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
        if (ret) {
                if (ret == -ENOMEM)
                        ret = VM_FAULT_OOM;
@@ -5098,13 +6012,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                goto out;
        }
-        ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
-        if (ret) {
-                btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
-                ret = VM_FAULT_SIGBUS;
-                goto out;
-        }
        ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
 again:
        lock_page(page);
@@ -5114,7 +6021,6 @@ again:
        if ((page->mapping != inode->i_mapping) ||
            (page_start >= size)) {
-                btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
                /* page got truncated out from underneath us */
                goto out_unlock;
        }
@@ -5155,7 +6061,6 @@ again:
                unlock_extent_cached(io_tree, page_start, page_end,
                                     &cached_state, GFP_NOFS);
                ret = VM_FAULT_SIGBUS;
-                btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
                goto out_unlock;
        }
        ret = 0;
@@ -5182,10 +6087,10 @@ again:
        unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
 out_unlock:
-        btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
        if (!ret)
                return VM_FAULT_LOCKED;
        unlock_page(page);
+        btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 out:
        return ret;
 }
@@ -5210,8 +6115,10 @@ static void btrfs_truncate(struct inode *inode)
        btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
        btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
+        BUG_ON(IS_ERR(trans));
        btrfs_set_trans_block_group(trans, inode);
+        trans->block_rsv = root->orphan_block_rsv;
        /*
         * setattr is responsible for setting the ordered_data_close flag,
@@ -5234,6 +6141,23 @@ static void btrfs_truncate(struct inode *inode)
                btrfs_add_ordered_operation(trans, root, inode);
        while (1) {
+                if (!trans) {
+                        trans = btrfs_start_transaction(root, 0);
+                        BUG_ON(IS_ERR(trans));
+                        btrfs_set_trans_block_group(trans, inode);
+                        trans->block_rsv = root->orphan_block_rsv;
+                }
+                ret = btrfs_block_rsv_check(trans, root,
+                                            root->orphan_block_rsv, 0, 5);
+                if (ret) {
+                        BUG_ON(ret != -EAGAIN);
+                        ret = btrfs_commit_transaction(trans, root);
+                        BUG_ON(ret);
+                        trans = NULL;
+                        continue;
+                }
                ret = btrfs_truncate_inode_items(trans, root, inode,
                                                 inode->i_size,
                                                 BTRFS_EXTENT_DATA_KEY);
@@ -5245,10 +6169,8 @@ static void btrfs_truncate(struct inode *inode)
                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
+                trans = NULL;
                btrfs_btree_balance_dirty(root, nr);
-                trans = btrfs_start_transaction(root, 1);
-                btrfs_set_trans_block_group(trans, inode);
        }
        if (ret == 0 && inode->i_nlink > 0) {
@@ -5309,21 +6231,47 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
 struct inode *btrfs_alloc_inode(struct super_block *sb)
 {
        struct btrfs_inode *ei;
+        struct inode *inode;
        ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;
+        ei->root = NULL;
+        ei->space_info = NULL;
+        ei->generation = 0;
+        ei->sequence = 0;
        ei->last_trans = 0;
        ei->last_sub_trans = 0;
        ei->logged_trans = 0;
-        ei->outstanding_extents = 0;
+        ei->delalloc_bytes = 0;
-        ei->reserved_extents = 0;
+        ei->reserved_bytes = 0;
-        ei->root = NULL;
+        ei->disk_i_size = 0;
+        ei->flags = 0;
+        ei->index_cnt = (u64)-1;
+        ei->last_unlink_trans = 0;
        spin_lock_init(&ei->accounting_lock);
+        atomic_set(&ei->outstanding_extents, 0);
+        ei->reserved_extents = 0;
+        ei->ordered_data_close = 0;
+        ei->orphan_meta_reserved = 0;
+        ei->dummy_inode = 0;
+        ei->force_compress = 0;
+        inode = &ei->vfs_inode;
+        extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
+        extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
+        extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
+        mutex_init(&ei->log_mutex);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
        INIT_LIST_HEAD(&ei->i_orphan);
+        INIT_LIST_HEAD(&ei->delalloc_inodes);
        INIT_LIST_HEAD(&ei->ordered_operations);
-        return &ei->vfs_inode;
+        RB_CLEAR_NODE(&ei->rb_node);
+        return inode;
 }
 void btrfs_destroy_inode(struct inode *inode)
@@ -5333,6 +6281,8 @@ void btrfs_destroy_inode(struct inode *inode)
        WARN_ON(!list_empty(&inode->i_dentry));
        WARN_ON(inode->i_data.nrpages);
+        WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
+        WARN_ON(BTRFS_I(inode)->reserved_extents);
        /*
         * This can happen where we create an inode, but somebody else also
@@ -5353,13 +6303,13 @@ void btrfs_destroy_inode(struct inode *inode)
                spin_unlock(&root->fs_info->ordered_extent_lock);
        }
-        spin_lock(&root->list_lock);
+        spin_lock(&root->orphan_lock);
        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
                printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
                       inode->i_ino);
                list_del_init(&BTRFS_I(inode)->i_orphan);
        }
-        spin_unlock(&root->list_lock);
+        spin_unlock(&root->orphan_lock);
        while (1) {
                ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -5384,7 +6334,6 @@ free:
 void btrfs_drop_inode(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
                generic_delete_inode(inode);
        else
@@ -5481,19 +6430,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (S_ISDIR(old_inode->i_mode) && new_inode &&
            new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
                return -ENOTEMPTY;
-        /*
-         * We want to reserve the absolute worst case amount of items.  So if
-         * both inodes are subvols and we need to unlink them then that would
-         * require 4 item modifications, but if they are both normal inodes it
-         * would require 5 item modifications, so we'll assume their normal
-         * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
-         * should cover the worst case number of items we'll modify.
-         */
-        ret = btrfs_reserve_metadata_space(root, 11);
-        if (ret)
-                return ret;
        /*
         * we're using rename to replace one file with another.
         * and the replacement file is large.  Start IO on it now so
@@ -5506,8 +6442,18 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        /* close the racy window with snapshot create/destroy ioctl */
        if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
                down_read(&root->fs_info->subvol_sem);
+        /*
+         * We want to reserve the absolute worst case amount of items.  So if
+         * both inodes are subvols and we need to unlink them then that would
+         * require 4 item modifications, but if they are both normal inodes it
+         * would require 5 item modifications, so we'll assume their normal
+         * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
+         * should cover the worst case number of items we'll modify.
+         */
+        trans = btrfs_start_transaction(root, 20);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
        btrfs_set_trans_block_group(trans, new_dir);
        if (dest != root)
@@ -5606,7 +6552,6 @@ out_fail:
        if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
                up_read(&root->fs_info->subvol_sem);
-        btrfs_unreserve_metadata_space(root, 11);
        return ret;
 }
@@ -5658,6 +6603,38 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
        return 0;
 }
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
+{
+        struct btrfs_inode *binode;
+        struct inode *inode = NULL;
+        spin_lock(&root->fs_info->delalloc_lock);
+        while (!list_empty(&root->fs_info->delalloc_inodes)) {
+                binode = list_entry(root->fs_info->delalloc_inodes.next,
+                                    struct btrfs_inode, delalloc_inodes);
+                inode = igrab(&binode->vfs_inode);
+                if (inode) {
+                        list_move_tail(&binode->delalloc_inodes,
+                                       &root->fs_info->delalloc_inodes);
+                        break;
+                }
+                list_del_init(&binode->delalloc_inodes);
+                cond_resched_lock(&root->fs_info->delalloc_lock);
+        }
+        spin_unlock(&root->fs_info->delalloc_lock);
+        if (inode) {
+                write_inode_now(inode, 0);
+                if (delay_iput)
+                        btrfs_add_delayed_iput(inode);
+                else
+                        iput(inode);
+                return 1;
+        }
+        return 0;
+}
 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                         const char *symname)
 {
@@ -5681,26 +6658,20 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
                return -ENAMETOOLONG;
+        err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+        if (err)
+                return err;
        /*
         * 2 items for inode item and ref
         * 2 items for dir items
         * 1 item for xattr if selinux is on
         */
-        err = btrfs_reserve_metadata_space(root, 5);
+        trans = btrfs_start_transaction(root, 5);
-        if (err)
+        if (IS_ERR(trans))
-                return err;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
-        if (!trans)
-                goto out_fail;
        btrfs_set_trans_block_group(trans, dir);
-        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-        if (err) {
-                err = -ENOSPC;
-                goto out_unlock;
-        }
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len,
                                dentry->d_parent->d_inode->i_ino, objectid,
@@ -5772,8 +6743,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 out_unlock:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
-out_fail:
-        btrfs_unreserve_metadata_space(root, 5);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@ -5782,36 +6751,28 @@ out_fail:
        return err;
 }
-static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
-                        u64 alloc_hint, int mode, loff_t actual_len)
+                              u64 start, u64 num_bytes, u64 min_size,
+                              loff_t actual_len, u64 *alloc_hint)
 {
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
-        u64 alloc_size;
        u64 cur_offset = start;
-        u64 num_bytes = end - start;
        int ret = 0;
-        u64 i_size;
        while (num_bytes > 0) {
-                alloc_size = min(num_bytes, root->fs_info->max_extent);
+                trans = btrfs_start_transaction(root, 3);
+                if (IS_ERR(trans)) {
-                trans = btrfs_start_transaction(root, 1);
+                        ret = PTR_ERR(trans);
+                        break;
-                ret = btrfs_reserve_extent(trans, root, alloc_size,
-                                           root->sectorsize, 0, alloc_hint,
-                                           (u64)-1, &ins, 1);
-                if (ret) {
-                        WARN_ON(1);
-                        goto stop_trans;
                }
-                ret = btrfs_reserve_metadata_space(root, 3);
+                ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
+                                           0, *alloc_hint, (u64)-1, &ins, 1);
                if (ret) {
-                        btrfs_free_reserved_extent(root, ins.objectid,
+                        btrfs_end_transaction(trans, root);
-                                                   ins.offset);
+                        break;
-                        goto stop_trans;
                }
                ret = insert_reserved_file_extent(trans, inode,
@@ -5825,34 +6786,27 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
                num_bytes -= ins.offset;
                cur_offset += ins.offset;
-                alloc_hint = ins.objectid + ins.offset;
+                *alloc_hint = ins.objectid + ins.offset;
                inode->i_ctime = CURRENT_TIME;
                BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-                        (actual_len > inode->i_size) &&
+                    (actual_len > inode->i_size) &&
-                        (cur_offset > inode->i_size)) {
+                    (cur_offset > inode->i_size)) {
                        if (cur_offset > actual_len)
-                                i_size  = actual_len;
+                                i_size_write(inode, actual_len);
                        else
-                                i_size = cur_offset;
+                                i_size_write(inode, cur_offset);
-                        i_size_write(inode, i_size);
+                        i_size_write(inode, cur_offset);
-                        btrfs_ordered_update_i_size(inode, i_size, NULL);
+                        btrfs_ordered_update_i_size(inode, cur_offset, NULL);
                }
                ret = btrfs_update_inode(trans, root, inode);
                BUG_ON(ret);
                btrfs_end_transaction(trans, root);
-                btrfs_unreserve_metadata_space(root, 3);
        }
        return ret;
-stop_trans:
-        btrfs_end_transaction(trans, root);
-        return ret;
 }
 static long btrfs_fallocate(struct inode *inode, int mode,
@@ -5885,8 +6839,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                        goto out;
        }
-        ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode,
+        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
-                                          alloc_end - alloc_start);
        if (ret)
                goto out;
@@ -5931,16 +6884,16 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                if (em->block_start == EXTENT_MAP_HOLE ||
                    (cur_offset >= inode->i_size &&
                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-                        ret = prealloc_file_range(inode,
+                        ret = btrfs_prealloc_file_range(inode, 0, cur_offset,
-                                                  cur_offset, last_byte,
+                                                        last_byte - cur_offset,
-                                                alloc_hint, mode, offset+len);
+                                                        1 << inode->i_blkbits,
+                                                        offset + len,
+                                                        &alloc_hint);
                        if (ret < 0) {
                                free_extent_map(em);
                                break;
                        }
                }
-                if (em->block_start <= EXTENT_MAP_LAST_BYTE)
-                        alloc_hint = em->block_start;
                free_extent_map(em);
                cur_offset = last_byte;
@@ -5952,8 +6905,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
                             &cached_state, GFP_NOFS);
-        btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
+        btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
-                                       alloc_end - alloc_start);
 out:
        mutex_unlock(&inode->i_mutex);
        return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2845c6ceecd2..4cdb98cf26de 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -39,6 +39,7 @@
 #include <linux/security.h>
 #include <linux/xattr.h>
 #include <linux/vmalloc.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -48,7 +49,6 @@
 #include "print-tree.h"
 #include "volumes.h"
 #include "locking.h"
-#include "ctree.h"
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -239,23 +239,19 @@ static noinline int create_subvol(struct btrfs_root *root,
        u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
        u64 index = 0;
+        ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
+                                       0, &objectid);
+        if (ret)
+                return ret;
        /*
         * 1 - inode item
         * 2 - refs
         * 1 - root item
         * 2 - dir items
         */
-        ret = btrfs_reserve_metadata_space(root, 6);
+        trans = btrfs_start_transaction(root, 6);
-        if (ret)
+        if (IS_ERR(trans))
-                return ret;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
-        BUG_ON(!trans);
-        ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
-                                       0, &objectid);
-        if (ret)
-                goto fail;
        leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
                                      0, objectid, NULL, 0, 0, 0);
@@ -345,13 +341,10 @@ fail:
        err = btrfs_commit_transaction(trans, root);
        if (err && !ret)
                ret = err;
-        btrfs_unreserve_metadata_space(root, 6);
        return ret;
 }
-static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
+static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
-                           char *name, int namelen)
 {
        struct inode *inode;
        struct btrfs_pending_snapshot *pending_snapshot;
@@ -361,40 +354,33 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
        if (!root->ref_cows)
                return -EINVAL;
-        /*
-         * 1 - inode item
-         * 2 - refs
-         * 1 - root item
-         * 2 - dir items
-         */
-        ret = btrfs_reserve_metadata_space(root, 6);
-        if (ret)
-                goto fail;
        pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
-        if (!pending_snapshot) {
+        if (!pending_snapshot)
-                ret = -ENOMEM;
+                return -ENOMEM;
-                btrfs_unreserve_metadata_space(root, 6);
-                goto fail;
+        btrfs_init_block_rsv(&pending_snapshot->block_rsv);
-        }
-        pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
-        if (!pending_snapshot->name) {
-                ret = -ENOMEM;
-                kfree(pending_snapshot);
-                btrfs_unreserve_metadata_space(root, 6);
-                goto fail;
-        }
-        memcpy(pending_snapshot->name, name, namelen);
-        pending_snapshot->name[namelen] = '\0';
        pending_snapshot->dentry = dentry;
-        trans = btrfs_start_transaction(root, 1);
-        BUG_ON(!trans);
        pending_snapshot->root = root;
+        trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
+        if (IS_ERR(trans)) {
+                ret = PTR_ERR(trans);
+                goto fail;
+        }
+        ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
+        BUG_ON(ret);
        list_add(&pending_snapshot->list,
                 &trans->transaction->pending_snapshots);
-        ret = btrfs_commit_transaction(trans, root);
+        ret = btrfs_commit_transaction(trans, root->fs_info->extent_root);
        BUG_ON(ret);
-        btrfs_unreserve_metadata_space(root, 6);
+        ret = pending_snapshot->error;
+        if (ret)
+                goto fail;
+        btrfs_orphan_cleanup(pending_snapshot->snap);
        inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
        if (IS_ERR(inode)) {
@@ -405,6 +391,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
        d_instantiate(dentry, inode);
        ret = 0;
 fail:
+        kfree(pending_snapshot);
        return ret;
 }
@@ -456,8 +443,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
                goto out_up_read;
        if (snap_src) {
-                error = create_snapshot(snap_src, dentry,
+                error = create_snapshot(snap_src, dentry);
-                                        name, namelen);
        } else {
                error = create_subvol(BTRFS_I(dir)->root, dentry,
                                      name, namelen);
@@ -511,7 +497,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
                em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
                unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
-                if (!em)
+                if (IS_ERR(em))
                        return 0;
        }
@@ -601,19 +587,9 @@ static int btrfs_defrag_file(struct file *file,
                if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
                        BTRFS_I(inode)->force_compress = 1;
-                ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+                ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
-                if (ret) {
+                if (ret)
-                        ret = -ENOSPC;
+                        goto err_unlock;
-                        break;
-                }
-                ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
-                if (ret) {
-                        btrfs_free_reserved_data_space(root, inode,
-                                                       PAGE_CACHE_SIZE);
-                        ret = -ENOSPC;
-                        break;
-                }
 again:
                if (inode->i_size == 0 ||
                    i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
@@ -622,8 +598,10 @@ again:
                }
                page = grab_cache_page(inode->i_mapping, i);
-                if (!page)
+                if (!page) {
+                        ret = -ENOMEM;
                        goto err_reservations;
+                }
                if (!PageUptodate(page)) {
                        btrfs_readpage(NULL, page);
@@ -631,6 +609,7 @@ again:
                        if (!PageUptodate(page)) {
                                unlock_page(page);
                                page_cache_release(page);
+                                ret = -EIO;
                                goto err_reservations;
                        }
                }
@@ -644,8 +623,7 @@ again:
                wait_on_page_writeback(page);
                if (PageDirty(page)) {
-                        btrfs_free_reserved_data_space(root, inode,
+                        btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
-                                                       PAGE_CACHE_SIZE);
                        goto loop_unlock;
                }
@@ -683,7 +661,6 @@ loop_unlock:
                page_cache_release(page);
                mutex_unlock(&inode->i_mutex);
-                btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
                balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
                i++;
        }
@@ -713,9 +690,9 @@ loop_unlock:
        return 0;
 err_reservations:
+        btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+err_unlock:
        mutex_unlock(&inode->i_mutex);
-        btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
-        btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
        return ret;
 }
@@ -811,7 +788,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                device->name, (unsigned long long)new_size);
        if (new_size > old_size) {
-                trans = btrfs_start_transaction(root, 1);
+                trans = btrfs_start_transaction(root, 0);
                ret = btrfs_grow_device(trans, device, new_size);
                btrfs_commit_transaction(trans, root);
        } else {
@@ -1212,6 +1189,9 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
                return -EPERM;
        args = kmalloc(sizeof(*args), GFP_KERNEL);
+        if (!args)
+                return -ENOMEM;
        if (copy_from_user(args, argp, sizeof(*args))) {
                kfree(args);
                return -EFAULT;
@@ -1297,7 +1277,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
        if (err)
                goto out_up_write;
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                err = PTR_ERR(trans);
+                goto out;
+        }
+        trans->block_rsv = &root->fs_info->global_block_rsv;
        ret = btrfs_unlink_subvol(trans, root, dir,
                                dest->root_key.objectid,
                                dentry->d_name.name,
@@ -1311,10 +1297,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
        dest->root_item.drop_level = 0;
        btrfs_set_root_refs(&dest->root_item, 0);
-        ret = btrfs_insert_orphan_item(trans,
+        if (!xchg(&dest->orphan_item_inserted, 1)) {
-                                root->fs_info->tree_root,
+                ret = btrfs_insert_orphan_item(trans,
-                                dest->root_key.objectid);
+                                        root->fs_info->tree_root,
-        BUG_ON(ret);
+                                        dest->root_key.objectid);
+                BUG_ON(ret);
+        }
        ret = btrfs_commit_transaction(trans, root);
        BUG_ON(ret);
@@ -1355,8 +1343,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
                        ret = -EPERM;
                        goto out;
                }
-                btrfs_defrag_root(root, 0);
+                ret = btrfs_defrag_root(root, 0);
-                btrfs_defrag_root(root->fs_info->extent_root, 0);
+                if (ret)
+                        goto out;
+                ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
                break;
        case S_IFREG:
                if (!(file->f_mode & FMODE_WRITE)) {
@@ -1375,6 +1365,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
                                           sizeof(*range))) {
                                ret = -EFAULT;
                                kfree(range);
+                                goto out;
                        }
                        /* compression requires us to start the IO */
                        if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
@@ -1385,9 +1376,11 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
                        /* the rest are all set to zero by kzalloc */
                        range->len = (u64)-1;
                }
-                btrfs_defrag_file(file, range);
+                ret = btrfs_defrag_file(file, range);
                kfree(range);
                break;
+        default:
+                ret = -EINVAL;
        }
 out:
        mnt_drop_write(file->f_path.mnt);
@@ -1477,12 +1470,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                ret = -EBADF;
                goto out_drop_write;
        }
        src = src_file->f_dentry->d_inode;
        ret = -EINVAL;
        if (src == inode)
                goto out_fput;
+        /* the src must be open for reading */
+        if (!(src_file->f_mode & FMODE_READ))
+                goto out_fput;
        ret = -EISDIR;
        if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
                goto out_fput;
@@ -1541,12 +1539,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                btrfs_wait_ordered_range(src, off, off+len);
        }
-        trans = btrfs_start_transaction(root, 1);
-        BUG_ON(!trans);
-        /* punch hole in destination first */
-        btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1);
        /* clone data */
        key.objectid = src->i_ino;
        key.type = BTRFS_EXTENT_DATA_KEY;
@@ -1557,7 +1549,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                 * note the key will change type as we walk through the
                 * tree.
                 */
-                ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                if (ret < 0)
                        goto out;
@@ -1620,12 +1612,31 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        new_key.objectid = inode->i_ino;
                        new_key.offset = key.offset + destoff - off;
+                        trans = btrfs_start_transaction(root, 1);
+                        if (IS_ERR(trans)) {
+                                ret = PTR_ERR(trans);
+                                goto out;
+                        }
                        if (type == BTRFS_FILE_EXTENT_REG ||
                            type == BTRFS_FILE_EXTENT_PREALLOC) {
+                                if (off > key.offset) {
+                                        datao += off - key.offset;
+                                        datal -= off - key.offset;
+                                }
+                                if (key.offset + datal > off + len)
+                                        datal = off + len - key.offset;
+                                ret = btrfs_drop_extents(trans, inode,
+                                                         new_key.offset,
+                                                         new_key.offset + datal,
+                                                         &hint_byte, 1);
+                                BUG_ON(ret);
                                ret = btrfs_insert_empty_item(trans, root, path,
                                                              &new_key, size);
-                                if (ret)
+                                BUG_ON(ret);
-                                        goto out;
                                leaf = path->nodes[0];
                                slot = path->slots[0];
@@ -1636,14 +1647,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                extent = btrfs_item_ptr(leaf, slot,
                                                struct btrfs_file_extent_item);
-                                if (off > key.offset) {
-                                        datao += off - key.offset;
-                                        datal -= off - key.offset;
-                                }
-                                if (key.offset + datal > off + len)
-                                        datal = off + len - key.offset;
                                /* disko == 0 means it's a hole */
                                if (!disko)
                                        datao = 0;
@@ -1674,14 +1677,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                if (comp && (skip || trim)) {
                                        ret = -EINVAL;
+                                        btrfs_end_transaction(trans, root);
                                        goto out;
                                }
                                size -= skip + trim;
                                datal -= skip + trim;
+                                ret = btrfs_drop_extents(trans, inode,
+                                                         new_key.offset,
+                                                         new_key.offset + datal,
+                                                         &hint_byte, 1);
+                                BUG_ON(ret);
                                ret = btrfs_insert_empty_item(trans, root, path,
                                                              &new_key, size);
-                                if (ret)
+                                BUG_ON(ret);
-                                        goto out;
                                if (skip) {
                                        u32 start =
@@ -1699,8 +1709,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        }
                        btrfs_mark_buffer_dirty(leaf);
-                }
+                        btrfs_release_path(root, path);
+                        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+                        if (new_key.offset + datal > inode->i_size)
+                                btrfs_i_size_write(inode,
+                                                   new_key.offset + datal);
+                        BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
+                        ret = btrfs_update_inode(trans, root, inode);
+                        BUG_ON(ret);
+                        btrfs_end_transaction(trans, root);
+                }
 next:
                btrfs_release_path(root, path);
                key.offset++;
@@ -1708,17 +1727,7 @@ next:
        ret = 0;
 out:
        btrfs_release_path(root, path);
-        if (ret == 0) {
-                inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-                if (destoff + olen > inode->i_size)
-                        btrfs_i_size_write(inode, destoff + olen);
-                BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
-                ret = btrfs_update_inode(trans, root, inode);
-        }
-        btrfs_end_transaction(trans, root);
        unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
-        if (ret)
-                vmtruncate(inode, 0);
 out_unlock:
        mutex_unlock(&src->i_mutex);
        mutex_unlock(&inode->i_mutex);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 1c36e5cd8f55..6151f2ea38bb 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -16,7 +16,6 @@
 * Boston, MA 021110-1307, USA.
 */
 #include <linux/sched.h>
-#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/spinlock.h>
 #include <linux/page-flags.h>
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a8ffecd0b491..e56c72bc5add 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -16,7 +16,6 @@
 * Boston, MA 021110-1307, USA.
 */
-#include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/writeback.h>
@@ -125,6 +124,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
        return 1;
 }
+static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
+                          u64 len)
+{
+        if (file_offset + len <= entry->file_offset ||
+            entry->file_offset + entry->len <= file_offset)
+                return 0;
+        return 1;
+}
 /*
 * look find the first ordered struct that has this offset, otherwise
 * the first one less than this offset
@@ -162,8 +170,9 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
 * The tree is given a single reference on the ordered extent that was
 * inserted.
 */
-int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-                             u64 start, u64 len, u64 disk_len, int type)
+                                      u64 start, u64 len, u64 disk_len,
+                                      int type, int dio)
 {
        struct btrfs_ordered_inode_tree *tree;
        struct rb_node *node;
@@ -183,6 +192,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
                set_bit(type, &entry->flags);
+        if (dio)
+                set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
        /* one ref for the tree */
        atomic_set(&entry->refs, 1);
        init_waitqueue_head(&entry->wait);
@@ -204,6 +216,20 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        return 0;
 }
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+                             u64 start, u64 len, u64 disk_len, int type)
+{
+        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+                                          disk_len, type, 0);
+}
+int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+                                 u64 start, u64 len, u64 disk_len, int type)
+{
+        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+                                          disk_len, type, 1);
+}
 /*
 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
 * when an ordered extent is finished.  If the list covers more than one
@@ -303,6 +329,7 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
                                struct btrfs_ordered_extent *entry)
 {
        struct btrfs_ordered_inode_tree *tree;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct rb_node *node;
        tree = &BTRFS_I(inode)->ordered_tree;
@@ -311,13 +338,7 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
        tree->last = NULL;
        set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
-        spin_lock(&BTRFS_I(inode)->accounting_lock);
+        spin_lock(&root->fs_info->ordered_extent_lock);
-        BTRFS_I(inode)->outstanding_extents--;
-        spin_unlock(&BTRFS_I(inode)->accounting_lock);
-        btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
-                                              inode, 1);
-        spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
        list_del_init(&entry->root_extent_list);
        /*
@@ -329,7 +350,7 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
            !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
                list_del_init(&BTRFS_I(inode)->ordered_operations);
        }
-        spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+        spin_unlock(&root->fs_info->ordered_extent_lock);
        return 0;
 }
@@ -490,7 +511,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
         * start IO on any dirty ones so the wait doesn't stall waiting
         * for pdflush to find them
         */
-        filemap_fdatawrite_range(inode->i_mapping, start, end);
+        if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
+                filemap_fdatawrite_range(inode->i_mapping, start, end);
        if (wait) {
                wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
                                                 &entry->flags));
@@ -587,6 +609,47 @@ out:
        return entry;
 }
+/* Since the DIO code tries to lock a wide area we need to look for any ordered
+ * extents that exist in the range, rather than just the start of the range.
+ */
+struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
+                                                        u64 file_offset,
+                                                        u64 len)
+{
+        struct btrfs_ordered_inode_tree *tree;
+        struct rb_node *node;
+        struct btrfs_ordered_extent *entry = NULL;
+        tree = &BTRFS_I(inode)->ordered_tree;
+        spin_lock(&tree->lock);
+        node = tree_search(tree, file_offset);
+        if (!node) {
+                node = tree_search(tree, file_offset + len);
+                if (!node)
+                        goto out;
+        }
+        while (1) {
+                entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+                if (range_overlaps(entry, file_offset, len))
+                        break;
+                if (entry->file_offset >= file_offset + len) {
+                        entry = NULL;
+                        break;
+                }
+                entry = NULL;
+                node = rb_next(node);
+                if (!node)
+                        break;
+        }
+out:
+        if (entry)
+                atomic_inc(&entry->refs);
+        spin_unlock(&tree->lock);
+        return entry;
+}
 /*
 * lookup and return any extent before 'file_offset'.  NULL is returned
 * if none is found
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c82f76a9f040..8ac365492a3f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -72,6 +72,8 @@ struct btrfs_ordered_sum {
 #define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
+#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
 struct btrfs_ordered_extent {
        /* logical offset in the file */
        u64 file_offset;
@@ -140,7 +142,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
                                   struct btrfs_ordered_extent **cached,
                                   u64 file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-                             u64 start, u64 len, u64 disk_len, int tyep);
+                             u64 start, u64 len, u64 disk_len, int type);
+int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+                                 u64 start, u64 len, u64 disk_len, int type);
 int btrfs_add_ordered_sum(struct inode *inode,
                          struct btrfs_ordered_extent *entry,
                          struct btrfs_ordered_sum *sum);
@@ -151,6 +155,9 @@ void btrfs_start_ordered_extent(struct inode *inode,
 int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
 struct btrfs_ordered_extent *
 btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
+                                                        u64 file_offset,
+                                                        u64 len);
 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                                struct btrfs_ordered_extent *ordered);
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index d0cc62bccb94..a97314cf6bd6 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -17,6 +17,7 @@
 */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/sort.h>
 #include "ctree.h"
 #include "ref-cache.h"
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 0b23942cbc0d..05d41e569236 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -21,6 +21,7 @@
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 #include <linux/rbtree.h>
+#include <linux/slab.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -43,8 +44,12 @@ struct tree_entry {
 struct backref_node {
        struct rb_node rb_node;
        u64 bytenr;
-        /* objectid tree block owner */
+        u64 new_bytenr;
+        /* objectid of tree block owner, can be not uptodate */
        u64 owner;
+        /* link to pending, changed or detached list */
+        struct list_head list;
        /* list of upper level blocks reference this block */
        struct list_head upper;
        /* list of child blocks in the cache */
@@ -55,9 +60,9 @@ struct backref_node {
        struct extent_buffer *eb;
        /* level of tree block */
        unsigned int level:8;
-        /* 1 if the block is root of old snapshot */
+        /* is the block in non-reference counted tree */
-        unsigned int old_root:1;
+        unsigned int cowonly:1;
-        /* 1 if no child blocks in the cache */
+        /* 1 if no child node in the cache */
        unsigned int lowest:1;
        /* is the extent buffer locked */
        unsigned int locked:1;
@@ -65,6 +70,16 @@ struct backref_node {
        unsigned int processed:1;
        /* have backrefs of this block been checked */
        unsigned int checked:1;
+        /*
+         * 1 if corresponding block has been cowed but some upper
+         * level block pointers may not point to the new location
+         */
+        unsigned int pending:1;
+        /*
+         * 1 if the backref node isn't connected to any other
+         * backref node.
+         */
+        unsigned int detached:1;
 };
 /*
@@ -73,7 +88,6 @@ struct backref_node {
 struct backref_edge {
        struct list_head list[2];
        struct backref_node *node[2];
-        u64 blockptr;
 };
 #define LOWER   0
@@ -82,9 +96,25 @@ struct backref_edge {
 struct backref_cache {
        /* red black tree of all backref nodes in the cache */
        struct rb_root rb_root;
-        /* list of backref nodes with no child block in the cache */
+        /* for passing backref nodes to btrfs_reloc_cow_block */
+        struct backref_node *path[BTRFS_MAX_LEVEL];
+        /*
+         * list of blocks that have been cowed but some block
+         * pointers in upper level blocks may not reflect the
+         * new location
+         */
        struct list_head pending[BTRFS_MAX_LEVEL];
-        spinlock_t lock;
+        /* list of backref nodes with no child node */
+        struct list_head leaves;
+        /* list of blocks that have been cowed in current transaction */
+        struct list_head changed;
+        /* list of detached backref node. */
+        struct list_head detached;
+        u64 last_trans;
+        int nr_nodes;
+        int nr_edges;
 };
 /*
@@ -112,15 +142,6 @@ struct tree_block {
        unsigned int key_ready:1;
 };
-/* inode vector */
-#define INODEVEC_SIZE 16
-struct inodevec {
-        struct list_head list;
-        struct inode *inode[INODEVEC_SIZE];
-        int nr;
-};
 #define MAX_EXTENTS 128
 struct file_extent_cluster {
@@ -137,36 +158,43 @@ struct reloc_control {
        struct btrfs_root *extent_root;
        /* inode for moving data */
        struct inode *data_inode;
-        struct btrfs_workers workers;
+        struct btrfs_block_rsv *block_rsv;
+        struct backref_cache backref_cache;
+        struct file_extent_cluster cluster;
        /* tree blocks have been processed */
        struct extent_io_tree processed_blocks;
        /* map start of tree root to corresponding reloc tree */
        struct mapping_tree reloc_root_tree;
        /* list of reloc trees */
        struct list_head reloc_roots;
+        /* size of metadata reservation for merging reloc trees */
+        u64 merging_rsv_size;
+        /* size of relocated tree nodes */
+        u64 nodes_relocated;
        u64 search_start;
        u64 extents_found;
-        u64 extents_skipped;
-        int stage;
+        int block_rsv_retries;
-        int create_reloc_root;
+        unsigned int stage:8;
+        unsigned int create_reloc_tree:1;
+        unsigned int merge_reloc_tree:1;
        unsigned int found_file_extent:1;
-        unsigned int found_old_snapshot:1;
+        unsigned int commit_transaction:1;
 };
 /* stages of data relocation */
 #define MOVE_DATA_EXTENTS       0
 #define UPDATE_DATA_PTRS        1
-/*
+static void remove_backref_node(struct backref_cache *cache,
- * merge reloc tree to corresponding fs tree in worker threads
+                                struct backref_node *node);
- */
+static void __mark_block_processed(struct reloc_control *rc,
-struct async_merge {
+                                   struct backref_node *node);
-        struct btrfs_work work;
-        struct reloc_control *rc;
-        struct btrfs_root *root;
-        struct completion *done;
-        atomic_t *num_pending;
-};
 static void mapping_tree_init(struct mapping_tree *tree)
 {
@@ -180,15 +208,80 @@ static void backref_cache_init(struct backref_cache *cache)
        cache->rb_root = RB_ROOT;
        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
                INIT_LIST_HEAD(&cache->pending[i]);
-        spin_lock_init(&cache->lock);
+        INIT_LIST_HEAD(&cache->changed);
+        INIT_LIST_HEAD(&cache->detached);
+        INIT_LIST_HEAD(&cache->leaves);
+}
+static void backref_cache_cleanup(struct backref_cache *cache)
+{
+        struct backref_node *node;
+        int i;
+        while (!list_empty(&cache->detached)) {
+                node = list_entry(cache->detached.next,
+                                  struct backref_node, list);
+                remove_backref_node(cache, node);
+        }
+        while (!list_empty(&cache->leaves)) {
+                node = list_entry(cache->leaves.next,
+                                  struct backref_node, lower);
+                remove_backref_node(cache, node);
+        }
+        cache->last_trans = 0;
+        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+                BUG_ON(!list_empty(&cache->pending[i]));
+        BUG_ON(!list_empty(&cache->changed));
+        BUG_ON(!list_empty(&cache->detached));
+        BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
+        BUG_ON(cache->nr_nodes);
+        BUG_ON(cache->nr_edges);
+}
+static struct backref_node *alloc_backref_node(struct backref_cache *cache)
+{
+        struct backref_node *node;
+        node = kzalloc(sizeof(*node), GFP_NOFS);
+        if (node) {
+                INIT_LIST_HEAD(&node->list);
+                INIT_LIST_HEAD(&node->upper);
+                INIT_LIST_HEAD(&node->lower);
+                RB_CLEAR_NODE(&node->rb_node);
+                cache->nr_nodes++;
+        }
+        return node;
+}
+static void free_backref_node(struct backref_cache *cache,
+                              struct backref_node *node)
+{
+        if (node) {
+                cache->nr_nodes--;
+                kfree(node);
+        }
+}
+static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
+{
+        struct backref_edge *edge;
+        edge = kzalloc(sizeof(*edge), GFP_NOFS);
+        if (edge)
+                cache->nr_edges++;
+        return edge;
 }
-static void backref_node_init(struct backref_node *node)
+static void free_backref_edge(struct backref_cache *cache,
+                              struct backref_edge *edge)
 {
-        memset(node, 0, sizeof(*node));
+        if (edge) {
-        INIT_LIST_HEAD(&node->upper);
+                cache->nr_edges--;
-        INIT_LIST_HEAD(&node->lower);
+                kfree(edge);
-        RB_CLEAR_NODE(&node->rb_node);
+        }
 }
 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
@@ -249,6 +342,7 @@ static struct backref_node *walk_up_backref(struct backref_node *node,
                edges[idx++] = edge;
                node = edge->node[UPPER];
        }
+        BUG_ON(node->detached);
        *index = idx;
        return node;
 }
@@ -280,13 +374,18 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[],
        return NULL;
 }
+static void unlock_node_buffer(struct backref_node *node)
+{
+        if (node->locked) {
+                btrfs_tree_unlock(node->eb);
+                node->locked = 0;
+        }
+}
 static void drop_node_buffer(struct backref_node *node)
 {
        if (node->eb) {
-                if (node->locked) {
+                unlock_node_buffer(node);
-                        btrfs_tree_unlock(node->eb);
-                        node->locked = 0;
-                }
                free_extent_buffer(node->eb);
                node->eb = NULL;
        }
@@ -295,14 +394,14 @@ static void drop_node_buffer(struct backref_node *node)
 static void drop_backref_node(struct backref_cache *tree,
                              struct backref_node *node)
 {
-        BUG_ON(!node->lowest);
        BUG_ON(!list_empty(&node->upper));
        drop_node_buffer(node);
+        list_del(&node->list);
        list_del(&node->lower);
+        if (!RB_EMPTY_NODE(&node->rb_node))
-        rb_erase(&node->rb_node, &tree->rb_root);
+                rb_erase(&node->rb_node, &tree->rb_root);
-        kfree(node);
+        free_backref_node(tree, node);
 }
 /*
@@ -317,27 +416,121 @@ static void remove_backref_node(struct backref_cache *cache,
        if (!node)
                return;
-        BUG_ON(!node->lowest);
+        BUG_ON(!node->lowest && !node->detached);
        while (!list_empty(&node->upper)) {
                edge = list_entry(node->upper.next, struct backref_edge,
                                  list[LOWER]);
                upper = edge->node[UPPER];
                list_del(&edge->list[LOWER]);
                list_del(&edge->list[UPPER]);
-                kfree(edge);
+                free_backref_edge(cache, edge);
+                if (RB_EMPTY_NODE(&upper->rb_node)) {
+                        BUG_ON(!list_empty(&node->upper));
+                        drop_backref_node(cache, node);
+                        node = upper;
+                        node->lowest = 1;
+                        continue;
+                }
                /*
-                 * add the node to pending list if no other
+                 * add the node to leaf node list if no other
                 * child block cached.
                 */
                if (list_empty(&upper->lower)) {
-                        list_add_tail(&upper->lower,
+                        list_add_tail(&upper->lower, &cache->leaves);
-                                      &cache->pending[upper->level]);
                        upper->lowest = 1;
                }
        }
        drop_backref_node(cache, node);
 }
+static void update_backref_node(struct backref_cache *cache,
+                                struct backref_node *node, u64 bytenr)
+{
+        struct rb_node *rb_node;
+        rb_erase(&node->rb_node, &cache->rb_root);
+        node->bytenr = bytenr;
+        rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+        BUG_ON(rb_node);
+}
+/*
+ * update backref cache after a transaction commit
+ */
+static int update_backref_cache(struct btrfs_trans_handle *trans,
+                                struct backref_cache *cache)
+{
+        struct backref_node *node;
+        int level = 0;
+        if (cache->last_trans == 0) {
+                cache->last_trans = trans->transid;
+                return 0;
+        }
+        if (cache->last_trans == trans->transid)
+                return 0;
+        /*
+         * detached nodes are used to avoid unnecessary backref
+         * lookup. transaction commit changes the extent tree.
+         * so the detached nodes are no longer useful.
+         */
+        while (!list_empty(&cache->detached)) {
+                node = list_entry(cache->detached.next,
+                                  struct backref_node, list);
+                remove_backref_node(cache, node);
+        }
+        while (!list_empty(&cache->changed)) {
+                node = list_entry(cache->changed.next,
+                                  struct backref_node, list);
+                list_del_init(&node->list);
+                BUG_ON(node->pending);
+                update_backref_node(cache, node, node->new_bytenr);
+        }
+        /*
+         * some nodes can be left in the pending list if there were
+         * errors during processing the pending nodes.
+         */
+        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+                list_for_each_entry(node, &cache->pending[level], list) {
+                        BUG_ON(!node->pending);
+                        if (node->bytenr == node->new_bytenr)
+                                continue;
+                        update_backref_node(cache, node, node->new_bytenr);
+                }
+        }
+        cache->last_trans = 0;
+        return 1;
+}
+static int should_ignore_root(struct btrfs_root *root)
+{
+        struct btrfs_root *reloc_root;
+        if (!root->ref_cows)
+                return 0;
+        reloc_root = root->reloc_root;
+        if (!reloc_root)
+                return 0;
+        if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
+            root->fs_info->running_transaction->transid - 1)
+                return 0;
+        /*
+         * if there is reloc tree and it was created in previous
+         * transaction backref lookup can find the reloc tree,
+         * so backref node for the fs tree root is useless for
+         * relocation.
+         */
+        return 1;
+}
 /*
 * find reloc tree by address of tree root
 */
@@ -452,11 +645,12 @@ int find_inline_backref(struct extent_buffer *leaf, int slot,
 * for all upper level blocks that directly/indirectly reference the
 * block are also cached.
 */
-static struct backref_node *build_backref_tree(struct reloc_control *rc,
+static noinline_for_stack
-                                               struct backref_cache *cache,
+struct backref_node *build_backref_tree(struct reloc_control *rc,
-                                               struct btrfs_key *node_key,
+                                        struct btrfs_key *node_key,
-                                               int level, u64 bytenr)
+                                        int level, u64 bytenr)
 {
+        struct backref_cache *cache = &rc->backref_cache;
        struct btrfs_path *path1;
        struct btrfs_path *path2;
        struct extent_buffer *eb;
@@ -472,6 +666,8 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
        unsigned long end;
        unsigned long ptr;
        LIST_HEAD(list);
+        LIST_HEAD(useless);
+        int cowonly;
        int ret;
        int err = 0;
@@ -482,15 +678,13 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
                goto out;
        }
-        node = kmalloc(sizeof(*node), GFP_NOFS);
+        node = alloc_backref_node(cache);
        if (!node) {
                err = -ENOMEM;
                goto out;
        }
-        backref_node_init(node);
        node->bytenr = bytenr;
-        node->owner = 0;
        node->level = level;
        node->lowest = 1;
        cur = node;
@@ -586,17 +780,20 @@ again:
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
                if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
                    key.type == BTRFS_EXTENT_REF_V0_KEY) {
-                        if (key.objectid == key.offset &&
+                        if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
-                            key.type == BTRFS_EXTENT_REF_V0_KEY) {
                                struct btrfs_extent_ref_v0 *ref0;
                                ref0 = btrfs_item_ptr(eb, path1->slots[0],
                                                struct btrfs_extent_ref_v0);
                                root = find_tree_root(rc, eb, ref0);
-                                if (root)
+                                if (!root->ref_cows)
-                                        cur->root = root;
+                                        cur->cowonly = 1;
-                                else
+                                if (key.objectid == key.offset) {
-                                        cur->old_root = 1;
+                                        if (root && !should_ignore_root(root))
-                                break;
+                                                cur->root = root;
+                                        else
+                                                list_add(&cur->list, &useless);
+                                        break;
+                                }
                        }
 #else
                BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
@@ -613,22 +810,20 @@ again:
                                break;
                        }
-                        edge = kzalloc(sizeof(*edge), GFP_NOFS);
+                        edge = alloc_backref_edge(cache);
                        if (!edge) {
                                err = -ENOMEM;
                                goto out;
                        }
                        rb_node = tree_search(&cache->rb_root, key.offset);
                        if (!rb_node) {
-                                upper = kmalloc(sizeof(*upper), GFP_NOFS);
+                                upper = alloc_backref_node(cache);
                                if (!upper) {
-                                        kfree(edge);
+                                        free_backref_edge(cache, edge);
                                        err = -ENOMEM;
                                        goto out;
                                }
-                                backref_node_init(upper);
                                upper->bytenr = key.offset;
-                                upper->owner = 0;
                                upper->level = cur->level + 1;
                                /*
                                 *  backrefs for the upper level block isn't
@@ -638,11 +833,12 @@ again:
                        } else {
                                upper = rb_entry(rb_node, struct backref_node,
                                                 rb_node);
+                                BUG_ON(!upper->checked);
                                INIT_LIST_HEAD(&edge->list[UPPER]);
                        }
-                        list_add(&edge->list[LOWER], &cur->upper);
+                        list_add_tail(&edge->list[LOWER], &cur->upper);
-                        edge->node[UPPER] = upper;
                        edge->node[LOWER] = cur;
+                        edge->node[UPPER] = upper;
                        goto next;
                } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
@@ -656,11 +852,17 @@ again:
                        goto out;
                }
+                if (!root->ref_cows)
+                        cur->cowonly = 1;
                if (btrfs_root_level(&root->root_item) == cur->level) {
                        /* tree root */
                        BUG_ON(btrfs_root_bytenr(&root->root_item) !=
                               cur->bytenr);
-                        cur->root = root;
+                        if (should_ignore_root(root))
+                                list_add(&cur->list, &useless);
+                        else
+                                cur->root = root;
                        break;
                }
@@ -691,11 +893,14 @@ again:
                        if (!path2->nodes[level]) {
                                BUG_ON(btrfs_root_bytenr(&root->root_item) !=
                                       lower->bytenr);
-                                lower->root = root;
+                                if (should_ignore_root(root))
+                                        list_add(&lower->list, &useless);
+                                else
+                                        lower->root = root;
                                break;
                        }
-                        edge = kzalloc(sizeof(*edge), GFP_NOFS);
+                        edge = alloc_backref_edge(cache);
                        if (!edge) {
                                err = -ENOMEM;
                                goto out;
@@ -704,16 +909,17 @@ again:
                        eb = path2->nodes[level];
                        rb_node = tree_search(&cache->rb_root, eb->start);
                        if (!rb_node) {
-                                upper = kmalloc(sizeof(*upper), GFP_NOFS);
+                                upper = alloc_backref_node(cache);
                                if (!upper) {
-                                        kfree(edge);
+                                        free_backref_edge(cache, edge);
                                        err = -ENOMEM;
                                        goto out;
                                }
-                                backref_node_init(upper);
                                upper->bytenr = eb->start;
                                upper->owner = btrfs_header_owner(eb);
                                upper->level = lower->level + 1;
+                                if (!root->ref_cows)
+                                        upper->cowonly = 1;
                                /*
                                 * if we know the block isn't shared
@@ -743,10 +949,12 @@ again:
                                                 rb_node);
                                BUG_ON(!upper->checked);
                                INIT_LIST_HEAD(&edge->list[UPPER]);
+                                if (!upper->owner)
+                                        upper->owner = btrfs_header_owner(eb);
                        }
                        list_add_tail(&edge->list[LOWER], &lower->upper);
-                        edge->node[UPPER] = upper;
                        edge->node[LOWER] = lower;
+                        edge->node[UPPER] = upper;
                        if (rb_node)
                                break;
@@ -784,8 +992,13 @@ next:
         * into the cache.
         */
        BUG_ON(!node->checked);
-        rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+        cowonly = node->cowonly;
-        BUG_ON(rb_node);
+        if (!cowonly) {
+                rb_node = tree_insert(&cache->rb_root, node->bytenr,
+                                      &node->rb_node);
+                BUG_ON(rb_node);
+                list_add_tail(&node->lower, &cache->leaves);
+        }
        list_for_each_entry(edge, &node->upper, list[LOWER])
                list_add_tail(&edge->list[UPPER], &list);
@@ -794,6 +1007,14 @@ next:
                edge = list_entry(list.next, struct backref_edge, list[UPPER]);
                list_del_init(&edge->list[UPPER]);
                upper = edge->node[UPPER];
+                if (upper->detached) {
+                        list_del(&edge->list[LOWER]);
+                        lower = edge->node[LOWER];
+                        free_backref_edge(cache, edge);
+                        if (list_empty(&lower->upper))
+                                list_add(&lower->list, &useless);
+                        continue;
+                }
                if (!RB_EMPTY_NODE(&upper->rb_node)) {
                        if (upper->lowest) {
@@ -806,25 +1027,69 @@ next:
                }
                BUG_ON(!upper->checked);
-                rb_node = tree_insert(&cache->rb_root, upper->bytenr,
+                BUG_ON(cowonly != upper->cowonly);
-                                      &upper->rb_node);
+                if (!cowonly) {
-                BUG_ON(rb_node);
+                        rb_node = tree_insert(&cache->rb_root, upper->bytenr,
+                                              &upper->rb_node);
+                        BUG_ON(rb_node);
+                }
                list_add_tail(&edge->list[UPPER], &upper->lower);
                list_for_each_entry(edge, &upper->upper, list[LOWER])
                        list_add_tail(&edge->list[UPPER], &list);
        }
+        /*
+         * process useless backref nodes. backref nodes for tree leaves
+         * are deleted from the cache. backref nodes for upper level
+         * tree blocks are left in the cache to avoid unnecessary backref
+         * lookup.
+         */
+        while (!list_empty(&useless)) {
+                upper = list_entry(useless.next, struct backref_node, list);
+                list_del_init(&upper->list);
+                BUG_ON(!list_empty(&upper->upper));
+                if (upper == node)
+                        node = NULL;
+                if (upper->lowest) {
+                        list_del_init(&upper->lower);
+                        upper->lowest = 0;
+                }
+                while (!list_empty(&upper->lower)) {
+                        edge = list_entry(upper->lower.next,
+                                          struct backref_edge, list[UPPER]);
+                        list_del(&edge->list[UPPER]);
+                        list_del(&edge->list[LOWER]);
+                        lower = edge->node[LOWER];
+                        free_backref_edge(cache, edge);
+                        if (list_empty(&lower->upper))
+                                list_add(&lower->list, &useless);
+                }
+                __mark_block_processed(rc, upper);
+                if (upper->level > 0) {
+                        list_add(&upper->list, &cache->detached);
+                        upper->detached = 1;
+                } else {
+                        rb_erase(&upper->rb_node, &cache->rb_root);
+                        free_backref_node(cache, upper);
+                }
+        }
 out:
        btrfs_free_path(path1);
        btrfs_free_path(path2);
        if (err) {
-                INIT_LIST_HEAD(&list);
+                while (!list_empty(&useless)) {
+                        lower = list_entry(useless.next,
+                                           struct backref_node, upper);
+                        list_del_init(&lower->upper);
+                }
                upper = node;
+                INIT_LIST_HEAD(&list);
                while (upper) {
                        if (RB_EMPTY_NODE(&upper->rb_node)) {
                                list_splice_tail(&upper->upper, &list);
-                                kfree(upper);
+                                free_backref_node(cache, upper);
                        }
                        if (list_empty(&list))
@@ -832,15 +1097,104 @@ out:
                        edge = list_entry(list.next, struct backref_edge,
                                          list[LOWER]);
+                        list_del(&edge->list[LOWER]);
                        upper = edge->node[UPPER];
-                        kfree(edge);
+                        free_backref_edge(cache, edge);
                }
                return ERR_PTR(err);
        }
+        BUG_ON(node && node->detached);
        return node;
 }
 /*
+ * helper to add backref node for the newly created snapshot.
+ * the backref node is created by cloning backref node that
+ * corresponds to root of source tree
+ */
+static int clone_backref_node(struct btrfs_trans_handle *trans,
+                              struct reloc_control *rc,
+                              struct btrfs_root *src,
+                              struct btrfs_root *dest)
+{
+        struct btrfs_root *reloc_root = src->reloc_root;
+        struct backref_cache *cache = &rc->backref_cache;
+        struct backref_node *node = NULL;
+        struct backref_node *new_node;
+        struct backref_edge *edge;
+        struct backref_edge *new_edge;
+        struct rb_node *rb_node;
+        if (cache->last_trans > 0)
+                update_backref_cache(trans, cache);
+        rb_node = tree_search(&cache->rb_root, src->commit_root->start);
+        if (rb_node) {
+                node = rb_entry(rb_node, struct backref_node, rb_node);
+                if (node->detached)
+                        node = NULL;
+                else
+                        BUG_ON(node->new_bytenr != reloc_root->node->start);
+        }
+        if (!node) {
+                rb_node = tree_search(&cache->rb_root,
+                                      reloc_root->commit_root->start);
+                if (rb_node) {
+                        node = rb_entry(rb_node, struct backref_node,
+                                        rb_node);
+                        BUG_ON(node->detached);
+                }
+        }
+        if (!node)
+                return 0;
+        new_node = alloc_backref_node(cache);
+        if (!new_node)
+                return -ENOMEM;
+        new_node->bytenr = dest->node->start;
+        new_node->level = node->level;
+        new_node->lowest = node->lowest;
+        new_node->root = dest;
+        if (!node->lowest) {
+                list_for_each_entry(edge, &node->lower, list[UPPER]) {
+                        new_edge = alloc_backref_edge(cache);
+                        if (!new_edge)
+                                goto fail;
+                        new_edge->node[UPPER] = new_node;
+                        new_edge->node[LOWER] = edge->node[LOWER];
+                        list_add_tail(&new_edge->list[UPPER],
+                                      &new_node->lower);
+                }
+        }
+        rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
+                              &new_node->rb_node);
+        BUG_ON(rb_node);
+        if (!new_node->lowest) {
+                list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
+                        list_add_tail(&new_edge->list[LOWER],
+                                      &new_edge->node[LOWER]->upper);
+                }
+        }
+        return 0;
+fail:
+        while (!list_empty(&new_node->lower)) {
+                new_edge = list_entry(new_node->lower.next,
+                                      struct backref_edge, list[UPPER]);
+                list_del(&new_edge->list[UPPER]);
+                free_backref_edge(cache, new_edge);
+        }
+        free_backref_node(cache, new_node);
+        return -ENOMEM;
+}
+/*
 * helper to add 'address of tree root -> reloc tree' mapping
 */
 static int __add_reloc_root(struct btrfs_root *root)
@@ -900,12 +1254,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
        return 0;
 }
-/*
+static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
- * create reloc tree for a given fs tree. reloc tree is just a
+                                        struct btrfs_root *root, u64 objectid)
- * snapshot of the fs tree with special root objectid.
- */
-int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root)
 {
        struct btrfs_root *reloc_root;
        struct extent_buffer *eb;
@@ -913,36 +1263,45 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
        struct btrfs_key root_key;
        int ret;
-        if (root->reloc_root) {
-                reloc_root = root->reloc_root;
-                reloc_root->last_trans = trans->transid;
-                return 0;
-        }
-        if (!root->fs_info->reloc_ctl ||
-            !root->fs_info->reloc_ctl->create_reloc_root ||
-            root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
-                return 0;
        root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
        BUG_ON(!root_item);
        root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
        root_key.type = BTRFS_ROOT_ITEM_KEY;
-        root_key.offset = root->root_key.objectid;
+        root_key.offset = objectid;
-        ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+        if (root->root_key.objectid == objectid) {
-                              BTRFS_TREE_RELOC_OBJECTID);
+                /* called by btrfs_init_reloc_root */
-        BUG_ON(ret);
+                ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+                                      BTRFS_TREE_RELOC_OBJECTID);
+                BUG_ON(ret);
+                btrfs_set_root_last_snapshot(&root->root_item,
+                                             trans->transid - 1);
+        } else {
+                /*
+                 * called by btrfs_reloc_post_snapshot_hook.
+                 * the source tree is a reloc tree, all tree blocks
+                 * modified after it was created have RELOC flag
+                 * set in their headers. so it's OK to not update
+                 * the 'last_snapshot'.
+                 */
+                ret = btrfs_copy_root(trans, root, root->node, &eb,
+                                      BTRFS_TREE_RELOC_OBJECTID);
+                BUG_ON(ret);
+        }
-        btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
        memcpy(root_item, &root->root_item, sizeof(*root_item));
-        btrfs_set_root_refs(root_item, 1);
        btrfs_set_root_bytenr(root_item, eb->start);
        btrfs_set_root_level(root_item, btrfs_header_level(eb));
        btrfs_set_root_generation(root_item, trans->transid);
-        memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
-        root_item->drop_level = 0;
+        if (root->root_key.objectid == objectid) {
+                btrfs_set_root_refs(root_item, 0);
+                memset(&root_item->drop_progress, 0,
+                       sizeof(struct btrfs_disk_key));
+                root_item->drop_level = 0;
+        }
        btrfs_tree_unlock(eb);
        free_extent_buffer(eb);
@@ -956,6 +1315,37 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
                                                 &root_key);
        BUG_ON(IS_ERR(reloc_root));
        reloc_root->last_trans = trans->transid;
+        return reloc_root;
+}
+/*
+ * create reloc tree for a given fs tree. reloc tree is just a
+ * snapshot of the fs tree with special root objectid.
+ */
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root)
+{
+        struct btrfs_root *reloc_root;
+        struct reloc_control *rc = root->fs_info->reloc_ctl;
+        int clear_rsv = 0;
+        if (root->reloc_root) {
+                reloc_root = root->reloc_root;
+                reloc_root->last_trans = trans->transid;
+                return 0;
+        }
+        if (!rc || !rc->create_reloc_tree ||
+            root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+                return 0;
+        if (!trans->block_rsv) {
+                trans->block_rsv = rc->block_rsv;
+                clear_rsv = 1;
+        }
+        reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
+        if (clear_rsv)
+                trans->block_rsv = NULL;
        __add_reloc_root(reloc_root);
        root->reloc_root = reloc_root;
@@ -979,7 +1369,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
        reloc_root = root->reloc_root;
        root_item = &reloc_root->root_item;
-        if (btrfs_root_refs(root_item) == 0) {
+        if (root->fs_info->reloc_ctl->merge_reloc_tree &&
+            btrfs_root_refs(root_item) == 0) {
                root->reloc_root = NULL;
                del = 1;
        }
@@ -1101,8 +1492,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
                goto out;
        }
-        if (new_bytenr)
+        *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
-                *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
        ret = 0;
 out:
        btrfs_free_path(path);
@@ -1113,19 +1503,18 @@ out:
 * update file extent items in the tree leaf to point to
 * the new locations.
 */
-static int replace_file_extents(struct btrfs_trans_handle *trans,
+static noinline_for_stack
-                                struct reloc_control *rc,
+int replace_file_extents(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root,
+                         struct reloc_control *rc,
-                                struct extent_buffer *leaf,
+                         struct btrfs_root *root,
-                                struct list_head *inode_list)
+                         struct extent_buffer *leaf)
 {
        struct btrfs_key key;
        struct btrfs_file_extent_item *fi;
        struct inode *inode = NULL;
-        struct inodevec *ivec = NULL;
        u64 parent;
        u64 bytenr;
-        u64 new_bytenr;
+        u64 new_bytenr = 0;
        u64 num_bytes;
        u64 end;
        u32 nritems;
@@ -1165,21 +1554,12 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
                 * to complete and drop the extent cache
                 */
                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
-                        if (!ivec || ivec->nr == INODEVEC_SIZE) {
-                                ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
-                                BUG_ON(!ivec);
-                                ivec->nr = 0;
-                                list_add_tail(&ivec->list, inode_list);
-                        }
                        if (first) {
                                inode = find_next_inode(root, key.objectid);
-                                if (inode)
-                                        ivec->inode[ivec->nr++] = inode;
                                first = 0;
                        } else if (inode && inode->i_ino < key.objectid) {
+                                btrfs_add_delayed_iput(inode);
                                inode = find_next_inode(root, key.objectid);
-                                if (inode)
-                                        ivec->inode[ivec->nr++] = inode;
                        }
                        if (inode && inode->i_ino == key.objectid) {
                                end = key.offset +
@@ -1203,8 +1583,10 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
                ret = get_new_location(rc->data_inode, &new_bytenr,
                                       bytenr, num_bytes);
-                if (ret > 0)
+                if (ret > 0) {
+                        WARN_ON(1);
                        continue;
+                }
                BUG_ON(ret < 0);
                btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
@@ -1224,6 +1606,8 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
        }
        if (dirty)
                btrfs_mark_buffer_dirty(leaf);
+        if (inode)
+                btrfs_add_delayed_iput(inode);
        return 0;
 }
@@ -1247,11 +1631,11 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot,
 * if no block got replaced, 0 is returned. if there are other
 * errors, a negative error number is returned.
 */
-static int replace_path(struct btrfs_trans_handle *trans,
+static noinline_for_stack
-                        struct btrfs_root *dest, struct btrfs_root *src,
+int replace_path(struct btrfs_trans_handle *trans,
-                        struct btrfs_path *path, struct btrfs_key *next_key,
+                 struct btrfs_root *dest, struct btrfs_root *src,
-                        struct extent_buffer **leaf,
+                 struct btrfs_path *path, struct btrfs_key *next_key,
-                        int lowest_level, int max_level)
+                 int lowest_level, int max_level)
 {
        struct extent_buffer *eb;
        struct extent_buffer *parent;
@@ -1262,16 +1646,16 @@ static int replace_path(struct btrfs_trans_handle *trans,
        u64 new_ptr_gen;
        u64 last_snapshot;
        u32 blocksize;
+        int cow = 0;
        int level;
        int ret;
        int slot;
        BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
        BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
-        BUG_ON(lowest_level > 1 && leaf);
        last_snapshot = btrfs_root_last_snapshot(&src->root_item);
+again:
        slot = path->slots[lowest_level];
        btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
@@ -1285,8 +1669,10 @@ static int replace_path(struct btrfs_trans_handle *trans,
                return 0;
        }
-        ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+        if (cow) {
-        BUG_ON(ret);
+                ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+                BUG_ON(ret);
+        }
        btrfs_set_lock_blocking(eb);
        if (next_key) {
@@ -1330,7 +1716,7 @@ static int replace_path(struct btrfs_trans_handle *trans,
                if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
                    memcmp_node_keys(parent, slot, path, level)) {
-                        if (level <= lowest_level && !leaf) {
+                        if (level <= lowest_level) {
                                ret = 0;
                                break;
                        }
@@ -1338,16 +1724,12 @@ static int replace_path(struct btrfs_trans_handle *trans,
                        eb = read_tree_block(dest, old_bytenr, blocksize,
                                             old_ptr_gen);
                        btrfs_tree_lock(eb);
-                        ret = btrfs_cow_block(trans, dest, eb, parent,
+                        if (cow) {
-                                              slot, &eb);
+                                ret = btrfs_cow_block(trans, dest, eb, parent,
-                        BUG_ON(ret);
+                                                      slot, &eb);
-                        btrfs_set_lock_blocking(eb);
+                                BUG_ON(ret);
-                        if (level <= lowest_level) {
-                                *leaf = eb;
-                                ret = 0;
-                                break;
                        }
+                        btrfs_set_lock_blocking(eb);
                        btrfs_tree_unlock(parent);
                        free_extent_buffer(parent);
@@ -1356,6 +1738,13 @@ static int replace_path(struct btrfs_trans_handle *trans,
                        continue;
                }
+                if (!cow) {
+                        btrfs_tree_unlock(parent);
+                        free_extent_buffer(parent);
+                        cow = 1;
+                        goto again;
+                }
                btrfs_node_key_to_cpu(path->nodes[level], &key,
                                      path->slots[level]);
                btrfs_release_path(src, path);
@@ -1561,20 +1950,6 @@ static int invalidate_extent_cache(struct btrfs_root *root,
        return 0;
 }
-static void put_inodes(struct list_head *list)
-{
-        struct inodevec *ivec;
-        while (!list_empty(list)) {
-                ivec = list_entry(list->next, struct inodevec, list);
-                list_del(&ivec->list);
-                while (ivec->nr > 0) {
-                        ivec->nr--;
-                        iput(ivec->inode[ivec->nr]);
-                }
-                kfree(ivec);
-        }
-}
 static int find_next_key(struct btrfs_path *path, int level,
                         struct btrfs_key *key)
@@ -1607,13 +1982,14 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
        struct btrfs_root *reloc_root;
        struct btrfs_root_item *root_item;
        struct btrfs_path *path;
-        struct extent_buffer *leaf = NULL;
+        struct extent_buffer *leaf;
        unsigned long nr;
        int level;
        int max_level;
        int replaced = 0;
        int ret;
        int err = 0;
+        u32 min_reserved;
        path = btrfs_alloc_path();
        if (!path)
@@ -1647,34 +2023,23 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                btrfs_unlock_up_safe(path, 0);
        }
-        if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
+        min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
-                trans = btrfs_start_transaction(root, 1);
+        memset(&next_key, 0, sizeof(next_key));
-                leaf = path->nodes[0];
+        while (1) {
-                btrfs_item_key_to_cpu(leaf, &key, 0);
+                trans = btrfs_start_transaction(root, 0);
-                btrfs_release_path(reloc_root, path);
+                trans->block_rsv = rc->block_rsv;
-                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+                ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
-                if (ret < 0) {
+                                            min_reserved, 0);
-                        err = ret;
+                if (ret) {
-                        goto out;
+                        BUG_ON(ret != -EAGAIN);
+                        ret = btrfs_commit_transaction(trans, root);
+                        BUG_ON(ret);
+                        continue;
                }
-                leaf = path->nodes[0];
-                btrfs_unlock_up_safe(path, 1);
-                ret = replace_file_extents(trans, rc, root, leaf,
-                                           &inode_list);
-                if (ret < 0)
-                        err = ret;
-                goto out;
-        }
-        memset(&next_key, 0, sizeof(next_key));
-        while (1) {
-                leaf = NULL;
                replaced = 0;
-                trans = btrfs_start_transaction(root, 1);
                max_level = level;
                ret = walk_down_reloc_tree(reloc_root, path, &level);
@@ -1688,14 +2053,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                if (!find_next_key(path, level, &key) &&
                    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
                        ret = 0;
-                } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
-                        ret = replace_path(trans, root, reloc_root,
-                                           path, &next_key, &leaf,
-                                           level, max_level);
                } else {
-                        ret = replace_path(trans, root, reloc_root,
+                        ret = replace_path(trans, root, reloc_root, path,
-                                           path, &next_key, NULL,
+                                           &next_key, level, max_level);
-                                           level, max_level);
                }
                if (ret < 0) {
                        err = ret;
@@ -1707,16 +2067,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                        btrfs_node_key_to_cpu(path->nodes[level], &key,
                                              path->slots[level]);
                        replaced = 1;
-                } else if (leaf) {
-                        /*
-                         * no block got replaced, try replacing file extents
-                         */
-                        btrfs_item_key_to_cpu(leaf, &key, 0);
-                        ret = replace_file_extents(trans, rc, root, leaf,
-                                                   &inode_list);
-                        btrfs_tree_unlock(leaf);
-                        free_extent_buffer(leaf);
-                        BUG_ON(ret < 0);
                }
                ret = walk_up_reloc_tree(reloc_root, path, &level);
@@ -1733,15 +2083,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                root_item->drop_level = level;
                nr = trans->blocks_used;
-                btrfs_end_transaction(trans, root);
+                btrfs_end_transaction_throttle(trans, root);
                btrfs_btree_balance_dirty(root, nr);
-                /*
-                 * put inodes outside transaction, otherwise we may deadlock.
-                 */
-                put_inodes(&inode_list);
                if (replaced && rc->stage == UPDATE_DATA_PTRS)
                        invalidate_extent_cache(root, &key, &next_key);
        }
@@ -1764,87 +2109,125 @@ out:
                       sizeof(root_item->drop_progress));
                root_item->drop_level = 0;
                btrfs_set_root_refs(root_item, 0);
+                btrfs_update_reloc_root(trans, root);
        }
        nr = trans->blocks_used;
-        btrfs_end_transaction(trans, root);
+        btrfs_end_transaction_throttle(trans, root);
        btrfs_btree_balance_dirty(root, nr);
-        put_inodes(&inode_list);
        if (replaced && rc->stage == UPDATE_DATA_PTRS)
                invalidate_extent_cache(root, &key, &next_key);
        return err;
 }
-/*
+static noinline_for_stack
- * callback for the work threads.
+int prepare_to_merge(struct reloc_control *rc, int err)
- * this function merges reloc tree with corresponding fs tree,
- * and then drops the reloc tree.
- */
-static void merge_func(struct btrfs_work *work)
 {
-        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = rc->extent_root;
-        struct btrfs_root *root;
        struct btrfs_root *reloc_root;
-        struct async_merge *async;
+        struct btrfs_trans_handle *trans;
+        LIST_HEAD(reloc_roots);
+        u64 num_bytes = 0;
+        int ret;
+        int retries = 0;
+        mutex_lock(&root->fs_info->trans_mutex);
+        rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+        rc->merging_rsv_size += rc->nodes_relocated * 2;
+        mutex_unlock(&root->fs_info->trans_mutex);
+again:
+        if (!err) {
+                num_bytes = rc->merging_rsv_size;
+                ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
+                                          num_bytes, &retries);
+                if (ret)
+                        err = ret;
+        }
+        trans = btrfs_join_transaction(rc->extent_root, 1);
+        if (!err) {
+                if (num_bytes != rc->merging_rsv_size) {
+                        btrfs_end_transaction(trans, rc->extent_root);
+                        btrfs_block_rsv_release(rc->extent_root,
+                                                rc->block_rsv, num_bytes);
+                        retries = 0;
+                        goto again;
+                }
+        }
-        async = container_of(work, struct async_merge, work);
+        rc->merge_reloc_tree = 1;
-        reloc_root = async->root;
+        while (!list_empty(&rc->reloc_roots)) {
+                reloc_root = list_entry(rc->reloc_roots.next,
+                                        struct btrfs_root, root_list);
+                list_del_init(&reloc_root->root_list);
-        if (btrfs_root_refs(&reloc_root->root_item) > 0) {
                root = read_fs_root(reloc_root->fs_info,
                                    reloc_root->root_key.offset);
                BUG_ON(IS_ERR(root));
                BUG_ON(root->reloc_root != reloc_root);
-                merge_reloc_root(async->rc, root);
+                /*
+                 * set reference count to 1, so btrfs_recover_relocation
-                trans = btrfs_start_transaction(root, 1);
+                 * knows it should resumes merging
+                 */
+                if (!err)
+                        btrfs_set_root_refs(&reloc_root->root_item, 1);
                btrfs_update_reloc_root(trans, root);
-                btrfs_end_transaction(trans, root);
-        }
-        btrfs_drop_snapshot(reloc_root, 0);
+                list_add(&reloc_root->root_list, &reloc_roots);
+        }
-        if (atomic_dec_and_test(async->num_pending))
+        list_splice(&reloc_roots, &rc->reloc_roots);
-                complete(async->done);
-        kfree(async);
+        if (!err)
+                btrfs_commit_transaction(trans, rc->extent_root);
+        else
+                btrfs_end_transaction(trans, rc->extent_root);
+        return err;
 }
-static int merge_reloc_roots(struct reloc_control *rc)
+static noinline_for_stack
+int merge_reloc_roots(struct reloc_control *rc)
 {
-        struct async_merge *async;
        struct btrfs_root *root;
-        struct completion done;
+        struct btrfs_root *reloc_root;
-        atomic_t num_pending;
+        LIST_HEAD(reloc_roots);
+        int found = 0;
+        int ret;
+again:
+        root = rc->extent_root;
+        mutex_lock(&root->fs_info->trans_mutex);
+        list_splice_init(&rc->reloc_roots, &reloc_roots);
+        mutex_unlock(&root->fs_info->trans_mutex);
-        init_completion(&done);
+        while (!list_empty(&reloc_roots)) {
-        atomic_set(&num_pending, 1);
+                found = 1;
+                reloc_root = list_entry(reloc_roots.next,
+                                        struct btrfs_root, root_list);
-        while (!list_empty(&rc->reloc_roots)) {
+                if (btrfs_root_refs(&reloc_root->root_item) > 0) {
-                root = list_entry(rc->reloc_roots.next,
+                        root = read_fs_root(reloc_root->fs_info,
-                                  struct btrfs_root, root_list);
+                                            reloc_root->root_key.offset);
-                list_del_init(&root->root_list);
+                        BUG_ON(IS_ERR(root));
+                        BUG_ON(root->reloc_root != reloc_root);
-                async = kmalloc(sizeof(*async), GFP_NOFS);
+                        ret = merge_reloc_root(rc, root);
-                BUG_ON(!async);
+                        BUG_ON(ret);
-                async->work.func = merge_func;
+                } else {
-                async->work.flags = 0;
+                        list_del_init(&reloc_root->root_list);
-                async->rc = rc;
+                }
-                async->root = root;
+                btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
-                async->done = &done;
-                async->num_pending = &num_pending;
-                atomic_inc(&num_pending);
-                btrfs_queue_worker(&rc->workers, &async->work);
        }
-        if (!atomic_dec_and_test(&num_pending))
+        if (found) {
-                wait_for_completion(&done);
+                found = 0;
+                goto again;
+        }
        BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
        return 0;
 }
@@ -1875,119 +2258,169 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
        return btrfs_record_root_in_trans(trans, root);
 }
-/*
+static noinline_for_stack
- * select one tree from trees that references the block.
+struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
- * for blocks in refernce counted trees, we preper reloc tree.
+                                     struct reloc_control *rc,
- * if no reloc tree found and reloc_only is true, NULL is returned.
+                                     struct backref_node *node,
- */
+                                     struct backref_edge *edges[], int *nr)
-static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
-                                            struct backref_node *node,
-                                            struct backref_edge *edges[],
-                                            int *nr, int reloc_only)
 {
        struct backref_node *next;
        struct btrfs_root *root;
-        int index;
+        int index = 0;
-        int loop = 0;
-again:
-        index = 0;
        next = node;
        while (1) {
                cond_resched();
                next = walk_up_backref(next, edges, &index);
                root = next->root;
-                if (!root) {
+                BUG_ON(!root);
-                        BUG_ON(!node->old_root);
+                BUG_ON(!root->ref_cows);
-                        goto skip;
-                }
-                /* no other choice for non-refernce counted tree */
-                if (!root->ref_cows) {
-                        BUG_ON(reloc_only);
-                        break;
-                }
                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
                        record_reloc_root_in_trans(trans, root);
                        break;
                }
-                if (loop) {
+                btrfs_record_root_in_trans(trans, root);
-                        btrfs_record_root_in_trans(trans, root);
+                root = root->reloc_root;
+                if (next->new_bytenr != root->node->start) {
+                        BUG_ON(next->new_bytenr);
+                        BUG_ON(!list_empty(&next->list));
+                        next->new_bytenr = root->node->start;
+                        next->root = root;
+                        list_add_tail(&next->list,
+                                      &rc->backref_cache.changed);
+                        __mark_block_processed(rc, next);
                        break;
                }
-                if (reloc_only || next != node) {
+                WARN_ON(1);
-                        if (!root->reloc_root)
-                                btrfs_record_root_in_trans(trans, root);
-                        root = root->reloc_root;
-                        /*
-                         * if the reloc tree was created in current
-                         * transation, there is no node in backref tree
-                         * corresponds to the root of the reloc tree.
-                         */
-                        if (btrfs_root_last_snapshot(&root->root_item) ==
-                            trans->transid - 1)
-                                break;
-                }
-skip:
                root = NULL;
                next = walk_down_backref(edges, &index);
                if (!next || next->level <= node->level)
                        break;
        }
+        if (!root)
+                return NULL;
-        if (!root && !loop && !reloc_only) {
+        *nr = index;
-                loop = 1;
+        next = node;
-                goto again;
+        /* setup backref node path for btrfs_reloc_cow_block */
+        while (1) {
+                rc->backref_cache.path[next->level] = next;
+                if (--index < 0)
+                        break;
+                next = edges[index]->node[UPPER];
        }
-        if (root)
-                *nr = index;
-        else
-                *nr = 0;
        return root;
 }
+/*
+ * select a tree root for relocation. return NULL if the block
+ * is reference counted. we should use do_relocation() in this
+ * case. return a tree root pointer if the block isn't reference
+ * counted. return -ENOENT if the block is root of reloc tree.
+ */
 static noinline_for_stack
 struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
                                   struct backref_node *node)
 {
+        struct backref_node *next;
+        struct btrfs_root *root;
+        struct btrfs_root *fs_root = NULL;
        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
-        int nr;
+        int index = 0;
-        return __select_one_root(trans, node, edges, &nr, 0);
+        next = node;
+        while (1) {
+                cond_resched();
+                next = walk_up_backref(next, edges, &index);
+                root = next->root;
+                BUG_ON(!root);
+                /* no other choice for non-refernce counted tree */
+                if (!root->ref_cows)
+                        return root;
+                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
+                        fs_root = root;
+                if (next != node)
+                        return NULL;
+                next = walk_down_backref(edges, &index);
+                if (!next || next->level <= node->level)
+                        break;
+        }
+        if (!fs_root)
+                return ERR_PTR(-ENOENT);
+        return fs_root;
 }
 static noinline_for_stack
-struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
+u64 calcu_metadata_size(struct reloc_control *rc,
-                                     struct backref_node *node,
+                        struct backref_node *node, int reserve)
-                                     struct backref_edge *edges[], int *nr)
 {
-        return __select_one_root(trans, node, edges, nr, 1);
+        struct backref_node *next = node;
+        struct backref_edge *edge;
+        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+        u64 num_bytes = 0;
+        int index = 0;
+        BUG_ON(reserve && node->processed);
+        while (next) {
+                cond_resched();
+                while (1) {
+                        if (next->processed && (reserve || next != node))
+                                break;
+                        num_bytes += btrfs_level_size(rc->extent_root,
+                                                      next->level);
+                        if (list_empty(&next->upper))
+                                break;
+                        edge = list_entry(next->upper.next,
+                                          struct backref_edge, list[LOWER]);
+                        edges[index++] = edge;
+                        next = edge->node[UPPER];
+                }
+                next = walk_down_backref(edges, &index);
+        }
+        return num_bytes;
 }
-static void grab_path_buffers(struct btrfs_path *path,
+static int reserve_metadata_space(struct btrfs_trans_handle *trans,
-                              struct backref_node *node,
+                                  struct reloc_control *rc,
-                              struct backref_edge *edges[], int nr)
+                                  struct backref_node *node)
 {
-        int i = 0;
+        struct btrfs_root *root = rc->extent_root;
-        while (1) {
+        u64 num_bytes;
-                drop_node_buffer(node);
+        int ret;
-                node->eb = path->nodes[node->level];
-                BUG_ON(!node->eb);
+        num_bytes = calcu_metadata_size(rc, node, 1) * 2;
-                if (path->locks[node->level])
-                        node->locked = 1;
-                path->nodes[node->level] = NULL;
-                path->locks[node->level] = 0;
-                if (i >= nr)
-                        break;
-                edges[i]->blockptr = node->eb->start;
+        trans->block_rsv = rc->block_rsv;
-                node = edges[i]->node[UPPER];
+        ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes,
-                i++;
+                                  &rc->block_rsv_retries);
+        if (ret) {
+                if (ret == -EAGAIN)
+                        rc->commit_transaction = 1;
+                return ret;
        }
+        rc->block_rsv_retries = 0;
+        return 0;
+}
+static void release_metadata_space(struct reloc_control *rc,
+                                   struct backref_node *node)
+{
+        u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
+        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
 }
 /*
@@ -1998,6 +2431,7 @@ static void grab_path_buffers(struct btrfs_path *path,
 * in that case this function just updates pointers.
 */
 static int do_relocation(struct btrfs_trans_handle *trans,
+                         struct reloc_control *rc,
                         struct backref_node *node,
                         struct btrfs_key *key,
                         struct btrfs_path *path, int lowest)
@@ -2018,18 +2452,25 @@ static int do_relocation(struct btrfs_trans_handle *trans,
        BUG_ON(lowest && node->eb);
        path->lowest_level = node->level + 1;
+        rc->backref_cache.path[node->level] = node;
        list_for_each_entry(edge, &node->upper, list[LOWER]) {
                cond_resched();
-                if (node->eb && node->eb->start == edge->blockptr)
-                        continue;
                upper = edge->node[UPPER];
-                root = select_reloc_root(trans, upper, edges, &nr);
+                root = select_reloc_root(trans, rc, upper, edges, &nr);
-                if (!root)
+                BUG_ON(!root);
-                        continue;
+                if (upper->eb && !upper->locked) {
-                if (upper->eb && !upper->locked)
+                        if (!lowest) {
+                                ret = btrfs_bin_search(upper->eb, key,
+                                                       upper->level, &slot);
+                                BUG_ON(ret);
+                                bytenr = btrfs_node_blockptr(upper->eb, slot);
+                                if (node->eb->start == bytenr)
+                                        goto next;
+                        }
                        drop_node_buffer(upper);
+                }
                if (!upper->eb) {
                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
@@ -2039,11 +2480,17 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                        }
                        BUG_ON(ret > 0);
-                        slot = path->slots[upper->level];
+                        if (!upper->eb) {
+                                upper->eb = path->nodes[upper->level];
+                                path->nodes[upper->level] = NULL;
+                        } else {
+                                BUG_ON(upper->eb != path->nodes[upper->level]);
+                        }
-                        btrfs_unlock_up_safe(path, upper->level + 1);
+                        upper->locked = 1;
-                        grab_path_buffers(path, upper, edges, nr);
+                        path->locks[upper->level] = 0;
+                        slot = path->slots[upper->level];
                        btrfs_release_path(NULL, path);
                } else {
                        ret = btrfs_bin_search(upper->eb, key, upper->level,
@@ -2052,14 +2499,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                }
                bytenr = btrfs_node_blockptr(upper->eb, slot);
-                if (!lowest) {
+                if (lowest) {
-                        if (node->eb->start == bytenr) {
+                        BUG_ON(bytenr != node->bytenr);
-                                btrfs_tree_unlock(upper->eb);
-                                upper->locked = 0;
-                                continue;
-                        }
                } else {
-                        BUG_ON(node->bytenr != bytenr);
+                        if (node->eb->start == bytenr)
+                                goto next;
                }
                blocksize = btrfs_level_size(root, node->level);
@@ -2071,13 +2515,13 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                if (!node->eb) {
                        ret = btrfs_cow_block(trans, root, eb, upper->eb,
                                              slot, &eb);
+                        btrfs_tree_unlock(eb);
+                        free_extent_buffer(eb);
                        if (ret < 0) {
                                err = ret;
-                                break;
+                                goto next;
                        }
-                        btrfs_set_lock_blocking(eb);
+                        BUG_ON(node->eb != eb);
-                        node->eb = eb;
-                        node->locked = 1;
                } else {
                        btrfs_set_node_blockptr(upper->eb, slot,
                                                node->eb->start);
@@ -2095,67 +2539,80 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                        ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
                        BUG_ON(ret);
                }
-                if (!lowest) {
+next:
-                        btrfs_tree_unlock(upper->eb);
+                if (!upper->pending)
-                        upper->locked = 0;
+                        drop_node_buffer(upper);
-                }
+                else
+                        unlock_node_buffer(upper);
+                if (err)
+                        break;
        }
+        if (!err && node->pending) {
+                drop_node_buffer(node);
+                list_move_tail(&node->list, &rc->backref_cache.changed);
+                node->pending = 0;
+        }
        path->lowest_level = 0;
+        BUG_ON(err == -ENOSPC);
        return err;
 }
 static int link_to_upper(struct btrfs_trans_handle *trans,
+                         struct reloc_control *rc,
                         struct backref_node *node,
                         struct btrfs_path *path)
 {
        struct btrfs_key key;
-        if (!node->eb || list_empty(&node->upper))
-                return 0;
        btrfs_node_key_to_cpu(node->eb, &key, 0);
-        return do_relocation(trans, node, &key, path, 0);
+        return do_relocation(trans, rc, node, &key, path, 0);
 }
 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
-                                struct backref_cache *cache,
+                                struct reloc_control *rc,
-                                struct btrfs_path *path)
+                                struct btrfs_path *path, int err)
 {
+        LIST_HEAD(list);
+        struct backref_cache *cache = &rc->backref_cache;
        struct backref_node *node;
        int level;
        int ret;
-        int err = 0;
        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
                while (!list_empty(&cache->pending[level])) {
                        node = list_entry(cache->pending[level].next,
-                                          struct backref_node, lower);
+                                          struct backref_node, list);
-                        BUG_ON(node->level != level);
+                        list_move_tail(&node->list, &list);
+                        BUG_ON(!node->pending);
-                        ret = link_to_upper(trans, node, path);
+                        if (!err) {
-                        if (ret < 0)
+                                ret = link_to_upper(trans, rc, node, path);
-                                err = ret;
+                                if (ret < 0)
-                        /*
+                                        err = ret;
-                         * this remove the node from the pending list and
+                        }
-                         * may add some other nodes to the level + 1
-                         * pending list
-                         */
-                        remove_backref_node(cache, node);
                }
+                list_splice_init(&list, &cache->pending[level]);
        }
-        BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
        return err;
 }
 static void mark_block_processed(struct reloc_control *rc,
-                                 struct backref_node *node)
+                                 u64 bytenr, u32 blocksize)
+{
+        set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
+                        EXTENT_DIRTY, GFP_NOFS);
+}
+static void __mark_block_processed(struct reloc_control *rc,
+                                   struct backref_node *node)
 {
        u32 blocksize;
        if (node->level == 0 ||
            in_block_group(node->bytenr, rc->block_group)) {
                blocksize = btrfs_level_size(rc->extent_root, node->level);
-                set_extent_bits(&rc->processed_blocks, node->bytenr,
+                mark_block_processed(rc, node->bytenr, blocksize);
-                                node->bytenr + blocksize - 1, EXTENT_DIRTY,
-                                GFP_NOFS);
        }
        node->processed = 1;
 }
@@ -2178,7 +2635,7 @@ static void update_processed_blocks(struct reloc_control *rc,
                        if (next->processed)
                                break;
-                        mark_block_processed(rc, next);
+                        __mark_block_processed(rc, next);
                        if (list_empty(&next->upper))
                                break;
@@ -2201,138 +2658,6 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
        return 0;
 }
-/*
- * check if there are any file extent pointers in the leaf point to
- * data require processing
- */
-static int check_file_extents(struct reloc_control *rc,
-                              u64 bytenr, u32 blocksize, u64 ptr_gen)
-{
-        struct btrfs_key found_key;
-        struct btrfs_file_extent_item *fi;
-        struct extent_buffer *leaf;
-        u32 nritems;
-        int i;
-        int ret = 0;
-        leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
-        nritems = btrfs_header_nritems(leaf);
-        for (i = 0; i < nritems; i++) {
-                cond_resched();
-                btrfs_item_key_to_cpu(leaf, &found_key, i);
-                if (found_key.type != BTRFS_EXTENT_DATA_KEY)
-                        continue;
-                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
-                if (btrfs_file_extent_type(leaf, fi) ==
-                    BTRFS_FILE_EXTENT_INLINE)
-                        continue;
-                bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
-                if (bytenr == 0)
-                        continue;
-                if (in_block_group(bytenr, rc->block_group)) {
-                        ret = 1;
-                        break;
-                }
-        }
-        free_extent_buffer(leaf);
-        return ret;
-}
-/*
- * scan child blocks of a given block to find blocks require processing
- */
-static int add_child_blocks(struct btrfs_trans_handle *trans,
-                            struct reloc_control *rc,
-                            struct backref_node *node,
-                            struct rb_root *blocks)
-{
-        struct tree_block *block;
-        struct rb_node *rb_node;
-        u64 bytenr;
-        u64 ptr_gen;
-        u32 blocksize;
-        u32 nritems;
-        int i;
-        int err = 0;
-        nritems = btrfs_header_nritems(node->eb);
-        blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
-        for (i = 0; i < nritems; i++) {
-                cond_resched();
-                bytenr = btrfs_node_blockptr(node->eb, i);
-                ptr_gen = btrfs_node_ptr_generation(node->eb, i);
-                if (ptr_gen == trans->transid)
-                        continue;
-                if (!in_block_group(bytenr, rc->block_group) &&
-                    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
-                        continue;
-                if (tree_block_processed(bytenr, blocksize, rc))
-                        continue;
-                readahead_tree_block(rc->extent_root,
-                                     bytenr, blocksize, ptr_gen);
-        }
-        for (i = 0; i < nritems; i++) {
-                cond_resched();
-                bytenr = btrfs_node_blockptr(node->eb, i);
-                ptr_gen = btrfs_node_ptr_generation(node->eb, i);
-                if (ptr_gen == trans->transid)
-                        continue;
-                if (!in_block_group(bytenr, rc->block_group) &&
-                    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
-                        continue;
-                if (tree_block_processed(bytenr, blocksize, rc))
-                        continue;
-                if (!in_block_group(bytenr, rc->block_group) &&
-                    !check_file_extents(rc, bytenr, blocksize, ptr_gen))
-                        continue;
-                block = kmalloc(sizeof(*block), GFP_NOFS);
-                if (!block) {
-                        err = -ENOMEM;
-                        break;
-                }
-                block->bytenr = bytenr;
-                btrfs_node_key_to_cpu(node->eb, &block->key, i);
-                block->level = node->level - 1;
-                block->key_ready = 1;
-                rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
-                BUG_ON(rb_node);
-        }
-        if (err)
-                free_block_list(blocks);
-        return err;
-}
-/*
- * find adjacent blocks require processing
- */
-static noinline_for_stack
-int add_adjacent_blocks(struct btrfs_trans_handle *trans,
-                        struct reloc_control *rc,
-                        struct backref_cache *cache,
-                        struct rb_root *blocks, int level,
-                        struct backref_node **upper)
-{
-        struct backref_node *node;
-        int ret = 0;
-        WARN_ON(!list_empty(&cache->pending[level]));
-        if (list_empty(&cache->pending[level + 1]))
-                return 1;
-        node = list_entry(cache->pending[level + 1].next,
-                          struct backref_node, lower);
-        if (node->eb)
-                ret = add_child_blocks(trans, rc, node, blocks);
-        *upper = node;
-        return ret;
-}
 static int get_tree_block_key(struct reloc_control *rc,
                              struct tree_block *block)
 {
@@ -2370,40 +2695,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
                                struct btrfs_path *path)
 {
        struct btrfs_root *root;
-        int ret;
+        int release = 0;
+        int ret = 0;
+        if (!node)
+                return 0;
+        BUG_ON(node->processed);
        root = select_one_root(trans, node);
-        if (unlikely(!root)) {
+        if (root == ERR_PTR(-ENOENT)) {
-                rc->found_old_snapshot = 1;
                update_processed_blocks(rc, node);
-                return 0;
+                goto out;
        }
-        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+        if (!root || root->ref_cows) {
-                ret = do_relocation(trans, node, key, path, 1);
+                ret = reserve_metadata_space(trans, rc, node);
-                if (ret < 0)
+                if (ret)
-                        goto out;
-                if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
-                        ret = replace_file_extents(trans, rc, root,
-                                                   node->eb, NULL);
-                        if (ret < 0)
-                                goto out;
-                }
-                drop_node_buffer(node);
-        } else if (!root->ref_cows) {
-                path->lowest_level = node->level;
-                ret = btrfs_search_slot(trans, root, key, path, 0, 1);
-                btrfs_release_path(root, path);
-                if (ret < 0)
                        goto out;
-        } else if (root != node->root) {
+                release = 1;
-                WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
        }
-        update_processed_blocks(rc, node);
+        if (root) {
-        ret = 0;
+                if (root->ref_cows) {
+                        BUG_ON(node->new_bytenr);
+                        BUG_ON(!list_empty(&node->list));
+                        btrfs_record_root_in_trans(trans, root);
+                        root = root->reloc_root;
+                        node->new_bytenr = root->node->start;
+                        node->root = root;
+                        list_add_tail(&node->list, &rc->backref_cache.changed);
+                } else {
+                        path->lowest_level = node->level;
+                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+                        btrfs_release_path(root, path);
+                        if (ret > 0)
+                                ret = 0;
+                }
+                if (!ret)
+                        update_processed_blocks(rc, node);
+        } else {
+                ret = do_relocation(trans, rc, node, key, path, 1);
+        }
 out:
-        drop_node_buffer(node);
+        if (ret || node->level == 0 || node->cowonly) {
+                if (release)
+                        release_metadata_space(rc, node);
+                remove_backref_node(&rc->backref_cache, node);
+        }
        return ret;
 }
@@ -2414,12 +2752,10 @@ static noinline_for_stack
 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
                         struct reloc_control *rc, struct rb_root *blocks)
 {
-        struct backref_cache *cache;
        struct backref_node *node;
        struct btrfs_path *path;
        struct tree_block *block;
        struct rb_node *rb_node;
-        int level = -1;
        int ret;
        int err = 0;
@@ -2427,21 +2763,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
-        cache = kmalloc(sizeof(*cache), GFP_NOFS);
-        if (!cache) {
-                btrfs_free_path(path);
-                return -ENOMEM;
-        }
-        backref_cache_init(cache);
        rb_node = rb_first(blocks);
        while (rb_node) {
                block = rb_entry(rb_node, struct tree_block, rb_node);
-                if (level == -1)
-                        level = block->level;
-                else
-                        BUG_ON(level != block->level);
                if (!block->key_ready)
                        reada_tree_block(rc, block);
                rb_node = rb_next(rb_node);
@@ -2459,7 +2783,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
        while (rb_node) {
                block = rb_entry(rb_node, struct tree_block, rb_node);
-                node = build_backref_tree(rc, cache, &block->key,
+                node = build_backref_tree(rc, &block->key,
                                          block->level, block->bytenr);
                if (IS_ERR(node)) {
                        err = PTR_ERR(node);
@@ -2469,79 +2793,62 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
                ret = relocate_tree_block(trans, rc, node, &block->key,
                                          path);
                if (ret < 0) {
-                        err = ret;
+                        if (ret != -EAGAIN || rb_node == rb_first(blocks))
+                                err = ret;
                        goto out;
                }
-                remove_backref_node(cache, node);
                rb_node = rb_next(rb_node);
        }
+out:
-        if (level > 0)
-                goto out;
        free_block_list(blocks);
+        err = finish_pending_nodes(trans, rc, path, err);
-        /*
+        btrfs_free_path(path);
-         * now backrefs of some upper level tree blocks have been cached,
+        return err;
-         * try relocating blocks referenced by these upper level blocks.
+}
-         */
-        while (1) {
-                struct backref_node *upper = NULL;
-                if (trans->transaction->in_commit ||
-                    trans->transaction->delayed_refs.flushing)
-                        break;
-                ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
+static noinline_for_stack
-                                          &upper);
+int prealloc_file_extent_cluster(struct inode *inode,
-                if (ret < 0)
+                                 struct file_extent_cluster *cluster)
-                        err = ret;
+{
-                if (ret != 0)
+        u64 alloc_hint = 0;
-                        break;
+        u64 start;
+        u64 end;
+        u64 offset = BTRFS_I(inode)->index_cnt;
+        u64 num_bytes;
+        int nr = 0;
+        int ret = 0;
-                rb_node = rb_first(blocks);
+        BUG_ON(cluster->start != cluster->boundary[0]);
-                while (rb_node) {
+        mutex_lock(&inode->i_mutex);
-                        block = rb_entry(rb_node, struct tree_block, rb_node);
-                        if (trans->transaction->in_commit ||
-                            trans->transaction->delayed_refs.flushing)
-                                goto out;
-                        BUG_ON(!block->key_ready);
-                        node = build_backref_tree(rc, cache, &block->key,
-                                                  level, block->bytenr);
-                        if (IS_ERR(node)) {
-                                err = PTR_ERR(node);
-                                goto out;
-                        }
-                        ret = relocate_tree_block(trans, rc, node,
+        ret = btrfs_check_data_free_space(inode, cluster->end +
-                                                  &block->key, path);
+                                          1 - cluster->start);
-                        if (ret < 0) {
+        if (ret)
-                                err = ret;
+                goto out;
-                                goto out;
-                        }
-                        remove_backref_node(cache, node);
-                        rb_node = rb_next(rb_node);
-                }
-                free_block_list(blocks);
-                if (upper) {
+        while (nr < cluster->nr) {
-                        ret = link_to_upper(trans, upper, path);
+                start = cluster->boundary[nr] - offset;
-                        if (ret < 0) {
+                if (nr + 1 < cluster->nr)
-                                err = ret;
+                        end = cluster->boundary[nr + 1] - 1 - offset;
-                                break;
+                else
-                        }
+                        end = cluster->end - offset;
-                        remove_backref_node(cache, upper);
-                }
+                lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+                num_bytes = end + 1 - start;
+                ret = btrfs_prealloc_file_range(inode, 0, start,
+                                                num_bytes, num_bytes,
+                                                end + 1, &alloc_hint);
+                unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+                if (ret)
+                        break;
+                nr++;
        }
+        btrfs_free_reserved_data_space(inode, cluster->end +
+                                       1 - cluster->start);
 out:
-        free_block_list(blocks);
+        mutex_unlock(&inode->i_mutex);
+        return ret;
-        ret = finish_pending_nodes(trans, cache, path);
-        if (ret < 0)
-                err = ret;
-        kfree(cache);
-        btrfs_free_path(path);
-        return err;
 }
 static noinline_for_stack
@@ -2587,7 +2894,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
        u64 offset = BTRFS_I(inode)->index_cnt;
        unsigned long index;
        unsigned long last_index;
-        unsigned int dirty_page = 0;
        struct page *page;
        struct file_ra_state *ra;
        int nr = 0;
@@ -2600,21 +2906,24 @@ static int relocate_file_extent_cluster(struct inode *inode,
        if (!ra)
                return -ENOMEM;
-        index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
+        ret = prealloc_file_extent_cluster(inode, cluster);
-        last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
+        if (ret)
+                goto out;
-        mutex_lock(&inode->i_mutex);
+        file_ra_state_init(ra, inode->i_mapping);
-        i_size_write(inode, cluster->end + 1 - offset);
        ret = setup_extent_mapping(inode, cluster->start - offset,
                                   cluster->end - offset, cluster->start);
        if (ret)
-                goto out_unlock;
+                goto out;
-        file_ra_state_init(ra, inode->i_mapping);
-        WARN_ON(cluster->start != cluster->boundary[0]);
+        index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
+        last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
        while (index <= last_index) {
+                ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
+                if (ret)
+                        goto out;
                page = find_lock_page(inode->i_mapping, index);
                if (!page) {
                        page_cache_sync_readahead(inode->i_mapping,
@@ -2622,8 +2931,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
                                                  last_index + 1 - index);
                        page = grab_cache_page(inode->i_mapping, index);
                        if (!page) {
+                                btrfs_delalloc_release_metadata(inode,
+                                                        PAGE_CACHE_SIZE);
                                ret = -ENOMEM;
-                                goto out_unlock;
+                                goto out;
                        }
                }
@@ -2639,8 +2950,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
                        if (!PageUptodate(page)) {
                                unlock_page(page);
                                page_cache_release(page);
+                                btrfs_delalloc_release_metadata(inode,
+                                                        PAGE_CACHE_SIZE);
                                ret = -EIO;
-                                goto out_unlock;
+                                goto out;
                        }
                }
@@ -2659,10 +2972,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
                                        EXTENT_BOUNDARY, GFP_NOFS);
                        nr++;
                }
-                btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
+                btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
                set_page_dirty(page);
-                dirty_page++;
                unlock_extent(&BTRFS_I(inode)->io_tree,
                              page_start, page_end, GFP_NOFS);
@@ -2670,20 +2982,11 @@ static int relocate_file_extent_cluster(struct inode *inode,
                page_cache_release(page);
                index++;
-                if (nr < cluster->nr &&
+                balance_dirty_pages_ratelimited(inode->i_mapping);
-                    page_end + 1 + offset == cluster->boundary[nr]) {
+                btrfs_throttle(BTRFS_I(inode)->root);
-                        balance_dirty_pages_ratelimited_nr(inode->i_mapping,
-                                                           dirty_page);
-                        dirty_page = 0;
-                }
-        }
-        if (dirty_page) {
-                balance_dirty_pages_ratelimited_nr(inode->i_mapping,
-                                                   dirty_page);
        }
        WARN_ON(nr != cluster->nr);
-out_unlock:
+out:
-        mutex_unlock(&inode->i_mutex);
        kfree(ra);
        return ret;
 }
@@ -2869,9 +3172,6 @@ out:
 static int block_use_full_backref(struct reloc_control *rc,
                                  struct extent_buffer *eb)
 {
-        struct btrfs_path *path;
-        struct btrfs_extent_item *ei;
-        struct btrfs_key key;
        u64 flags;
        int ret;
@@ -2879,28 +3179,14 @@ static int block_use_full_backref(struct reloc_control *rc,
            btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
                return 1;
-        path = btrfs_alloc_path();
+        ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
-        BUG_ON(!path);
+                                       eb->start, eb->len, NULL, &flags);
-        key.objectid = eb->start;
-        key.type = BTRFS_EXTENT_ITEM_KEY;
-        key.offset = eb->len;
-        path->search_commit_root = 1;
-        path->skip_locking = 1;
-        ret = btrfs_search_slot(NULL, rc->extent_root,
-                                &key, path, 0, 0);
        BUG_ON(ret);
-        ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
-                            struct btrfs_extent_item);
-        flags = btrfs_extent_flags(path->nodes[0], ei);
-        BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
        if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
                ret = 1;
        else
                ret = 0;
-        btrfs_free_path(path);
        return ret;
 }
@@ -3073,22 +3359,10 @@ int add_data_references(struct reloc_control *rc,
        struct btrfs_extent_inline_ref *iref;
        unsigned long ptr;
        unsigned long end;
-        u32 blocksize;
+        u32 blocksize = btrfs_level_size(rc->extent_root, 0);
        int ret;
        int err = 0;
-        ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
-                               extent_key->offset);
-        BUG_ON(ret < 0);
-        if (ret > 0) {
-                /* the relocated data is fragmented */
-                rc->extents_skipped++;
-                btrfs_release_path(rc->extent_root, path);
-                return 0;
-        }
-        blocksize = btrfs_level_size(rc->extent_root, 0);
        eb = path->nodes[0];
        ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
        end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
@@ -3169,7 +3443,8 @@ int add_data_references(struct reloc_control *rc,
 */
 static noinline_for_stack
 int find_next_extent(struct btrfs_trans_handle *trans,
-                     struct reloc_control *rc, struct btrfs_path *path)
+                     struct reloc_control *rc, struct btrfs_path *path,
+                     struct btrfs_key *extent_key)
 {
        struct btrfs_key key;
        struct extent_buffer *leaf;
@@ -3224,6 +3499,7 @@ next:
                        rc->search_start = end + 1;
                } else {
                        rc->search_start = key.objectid + key.offset;
+                        memcpy(extent_key, &key, sizeof(key));
                        return 0;
                }
        }
@@ -3261,12 +3537,49 @@ static int check_extent_flags(u64 flags)
        return 0;
 }
+static noinline_for_stack
+int prepare_to_relocate(struct reloc_control *rc)
+{
+        struct btrfs_trans_handle *trans;
+        int ret;
+        rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
+        if (!rc->block_rsv)
+                return -ENOMEM;
+        /*
+         * reserve some space for creating reloc trees.
+         * btrfs_init_reloc_root will use them when there
+         * is no reservation in transaction handle.
+         */
+        ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
+                                  rc->extent_root->nodesize * 256,
+                                  &rc->block_rsv_retries);
+        if (ret)
+                return ret;
+        rc->block_rsv->refill_used = 1;
+        btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
+        memset(&rc->cluster, 0, sizeof(rc->cluster));
+        rc->search_start = rc->block_group->key.objectid;
+        rc->extents_found = 0;
+        rc->nodes_relocated = 0;
+        rc->merging_rsv_size = 0;
+        rc->block_rsv_retries = 0;
+        rc->create_reloc_tree = 1;
+        set_reloc_control(rc);
+        trans = btrfs_join_transaction(rc->extent_root, 1);
+        btrfs_commit_transaction(trans, rc->extent_root);
+        return 0;
+}
 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 {
        struct rb_root blocks = RB_ROOT;
        struct btrfs_key key;
-        struct file_extent_cluster *cluster;
        struct btrfs_trans_handle *trans = NULL;
        struct btrfs_path *path;
        struct btrfs_extent_item *ei;
@@ -3276,33 +3589,25 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
        int ret;
        int err = 0;
-        cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
-        if (!cluster)
-                return -ENOMEM;
        path = btrfs_alloc_path();
-        if (!path) {
+        if (!path)
-                kfree(cluster);
                return -ENOMEM;
-        }
-        rc->extents_found = 0;
-        rc->extents_skipped = 0;
-        rc->search_start = rc->block_group->key.objectid;
-        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
-                          GFP_NOFS);
-        rc->create_reloc_root = 1;
-        set_reloc_control(rc);
-        trans = btrfs_start_transaction(rc->extent_root, 1);
+        ret = prepare_to_relocate(rc);
-        btrfs_commit_transaction(trans, rc->extent_root);
+        if (ret) {
+                err = ret;
+                goto out_free;
+        }
        while (1) {
-                trans = btrfs_start_transaction(rc->extent_root, 1);
+                trans = btrfs_start_transaction(rc->extent_root, 0);
+                if (update_backref_cache(trans, &rc->backref_cache)) {
+                        btrfs_end_transaction(trans, rc->extent_root);
+                        continue;
+                }
-                ret = find_next_extent(trans, rc, path);
+                ret = find_next_extent(trans, rc, path, &key);
                if (ret < 0)
                        err = ret;
                if (ret != 0)
@@ -3312,9 +3617,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
                ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                    struct btrfs_extent_item);
-                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+                item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
-                item_size = btrfs_item_size_nr(path->nodes[0],
-                                               path->slots[0]);
                if (item_size >= sizeof(*ei)) {
                        flags = btrfs_extent_flags(path->nodes[0], ei);
                        ret = check_extent_flags(flags);
@@ -3355,73 +3658,100 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
                if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
                        ret = add_tree_block(rc, &key, path, &blocks);
                } else if (rc->stage == UPDATE_DATA_PTRS &&
-                         (flags & BTRFS_EXTENT_FLAG_DATA)) {
+                           (flags & BTRFS_EXTENT_FLAG_DATA)) {
                        ret = add_data_references(rc, &key, path, &blocks);
                } else {
                        btrfs_release_path(rc->extent_root, path);
                        ret = 0;
                }
                if (ret < 0) {
-                        err = 0;
+                        err = ret;
                        break;
                }
                if (!RB_EMPTY_ROOT(&blocks)) {
                        ret = relocate_tree_blocks(trans, rc, &blocks);
                        if (ret < 0) {
+                                if (ret != -EAGAIN) {
+                                        err = ret;
+                                        break;
+                                }
+                                rc->extents_found--;
+                                rc->search_start = key.objectid;
+                        }
+                }
+                ret = btrfs_block_rsv_check(trans, rc->extent_root,
+                                            rc->block_rsv, 0, 5);
+                if (ret < 0) {
+                        if (ret != -EAGAIN) {
                                err = ret;
+                                WARN_ON(1);
                                break;
                        }
+                        rc->commit_transaction = 1;
                }
-                nr = trans->blocks_used;
+                if (rc->commit_transaction) {
-                btrfs_end_transaction(trans, rc->extent_root);
+                        rc->commit_transaction = 0;
+                        ret = btrfs_commit_transaction(trans, rc->extent_root);
+                        BUG_ON(ret);
+                } else {
+                        nr = trans->blocks_used;
+                        btrfs_end_transaction_throttle(trans, rc->extent_root);
+                        btrfs_btree_balance_dirty(rc->extent_root, nr);
+                }
                trans = NULL;
-                btrfs_btree_balance_dirty(rc->extent_root, nr);
                if (rc->stage == MOVE_DATA_EXTENTS &&
                    (flags & BTRFS_EXTENT_FLAG_DATA)) {
                        rc->found_file_extent = 1;
                        ret = relocate_data_extent(rc->data_inode,
-                                                   &key, cluster);
+                                                   &key, &rc->cluster);
                        if (ret < 0) {
                                err = ret;
                                break;
                        }
                }
        }
-        btrfs_free_path(path);
+        btrfs_release_path(rc->extent_root, path);
+        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
+                          GFP_NOFS);
        if (trans) {
                nr = trans->blocks_used;
-                btrfs_end_transaction(trans, rc->extent_root);
+                btrfs_end_transaction_throttle(trans, rc->extent_root);
                btrfs_btree_balance_dirty(rc->extent_root, nr);
        }
        if (!err) {
-                ret = relocate_file_extent_cluster(rc->data_inode, cluster);
+                ret = relocate_file_extent_cluster(rc->data_inode,
+                                                   &rc->cluster);
                if (ret < 0)
                        err = ret;
        }
-        kfree(cluster);
+        rc->create_reloc_tree = 0;
+        set_reloc_control(rc);
-        rc->create_reloc_root = 0;
+        backref_cache_cleanup(&rc->backref_cache);
-        smp_mb();
+        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
-        if (rc->extents_found > 0) {
+        err = prepare_to_merge(rc, err);
-                trans = btrfs_start_transaction(rc->extent_root, 1);
-                btrfs_commit_transaction(trans, rc->extent_root);
-        }
        merge_reloc_roots(rc);
+        rc->merge_reloc_tree = 0;
        unset_reloc_control(rc);
+        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
        /* get rid of pinned extents */
-        trans = btrfs_start_transaction(rc->extent_root, 1);
+        trans = btrfs_join_transaction(rc->extent_root, 1);
        btrfs_commit_transaction(trans, rc->extent_root);
+out_free:
+        btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
+        btrfs_free_path(path);
        return err;
 }
@@ -3447,7 +3777,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
        btrfs_set_inode_generation(leaf, item, 1);
        btrfs_set_inode_size(leaf, item, 0);
        btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
-        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
+        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
+                                          BTRFS_INODE_PREALLOC);
        btrfs_mark_buffer_dirty(leaf);
        btrfs_release_path(root, path);
 out:
@@ -3459,8 +3790,9 @@ out:
 * helper to create inode for data relocation.
 * the inode is in data relocation tree and its link count is 0
 */
-static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+static noinline_for_stack
-                                        struct btrfs_block_group_cache *group)
+struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+                                 struct btrfs_block_group_cache *group)
 {
        struct inode *inode = NULL;
        struct btrfs_trans_handle *trans;
@@ -3474,8 +3806,9 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
        if (IS_ERR(root))
                return ERR_CAST(root);
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 6);
-        BUG_ON(!trans);
+        if (IS_ERR(trans))
+                return ERR_CAST(trans);
        err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
        if (err)
@@ -3495,7 +3828,6 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
 out:
        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root, nr);
        if (err) {
                if (inode)
@@ -3505,6 +3837,21 @@ out:
        return inode;
 }
+static struct reloc_control *alloc_reloc_control(void)
+{
+        struct reloc_control *rc;
+        rc = kzalloc(sizeof(*rc), GFP_NOFS);
+        if (!rc)
+                return NULL;
+        INIT_LIST_HEAD(&rc->reloc_roots);
+        backref_cache_init(&rc->backref_cache);
+        mapping_tree_init(&rc->reloc_root_tree);
+        extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
+        return rc;
+}
 /*
 * function to relocate all extents in a block group.
 */
@@ -3513,24 +3860,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
        struct btrfs_fs_info *fs_info = extent_root->fs_info;
        struct reloc_control *rc;
        int ret;
+        int rw = 0;
        int err = 0;
-        rc = kzalloc(sizeof(*rc), GFP_NOFS);
+        rc = alloc_reloc_control();
        if (!rc)
                return -ENOMEM;
-        mapping_tree_init(&rc->reloc_root_tree);
+        rc->extent_root = extent_root;
-        extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
-        INIT_LIST_HEAD(&rc->reloc_roots);
        rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
        BUG_ON(!rc->block_group);
-        btrfs_init_workers(&rc->workers, "relocate",
+        if (!rc->block_group->ro) {
-                           fs_info->thread_pool_size, NULL);
+                ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
+                if (ret) {
-        rc->extent_root = extent_root;
+                        err = ret;
-        btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
+                        goto out;
+                }
+                rw = 1;
+        }
        rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
        if (IS_ERR(rc->data_inode)) {
@@ -3547,9 +3896,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
        btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
        while (1) {
-                rc->extents_found = 0;
-                rc->extents_skipped = 0;
                mutex_lock(&fs_info->cleaner_mutex);
                btrfs_clean_old_snapshots(fs_info->tree_root);
@@ -3558,7 +3904,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
                mutex_unlock(&fs_info->cleaner_mutex);
                if (ret < 0) {
                        err = ret;
-                        break;
+                        goto out;
                }
                if (rc->extents_found == 0)
@@ -3572,18 +3918,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
                        invalidate_mapping_pages(rc->data_inode->i_mapping,
                                                 0, -1);
                        rc->stage = UPDATE_DATA_PTRS;
-                } else if (rc->stage == UPDATE_DATA_PTRS &&
-                           rc->extents_skipped >= rc->extents_found) {
-                        iput(rc->data_inode);
-                        rc->data_inode = create_reloc_inode(fs_info,
-                                                            rc->block_group);
-                        if (IS_ERR(rc->data_inode)) {
-                                err = PTR_ERR(rc->data_inode);
-                                rc->data_inode = NULL;
-                                break;
-                        }
-                        rc->stage = MOVE_DATA_EXTENTS;
-                        rc->found_file_extent = 0;
                }
        }
@@ -3596,8 +3930,9 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
        WARN_ON(rc->block_group->reserved > 0);
        WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
 out:
+        if (err && rw)
+                btrfs_set_block_group_rw(extent_root, rc->block_group);
        iput(rc->data_inode);
-        btrfs_stop_workers(&rc->workers);
        btrfs_put_block_group(rc->block_group);
        kfree(rc);
        return err;
@@ -3608,7 +3943,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
        struct btrfs_trans_handle *trans;
        int ret;
-        trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
+        trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
        memset(&root->root_item.drop_progress, 0,
                sizeof(root->root_item.drop_progress));
@@ -3701,20 +4036,20 @@ int btrfs_recover_relocation(struct btrfs_root *root)
        if (list_empty(&reloc_roots))
                goto out;
-        rc = kzalloc(sizeof(*rc), GFP_NOFS);
+        rc = alloc_reloc_control();
        if (!rc) {
                err = -ENOMEM;
                goto out;
        }
-        mapping_tree_init(&rc->reloc_root_tree);
-        INIT_LIST_HEAD(&rc->reloc_roots);
-        btrfs_init_workers(&rc->workers, "relocate",
-                           root->fs_info->thread_pool_size, NULL);
        rc->extent_root = root->fs_info->extent_root;
        set_reloc_control(rc);
+        trans = btrfs_join_transaction(rc->extent_root, 1);
+        rc->merge_reloc_tree = 1;
        while (!list_empty(&reloc_roots)) {
                reloc_root = list_entry(reloc_roots.next,
                                        struct btrfs_root, root_list);
@@ -3734,20 +4069,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
                fs_root->reloc_root = reloc_root;
        }
-        trans = btrfs_start_transaction(rc->extent_root, 1);
        btrfs_commit_transaction(trans, rc->extent_root);
        merge_reloc_roots(rc);
        unset_reloc_control(rc);
-        trans = btrfs_start_transaction(rc->extent_root, 1);
+        trans = btrfs_join_transaction(rc->extent_root, 1);
        btrfs_commit_transaction(trans, rc->extent_root);
 out:
-        if (rc) {
+        kfree(rc);
-                btrfs_stop_workers(&rc->workers);
-                kfree(rc);
-        }
        while (!list_empty(&reloc_roots)) {
                reloc_root = list_entry(reloc_roots.next,
                                        struct btrfs_root, root_list);
@@ -3813,3 +4144,130 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
        btrfs_put_ordered_extent(ordered);
        return 0;
 }
+void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, struct extent_buffer *buf,
+                           struct extent_buffer *cow)
+{
+        struct reloc_control *rc;
+        struct backref_node *node;
+        int first_cow = 0;
+        int level;
+        int ret;
+        rc = root->fs_info->reloc_ctl;
+        if (!rc)
+                return;
+        BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
+               root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
+        level = btrfs_header_level(buf);
+        if (btrfs_header_generation(buf) <=
+            btrfs_root_last_snapshot(&root->root_item))
+                first_cow = 1;
+        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
+            rc->create_reloc_tree) {
+                WARN_ON(!first_cow && level == 0);
+                node = rc->backref_cache.path[level];
+                BUG_ON(node->bytenr != buf->start &&
+                       node->new_bytenr != buf->start);
+                drop_node_buffer(node);
+                extent_buffer_get(cow);
+                node->eb = cow;
+                node->new_bytenr = cow->start;
+                if (!node->pending) {
+                        list_move_tail(&node->list,
+                                       &rc->backref_cache.pending[level]);
+                        node->pending = 1;
+                }
+                if (first_cow)
+                        __mark_block_processed(rc, node);
+                if (first_cow && level > 0)
+                        rc->nodes_relocated += buf->len;
+        }
+        if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) {
+                ret = replace_file_extents(trans, rc, root, cow);
+                BUG_ON(ret);
+        }
+}
+/*
+ * called before creating snapshot. it calculates metadata reservation
+ * requried for relocating tree blocks in the snapshot
+ */
+void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
+                              struct btrfs_pending_snapshot *pending,
+                              u64 *bytes_to_reserve)
+{
+        struct btrfs_root *root;
+        struct reloc_control *rc;
+        root = pending->root;
+        if (!root->reloc_root)
+                return;
+        rc = root->fs_info->reloc_ctl;
+        if (!rc->merge_reloc_tree)
+                return;
+        root = root->reloc_root;
+        BUG_ON(btrfs_root_refs(&root->root_item) == 0);
+        /*
+         * relocation is in the stage of merging trees. the space
+         * used by merging a reloc tree is twice the size of
+         * relocated tree nodes in the worst case. half for cowing
+         * the reloc tree, half for cowing the fs tree. the space
+         * used by cowing the reloc tree will be freed after the
+         * tree is dropped. if we create snapshot, cowing the fs
+         * tree may use more space than it frees. so we need
+         * reserve extra space.
+         */
+        *bytes_to_reserve += rc->nodes_relocated;
+}
+/*
+ * called after snapshot is created. migrate block reservation
+ * and create reloc root for the newly created snapshot
+ */
+void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+                               struct btrfs_pending_snapshot *pending)
+{
+        struct btrfs_root *root = pending->root;
+        struct btrfs_root *reloc_root;
+        struct btrfs_root *new_root;
+        struct reloc_control *rc;
+        int ret;
+        if (!root->reloc_root)
+                return;
+        rc = root->fs_info->reloc_ctl;
+        rc->merging_rsv_size += rc->nodes_relocated;
+        if (rc->merge_reloc_tree) {
+                ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+                                              rc->block_rsv,
+                                              rc->nodes_relocated);
+                BUG_ON(ret);
+        }
+        new_root = pending->snap;
+        reloc_root = create_reloc_root(trans, root->reloc_root,
+                                       new_root->root_key.objectid);
+        __add_reloc_root(reloc_root);
+        new_root->reloc_root = reloc_root;
+        if (rc->create_reloc_tree) {
+                ret = clone_backref_node(trans, rc, root, reloc_root);
+                BUG_ON(ret);
+        }
+}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 67fa2d29d663..b91ccd972644 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -259,6 +259,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
        struct extent_buffer *leaf;
        struct btrfs_path *path;
        struct btrfs_key key;
+        struct btrfs_key root_key;
+        struct btrfs_root *root;
        int err = 0;
        int ret;
@@ -270,6 +272,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
        key.type = BTRFS_ORPHAN_ITEM_KEY;
        key.offset = 0;
+        root_key.type = BTRFS_ROOT_ITEM_KEY;
+        root_key.offset = (u64)-1;
        while (1) {
                ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
                if (ret < 0) {
@@ -294,13 +299,25 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
                    key.type != BTRFS_ORPHAN_ITEM_KEY)
                        break;
-                ret = btrfs_find_dead_roots(tree_root, key.offset);
+                root_key.objectid = key.offset;
-                if (ret) {
+                key.offset++;
+                root = btrfs_read_fs_root_no_name(tree_root->fs_info,
+                                                  &root_key);
+                if (!IS_ERR(root))
+                        continue;
+                ret = PTR_ERR(root);
+                if (ret != -ENOENT) {
                        err = ret;
                        break;
                }
-                key.offset++;
+                ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
+                if (ret) {
+                        err = ret;
+                        break;
+                }
        }
        btrfs_free_path(path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9ac612e6ca60..d34b2dfc9628 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -38,6 +38,7 @@
 #include <linux/namei.h>
 #include <linux/miscdevice.h>
 #include <linux/magic.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -64,10 +65,9 @@ static void btrfs_put_super(struct super_block *sb)
 enum {
        Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
-        Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start,
+        Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
-        Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool,
+        Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
-        Opt_noacl, Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio,
+        Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
-        Opt_flushoncommit,
        Opt_discard, Opt_err,
 };
@@ -79,7 +79,6 @@ static match_table_t tokens = {
        {Opt_nodatasum, "nodatasum"},
        {Opt_nodatacow, "nodatacow"},
        {Opt_nobarrier, "nobarrier"},
-        {Opt_max_extent, "max_extent=%s"},
        {Opt_max_inline, "max_inline=%s"},
        {Opt_alloc_start, "alloc_start=%s"},
        {Opt_thread_pool, "thread_pool=%d"},
@@ -188,18 +187,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                                       info->thread_pool_size);
                        }
                        break;
-                case Opt_max_extent:
-                        num = match_strdup(&args[0]);
-                        if (num) {
-                                info->max_extent = memparse(num, NULL);
-                                kfree(num);
-                                info->max_extent = max_t(u64,
-                                        info->max_extent, root->sectorsize);
-                                printk(KERN_INFO "btrfs: max_extent at %llu\n",
-                                       (unsigned long long)info->max_extent);
-                        }
-                        break;
                case Opt_max_inline:
                        num = match_strdup(&args[0]);
                        if (num) {
@@ -511,7 +498,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
        btrfs_start_delalloc_inodes(root, 0);
        btrfs_wait_ordered_extents(root, 0, 0);
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
        ret = btrfs_commit_transaction(trans, root);
        return ret;
 }
@@ -529,9 +516,6 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",nodatacow");
        if (btrfs_test_opt(root, NOBARRIER))
                seq_puts(seq, ",nobarrier");
-        if (info->max_extent != (u64)-1)
-                seq_printf(seq, ",max_extent=%llu",
-                           (unsigned long long)info->max_extent);
        if (info->max_inline != 8192 * 1024)
                seq_printf(seq, ",max_inline=%llu",
                           (unsigned long long)info->max_inline);
@@ -710,11 +694,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
                        return -EINVAL;
-                /* recover relocation */
+                ret = btrfs_cleanup_fs_roots(root->fs_info);
-                ret = btrfs_recover_relocation(root);
                WARN_ON(ret);
-                ret = btrfs_cleanup_fs_roots(root->fs_info);
+                /* recover relocation */
+                ret = btrfs_recover_relocation(root);
                WARN_ON(ret);
                sb->s_flags &= ~MS_RDONLY;
@@ -730,34 +714,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct list_head *head = &root->fs_info->space_info;
        struct btrfs_space_info *found;
        u64 total_used = 0;
-        u64 data_used = 0;
        int bits = dentry->d_sb->s_blocksize_bits;
        __be32 *fsid = (__be32 *)root->fs_info->fsid;
        rcu_read_lock();
-        list_for_each_entry_rcu(found, head, list) {
+        list_for_each_entry_rcu(found, head, list)
-                if (found->flags & (BTRFS_BLOCK_GROUP_DUP|
+                total_used += found->disk_used;
-                                    BTRFS_BLOCK_GROUP_RAID10|
-                                    BTRFS_BLOCK_GROUP_RAID1)) {
-                        total_used += found->bytes_used;
-                        if (found->flags & BTRFS_BLOCK_GROUP_DATA)
-                                data_used += found->bytes_used;
-                        else
-                                data_used += found->total_bytes;
-                }
-                total_used += found->bytes_used;
-                if (found->flags & BTRFS_BLOCK_GROUP_DATA)
-                        data_used += found->bytes_used;
-                else
-                        data_used += found->total_bytes;
-        }
        rcu_read_unlock();
        buf->f_namelen = BTRFS_NAME_LEN;
        buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
        buf->f_bfree = buf->f_blocks - (total_used >> bits);
-        buf->f_bavail = buf->f_blocks - (data_used >> bits);
+        buf->f_bavail = buf->f_bfree;
        buf->f_bsize = dentry->d_sb->s_blocksize;
        buf->f_type = BTRFS_SUPER_MAGIC;
@@ -848,11 +816,14 @@ static const struct file_operations btrfs_ctl_fops = {
 };
 static struct miscdevice btrfs_misc = {
-        .minor          = MISC_DYNAMIC_MINOR,
+        .minor          = BTRFS_MINOR,
        .name           = "btrfs-control",
        .fops           = &btrfs_ctl_fops
 };
+MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
+MODULE_ALIAS("devname:btrfs-control");
 static int btrfs_interface_init(void)
 {
        return misc_register(&btrfs_misc);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2d654c1c794d..66e4c66cc63b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -17,6 +17,7 @@
 */
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/writeback.h>
 #include <linux/pagemap.h>
@@ -147,18 +148,13 @@ static void wait_current_trans(struct btrfs_root *root)
                while (1) {
                        prepare_to_wait(&root->fs_info->transaction_wait, &wait,
                                        TASK_UNINTERRUPTIBLE);
-                        if (cur_trans->blocked) {
+                        if (!cur_trans->blocked)
-                                mutex_unlock(&root->fs_info->trans_mutex);
-                                schedule();
-                                mutex_lock(&root->fs_info->trans_mutex);
-                                finish_wait(&root->fs_info->transaction_wait,
-                                            &wait);
-                        } else {
-                                finish_wait(&root->fs_info->transaction_wait,
-                                            &wait);
                                break;
-                        }
+                        mutex_unlock(&root->fs_info->trans_mutex);
+                        schedule();
+                        mutex_lock(&root->fs_info->trans_mutex);
                }
+                finish_wait(&root->fs_info->transaction_wait, &wait);
                put_transaction(cur_trans);
        }
 }
@@ -169,54 +165,89 @@ enum btrfs_trans_type {
        TRANS_USERSPACE,
 };
+static int may_wait_transaction(struct btrfs_root *root, int type)
+{
+        if (!root->fs_info->log_root_recovering &&
+            ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
+             type == TRANS_USERSPACE))
+                return 1;
+        return 0;
+}
 static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
-                                             int num_blocks, int type)
+                                                    u64 num_items, int type)
 {
-        struct btrfs_trans_handle *h =
+        struct btrfs_trans_handle *h;
-                kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+        struct btrfs_transaction *cur_trans;
+        int retries = 0;
        int ret;
+again:
+        h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+        if (!h)
+                return ERR_PTR(-ENOMEM);
        mutex_lock(&root->fs_info->trans_mutex);
-        if (!root->fs_info->log_root_recovering &&
+        if (may_wait_transaction(root, type))
-            ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
-             type == TRANS_USERSPACE))
                wait_current_trans(root);
        ret = join_transaction(root);
        BUG_ON(ret);
-        h->transid = root->fs_info->running_transaction->transid;
+        cur_trans = root->fs_info->running_transaction;
-        h->transaction = root->fs_info->running_transaction;
+        cur_trans->use_count++;
-        h->blocks_reserved = num_blocks;
+        mutex_unlock(&root->fs_info->trans_mutex);
+        h->transid = cur_trans->transid;
+        h->transaction = cur_trans;
        h->blocks_used = 0;
        h->block_group = 0;
-        h->alloc_exclude_nr = 0;
+        h->bytes_reserved = 0;
-        h->alloc_exclude_start = 0;
        h->delayed_ref_updates = 0;
+        h->block_rsv = NULL;
-        if (!current->journal_info && type != TRANS_USERSPACE)
+        smp_mb();
-                current->journal_info = h;
+        if (cur_trans->blocked && may_wait_transaction(root, type)) {
+                btrfs_commit_transaction(h, root);
+                goto again;
+        }
+        if (num_items > 0) {
+                ret = btrfs_trans_reserve_metadata(h, root, num_items,
+                                                   &retries);
+                if (ret == -EAGAIN) {
+                        btrfs_commit_transaction(h, root);
+                        goto again;
+                }
+                if (ret < 0) {
+                        btrfs_end_transaction(h, root);
+                        return ERR_PTR(ret);
+                }
+        }
-        root->fs_info->running_transaction->use_count++;
+        mutex_lock(&root->fs_info->trans_mutex);
        record_root_in_trans(h, root);
        mutex_unlock(&root->fs_info->trans_mutex);
+        if (!current->journal_info && type != TRANS_USERSPACE)
+                current->journal_info = h;
        return h;
 }
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
-                                                   int num_blocks)
+                                                   int num_items)
 {
-        return start_transaction(root, num_blocks, TRANS_START);
+        return start_transaction(root, num_items, TRANS_START);
 }
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
                                                   int num_blocks)
 {
-        return start_transaction(root, num_blocks, TRANS_JOIN);
+        return start_transaction(root, 0, TRANS_JOIN);
 }
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
                                                         int num_blocks)
 {
-        return start_transaction(r, num_blocks, TRANS_USERSPACE);
+        return start_transaction(r, 0, TRANS_USERSPACE);
 }
 /* wait for a transaction commit to be fully complete */
@@ -290,10 +321,36 @@ void btrfs_throttle(struct btrfs_root *root)
        mutex_unlock(&root->fs_info->trans_mutex);
 }
+static int should_end_transaction(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root)
+{
+        int ret;
+        ret = btrfs_block_rsv_check(trans, root,
+                                    &root->fs_info->global_block_rsv, 0, 5);
+        return ret ? 1 : 0;
+}
+int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root)
+{
+        struct btrfs_transaction *cur_trans = trans->transaction;
+        int updates;
+        if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
+                return 1;
+        updates = trans->delayed_ref_updates;
+        trans->delayed_ref_updates = 0;
+        if (updates)
+                btrfs_run_delayed_refs(trans, root, updates);
+        return should_end_transaction(trans, root);
+}
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, int throttle)
 {
-        struct btrfs_transaction *cur_trans;
+        struct btrfs_transaction *cur_trans = trans->transaction;
        struct btrfs_fs_info *info = root->fs_info;
        int count = 0;
@@ -317,9 +374,21 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                count++;
        }
+        btrfs_trans_release_metadata(trans, root);
+        if (!root->fs_info->open_ioctl_trans &&
+            should_end_transaction(trans, root))
+                trans->transaction->blocked = 1;
+        if (cur_trans->blocked && !cur_trans->in_commit) {
+                if (throttle)
+                        return btrfs_commit_transaction(trans, root);
+                else
+                        wake_up_process(info->transaction_kthread);
+        }
        mutex_lock(&info->trans_mutex);
-        cur_trans = info->running_transaction;
+        WARN_ON(cur_trans != info->running_transaction);
-        WARN_ON(cur_trans != trans->transaction);
        WARN_ON(cur_trans->num_writers < 1);
        cur_trans->num_writers--;
@@ -607,6 +676,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                        btrfs_free_log(trans, root);
                        btrfs_update_reloc_root(trans, root);
+                        btrfs_orphan_commit_root(trans, root);
                        if (root->commit_root != root->node) {
                                switch_commit_root(root);
@@ -631,30 +701,30 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 {
        struct btrfs_fs_info *info = root->fs_info;
-        int ret;
        struct btrfs_trans_handle *trans;
+        int ret;
        unsigned long nr;
-        smp_mb();
+        if (xchg(&root->defrag_running, 1))
-        if (root->defrag_running)
                return 0;
-        trans = btrfs_start_transaction(root, 1);
        while (1) {
-                root->defrag_running = 1;
+                trans = btrfs_start_transaction(root, 0);
+                if (IS_ERR(trans))
+                        return PTR_ERR(trans);
                ret = btrfs_defrag_leaves(trans, root, cacheonly);
                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
                btrfs_btree_balance_dirty(info->tree_root, nr);
                cond_resched();
-                trans = btrfs_start_transaction(root, 1);
                if (root->fs_info->closing || ret != -EAGAIN)
                        break;
        }
        root->defrag_running = 0;
-        smp_mb();
+        return ret;
-        btrfs_end_transaction(trans, root);
-        return 0;
 }
 #if 0
@@ -760,29 +830,72 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        struct btrfs_root_item *new_root_item;
        struct btrfs_root *tree_root = fs_info->tree_root;
        struct btrfs_root *root = pending->root;
+        struct btrfs_root *parent_root;
+        struct inode *parent_inode;
+        struct dentry *dentry;
        struct extent_buffer *tmp;
        struct extent_buffer *old;
        int ret;
+        int retries = 0;
+        u64 to_reserve = 0;
+        u64 index = 0;
        u64 objectid;
        new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
        if (!new_root_item) {
-                ret = -ENOMEM;
+                pending->error = -ENOMEM;
                goto fail;
        }
        ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
-        if (ret)
+        if (ret) {
+                pending->error = ret;
                goto fail;
+        }
+        btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
+        btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
+        if (to_reserve > 0) {
+                ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
+                                          to_reserve, &retries);
+                if (ret) {
+                        pending->error = ret;
+                        goto fail;
+                }
+        }
+        key.objectid = objectid;
+        key.offset = (u64)-1;
+        key.type = BTRFS_ROOT_ITEM_KEY;
+        trans->block_rsv = &pending->block_rsv;
+        dentry = pending->dentry;
+        parent_inode = dentry->d_parent->d_inode;
+        parent_root = BTRFS_I(parent_inode)->root;
+        record_root_in_trans(trans, parent_root);
+        /*
+         * insert the directory item
+         */
+        ret = btrfs_set_inode_index(parent_inode, &index);
+        BUG_ON(ret);
+        ret = btrfs_insert_dir_item(trans, parent_root,
+                                dentry->d_name.name, dentry->d_name.len,
+                                parent_inode->i_ino, &key,
+                                BTRFS_FT_DIR, index);
+        BUG_ON(ret);
+        btrfs_i_size_write(parent_inode, parent_inode->i_size +
+                                         dentry->d_name.len * 2);
+        ret = btrfs_update_inode(trans, parent_root, parent_inode);
+        BUG_ON(ret);
        record_root_in_trans(trans, root);
        btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
        memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
-        key.objectid = objectid;
-        /* record when the snapshot was created in key.offset */
-        key.offset = trans->transid;
-        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
        old = btrfs_lock_root_node(root);
        btrfs_cow_block(trans, root, old, NULL, 0, &old);
        btrfs_set_lock_blocking(old);
@@ -792,62 +905,32 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        free_extent_buffer(old);
        btrfs_set_root_node(new_root_item, tmp);
-        ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+        /* record when the snapshot was created in key.offset */
-                                new_root_item);
+        key.offset = trans->transid;
+        ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
        btrfs_tree_unlock(tmp);
        free_extent_buffer(tmp);
-        if (ret)
+        BUG_ON(ret);
-                goto fail;
-        key.offset = (u64)-1;
-        memcpy(&pending->root_key, &key, sizeof(key));
-fail:
-        kfree(new_root_item);
-        return ret;
-}
-static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
-                                   struct btrfs_pending_snapshot *pending)
-{
-        int ret;
-        int namelen;
-        u64 index = 0;
-        struct btrfs_trans_handle *trans;
-        struct inode *parent_inode;
-        struct btrfs_root *parent_root;
-        parent_inode = pending->dentry->d_parent->d_inode;
-        parent_root = BTRFS_I(parent_inode)->root;
-        trans = btrfs_join_transaction(parent_root, 1);
        /*
-         * insert the directory item
+         * insert root back/forward references
         */
-        namelen = strlen(pending->name);
+        ret = btrfs_add_root_ref(trans, tree_root, objectid,
-        ret = btrfs_set_inode_index(parent_inode, &index);
-        ret = btrfs_insert_dir_item(trans, parent_root,
-                            pending->name, namelen,
-                            parent_inode->i_ino,
-                            &pending->root_key, BTRFS_FT_DIR, index);
-        if (ret)
-                goto fail;
-        btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
-        ret = btrfs_update_inode(trans, parent_root, parent_inode);
-        BUG_ON(ret);
-        ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
-                                 pending->root_key.objectid,
                                 parent_root->root_key.objectid,
-                                 parent_inode->i_ino, index, pending->name,
+                                 parent_inode->i_ino, index,
-                                 namelen);
+                                 dentry->d_name.name, dentry->d_name.len);
        BUG_ON(ret);
+        key.offset = (u64)-1;
+        pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
+        BUG_ON(IS_ERR(pending->snap));
+        btrfs_reloc_post_snapshot(trans, pending);
+        btrfs_orphan_post_snapshot(trans, pending);
 fail:
-        btrfs_end_transaction(trans, fs_info->fs_root);
+        kfree(new_root_item);
-        return ret;
+        btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
+        return 0;
 }
 /*
@@ -867,25 +950,6 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
        return 0;
 }
-static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
-                                             struct btrfs_fs_info *fs_info)
-{
-        struct btrfs_pending_snapshot *pending;
-        struct list_head *head = &trans->transaction->pending_snapshots;
-        int ret;
-        while (!list_empty(head)) {
-                pending = list_entry(head->next,
-                                     struct btrfs_pending_snapshot, list);
-                ret = finish_pending_snapshot(fs_info, pending);
-                BUG_ON(ret);
-                list_del(&pending->list);
-                kfree(pending->name);
-                kfree(pending);
-        }
-        return 0;
-}
 static void update_super_roots(struct btrfs_root *root)
 {
        struct btrfs_root_item *root_item;
@@ -914,6 +978,16 @@ int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
        return ret;
 }
+int btrfs_transaction_blocked(struct btrfs_fs_info *info)
+{
+        int ret = 0;
+        spin_lock(&info->new_trans_lock);
+        if (info->running_transaction)
+                ret = info->running_transaction->blocked;
+        spin_unlock(&info->new_trans_lock);
+        return ret;
+}
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root)
 {
@@ -935,6 +1009,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        ret = btrfs_run_delayed_refs(trans, root, 0);
        BUG_ON(ret);
+        btrfs_trans_release_metadata(trans, root);
        cur_trans = trans->transaction;
        /*
         * set the flushing flag so procs in this transaction have to
@@ -987,9 +1063,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                        snap_pending = 1;
                WARN_ON(cur_trans != trans->transaction);
-                prepare_to_wait(&cur_trans->writer_wait, &wait,
-                                TASK_UNINTERRUPTIBLE);
                if (cur_trans->num_writers > 1)
                        timeout = MAX_SCHEDULE_TIMEOUT;
                else if (should_grow)
@@ -1012,6 +1085,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                 */
                btrfs_run_ordered_operations(root, 1);
+                prepare_to_wait(&cur_trans->writer_wait, &wait,
+                                TASK_UNINTERRUPTIBLE);
                smp_mb();
                if (cur_trans->num_writers > 1 || should_grow)
                        schedule_timeout(timeout);
@@ -1097,9 +1173,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_finish_extent_commit(trans, root);
-        /* do the directory inserts of any pending snapshot creations */
-        finish_pending_snapshots(trans, root->fs_info);
        mutex_lock(&root->fs_info->trans_mutex);
        cur_trans->commit_done = 1;
@@ -1142,9 +1215,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
                if (btrfs_header_backref_rev(root->node) <
                    BTRFS_MIXED_BACKREF_REV)
-                        btrfs_drop_snapshot(root, 0);
+                        btrfs_drop_snapshot(root, NULL, 0);
                else
-                        btrfs_drop_snapshot(root, 1);
+                        btrfs_drop_snapshot(root, NULL, 1);
        }
        return 0;
 }
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 93c7ccb33118..e104986d0bfd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -45,20 +45,23 @@ struct btrfs_transaction {
 struct btrfs_trans_handle {
        u64 transid;
+        u64 block_group;
+        u64 bytes_reserved;
        unsigned long blocks_reserved;
        unsigned long blocks_used;
-        struct btrfs_transaction *transaction;
-        u64 block_group;
-        u64 alloc_exclude_start;
-        u64 alloc_exclude_nr;
        unsigned long delayed_ref_updates;
+        struct btrfs_transaction *transaction;
+        struct btrfs_block_rsv *block_rsv;
 };
 struct btrfs_pending_snapshot {
        struct dentry *dentry;
        struct btrfs_root *root;
-        char *name;
+        struct btrfs_root *snap;
-        struct btrfs_key root_key;
+        /* block reservation for the operation */
+        struct btrfs_block_rsv block_rsv;
+        /* extra metadata reseration for relocation */
+        int error;
        struct list_head list;
 };
@@ -85,11 +88,11 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
-                                                   int num_blocks);
+                                                   int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
-                                                   int num_blocks);
+                                                  int num_blocks);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
-                                                   int num_blocks);
+                                                         int num_blocks);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root);
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
@@ -103,6 +106,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root);
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root);
+int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root);
 void btrfs_throttle(struct btrfs_root *root);
 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root);
@@ -112,5 +117,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
                                struct extent_io_tree *dirty_pages, int mark);
 int btrfs_wait_marked_extents(struct btrfs_root *root,
                                struct extent_io_tree *dirty_pages, int mark);
+int btrfs_transaction_blocked(struct btrfs_fs_info *info);
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
 #endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index b10eacdb1620..f7ac8e013ed7 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -117,13 +117,14 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
                                 path->nodes[1], 0,
                                 cache_only, &last_ret,
                                 &root->defrag_progress);
-        WARN_ON(ret && ret != -EAGAIN);
+        if (ret) {
+                WARN_ON(ret == -EAGAIN);
+                goto out;
+        }
        if (next_key_ret == 0) {
                memcpy(&root->defrag_progress, &key, sizeof(key));
                ret = -EAGAIN;
        }
-        btrfs_release_path(root, path);
 out:
        if (path)
                btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1255fcc8ade5..fb102a9aee9c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -17,6 +17,7 @@
 */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include "ctree.h"
 #include "transaction.h"
 #include "disk-io.h"
@@ -134,6 +135,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root)
 {
        int ret;
+        int err = 0;
        mutex_lock(&root->log_mutex);
        if (root->log_root) {
@@ -154,17 +156,19 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
        mutex_lock(&root->fs_info->tree_log_mutex);
        if (!root->fs_info->log_root_tree) {
                ret = btrfs_init_log_root_tree(trans, root->fs_info);
-                BUG_ON(ret);
+                if (ret)
+                        err = ret;
        }
-        if (!root->log_root) {
+        if (err == 0 && !root->log_root) {
                ret = btrfs_add_log_tree(trans, root);
-                BUG_ON(ret);
+                if (ret)
+                        err = ret;
        }
        mutex_unlock(&root->fs_info->tree_log_mutex);
        root->log_batch++;
        atomic_inc(&root->log_writers);
        mutex_unlock(&root->log_mutex);
-        return 0;
+        return err;
 }
 /*
@@ -375,7 +379,7 @@ insert:
                        BUG_ON(ret);
                }
        } else if (ret) {
-                BUG();
+                return ret;
        }
        dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
                                        path->slots[0]);
@@ -1698,9 +1702,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                next = btrfs_find_create_tree_block(root, bytenr, blocksize);
-                wc->process_func(root, next, wc, ptr_gen);
                if (*level == 1) {
+                        wc->process_func(root, next, wc, ptr_gen);
                        path->slots[*level]++;
                        if (wc->free) {
                                btrfs_read_buffer(next, ptr_gen);
@@ -1733,35 +1737,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
        WARN_ON(*level < 0);
        WARN_ON(*level >= BTRFS_MAX_LEVEL);
-        if (path->nodes[*level] == root->node)
+        path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
-                parent = path->nodes[*level];
-        else
-                parent = path->nodes[*level + 1];
-        bytenr = path->nodes[*level]->start;
-        blocksize = btrfs_level_size(root, *level);
-        root_owner = btrfs_header_owner(parent);
-        root_gen = btrfs_header_generation(parent);
-        wc->process_func(root, path->nodes[*level], wc,
-                         btrfs_header_generation(path->nodes[*level]));
-        if (wc->free) {
-                next = path->nodes[*level];
-                btrfs_tree_lock(next);
-                clean_tree_block(trans, root, next);
-                btrfs_set_lock_blocking(next);
-                btrfs_wait_tree_block_writeback(next);
-                btrfs_tree_unlock(next);
-                WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
-                ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
-                BUG_ON(ret);
-        }
-        free_extent_buffer(path->nodes[*level]);
-        path->nodes[*level] = NULL;
-        *level += 1;
        cond_resched();
        return 0;
@@ -1780,7 +1756,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
        for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
                slot = path->slots[i];
-                if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+                if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
                        struct extent_buffer *node;
                        node = path->nodes[i];
                        path->slots[i]++;
@@ -2046,7 +2022,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        mutex_unlock(&log_root_tree->log_mutex);
        ret = update_log_root(trans, log);
-        BUG_ON(ret);
        mutex_lock(&log_root_tree->log_mutex);
        if (atomic_dec_and_test(&log_root_tree->log_writers)) {
@@ -2055,6 +2030,15 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                        wake_up(&log_root_tree->log_writer_wait);
        }
+        if (ret) {
+                BUG_ON(ret != -ENOSPC);
+                root->fs_info->last_trans_log_full_commit = trans->transid;
+                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+                mutex_unlock(&log_root_tree->log_mutex);
+                ret = -EAGAIN;
+                goto out;
+        }
        index2 = log_root_tree->log_transid % 2;
        if (atomic_read(&log_root_tree->log_commit[index2])) {
                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
@@ -2128,15 +2112,10 @@ out:
        return 0;
 }
-/*
+static void free_log_tree(struct btrfs_trans_handle *trans,
- * free all the extents used by the tree log.  This should be called
+                          struct btrfs_root *log)
- * at commit time of the full transaction
- */
-int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
 {
        int ret;
-        struct btrfs_root *log;
-        struct key;
        u64 start;
        u64 end;
        struct walk_control wc = {
@@ -2144,10 +2123,6 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
                .process_func = process_one_buffer
        };
-        if (!root->log_root || root->fs_info->log_root_recovering)
-                return 0;
-        log = root->log_root;
        ret = walk_log_tree(trans, log, &wc);
        BUG_ON(ret);
@@ -2161,14 +2136,30 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
                                  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
        }
-        if (log->log_transid > 0) {
-                ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
-                                     &log->root_key);
-                BUG_ON(ret);
-        }
-        root->log_root = NULL;
        free_extent_buffer(log->node);
        kfree(log);
+}
+/*
+ * free all the extents used by the tree log.  This should be called
+ * at commit time of the full transaction
+ */
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+{
+        if (root->log_root) {
+                free_log_tree(trans, root->log_root);
+                root->log_root = NULL;
+        }
+        return 0;
+}
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info)
+{
+        if (fs_info->log_root_tree) {
+                free_log_tree(trans, fs_info->log_root_tree);
+                fs_info->log_root_tree = NULL;
+        }
        return 0;
 }
@@ -2202,6 +2193,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
        struct btrfs_dir_item *di;
        struct btrfs_path *path;
        int ret;
+        int err = 0;
        int bytes_del = 0;
        if (BTRFS_I(dir)->logged_trans < trans->transid)
@@ -2217,7 +2209,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
                                   name, name_len, -1);
-        if (di && !IS_ERR(di)) {
+        if (IS_ERR(di)) {
+                err = PTR_ERR(di);
+                goto fail;
+        }
+        if (di) {
                ret = btrfs_delete_one_dir_name(trans, log, path, di);
                bytes_del += name_len;
                BUG_ON(ret);
@@ -2225,7 +2221,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
        btrfs_release_path(log, path);
        di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
                                         index, name, name_len, -1);
-        if (di && !IS_ERR(di)) {
+        if (IS_ERR(di)) {
+                err = PTR_ERR(di);
+                goto fail;
+        }
+        if (di) {
                ret = btrfs_delete_one_dir_name(trans, log, path, di);
                bytes_del += name_len;
                BUG_ON(ret);
@@ -2243,6 +2243,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
                btrfs_release_path(log, path);
                ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
+                if (ret < 0) {
+                        err = ret;
+                        goto fail;
+                }
                if (ret == 0) {
                        struct btrfs_inode_item *item;
                        u64 i_size;
@@ -2260,9 +2264,13 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
                        ret = 0;
                btrfs_release_path(log, path);
        }
+fail:
        btrfs_free_path(path);
        mutex_unlock(&BTRFS_I(dir)->log_mutex);
+        if (ret == -ENOSPC) {
+                root->fs_info->last_trans_log_full_commit = trans->transid;
+                ret = 0;
+        }
        btrfs_end_log_trans(root);
        return 0;
@@ -2290,6 +2298,10 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
        ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
                                  dirid, &index);
        mutex_unlock(&BTRFS_I(inode)->log_mutex);
+        if (ret == -ENOSPC) {
+                root->fs_info->last_trans_log_full_commit = trans->transid;
+                ret = 0;
+        }
        btrfs_end_log_trans(root);
        return ret;
@@ -2317,7 +2329,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
        else
                key.type = BTRFS_DIR_LOG_INDEX_KEY;
        ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
-        BUG_ON(ret);
+        if (ret)
+                return ret;
        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                              struct btrfs_dir_log_item);
@@ -2342,6 +2355,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
        struct btrfs_key max_key;
        struct btrfs_root *log = root->log_root;
        struct extent_buffer *src;
+        int err = 0;
        int ret;
        int i;
        int nritems;
@@ -2404,6 +2418,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                        ret = overwrite_item(trans, log, dst_path,
                                             path->nodes[0], path->slots[0],
                                             &tmp);
+                        if (ret) {
+                                err = ret;
+                                goto done;
+                        }
                }
        }
        btrfs_release_path(root, path);
@@ -2431,7 +2449,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                                goto done;
                        ret = overwrite_item(trans, log, dst_path, src, i,
                                             &min_key);
-                        BUG_ON(ret);
+                        if (ret) {
+                                err = ret;
+                                goto done;
+                        }
                }
                path->slots[0] = nritems;
@@ -2453,22 +2474,30 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                        ret = overwrite_item(trans, log, dst_path,
                                             path->nodes[0], path->slots[0],
                                             &tmp);
+                        if (ret)
-                        BUG_ON(ret);
+                                err = ret;
-                        last_offset = tmp.offset;
+                        else
+                                last_offset = tmp.offset;
                        goto done;
                }
        }
 done:
-        *last_offset_ret = last_offset;
        btrfs_release_path(root, path);
        btrfs_release_path(log, dst_path);
-        /* insert the log range keys to indicate where the log is valid */
+        if (err == 0) {
-        ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
+                *last_offset_ret = last_offset;
-                                 first_offset, last_offset);
+                /*
-        BUG_ON(ret);
+                 * insert the log range keys to indicate where the log
-        return 0;
+                 * is valid
+                 */
+                ret = insert_dir_log_key(trans, log, path, key_type,
+                                         inode->i_ino, first_offset,
+                                         last_offset);
+                if (ret)
+                        err = ret;
+        }
+        return err;
 }
 /*
@@ -2500,7 +2529,8 @@ again:
                ret = log_dir_items(trans, root, inode, path,
                                    dst_path, key_type, min_key,
                                    &max_key);
-                BUG_ON(ret);
+                if (ret)
+                        return ret;
                if (max_key == (u64)-1)
                        break;
                min_key = max_key + 1;
@@ -2534,8 +2564,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
        while (1) {
                ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
+                BUG_ON(ret == 0);
-                if (ret != 1)
+                if (ret < 0)
                        break;
                if (path->slots[0] == 0)
@@ -2553,7 +2583,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
                btrfs_release_path(log, path);
        }
        btrfs_release_path(log, path);
-        return 0;
+        return ret;
 }
 static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -2586,7 +2616,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
        }
        ret = btrfs_insert_empty_items(trans, log, dst_path,
                                       ins_keys, ins_sizes, nr);
-        BUG_ON(ret);
+        if (ret) {
+                kfree(ins_data);
+                return ret;
+        }
        for (i = 0; i < nr; i++, dst_path->slots[0]++) {
                dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
@@ -2659,16 +2692,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
         * we have to do this after the loop above to avoid changing the
         * log tree while trying to change the log tree.
         */
+        ret = 0;
        while (!list_empty(&ordered_sums)) {
                struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
                                                   struct btrfs_ordered_sum,
                                                   list);
-                ret = btrfs_csum_file_blocks(trans, log, sums);
+                if (!ret)
-                BUG_ON(ret);
+                        ret = btrfs_csum_file_blocks(trans, log, sums);
                list_del(&sums->list);
                kfree(sums);
        }
-        return 0;
+        return ret;
 }
 /* log a single inode in the tree log.
@@ -2696,6 +2730,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        struct btrfs_root *log = root->log_root;
        struct extent_buffer *src = NULL;
        u32 size;
+        int err = 0;
        int ret;
        int nritems;
        int ins_start_slot = 0;
@@ -2738,7 +2773,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        } else {
                ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
        }
-        BUG_ON(ret);
+        if (ret) {
+                err = ret;
+                goto out_unlock;
+        }
        path->keep_locks = 1;
        while (1) {
@@ -2767,7 +2805,10 @@ again:
                ret = copy_items(trans, log, dst_path, src, ins_start_slot,
                                 ins_nr, inode_only);
-                BUG_ON(ret);
+                if (ret) {
+                        err = ret;
+                        goto out_unlock;
+                }
                ins_nr = 1;
                ins_start_slot = path->slots[0];
 next_slot:
@@ -2783,7 +2824,10 @@ next_slot:
                        ret = copy_items(trans, log, dst_path, src,
                                         ins_start_slot,
                                         ins_nr, inode_only);
-                        BUG_ON(ret);
+                        if (ret) {
+                                err = ret;
+                                goto out_unlock;
+                        }
                        ins_nr = 0;
                }
                btrfs_release_path(root, path);
@@ -2801,7 +2845,10 @@ next_slot:
                ret = copy_items(trans, log, dst_path, src,
                                 ins_start_slot,
                                 ins_nr, inode_only);
-                BUG_ON(ret);
+                if (ret) {
+                        err = ret;
+                        goto out_unlock;
+                }
                ins_nr = 0;
        }
        WARN_ON(ins_nr);
@@ -2809,14 +2856,18 @@ next_slot:
                btrfs_release_path(root, path);
                btrfs_release_path(log, dst_path);
                ret = log_directory_changes(trans, root, inode, path, dst_path);
-                BUG_ON(ret);
+                if (ret) {
+                        err = ret;
+                        goto out_unlock;
+                }
        }
        BTRFS_I(inode)->logged_trans = trans->transid;
+out_unlock:
        mutex_unlock(&BTRFS_I(inode)->log_mutex);
        btrfs_free_path(path);
        btrfs_free_path(dst_path);
-        return 0;
+        return err;
 }
 /*
@@ -2941,10 +2992,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                goto end_no_trans;
        }
-        start_log_trans(trans, root);
+        ret = start_log_trans(trans, root);
+        if (ret)
+                goto end_trans;
        ret = btrfs_log_inode(trans, root, inode, inode_only);
-        BUG_ON(ret);
+        if (ret)
+                goto end_trans;
        /*
         * for regular files, if its inode is already on disk, we don't
@@ -2954,8 +3008,10 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
         */
        if (S_ISREG(inode->i_mode) &&
            BTRFS_I(inode)->generation <= last_committed &&
-            BTRFS_I(inode)->last_unlink_trans <= last_committed)
+            BTRFS_I(inode)->last_unlink_trans <= last_committed) {
-                        goto no_parent;
+                ret = 0;
+                goto end_trans;
+        }
        inode_only = LOG_INODE_EXISTS;
        while (1) {
@@ -2969,15 +3025,21 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                if (BTRFS_I(inode)->generation >
                    root->fs_info->last_trans_committed) {
                        ret = btrfs_log_inode(trans, root, inode, inode_only);
-                        BUG_ON(ret);
+                        if (ret)
+                                goto end_trans;
                }
                if (IS_ROOT(parent))
                        break;
                parent = parent->d_parent;
        }
-no_parent:
        ret = 0;
+end_trans:
+        if (ret < 0) {
+                BUG_ON(ret != -ENOSPC);
+                root->fs_info->last_trans_log_full_commit = trans->transid;
+                ret = 1;
+        }
        btrfs_end_log_trans(root);
 end_no_trans:
        return ret;
@@ -3019,7 +3081,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
        path = btrfs_alloc_path();
        BUG_ON(!path);
-        trans = btrfs_start_transaction(fs_info->tree_root, 1);
+        trans = btrfs_start_transaction(fs_info->tree_root, 0);
        wc.trans = trans;
        wc.pin = 1;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 0776eacb5083..3dfae84c8cc8 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -25,6 +25,8 @@
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
                   struct btrfs_root *root);
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info);
 int btrfs_recover_log_trees(struct btrfs_root *tree_root);
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct dentry *dentry);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9df8e3f1ccab..d6e3af8be95b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -17,6 +17,7 @@
 */
 #include <linux/sched.h>
 #include <linux/bio.h>
+#include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/random.h>
@@ -1096,7 +1097,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
        if (!path)
                return -ENOMEM;
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
        key.type = BTRFS_DEV_ITEM_KEY;
        key.offset = device->devid;
@@ -1485,7 +1486,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
                goto error;
        }
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
        lock_chunks(root);
        device->barriers = 1;
@@ -1750,9 +1751,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
        /* step one, relocate all the extents inside this chunk */
        ret = btrfs_relocate_block_group(extent_root, chunk_offset);
-        BUG_ON(ret);
+        if (ret)
+                return ret;
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
        BUG_ON(!trans);
        lock_chunks(root);
@@ -1924,7 +1926,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
                        break;
                BUG_ON(ret);
-                trans = btrfs_start_transaction(dev_root, 1);
+                trans = btrfs_start_transaction(dev_root, 0);
                BUG_ON(!trans);
                ret = btrfs_grow_device(trans, device, old_size);
@@ -2093,11 +2095,7 @@ again:
        }
        /* Shrinking succeeded, else we would be at "done". */
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
-        if (!trans) {
-                ret = -ENOMEM;
-                goto done;
-        }
        lock_chunks(root);
        device->disk_total_bytes = new_size;
@@ -2198,9 +2196,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                min_stripes = 2;
        }
        if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
-                num_stripes = min_t(u64, 2, fs_devices->rw_devices);
+                if (fs_devices->rw_devices < 2)
-                if (num_stripes < 2)
                        return -ENOSPC;
+                num_stripes = 2;
                min_stripes = 2;
        }
        if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
@@ -2244,8 +2242,16 @@ again:
                do_div(calc_size, stripe_len);
                calc_size *= stripe_len;
        }
        /* we don't want tiny stripes */
-        calc_size = max_t(u64, min_stripe_size, calc_size);
+        if (!looped)
+                calc_size = max_t(u64, min_stripe_size, calc_size);
+        /*
+         * we're about to do_div by the stripe_len so lets make sure
+         * we end up with something bigger than a stripe
+         */
+        calc_size = max_t(u64, calc_size, stripe_len * 4);
        do_div(calc_size, stripe_len);
        calc_size *= stripe_len;
@@ -3389,6 +3395,8 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
        key.type = 0;
 again:
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto error;
        while (1) {
                leaf = path->nodes[0];
                slot = path->slots[0];
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 193b58f7d3f3..88ecbb215878 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -154,15 +154,10 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
        if (trans)
                return do_setxattr(trans, inode, name, value, size, flags);
-        ret = btrfs_reserve_metadata_space(root, 2);
+        trans = btrfs_start_transaction(root, 2);
-        if (ret)
+        if (IS_ERR(trans))
-                return ret;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
-        if (!trans) {
-                ret = -ENOMEM;
-                goto out;
-        }
        btrfs_set_trans_block_group(trans, inode);
        ret = do_setxattr(trans, inode, name, value, size, flags);
@@ -174,7 +169,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
 out:
        btrfs_end_transaction_throttle(trans, root);
-        btrfs_unreserve_metadata_space(root, 2);
        return ret;
 }
@@ -282,7 +276,7 @@ err:
 * List of handlers for synthetic system.* attributes.  All real ondisk
 * attributes are handled directly.
 */
-struct xattr_handler *btrfs_xattr_handlers[] = {
+const struct xattr_handler *btrfs_xattr_handlers[] = {
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
        &btrfs_xattr_acl_access_handler,
        &btrfs_xattr_acl_default_handler,
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 721efa0346e0..7a43fd640bbb 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -21,9 +21,9 @@
 #include <linux/xattr.h>
-extern struct xattr_handler btrfs_xattr_acl_access_handler;
+extern const struct xattr_handler btrfs_xattr_acl_access_handler;
-extern struct xattr_handler btrfs_xattr_acl_default_handler;
+extern const struct xattr_handler btrfs_xattr_acl_default_handler;
-extern struct xattr_handler *btrfs_xattr_handlers[];
+extern const struct xattr_handler *btrfs_xattr_handlers[];
 extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
                void *buffer, size_t size);
diff --git a/fs/buffer.c b/fs/buffer.c
index c9c266db0624..d54812b198e9 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -275,6 +275,7 @@ void invalidate_bdev(struct block_device *bdev)
                return;
        invalidate_bh_lrus();
+        lru_add_drain_all();    /* make sure all lru add caches are flushed */
        invalidate_mapping_pages(mapping, 0, -1);
 }
 EXPORT_SYMBOL(invalidate_bdev);
@@ -560,26 +561,17 @@ repeat:
        return err;
 }
-static void do_thaw_all(struct work_struct *work)
+static void do_thaw_one(struct super_block *sb, void *unused)
 {
-        struct super_block *sb;
        char b[BDEVNAME_SIZE];
+        while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
+                printk(KERN_WARNING "Emergency Thaw on %s\n",
+                       bdevname(sb->s_bdev, b));
+}
-        spin_lock(&sb_lock);
+static void do_thaw_all(struct work_struct *work)
-restart:
+{
-        list_for_each_entry(sb, &super_blocks, s_list) {
+        iterate_supers(do_thaw_one, NULL);
-                sb->s_count++;
-                spin_unlock(&sb_lock);
-                down_read(&sb->s_umount);
-                while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
-                        printk(KERN_WARNING "Emergency Thaw on %s\n",
-                               bdevname(sb->s_bdev, b));
-                up_read(&sb->s_umount);
-                spin_lock(&sb_lock);
-                if (__put_super_and_need_restart(sb))
-                        goto restart;
-        }
-        spin_unlock(&sb_lock);
        kfree(work);
        printk(KERN_WARNING "Emergency Thaw complete\n");
 }
@@ -1957,14 +1949,11 @@ static int __block_commit_write(struct inode *inode, struct page *page,
 }
 /*
- * block_write_begin takes care of the basic task of block allocation and
+ * Filesystems implementing the new truncate sequence should use the
- * bringing partial write blocks uptodate first.
+ * _newtrunc postfix variant which won't incorrectly call vmtruncate.
- *
+ * The filesystem needs to handle block truncation upon failure.
- * If *pagep is not NULL, then block_write_begin uses the locked page
- * at *pagep rather than allocating its own. In this case, the page will
- * not be unlocked or deallocated on failure.
 */
-int block_write_begin(struct file *file, struct address_space *mapping,
+int block_write_begin_newtrunc(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata,
                        get_block_t *get_block)
@@ -2000,20 +1989,50 @@ int block_write_begin(struct file *file, struct address_space *mapping,
                        unlock_page(page);
                        page_cache_release(page);
                        *pagep = NULL;
-                        /*
-                         * prepare_write() may have instantiated a few blocks
-                         * outside i_size.  Trim these off again. Don't need
-                         * i_size_read because we hold i_mutex.
-                         */
-                        if (pos + len > inode->i_size)
-                                vmtruncate(inode, inode->i_size);
                }
        }
 out:
        return status;
 }
+EXPORT_SYMBOL(block_write_begin_newtrunc);
+/*
+ * block_write_begin takes care of the basic task of block allocation and
+ * bringing partial write blocks uptodate first.
+ *
+ * If *pagep is not NULL, then block_write_begin uses the locked page
+ * at *pagep rather than allocating its own. In this case, the page will
+ * not be unlocked or deallocated on failure.
+ */
+int block_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata,
+                        get_block_t *get_block)
+{
+        int ret;
+        ret = block_write_begin_newtrunc(file, mapping, pos, len, flags,
+                                        pagep, fsdata, get_block);
+        /*
+         * prepare_write() may have instantiated a few blocks
+         * outside i_size.  Trim these off again. Don't need
+         * i_size_read because we hold i_mutex.
+         *
+         * Filesystems which pass down their own page also cannot
+         * call into vmtruncate here because it would lead to lock
+         * inversion problems (*pagep is locked). This is a further
+         * example of where the old truncate sequence is inadequate.
+         */
+        if (unlikely(ret) && *pagep == NULL) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
+        return ret;
+}
 EXPORT_SYMBOL(block_write_begin);
 int block_write_end(struct file *file, struct address_space *mapping,
@@ -2332,7 +2351,7 @@ out:
 * For moronic filesystems that do not allow holes in file.
 * We may have to extend the file.
 */
-int cont_write_begin(struct file *file, struct address_space *mapping,
+int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata,
                        get_block_t *get_block, loff_t *bytes)
@@ -2353,11 +2372,30 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
        }
        *pagep = NULL;
-        err = block_write_begin(file, mapping, pos, len,
+        err = block_write_begin_newtrunc(file, mapping, pos, len,
                                flags, pagep, fsdata, get_block);
 out:
        return err;
 }
+EXPORT_SYMBOL(cont_write_begin_newtrunc);
+int cont_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata,
+                        get_block_t *get_block, loff_t *bytes)
+{
+        int ret;
+        ret = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
+                                        pagep, fsdata, get_block, bytes);
+        if (unlikely(ret)) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
+        return ret;
+}
 EXPORT_SYMBOL(cont_write_begin);
 int block_prepare_write(struct page *page, unsigned from, unsigned to,
@@ -2389,7 +2427,7 @@ EXPORT_SYMBOL(block_commit_write);
 *
 * We are not allowed to take the i_mutex here so we have to play games to
 * protect against truncate races as the page could now be beyond EOF.  Because
- * vmtruncate() writes the inode size before removing pages, once we have the
+ * truncate writes the inode size before removing pages, once we have the
 * page lock we can determine safely if the page is beyond EOF. If it is not
 * beyond EOF, then the page is guaranteed safe against truncation until we
 * unlock the page.
@@ -2472,10 +2510,11 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
 }
 /*
- * On entry, the page is fully not uptodate.
+ * Filesystems implementing the new truncate sequence should use the
- * On exit the page is fully uptodate in the areas outside (from,to)
+ * _newtrunc postfix variant which won't incorrectly call vmtruncate.
+ * The filesystem needs to handle block truncation upon failure.
 */
-int nobh_write_begin(struct file *file, struct address_space *mapping,
+int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata,
                        get_block_t *get_block)
@@ -2508,8 +2547,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
                unlock_page(page);
                page_cache_release(page);
                *pagep = NULL;
-                return block_write_begin(file, mapping, pos, len, flags, pagep,
+                return block_write_begin_newtrunc(file, mapping, pos, len,
-                                        fsdata, get_block);
+                                        flags, pagep, fsdata, get_block);
        }
        if (PageMappedToDisk(page))
@@ -2613,8 +2652,34 @@ out_release:
        page_cache_release(page);
        *pagep = NULL;
-        if (pos + len > inode->i_size)
+        return ret;
-                vmtruncate(inode, inode->i_size);
+}
+EXPORT_SYMBOL(nobh_write_begin_newtrunc);
+/*
+ * On entry, the page is fully not uptodate.
+ * On exit the page is fully uptodate in the areas outside (from,to)
+ */
+int nobh_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata,
+                        get_block_t *get_block)
+{
+        int ret;
+        ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags,
+                                        pagep, fsdata, get_block);
+        /*
+         * prepare_write() may have instantiated a few blocks
+         * outside i_size.  Trim these off again. Don't need
+         * i_size_read because we hold i_mutex.
+         */
+        if (unlikely(ret)) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
        return ret;
 }
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 27089311fbea..37fe101a4e0d 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -9,6 +9,7 @@
 * 2 of the Licence, or (at your option) any later version.
 */
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/buffer_head.h>
 #include "internal.h"
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index f7c255f9c624..a8cd821226da 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -34,6 +34,7 @@ struct cachefiles_object {
        loff_t                          i_size;         /* object size */
        unsigned long                   flags;
 #define CACHEFILES_OBJECT_ACTIVE        0               /* T if marked active */
+#define CACHEFILES_OBJECT_BURIED        1               /* T if preemptively buried */
        atomic_t                        usage;          /* object usage count */
        uint8_t                         type;           /* object type */
        uint8_t                         new;            /* T if object new */
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index eeb4986ea7db..f4a7840bf42c 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -19,6 +19,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/security.h>
+#include <linux/slab.h>
 #include "internal.h"
 #define CACHEFILES_KEYBUF_SIZE 512
@@ -92,6 +93,59 @@ static noinline void cachefiles_printk_object(struct cachefiles_object *object,
 }
 /*
+ * mark the owner of a dentry, if there is one, to indicate that that dentry
+ * has been preemptively deleted
+ * - the caller must hold the i_mutex on the dentry's parent as required to
+ *   call vfs_unlink(), vfs_rmdir() or vfs_rename()
+ */
+static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
+                                          struct dentry *dentry)
+{
+        struct cachefiles_object *object;
+        struct rb_node *p;
+        _enter(",'%*.*s'",
+               dentry->d_name.len, dentry->d_name.len, dentry->d_name.name);
+        write_lock(&cache->active_lock);
+        p = cache->active_nodes.rb_node;
+        while (p) {
+                object = rb_entry(p, struct cachefiles_object, active_node);
+                if (object->dentry > dentry)
+                        p = p->rb_left;
+                else if (object->dentry < dentry)
+                        p = p->rb_right;
+                else
+                        goto found_dentry;
+        }
+        write_unlock(&cache->active_lock);
+        _leave(" [no owner]");
+        return;
+        /* found the dentry for  */
+found_dentry:
+        kdebug("preemptive burial: OBJ%x [%s] %p",
+               object->fscache.debug_id,
+               fscache_object_states[object->fscache.state],
+               dentry);
+        if (object->fscache.state < FSCACHE_OBJECT_DYING) {
+                printk(KERN_ERR "\n");
+                printk(KERN_ERR "CacheFiles: Error:"
+                       " Can't preemptively bury live object\n");
+                cachefiles_printk_object(object, NULL);
+        } else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
+                printk(KERN_ERR "CacheFiles: Error:"
+                       " Object already preemptively buried\n");
+        }
+        write_unlock(&cache->active_lock);
+        _leave(" [owner marked]");
+}
+/*
 * record the fact that an object is now active
 */
 static int cachefiles_mark_object_active(struct cachefiles_cache *cache,
@@ -218,7 +272,8 @@ requeue:
 */
 static int cachefiles_bury_object(struct cachefiles_cache *cache,
                                  struct dentry *dir,
-                                  struct dentry *rep)
+                                  struct dentry *rep,
+                                  bool preemptive)
 {
        struct dentry *grave, *trap;
        char nbuffer[8 + 8 + 1];
@@ -228,11 +283,16 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
               dir->d_name.len, dir->d_name.len, dir->d_name.name,
               rep->d_name.len, rep->d_name.len, rep->d_name.name);
+        _debug("remove %p from %p", rep, dir);
        /* non-directories can just be unlinked */
        if (!S_ISDIR(rep->d_inode->i_mode)) {
                _debug("unlink stale object");
                ret = vfs_unlink(dir->d_inode, rep);
+                if (preemptive)
+                        cachefiles_mark_object_buried(cache, rep);
                mutex_unlock(&dir->d_inode->i_mutex);
                if (ret == -EIO)
@@ -324,6 +384,9 @@ try_again:
        if (ret != 0 && ret != -ENOMEM)
                cachefiles_io_error(cache, "Rename failed with error %d", ret);
+        if (preemptive)
+                cachefiles_mark_object_buried(cache, rep);
        unlock_rename(cache->graveyard, dir);
        dput(grave);
        _leave(" = 0");
@@ -339,7 +402,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
        struct dentry *dir;
        int ret;
-        _enter(",{%p}", object->dentry);
+        _enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry);
        ASSERT(object->dentry);
        ASSERT(object->dentry->d_inode);
@@ -349,15 +412,25 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
-        /* we need to check that our parent is _still_ our parent - it may have
+        if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
-         * been renamed */
+                /* object allocation for the same key preemptively deleted this
-        if (dir == object->dentry->d_parent) {
+                 * object's file so that it could create its own file */
-                ret = cachefiles_bury_object(cache, dir, object->dentry);
+                _debug("object preemptively buried");
-        } else {
-                /* it got moved, presumably by cachefilesd culling it, so it's
-                 * no longer in the key path and we can ignore it */
                mutex_unlock(&dir->d_inode->i_mutex);
                ret = 0;
+        } else {
+                /* we need to check that our parent is _still_ our parent - it
+                 * may have been renamed */
+                if (dir == object->dentry->d_parent) {
+                        ret = cachefiles_bury_object(cache, dir,
+                                                     object->dentry, false);
+                } else {
+                        /* it got moved, presumably by cachefilesd culling it,
+                         * so it's no longer in the key path and we can ignore
+                         * it */
+                        mutex_unlock(&dir->d_inode->i_mutex);
+                        ret = 0;
+                }
        }
        dput(dir);
@@ -380,7 +453,9 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
        const char *name;
        int ret, nlen;
-        _enter("{%p},,%s,", parent->dentry, key);
+        _enter("OBJ%x{%p},OBJ%x,%s,",
+               parent->fscache.debug_id, parent->dentry,
+               object->fscache.debug_id, key);
        cache = container_of(parent->fscache.cache,
                             struct cachefiles_cache, cache);
@@ -508,7 +583,7 @@ lookup_again:
                         * mutex) */
                        object->dentry = NULL;
-                        ret = cachefiles_bury_object(cache, dir, next);
+                        ret = cachefiles_bury_object(cache, dir, next, true);
                        dput(next);
                        next = NULL;
@@ -827,7 +902,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
        /*  actually remove the victim (drops the dir mutex) */
        _debug("bury");
-        ret = cachefiles_bury_object(cache, dir, victim);
+        ret = cachefiles_bury_object(cache, dir, victim, false);
        if (ret < 0)
                goto error;
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 1d8332563863..0f0d41fbb03f 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -10,6 +10,7 @@
 */
 #include <linux/mount.h>
+#include <linux/slab.h>
 #include <linux/file.h>
 #include "internal.h"
diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
index b5808cdb2232..039b5011d83b 100644
--- a/fs/cachefiles/security.c
+++ b/fs/cachefiles/security.c
@@ -77,6 +77,8 @@ static int cachefiles_check_cache_dir(struct cachefiles_cache *cache,
 /*
 * check the security details of the on-disk cache
 * - must be called with security override in force
+ * - must return with a security override in force - even in the case of an
+ *   error
 */
 int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
                                        struct dentry *root,
@@ -99,6 +101,8 @@ int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
         * which create files */
        ret = set_create_files_as(new, root->d_inode);
        if (ret < 0) {
+                abort_creds(new);
+                cachefiles_begin_secure(cache, _saved_cred);
                _leave(" = %d [cfa]", ret);
                return ret;
        }
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index f3e7a0bf068b..e18b183b47e1 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -16,6 +16,7 @@
 #include <linux/fsnotify.h>
 #include <linux/quotaops.h>
 #include <linux/xattr.h>
+#include <linux/slab.h>
 #include "internal.h"
 static const char cachefiles_xattr_cache[] =
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 23bb0ceabe31..d9c60b84949a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -5,6 +5,7 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/writeback.h>    /* generic_writepages */
+#include <linux/slab.h>
 #include <linux/pagevec.h>
 #include <linux/task_io_accounting_ops.h>
@@ -273,7 +274,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
        struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
        int rc = 0;
        struct page **pages;
-        struct pagevec pvec;
        loff_t offset;
        u64 len;
@@ -296,8 +296,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
        if (rc < 0)
                goto out;
-        /* set uptodate and add to lru in pagevec-sized chunks */
-        pagevec_init(&pvec, 0);
        for (; !list_empty(page_list) && len > 0;
             rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
                struct page *page =
@@ -311,7 +309,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
                        zero_user_segment(page, s, PAGE_CACHE_SIZE);
                }
-                if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
+                if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) {
                        page_cache_release(page);
                        dout("readpages %p add_to_page_cache failed %p\n",
                             inode, page);
@@ -322,10 +320,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
                flush_dcache_page(page);
                SetPageUptodate(page);
                unlock_page(page);
-                if (pagevec_add(&pvec, page) == 0)
+                page_cache_release(page);
-                        pagevec_lru_add_file(&pvec);   /* add to lru */
        }
-        pagevec_lru_add_file(&pvec);
        rc = 0;
 out:
@@ -336,16 +332,15 @@ out:
 /*
 * Get ref for the oldest snapc for an inode with dirty data... that is, the
 * only snap context we are allowed to write back.
- *
- * Caller holds i_lock.
 */
-static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
+static struct ceph_snap_context *get_oldest_context(struct inode *inode,
-                                                      u64 *snap_size)
+                                                    u64 *snap_size)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_snap_context *snapc = NULL;
        struct ceph_cap_snap *capsnap = NULL;
+        spin_lock(&inode->i_lock);
        list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
                dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
                     capsnap->context, capsnap->dirty_pages);
@@ -356,21 +351,11 @@ static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
                        break;
                }
        }
-        if (!snapc && ci->i_snap_realm) {
+        if (!snapc && ci->i_head_snapc) {
-                snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
+                snapc = ceph_get_snap_context(ci->i_head_snapc);
                dout(" head snapc %p has %d dirty pages\n",
                     snapc, ci->i_wrbuffer_ref_head);
        }
-        return snapc;
-}
-static struct ceph_snap_context *get_oldest_context(struct inode *inode,
-                                                    u64 *snap_size)
-{
-        struct ceph_snap_context *snapc = NULL;
-        spin_lock(&inode->i_lock);
-        snapc = __get_oldest_context(inode, snap_size);
        spin_unlock(&inode->i_lock);
        return snapc;
 }
@@ -391,7 +376,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        int len = PAGE_CACHE_SIZE;
        loff_t i_size;
        int err = 0;
-        struct ceph_snap_context *snapc;
+        struct ceph_snap_context *snapc, *oldest;
        u64 snap_size = 0;
        long writeback_stat;
@@ -412,13 +397,16 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
                dout("writepage %p page %p not dirty?\n", inode, page);
                goto out;
        }
-        if (snapc != get_oldest_context(inode, &snap_size)) {
+        oldest = get_oldest_context(inode, &snap_size);
+        if (snapc->seq > oldest->seq) {
                dout("writepage %p page %p snapc %p not writeable - noop\n",
                     inode, page, (void *)page->private);
                /* we should only noop if called by kswapd */
                WARN_ON((current->flags & PF_MEMALLOC) == 0);
+                ceph_put_snap_context(oldest);
                goto out;
        }
+        ceph_put_snap_context(oldest);
        /* is this a partial page at end of file? */
        if (snap_size)
@@ -457,7 +445,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        ClearPagePrivate(page);
        end_page_writeback(page);
        ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
-        ceph_put_snap_context(snapc);
+        ceph_put_snap_context(snapc);  /* page's reference */
 out:
        return err;
 }
@@ -511,12 +499,11 @@ static void writepages_finish(struct ceph_osd_request *req,
        int i;
        struct ceph_snap_context *snapc = req->r_snapc;
        struct address_space *mapping = inode->i_mapping;
-        struct writeback_control *wbc = req->r_wbc;
        __s32 rc = -EIO;
        u64 bytes = 0;
        struct ceph_client *client = ceph_inode_to_client(inode);
        long writeback_stat;
-        unsigned issued = __ceph_caps_issued(ci, NULL);
+        unsigned issued = ceph_caps_issued(ci);
        /* parse reply */
        replyhead = msg->front.iov_base;
@@ -553,13 +540,9 @@ static void writepages_finish(struct ceph_osd_request *req,
                        clear_bdi_congested(&client->backing_dev_info,
                                            BLK_RW_ASYNC);
-                if (i >= wrote) {
+                ceph_put_snap_context((void *)page->private);
-                        dout("inode %p skipping page %p\n", inode, page);
-                        wbc->pages_skipped++;
-                }
                page->private = 0;
                ClearPagePrivate(page);
-                ceph_put_snap_context(snapc);
                dout("unlocking %d %p\n", i, page);
                end_page_writeback(page);
@@ -580,7 +563,7 @@ static void writepages_finish(struct ceph_osd_request *req,
        ceph_release_pages(req->r_pages, req->r_num_pages);
        if (req->r_pages_from_pool)
                mempool_free(req->r_pages,
-                             ceph_client(inode->i_sb)->wb_pagevec_pool);
+                             ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
        else
                kfree(req->r_pages);
        ceph_osdc_put_request(req);
@@ -617,7 +600,7 @@ static int ceph_writepages_start(struct address_space *mapping,
        int range_whole = 0;
        int should_loop = 1;
        pgoff_t max_pages = 0, max_pages_ever = 0;
-        struct ceph_snap_context *snapc = NULL, *last_snapc = NULL;
+        struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
        struct pagevec pvec;
        int done = 0;
        int rc = 0;
@@ -769,9 +752,10 @@ get_more_pages:
                        }
                        /* only if matching snap context */
-                        if (snapc != (void *)page->private) {
+                        pgsnapc = (void *)page->private;
-                                dout("page snapc %p != oldest %p\n",
+                        if (pgsnapc->seq > snapc->seq) {
-                                     (void *)page->private, snapc);
+                                dout("page snapc %p %lld > oldest %p %lld\n",
+                                     pgsnapc, pgsnapc->seq, snapc, snapc->seq);
                                unlock_page(page);
                                if (!locked_pages)
                                        continue; /* keep looking for snap */
@@ -805,7 +789,6 @@ get_more_pages:
                                alloc_page_vec(client, req);
                                req->r_callback = writepages_finish;
                                req->r_inode = inode;
-                                req->r_wbc = wbc;
                        }
                        /* note position of first page in pvec */
@@ -913,12 +896,19 @@ static int context_is_writeable_or_written(struct inode *inode,
                                           struct ceph_snap_context *snapc)
 {
        struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
-        return !oldest || snapc->seq <= oldest->seq;
+        int ret = !oldest || snapc->seq <= oldest->seq;
+        ceph_put_snap_context(oldest);
+        return ret;
 }
 /*
 * We are only allowed to write into/dirty the page if the page is
 * clean, or already dirty within the same snap context.
+ *
+ * called with page locked.
+ * return success with page locked,
+ * or any failure (incl -EAGAIN) with page unlocked.
 */
 static int ceph_update_writeable_page(struct file *file,
                            loff_t pos, unsigned len,
@@ -931,8 +921,8 @@ static int ceph_update_writeable_page(struct file *file,
        int pos_in_page = pos & ~PAGE_CACHE_MASK;
        int end_in_page = pos_in_page + len;
        loff_t i_size;
-        struct ceph_snap_context *snapc;
        int r;
+        struct ceph_snap_context *snapc, *oldest;
 retry_locked:
        /* writepages currently holds page lock, but if we change that later, */
@@ -942,30 +932,34 @@ retry_locked:
        BUG_ON(!ci->i_snap_realm);
        down_read(&mdsc->snap_rwsem);
        BUG_ON(!ci->i_snap_realm->cached_context);
-        if (page->private &&
+        snapc = (void *)page->private;
-            (void *)page->private != ci->i_snap_realm->cached_context) {
+        if (snapc && snapc != ci->i_head_snapc) {
                /*
                 * this page is already dirty in another (older) snap
                 * context!  is it writeable now?
                 */
-                snapc = get_oldest_context(inode, NULL);
+                oldest = get_oldest_context(inode, NULL);
                up_read(&mdsc->snap_rwsem);
-                if (snapc != (void *)page->private) {
+                if (snapc->seq > oldest->seq) {
+                        ceph_put_snap_context(oldest);
                        dout(" page %p snapc %p not current or oldest\n",
-                             page, (void *)page->private);
+                             page, snapc);
                        /*
                         * queue for writeback, and wait for snapc to
                         * be writeable or written
                         */
-                        snapc = ceph_get_snap_context((void *)page->private);
+                        snapc = ceph_get_snap_context(snapc);
                        unlock_page(page);
                        ceph_queue_writeback(inode);
-                        wait_event_interruptible(ci->i_cap_wq,
+                        r = wait_event_interruptible(ci->i_cap_wq,
                               context_is_writeable_or_written(inode, snapc));
                        ceph_put_snap_context(snapc);
+                        if (r == -ERESTARTSYS)
+                                return r;
                        return -EAGAIN;
                }
+                ceph_put_snap_context(oldest);
                /* yay, writeable, do it now (without dropping page lock) */
                dout(" page %p snapc %p not current, but oldest\n",
@@ -1035,7 +1029,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
        int r;
        do {
-                /* get a page*/
+                /* get a page */
                page = grab_cache_page_write_begin(mapping, index, 0);
                if (!page)
                        return -ENOMEM;
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index abb204fea6c7..89490beaf537 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -2,6 +2,7 @@
 #include <linux/module.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 #include "types.h"
 #include "auth_none.h"
@@ -148,7 +149,8 @@ int ceph_build_auth_request(struct ceph_auth_client *ac,
        ret = ac->ops->build_request(ac, p + sizeof(u32), end);
        if (ret < 0) {
-                pr_err("error %d building request\n", ret);
+                pr_err("error %d building auth method %s request\n", ret,
+                       ac->ops->name);
                return ret;
        }
        dout(" built request %d bytes\n", ret);
@@ -227,7 +229,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
        if (ret == -EAGAIN) {
                return ceph_build_auth_request(ac, reply_buf, reply_len);
        } else if (ret) {
-                pr_err("authentication error %d\n", ret);
+                pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
                return ret;
        }
        return 0;
@@ -244,7 +246,7 @@ int ceph_build_auth(struct ceph_auth_client *ac,
        if (!ac->protocol)
                return ceph_auth_build_hello(ac, msg_buf, msg_len);
        BUG_ON(!ac->ops);
-        if (!ac->ops->is_authenticated(ac))
+        if (ac->ops->should_authenticate(ac))
                return ceph_build_auth_request(ac, msg_buf, msg_len);
        return 0;
 }
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
index ca4f57cfb267..d38a2fb4a137 100644
--- a/fs/ceph/auth.h
+++ b/fs/ceph/auth.h
@@ -15,6 +15,8 @@ struct ceph_auth_client;
 struct ceph_authorizer;
 struct ceph_auth_client_ops {
+        const char *name;
        /*
         * true if we are authenticated and can connect to
         * services.
@@ -22,6 +24,12 @@ struct ceph_auth_client_ops {
        int (*is_authenticated)(struct ceph_auth_client *ac);
        /*
+         * true if we should (re)authenticate, e.g., when our tickets
+         * are getting old and crusty.
+         */
+        int (*should_authenticate)(struct ceph_auth_client *ac);
+        /*
         * build requests and process replies during monitor
         * handshake.  if handle_reply returns -EAGAIN, we build
         * another request.
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
index b4ef6f0a6c85..ad1dc21286c7 100644
--- a/fs/ceph/auth_none.c
+++ b/fs/ceph/auth_none.c
@@ -4,6 +4,7 @@
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/random.h>
+#include <linux/slab.h>
 #include "auth_none.h"
 #include "auth.h"
@@ -30,6 +31,13 @@ static int is_authenticated(struct ceph_auth_client *ac)
        return !xi->starting;
 }
+static int should_authenticate(struct ceph_auth_client *ac)
+{
+        struct ceph_auth_none_info *xi = ac->private;
+        return xi->starting;
+}
 /*
 * the generic auth code decode the global_id, and we carry no actual
 * authenticate state, so nothing happens here.
@@ -93,9 +101,11 @@ static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
 }
 static const struct ceph_auth_client_ops ceph_auth_none_ops = {
+        .name = "none",
        .reset = reset,
        .destroy = destroy,
        .is_authenticated = is_authenticated,
+        .should_authenticate = should_authenticate,
        .handle_reply = handle_reply,
        .create_authorizer = ceph_auth_none_create_authorizer,
        .destroy_authorizer = ceph_auth_none_destroy_authorizer,
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
index 56c05533a31c..8164df1a08be 100644
--- a/fs/ceph/auth_none.h
+++ b/fs/ceph/auth_none.h
@@ -1,6 +1,8 @@
 #ifndef _FS_CEPH_AUTH_NONE_H
 #define _FS_CEPH_AUTH_NONE_H
+#include <linux/slab.h>
 #include "auth.h"
 /*
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index f0318427b6da..83d4d2785ffe 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -4,6 +4,7 @@
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/random.h>
+#include <linux/slab.h>
 #include "auth_x.h"
 #include "auth_x_protocol.h"
@@ -11,8 +12,6 @@
 #include "auth.h"
 #include "decode.h"
-struct kmem_cache *ceph_x_ticketbuf_cachep;
 #define TEMP_TICKET_BUF_LEN     256
 static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
@@ -28,6 +27,23 @@ static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
        return (ac->want_keys & xi->have_keys) == ac->want_keys;
 }
+static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
+{
+        struct ceph_x_info *xi = ac->private;
+        int need;
+        ceph_x_validate_tickets(ac, &need);
+        dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
+             ac->want_keys, need, xi->have_keys);
+        return need != 0;
+}
+static int ceph_x_encrypt_buflen(int ilen)
+{
+        return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
+                sizeof(u32);
+}
 static int ceph_x_encrypt(struct ceph_crypto_key *secret,
                          void *ibuf, int ilen, void *obuf, size_t olen)
 {
@@ -122,27 +138,26 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
        int ret;
        char *dbuf;
        char *ticket_buf;
-        u8 struct_v;
+        u8 reply_struct_v;
-        dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC);
+        dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
        if (!dbuf)
                return -ENOMEM;
        ret = -ENOMEM;
-        ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep,
+        ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
-                                      GFP_NOFS | GFP_ATOMIC);
        if (!ticket_buf)
                goto out_dbuf;
        ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
-        struct_v = ceph_decode_8(&p);
+        reply_struct_v = ceph_decode_8(&p);
-        if (struct_v != 1)
+        if (reply_struct_v != 1)
                goto bad;
        num = ceph_decode_32(&p);
        dout("%d tickets\n", num);
        while (num--) {
                int type;
-                u8 struct_v;
+                u8 tkt_struct_v, blob_struct_v;
                struct ceph_x_ticket_handler *th;
                void *dp, *dend;
                int dlen;
@@ -150,14 +165,19 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
                struct timespec validity;
                struct ceph_crypto_key old_key;
                void *tp, *tpend;
+                struct ceph_timespec new_validity;
+                struct ceph_crypto_key new_session_key;
+                struct ceph_buffer *new_ticket_blob;
+                unsigned long new_expires, new_renew_after;
+                u64 new_secret_id;
                ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
                type = ceph_decode_32(&p);
                dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
-                struct_v = ceph_decode_8(&p);
+                tkt_struct_v = ceph_decode_8(&p);
-                if (struct_v != 1)
+                if (tkt_struct_v != 1)
                        goto bad;
                th = get_ticket_handler(ac, type);
@@ -177,21 +197,21 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
                dend = dbuf + dlen;
                dp = dbuf;
-                struct_v = ceph_decode_8(&dp);
+                tkt_struct_v = ceph_decode_8(&dp);
-                if (struct_v != 1)
+                if (tkt_struct_v != 1)
                        goto bad;
                memcpy(&old_key, &th->session_key, sizeof(old_key));
-                ret = ceph_crypto_key_decode(&th->session_key, &dp, dend);
+                ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
                if (ret)
                        goto out;
-                ceph_decode_copy(&dp, &th->validity, sizeof(th->validity));
+                ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
-                ceph_decode_timespec(&validity, &th->validity);
+                ceph_decode_timespec(&validity, &new_validity);
-                th->expires = get_seconds() + validity.tv_sec;
+                new_expires = get_seconds() + validity.tv_sec;
-                th->renew_after = th->expires - (validity.tv_sec / 4);
+                new_renew_after = new_expires - (validity.tv_sec / 4);
-                dout(" expires=%lu renew_after=%lu\n", th->expires,
+                dout(" expires=%lu renew_after=%lu\n", new_expires,
-                     th->renew_after);
+                     new_renew_after);
                /* ticket blob for service */
                ceph_decode_8_safe(&p, end, is_enc, bad);
@@ -215,11 +235,22 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
                tpend = tp + dlen;
                dout(" ticket blob is %d bytes\n", dlen);
                ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
-                struct_v = ceph_decode_8(&tp);
+                blob_struct_v = ceph_decode_8(&tp);
-                th->secret_id = ceph_decode_64(&tp);
+                new_secret_id = ceph_decode_64(&tp);
-                ret = ceph_decode_buffer(&th->ticket_blob, &tp, tpend);
+                ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
                if (ret)
                        goto out;
+                /* all is well, update our ticket */
+                ceph_crypto_key_destroy(&th->session_key);
+                if (th->ticket_blob)
+                        ceph_buffer_put(th->ticket_blob);
+                th->session_key = new_session_key;
+                th->ticket_blob = new_ticket_blob;
+                th->validity = new_validity;
+                th->secret_id = new_secret_id;
+                th->expires = new_expires;
+                th->renew_after = new_renew_after;
                dout(" got ticket service %d (%s) secret_id %lld len %d\n",
                     type, ceph_entity_type_name(type), th->secret_id,
                     (int)th->ticket_blob->vec.iov_len);
@@ -228,9 +259,9 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
        ret = 0;
 out:
-        kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf);
+        kfree(ticket_buf);
 out_dbuf:
-        kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf);
+        kfree(dbuf);
        return ret;
 bad:
@@ -242,7 +273,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
                                   struct ceph_x_ticket_handler *th,
                                   struct ceph_x_authorizer *au)
 {
-        int len;
+        int maxlen;
        struct ceph_x_authorize_a *msg_a;
        struct ceph_x_authorize_b msg_b;
        void *p, *end;
@@ -253,15 +284,15 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
        dout("build_authorizer for %s %p\n",
             ceph_entity_type_name(th->service), au);
-        len = sizeof(*msg_a) + sizeof(msg_b) + sizeof(u32) +
+        maxlen = sizeof(*msg_a) + sizeof(msg_b) +
-                ticket_blob_len + 16;
+                ceph_x_encrypt_buflen(ticket_blob_len);
-        dout("  need len %d\n", len);
+        dout("  need len %d\n", maxlen);
-        if (au->buf && au->buf->alloc_len < len) {
+        if (au->buf && au->buf->alloc_len < maxlen) {
                ceph_buffer_put(au->buf);
                au->buf = NULL;
        }
        if (!au->buf) {
-                au->buf = ceph_buffer_new(len, GFP_NOFS);
+                au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
                if (!au->buf)
                        return -ENOMEM;
        }
@@ -296,6 +327,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
        au->buf->vec.iov_len = p - au->buf->vec.iov_base;
        dout(" built authorizer nonce %llx len %d\n", au->nonce,
             (int)au->buf->vec.iov_len);
+        BUG_ON(au->buf->vec.iov_len > maxlen);
        return 0;
 out_buf:
@@ -581,8 +613,6 @@ static void ceph_x_destroy(struct ceph_auth_client *ac)
                remove_ticket_handler(ac, th);
        }
-        kmem_cache_destroy(ceph_x_ticketbuf_cachep);
        kfree(ac->private);
        ac->private = NULL;
 }
@@ -599,7 +629,9 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
 static const struct ceph_auth_client_ops ceph_x_ops = {
+        .name = "x",
        .is_authenticated = ceph_x_is_authenticated,
+        .should_authenticate = ceph_x_should_authenticate,
        .build_request = ceph_x_build_request,
        .handle_reply = ceph_x_handle_reply,
        .create_authorizer = ceph_x_create_authorizer,
@@ -617,26 +649,20 @@ int ceph_x_init(struct ceph_auth_client *ac)
        int ret;
        dout("ceph_x_init %p\n", ac);
+        ret = -ENOMEM;
        xi = kzalloc(sizeof(*xi), GFP_NOFS);
        if (!xi)
-                return -ENOMEM;
+                goto out;
-        ret = -ENOMEM;
-        ceph_x_ticketbuf_cachep = kmem_cache_create("ceph_x_ticketbuf",
-                                      TEMP_TICKET_BUF_LEN, 8,
-                                      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
-                                      NULL);
-        if (!ceph_x_ticketbuf_cachep)
-                goto done_nomem;
        ret = -EINVAL;
        if (!ac->secret) {
                pr_err("no secret set (for auth_x protocol)\n");
-                goto done_nomem;
+                goto out_nomem;
        }
        ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
        if (ret)
-                goto done_nomem;
+                goto out_nomem;
        xi->starting = true;
        xi->ticket_handlers = RB_ROOT;
@@ -646,10 +672,9 @@ int ceph_x_init(struct ceph_auth_client *ac)
        ac->ops = &ceph_x_ops;
        return 0;
-done_nomem:
+out_nomem:
        kfree(xi);
-        if (ceph_x_ticketbuf_cachep)
+out:
-                kmem_cache_destroy(ceph_x_ticketbuf_cachep);
        return ret;
 }
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
index b98086c7aeba..c67535d70aa6 100644
--- a/fs/ceph/buffer.c
+++ b/fs/ceph/buffer.c
@@ -1,5 +1,8 @@
 #include "ceph_debug.h"
+#include <linux/slab.h>
 #include "buffer.h"
 #include "decode.h"
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index db122bb357b8..ae3e3a306445 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -3,6 +3,7 @@
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/wait.h>
 #include <linux/writeback.h>
@@ -857,6 +858,8 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci)
 }
 /*
+ * Remove a cap.  Take steps to deal with a racing iterate_session_caps.
+ *
 * caller should hold i_lock.
 * caller will not hold session s_mutex if called from destroy_inode.
 */
@@ -864,16 +867,12 @@ void __ceph_remove_cap(struct ceph_cap *cap)
 {
        struct ceph_mds_session *session = cap->session;
        struct ceph_inode_info *ci = cap->ci;
-        struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+        struct ceph_mds_client *mdsc =
+                &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+        int removed = 0;
        dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
-        /* remove from inode list */
-        rb_erase(&cap->ci_node, &ci->i_caps);
-        cap->ci = NULL;
-        if (ci->i_auth_cap == cap)
-                ci->i_auth_cap = NULL;
        /* remove from session list */
        spin_lock(&session->s_cap_lock);
        if (session->s_cap_iterator == cap) {
@@ -884,10 +883,18 @@ void __ceph_remove_cap(struct ceph_cap *cap)
                list_del_init(&cap->session_caps);
                session->s_nr_caps--;
                cap->session = NULL;
+                removed = 1;
        }
+        /* protect backpointer with s_cap_lock: see iterate_session_caps */
+        cap->ci = NULL;
        spin_unlock(&session->s_cap_lock);
-        if (cap->session == NULL)
+        /* remove from inode list */
+        rb_erase(&cap->ci_node, &ci->i_caps);
+        if (ci->i_auth_cap == cap)
+                ci->i_auth_cap = NULL;
+        if (removed)
                ceph_put_cap(cap);
        if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
@@ -931,9 +938,9 @@ static int send_cap_msg(struct ceph_mds_session *session,
             seq, issue_seq, mseq, follows, size, max_size,
             xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
-        msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
+        msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS);
-        if (IS_ERR(msg))
+        if (!msg)
-                return PTR_ERR(msg);
+                return -ENOMEM;
        msg->hdr.tid = cpu_to_le64(flush_tid);
@@ -1204,6 +1211,12 @@ retry:
                if (capsnap->dirty_pages || capsnap->writing)
                        continue;
+                /*
+                 * if cap writeback already occurred, we should have dropped
+                 * the capsnap in ceph_put_wrbuffer_cap_refs.
+                 */
+                BUG_ON(capsnap->dirty == 0);
                /* pick mds, take s_mutex */
                mds = __ceph_get_cap_mds(ci, &mseq);
                if (session && session->s_mds != mds) {
@@ -1286,7 +1299,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
 */
 void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 {
-        struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+        struct ceph_mds_client *mdsc =
+                &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
        struct inode *inode = &ci->vfs_inode;
        int was = ci->i_dirty_caps;
        int dirty = 0;
@@ -1324,7 +1338,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 static int __mark_caps_flushing(struct inode *inode,
                                 struct ceph_mds_session *session)
 {
-        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
        int flushing;
@@ -1407,6 +1421,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
 */
 void ceph_check_caps(struct ceph_inode_info *ci, int flags,
                     struct ceph_mds_session *session)
+        __releases(session->s_mutex)
 {
        struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
        struct ceph_mds_client *mdsc = &client->mdsc;
@@ -1414,7 +1429,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
        struct ceph_cap *cap;
        int file_wanted, used;
        int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */
-        int drop_session_lock = session ? 0 : 1;
        int issued, implemented, want, retain, revoking, flushing = 0;
        int mds = -1;   /* keep track of how far we've gone through i_caps list
                           to avoid an infinite loop on retry */
@@ -1639,7 +1653,7 @@ ack:
        if (queue_invalidate)
                ceph_queue_invalidate(inode);
-        if (session && drop_session_lock)
+        if (session)
                mutex_unlock(&session->s_mutex);
        if (took_snap_rwsem)
                up_read(&mdsc->snap_rwsem);
@@ -1651,7 +1665,7 @@ ack:
 static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
                          unsigned *flush_tid)
 {
-        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
        int unlock_session = session ? 0 : 1;
        int flushing = 0;
@@ -1704,10 +1718,9 @@ out_unlocked:
 static int caps_are_flushed(struct inode *inode, unsigned tid)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        int dirty, i, ret = 1;
+        int i, ret = 1;
        spin_lock(&inode->i_lock);
-        dirty = __ceph_caps_dirty(ci);
        for (i = 0; i < CEPH_CAP_BITS; i++)
                if ((ci->i_flushing_caps & (1 << i)) &&
                    ci->i_cap_flush_tid[i] <= tid) {
@@ -1763,9 +1776,9 @@ out:
        spin_unlock(&ci->i_unsafe_lock);
 }
-int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
+int ceph_fsync(struct file *file, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        struct ceph_inode_info *ci = ceph_inode(inode);
        unsigned flush_tid;
        int ret;
@@ -1817,7 +1830,8 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
                        err = wait_event_interruptible(ci->i_cap_wq,
                                       caps_are_flushed(inode, flush_tid));
        } else {
-                struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+                struct ceph_mds_client *mdsc =
+                        &ceph_sb_to_client(inode->i_sb)->mdsc;
                spin_lock(&inode->i_lock);
                if (__ceph_caps_dirty(ci))
@@ -1854,8 +1868,8 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
                } else {
                        pr_err("%p auth cap %p not mds%d ???\n", inode,
                               cap, session->s_mds);
-                        spin_unlock(&inode->i_lock);
                }
+                spin_unlock(&inode->i_lock);
        }
 }
@@ -2117,8 +2131,8 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
                }
        spin_unlock(&inode->i_lock);
-        dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had),
+        dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
-             last ? "last" : "");
+             last ? " last" : "", put ? " put" : "");
        if (last && !flushsnaps)
                ceph_check_caps(ci, 0, NULL);
@@ -2142,7 +2156,8 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 {
        struct inode *inode = &ci->vfs_inode;
        int last = 0;
-        int last_snap = 0;
+        int complete_capsnap = 0;
+        int drop_capsnap = 0;
        int found = 0;
        struct ceph_cap_snap *capsnap = NULL;
@@ -2165,19 +2180,32 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
                list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
                        if (capsnap->context == snapc) {
                                found = 1;
-                                capsnap->dirty_pages -= nr;
-                                last_snap = !capsnap->dirty_pages;
                                break;
                        }
                }
                BUG_ON(!found);
+                capsnap->dirty_pages -= nr;
+                if (capsnap->dirty_pages == 0) {
+                        complete_capsnap = 1;
+                        if (capsnap->dirty == 0)
+                                /* cap writeback completed before we created
+                                 * the cap_snap; no FLUSHSNAP is needed */
+                                drop_capsnap = 1;
+                }
                dout("put_wrbuffer_cap_refs on %p cap_snap %p "
-                     " snap %lld %d/%d -> %d/%d %s%s\n",
+                     " snap %lld %d/%d -> %d/%d %s%s%s\n",
                     inode, capsnap, capsnap->context->seq,
                     ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
                     ci->i_wrbuffer_ref, capsnap->dirty_pages,
                     last ? " (wrbuffer last)" : "",
-                     last_snap ? " (capsnap last)" : "");
+                     complete_capsnap ? " (complete capsnap)" : "",
+                     drop_capsnap ? " (drop capsnap)" : "");
+                if (drop_capsnap) {
+                        ceph_put_snap_context(capsnap->context);
+                        list_del(&capsnap->ci_item);
+                        list_del(&capsnap->flushing_item);
+                        ceph_put_cap_snap(capsnap);
+                }
        }
        spin_unlock(&inode->i_lock);
@@ -2185,28 +2213,31 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
        if (last) {
                ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
                iput(inode);
-        } else if (last_snap) {
+        } else if (complete_capsnap) {
                ceph_flush_snaps(ci);
                wake_up(&ci->i_cap_wq);
        }
+        if (drop_capsnap)
+                iput(inode);
 }
 /*
 * Handle a cap GRANT message from the MDS.  (Note that a GRANT may
 * actually be a revocation if it specifies a smaller cap set.)
 *
- * caller holds s_mutex.
+ * caller holds s_mutex and i_lock, we drop both.
+ *
 * return value:
 *  0 - ok
 *  1 - check_caps on auth cap only (writeback)
 *  2 - check_caps (ack revoke)
 */
-static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
+static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
-                            struct ceph_mds_session *session,
+                             struct ceph_mds_session *session,
-                            struct ceph_cap *cap,
+                             struct ceph_cap *cap,
-                            struct ceph_buffer *xattr_buf)
+                             struct ceph_buffer *xattr_buf)
        __releases(inode->i_lock)
+        __releases(session->s_mutex)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        int mds = session->s_mds;
@@ -2216,7 +2247,7 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        u64 size = le64_to_cpu(grant->size);
        u64 max_size = le64_to_cpu(grant->max_size);
        struct timespec mtime, atime, ctime;
-        int reply = 0;
+        int check_caps = 0;
        int wake = 0;
        int writeback = 0;
        int revoked_rdcache = 0;
@@ -2329,11 +2360,12 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
                if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
                        writeback = 1; /* will delay ack */
                else if (dirty & ~newcaps)
-                        reply = 1;     /* initiate writeback in check_caps */
+                        check_caps = 1;  /* initiate writeback in check_caps */
                else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
                           revoked_rdcache)
-                        reply = 2;     /* send revoke ack in check_caps */
+                        check_caps = 2;     /* send revoke ack in check_caps */
                cap->issued = newcaps;
+                cap->implemented |= newcaps;
        } else if (cap->issued == newcaps) {
                dout("caps unchanged: %s -> %s\n",
                     ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
@@ -2346,6 +2378,7 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
                                              * pending revocation */
                wake = 1;
        }
+        BUG_ON(cap->issued & ~cap->implemented);
        spin_unlock(&inode->i_lock);
        if (writeback)
@@ -2359,7 +2392,14 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
                ceph_queue_invalidate(inode);
        if (wake)
                wake_up(&ci->i_cap_wq);
-        return reply;
+        if (check_caps == 1)
+                ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
+                                session);
+        else if (check_caps == 2)
+                ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
+        else
+                mutex_unlock(&session->s_mutex);
 }
 /*
@@ -2373,7 +2413,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
        __releases(inode->i_lock)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
        unsigned seq = le32_to_cpu(m->seq);
        int dirty = le32_to_cpu(m->dirty);
        int cleaned = 0;
@@ -2454,8 +2494,8 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
                                break;
                        }
                        WARN_ON(capsnap->dirty_pages || capsnap->writing);
-                        dout(" removing cap_snap %p follows %lld\n",
+                        dout(" removing %p cap_snap %p follows %lld\n",
-                             capsnap, follows);
+                             inode, capsnap, follows);
                        ceph_put_snap_context(capsnap->context);
                        list_del(&capsnap->ci_item);
                        list_del(&capsnap->flushing_item);
@@ -2548,9 +2588,8 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
                        ci->i_cap_exporting_issued = cap->issued;
                }
                __ceph_remove_cap(cap);
-        } else {
-                WARN_ON(!cap);
        }
+        /* else, we already released it */
        spin_unlock(&inode->i_lock);
 }
@@ -2621,9 +2660,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        u64 cap_id;
        u64 size, max_size;
        u64 tid;
-        int check_caps = 0;
        void *snaptrace;
-        int r;
        dout("handle_caps from mds%d\n", mds);
@@ -2668,8 +2705,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        case CEPH_CAP_OP_IMPORT:
                handle_cap_import(mdsc, inode, h, session,
                                  snaptrace, le32_to_cpu(h->snap_trace_len));
-                check_caps = 1; /* we may have sent a RELEASE to the old auth */
+                ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
-                goto done;
+                                session);
+                goto done_unlocked;
        }
        /* the rest require a cap */
@@ -2686,16 +2724,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        switch (op) {
        case CEPH_CAP_OP_REVOKE:
        case CEPH_CAP_OP_GRANT:
-                r = handle_cap_grant(inode, h, session, cap, msg->middle);
+                handle_cap_grant(inode, h, session, cap, msg->middle);
-                if (r == 1)
+                goto done_unlocked;
-                        ceph_check_caps(ceph_inode(inode),
-                                        CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
-                                        session);
-                else if (r == 2)
-                        ceph_check_caps(ceph_inode(inode),
-                                        CHECK_CAPS_NODELAY,
-                                        session);
-                break;
        case CEPH_CAP_OP_FLUSH_ACK:
                handle_cap_flush_ack(inode, tid, h, session, cap);
@@ -2713,9 +2743,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 done:
        mutex_unlock(&session->s_mutex);
+done_unlocked:
-        if (check_caps)
-                ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, NULL);
        if (inode)
                iput(inode);
        return;
@@ -2838,11 +2866,18 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
        struct ceph_cap *cap;
        struct ceph_mds_request_release *rel = *p;
        int ret = 0;
+        int used = 0;
-        dout("encode_inode_release %p mds%d drop %s unless %s\n", inode,
-             mds, ceph_cap_string(drop), ceph_cap_string(unless));
        spin_lock(&inode->i_lock);
+        used = __ceph_caps_used(ci);
+        dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode,
+             mds, ceph_cap_string(used), ceph_cap_string(drop),
+             ceph_cap_string(unless));
+        /* only drop unused caps */
+        drop &= ~used;
        cap = __get_cap_for_mds(ci, mds);
        if (cap && __cap_is_valid(cap)) {
                if (force ||
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 0c2241ef3653..2fa992eaf7da 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -19,7 +19,7 @@
 * Ceph release version
 */
 #define CEPH_VERSION_MAJOR 0
-#define CEPH_VERSION_MINOR 19
+#define CEPH_VERSION_MINOR 20
 #define CEPH_VERSION_PATCH 0
 #define _CEPH_STRINGIFY(x) #x
@@ -36,7 +36,7 @@
 * client-facing protocol.
 */
 #define CEPH_OSD_PROTOCOL     8 /* cluster internal */
-#define CEPH_MDS_PROTOCOL     9 /* cluster internal */
+#define CEPH_MDS_PROTOCOL    12 /* cluster internal */
 #define CEPH_MON_PROTOCOL     5 /* cluster internal */
 #define CEPH_OSDC_PROTOCOL   24 /* server/client */
 #define CEPH_MDSC_PROTOCOL   32 /* server/client */
@@ -53,8 +53,18 @@
 /*
 * feature bits
 */
-#define CEPH_FEATURE_SUPPORTED  0
+#define CEPH_FEATURE_UID        1
-#define CEPH_FEATURE_REQUIRED   0
+#define CEPH_FEATURE_NOSRCADDR  2
+#define CEPH_FEATURE_FLOCK      4
+#define CEPH_FEATURE_SUPPORTED_MON  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_MON   CEPH_FEATURE_UID
+#define CEPH_FEATURE_SUPPORTED_MDS  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK
+#define CEPH_FEATURE_REQUIRED_MDS   CEPH_FEATURE_UID
+#define CEPH_FEATURE_SUPPORTED_OSD  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_OSD   CEPH_FEATURE_UID
+#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR
 /*
@@ -91,6 +101,8 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
 #define CEPH_AUTH_NONE          0x1
 #define CEPH_AUTH_CEPHX         0x2
+#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
 /*********************************************
 * message layer
@@ -128,11 +140,27 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
 #define CEPH_MSG_CLIENT_SNAP            0x312
 #define CEPH_MSG_CLIENT_CAPRELEASE      0x313
+/* pool ops */
+#define CEPH_MSG_POOLOP_REPLY           48
+#define CEPH_MSG_POOLOP                 49
 /* osd */
 #define CEPH_MSG_OSD_MAP          41
 #define CEPH_MSG_OSD_OP           42
 #define CEPH_MSG_OSD_OPREPLY      43
+/* pool operations */
+enum {
+  POOL_OP_CREATE                        = 0x01,
+  POOL_OP_DELETE                        = 0x02,
+  POOL_OP_AUID_CHANGE                   = 0x03,
+  POOL_OP_CREATE_SNAP                   = 0x11,
+  POOL_OP_DELETE_SNAP                   = 0x12,
+  POOL_OP_CREATE_UNMANAGED_SNAP         = 0x21,
+  POOL_OP_DELETE_UNMANAGED_SNAP         = 0x22,
+};
 struct ceph_mon_request_header {
        __le64 have_version;
        __le16 session_mon;
@@ -155,6 +183,31 @@ struct ceph_mon_statfs_reply {
        struct ceph_statfs st;
 } __attribute__ ((packed));
+const char *ceph_pool_op_name(int op);
+struct ceph_mon_poolop {
+        struct ceph_mon_request_header monhdr;
+        struct ceph_fsid fsid;
+        __le32 pool;
+        __le32 op;
+        __le64 auid;
+        __le64 snapid;
+        __le32 name_len;
+} __attribute__ ((packed));
+struct ceph_mon_poolop_reply {
+        struct ceph_mon_request_header monhdr;
+        struct ceph_fsid fsid;
+        __le32 reply_code;
+        __le32 epoch;
+        char has_data;
+        char data[0];
+} __attribute__ ((packed));
+struct ceph_mon_unmanaged_snap {
+        __le64 snapid;
+} __attribute__ ((packed));
 struct ceph_osd_getmap {
        struct ceph_mon_request_header monhdr;
        struct ceph_fsid fsid;
@@ -212,16 +265,17 @@ extern const char *ceph_mds_state_name(int s);
 *  - they also define the lock ordering by the MDS
 *  - a few of these are internal to the mds
 */
-#define CEPH_LOCK_DN          1
+#define CEPH_LOCK_DVERSION    1
-#define CEPH_LOCK_ISNAP       2
+#define CEPH_LOCK_DN          2
-#define CEPH_LOCK_IVERSION    4     /* mds internal */
+#define CEPH_LOCK_ISNAP       16
-#define CEPH_LOCK_IFILE       8     /* mds internal */
+#define CEPH_LOCK_IVERSION    32    /* mds internal */
-#define CEPH_LOCK_IAUTH       32
+#define CEPH_LOCK_IFILE       64
-#define CEPH_LOCK_ILINK       64
+#define CEPH_LOCK_IAUTH       128
-#define CEPH_LOCK_IDFT        128   /* dir frag tree */
+#define CEPH_LOCK_ILINK       256
-#define CEPH_LOCK_INEST       256   /* mds internal */
+#define CEPH_LOCK_IDFT        512   /* dir frag tree */
-#define CEPH_LOCK_IXATTR      512
+#define CEPH_LOCK_INEST       1024  /* mds internal */
-#define CEPH_LOCK_INO         2048  /* immutable inode bits; not a lock */
+#define CEPH_LOCK_IXATTR      2048
+#define CEPH_LOCK_INO         8192  /* immutable inode bits; not a lock */
 /* client_session ops */
 enum {
@@ -308,6 +362,7 @@ union ceph_mds_request_args {
        struct {
                __le32 frag;                 /* which dir fragment */
                __le32 max_entries;          /* how many dentries to grab */
+                __le32 max_bytes;
        } __attribute__ ((packed)) readdir;
        struct {
                __le32 mode;
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
index 8e4be6a80c62..7503aee828ce 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/ceph_strings.c
@@ -10,7 +10,6 @@ const char *ceph_entity_type_name(int type)
        case CEPH_ENTITY_TYPE_OSD: return "osd";
        case CEPH_ENTITY_TYPE_MON: return "mon";
        case CEPH_ENTITY_TYPE_CLIENT: return "client";
-        case CEPH_ENTITY_TYPE_ADMIN: return "admin";
        case CEPH_ENTITY_TYPE_AUTH: return "auth";
        default: return "unknown";
        }
@@ -45,6 +44,7 @@ const char *ceph_osd_op_name(int op)
        case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
        case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
        case CEPH_OSD_OP_RMXATTR: return "rmxattr";
+        case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
        case CEPH_OSD_OP_PULL: return "pull";
        case CEPH_OSD_OP_PUSH: return "push";
@@ -174,3 +174,17 @@ const char *ceph_snap_op_name(int o)
        }
        return "???";
 }
+const char *ceph_pool_op_name(int op)
+{
+        switch (op) {
+        case POOL_OP_CREATE: return "create";
+        case POOL_OP_DELETE: return "delete";
+        case POOL_OP_AUID_CHANGE: return "auid change";
+        case POOL_OP_CREATE_SNAP: return "create snap";
+        case POOL_OP_DELETE_SNAP: return "delete snap";
+        case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
+        case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
+        }
+        return "???";
+}
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index 291ac288e791..f704b3b62424 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -3,6 +3,7 @@
 #include <linux/err.h>
 #include <linux/scatterlist.h>
+#include <linux/slab.h>
 #include <crypto/hash.h>
 #include "crypto.h"
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index e159f1415110..3be33fb066cc 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -1,6 +1,7 @@
 #include "ceph_debug.h"
 #include <linux/device.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
@@ -112,7 +113,7 @@ static int osdmap_show(struct seq_file *s, void *p)
 static int monc_show(struct seq_file *s, void *p)
 {
        struct ceph_client *client = s->private;
-        struct ceph_mon_statfs_request *req;
+        struct ceph_mon_generic_request *req;
        struct ceph_mon_client *monc = &client->monc;
        struct rb_node *rp;
@@ -125,9 +126,14 @@ static int monc_show(struct seq_file *s, void *p)
        if (monc->want_next_osdmap)
                seq_printf(s, "want next osdmap\n");
-        for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) {
+        for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
-                req = rb_entry(rp, struct ceph_mon_statfs_request, node);
+                __u16 op;
-                seq_printf(s, "%lld statfs\n", req->tid);
+                req = rb_entry(rp, struct ceph_mon_generic_request, node);
+                op = le16_to_cpu(req->request->hdr.type);
+                if (op == CEPH_MSG_STATFS)
+                        seq_printf(s, "%lld statfs\n", req->tid);
+                else
+                        seq_printf(s, "%lld unknown\n", req->tid);
        }
        mutex_unlock(&monc->mutex);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 5107384ee029..f85719310db2 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -3,6 +3,7 @@
 #include <linux/spinlock.h>
 #include <linux/fs_struct.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include "super.h"
@@ -50,8 +51,11 @@ int ceph_init_dentry(struct dentry *dentry)
                return -ENOMEM;          /* oh well */
        spin_lock(&dentry->d_lock);
-        if (dentry->d_fsdata) /* lost a race */
+        if (dentry->d_fsdata) {
+                /* lost a race */
+                kmem_cache_free(ceph_dentry_cachep, di);
                goto out_unlock;
+        }
        di->dentry = dentry;
        di->lease_session = NULL;
        dentry->d_fsdata = di;
@@ -124,7 +128,8 @@ more:
        dentry = list_entry(p, struct dentry, d_u.d_child);
        di = ceph_dentry(dentry);
        while (1) {
-                dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
+                dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
+                     d_unhashed(dentry) ? "!hashed" : "hashed",
                     parent->d_subdirs.prev, parent->d_subdirs.next);
                if (p == &parent->d_subdirs) {
                        fi->at_end = 1;
@@ -170,11 +175,11 @@ more:
        spin_lock(&inode->i_lock);
        spin_lock(&dcache_lock);
+        last = dentry;
        if (err < 0)
                goto out_unlock;
-        last = dentry;
        p = p->prev;
        filp->f_pos++;
@@ -228,6 +233,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
        u32 ftype;
        struct ceph_mds_reply_info_parsed *rinfo;
        const int max_entries = client->mount_args->max_readdir;
+        const int max_bytes = client->mount_args->max_readdir_bytes;
        dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
        if (fi->at_end)
@@ -288,8 +294,10 @@ more:
                        CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
                /* discard old result, if any */
-                if (fi->last_readdir)
+                if (fi->last_readdir) {
                        ceph_mdsc_put_request(fi->last_readdir);
+                        fi->last_readdir = NULL;
+                }
                /* requery frag tree, as the frag topology may have changed */
                frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
@@ -309,7 +317,8 @@ more:
                req->r_readdir_offset = fi->next_offset;
                req->r_args.readdir.frag = cpu_to_le32(frag);
                req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
-                req->r_num_caps = max_entries;
+                req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
+                req->r_num_caps = max_entries + 1;
                err = ceph_mdsc_do_request(mdsc, NULL, req);
                if (err < 0) {
                        ceph_mdsc_put_request(req);
@@ -332,7 +341,7 @@ more:
                if (req->r_reply_info.dir_end) {
                        kfree(fi->last_name);
                        fi->last_name = NULL;
-                        fi->next_offset = 0;
+                        fi->next_offset = 2;
                } else {
                        rinfo = &req->r_reply_info;
                        err = note_last_dentry(fi,
@@ -475,7 +484,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
 struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
                                  struct dentry *dentry, int err)
 {
-        struct ceph_client *client = ceph_client(dentry->d_sb);
+        struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
        struct inode *parent = dentry->d_parent->d_inode;
        /* .snap dir? */
@@ -486,6 +495,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
                struct inode *inode = ceph_get_snapdir(parent);
                dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
                     dentry, dentry->d_name.len, dentry->d_name.name, inode);
+                BUG_ON(!d_unhashed(dentry));
                d_add(dentry, inode);
                err = 0;
        }
@@ -564,7 +574,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                    !is_root_ceph_dentry(dir, dentry) &&
                    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
                    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
-                        di->offset = ci->i_max_offset++;
                        spin_unlock(&dir->i_lock);
                        dout(" dir %p complete, -ENOENT\n", dir);
                        d_add(dentry, NULL);
@@ -578,7 +587,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
        req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
        if (IS_ERR(req))
-                return ERR_PTR(PTR_ERR(req));
+                return ERR_CAST(req);
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
        /* we only need inode linkage */
@@ -876,12 +885,30 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
                 * do_request, above).  If there is no trace, we need
                 * to do it here.
                 */
+                /* d_move screws up d_subdirs order */
+                ceph_i_clear(new_dir, CEPH_I_COMPLETE);
                d_move(old_dentry, new_dentry);
+                /* ensure target dentry is invalidated, despite
+                   rehashing bug in vfs_rename_dir */
+                ceph_invalidate_dentry_lease(new_dentry);
        }
        ceph_mdsc_put_request(req);
        return err;
 }
+/*
+ * Ensure a dentry lease will no longer revalidate.
+ */
+void ceph_invalidate_dentry_lease(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        dentry->d_time = jiffies;
+        ceph_dentry(dentry)->lease_shared_gen = 0;
+        spin_unlock(&dentry->d_lock);
+}
 /*
 * Check if dentry lease is valid.  If not, delete the lease.  Try to
@@ -959,8 +986,9 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct inode *dir = dentry->d_parent->d_inode;
-        dout("d_revalidate %p '%.*s' inode %p\n", dentry,
+        dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
-             dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+             dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
+             ceph_dentry(dentry)->offset);
        /* always trust cached snapped dentries, snapdir dentry */
        if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -1037,7 +1065,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
        struct ceph_inode_info *ci = ceph_inode(inode);
        int left;
-        if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
+        if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
                return -EISDIR;
        if (!cf->dir_info) {
@@ -1079,10 +1107,9 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
 * an fsync() on a dir will wait for any uncommitted directory
 * operations to commit.
 */
-static int ceph_dir_fsync(struct file *file, struct dentry *dentry,
+static int ceph_dir_fsync(struct file *file, int datasync)
-                          int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_path.dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct list_head *head = &ci->i_unsafe_dirops;
        struct ceph_mds_request *req;
@@ -1139,7 +1166,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
        dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
             dn->d_name.len, dn->d_name.name);
        if (di) {
-                mdsc = &ceph_client(dn->d_sb)->mdsc;
+                mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
                spin_lock(&mdsc->dentry_lru_lock);
                list_add_tail(&di->lru, &mdsc->dentry_lru);
                mdsc->num_dentry++;
@@ -1152,10 +1179,10 @@ void ceph_dentry_lru_touch(struct dentry *dn)
        struct ceph_dentry_info *di = ceph_dentry(dn);
        struct ceph_mds_client *mdsc;
-        dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
+        dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
-             dn->d_name.len, dn->d_name.name);
+             dn->d_name.len, dn->d_name.name, di->offset);
        if (di) {
-                mdsc = &ceph_client(dn->d_sb)->mdsc;
+                mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
                spin_lock(&mdsc->dentry_lru_lock);
                list_move_tail(&di->lru, &mdsc->dentry_lru);
                spin_unlock(&mdsc->dentry_lru_lock);
@@ -1170,7 +1197,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
        dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
             dn->d_name.len, dn->d_name.name);
        if (di) {
-                mdsc = &ceph_client(dn->d_sb)->mdsc;
+                mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
                spin_lock(&mdsc->dentry_lru_lock);
                list_del_init(&di->lru);
                mdsc->num_dentry--;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index fc68e39cbad6..4480cb1c63e7 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -1,6 +1,7 @@
 #include "ceph_debug.h"
 #include <linux/exportfs.h>
+#include <linux/slab.h>
 #include <asm/unaligned.h>
 #include "super.h"
@@ -92,11 +93,11 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
                return ERR_PTR(-ESTALE);
        dentry = d_obtain_alias(inode);
-        if (!dentry) {
+        if (IS_ERR(dentry)) {
                pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
                       fh->ino, inode);
                iput(inode);
-                return ERR_PTR(-ENOMEM);
+                return dentry;
        }
        err = ceph_init_dentry(dentry);
@@ -114,7 +115,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
 static struct dentry *__cfh_to_dentry(struct super_block *sb,
                                      struct ceph_nfs_confh *cfh)
 {
-        struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
        struct inode *inode;
        struct dentry *dentry;
        struct ceph_vino vino;
@@ -132,7 +133,7 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
                req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
                                               USE_ANY_MDS);
                if (IS_ERR(req))
-                        return ERR_PTR(PTR_ERR(req));
+                        return ERR_CAST(req);
                req->r_ino1 = vino;
                req->r_ino2.ino = cfh->parent_ino;
@@ -148,11 +149,11 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
        }
        dentry = d_obtain_alias(inode);
-        if (!dentry) {
+        if (IS_ERR(dentry)) {
                pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
                       cfh->ino, inode);
                iput(inode);
-                return ERR_PTR(-ENOMEM);
+                return dentry;
        }
        err = ceph_init_dentry(dentry);
        if (err < 0) {
@@ -201,11 +202,11 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
                return ERR_PTR(-ESTALE);
        dentry = d_obtain_alias(inode);
-        if (!dentry) {
+        if (IS_ERR(dentry)) {
                pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
                       cfh->ino, inode);
                iput(inode);
-                return ERR_PTR(-ENOMEM);
+                return dentry;
        }
        err = ceph_init_dentry(dentry);
        if (err < 0) {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 5d2af8464f6a..6251a1574b94 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1,6 +1,7 @@
 #include "ceph_debug.h"
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/namei.h>
 #include <linux/writeback.h>
@@ -229,7 +230,7 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
        /* do the open */
        req = prepare_open_request(dir->i_sb, flags, mode);
        if (IS_ERR(req))
-                return ERR_PTR(PTR_ERR(req));
+                return ERR_CAST(req);
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
        if (flags & O_CREAT) {
@@ -316,16 +317,16 @@ void ceph_release_page_vector(struct page **pages, int num_pages)
 /*
 * allocate a vector new pages
 */
-static struct page **alloc_page_vector(int num_pages)
+struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
 {
        struct page **pages;
        int i;
-        pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+        pages = kmalloc(sizeof(*pages) * num_pages, flags);
        if (!pages)
                return ERR_PTR(-ENOMEM);
        for (i = 0; i < num_pages; i++) {
-                pages[i] = alloc_page(GFP_NOFS);
+                pages[i] = __page_cache_alloc(flags);
                if (pages[i] == NULL) {
                        ceph_release_page_vector(pages, i);
                        return ERR_PTR(-ENOMEM);
@@ -539,7 +540,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
                 * in sequence.
                 */
        } else {
-                pages = alloc_page_vector(num_pages);
+                pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
        }
        if (IS_ERR(pages))
                return PTR_ERR(pages);
@@ -648,8 +649,8 @@ more:
                                    do_sync,
                                    ci->i_truncate_seq, ci->i_truncate_size,
                                    &mtime, false, 2);
-        if (IS_ERR(req))
+        if (!req)
-                return PTR_ERR(req);
+                return -ENOMEM;
        num_pages = calc_pages_for(pos, len);
@@ -664,9 +665,10 @@ more:
                 * throw out any page cache pages in this range. this
                 * may block.
                 */
-                truncate_inode_pages_range(inode->i_mapping, pos, pos+len);
+                truncate_inode_pages_range(inode->i_mapping, pos, 
+                                           (pos+len) | (PAGE_CACHE_SIZE-1));
        } else {
-                pages = alloc_page_vector(num_pages);
+                pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
                if (IS_ERR(pages)) {
                        ret = PTR_ERR(pages);
                        goto out;
@@ -807,7 +809,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+        struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
        loff_t endoff = pos + iov->iov_len;
        int got = 0;
        int ret, err;
@@ -842,8 +844,7 @@ retry_snap:
                if ((ret >= 0 || ret == -EIOCBQUEUED) &&
                    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
                     || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
-                        err = vfs_fsync_range(file, file->f_path.dentry,
+                        err = vfs_fsync_range(file, pos, pos + ret - 1, 1);
-                                              pos, pos + ret - 1, 1);
                        if (err < 0)
                                ret = err;
                }
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 7abe1aed819b..226f5a50d362 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -69,7 +69,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
        BUG_ON(!S_ISDIR(parent->i_mode));
        if (IS_ERR(inode))
-                return ERR_PTR(PTR_ERR(inode));
+                return inode;
        inode->i_mode = parent->i_mode;
        inode->i_uid = parent->i_uid;
        inode->i_gid = parent->i_gid;
@@ -378,6 +378,22 @@ void ceph_destroy_inode(struct inode *inode)
        ceph_queue_caps_release(inode);
+        /*
+         * we may still have a snap_realm reference if there are stray
+         * caps in i_cap_exporting_issued or i_snap_caps.
+         */
+        if (ci->i_snap_realm) {
+                struct ceph_mds_client *mdsc =
+                        &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+                struct ceph_snap_realm *realm = ci->i_snap_realm;
+                dout(" dropping residual ref to snap realm %p\n", realm);
+                spin_lock(&realm->inodes_with_caps_lock);
+                list_del_init(&ci->i_snap_realm_item);
+                spin_unlock(&realm->inodes_with_caps_lock);
+                ceph_put_snap_realm(mdsc, realm);
+        }
        kfree(ci->i_symlink);
        while ((n = rb_first(&ci->i_fragtree)) != NULL) {
                frag = rb_entry(n, struct ceph_inode_frag, node);
@@ -603,11 +619,12 @@ static int fill_inode(struct inode *inode,
                        memcpy(ci->i_xattrs.blob->vec.iov_base,
                               iinfo->xattr_data, iinfo->xattr_len);
                ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
+                xattr_blob = NULL;
        }
        inode->i_mapping->a_ops = &ceph_aops;
        inode->i_mapping->backing_dev_info =
-                &ceph_client(inode->i_sb)->backing_dev_info;
+                &ceph_sb_to_client(inode->i_sb)->backing_dev_info;
        switch (inode->i_mode & S_IFMT) {
        case S_IFIFO:
@@ -658,14 +675,15 @@ static int fill_inode(struct inode *inode,
                /* set dir completion flag? */
                if (ci->i_files == 0 && ci->i_subdirs == 0 &&
                    ceph_snap(inode) == CEPH_NOSNAP &&
-                    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
+                    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
+                    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
                        dout(" marking %p complete (empty)\n", inode);
                        ci->i_ceph_flags |= CEPH_I_COMPLETE;
                        ci->i_max_offset = 2;
                }
                /* it may be better to set st_size in getattr instead? */
-                if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
+                if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
                        inode->i_size = ci->i_rbytes;
                break;
        default:
@@ -717,6 +735,10 @@ no_change:
                                __ceph_get_fmode(ci, cap_fmode);
                        spin_unlock(&inode->i_lock);
                }
+        } else if (cap_fmode >= 0) {
+                pr_warning("mds issued no caps on %llx.%llx\n",
+                           ceph_vinop(inode));
+                __ceph_get_fmode(ci, cap_fmode);
        }
        /* update delegation info? */
@@ -782,6 +804,37 @@ out_unlock:
 }
 /*
+ * Set dentry's directory position based on the current dir's max, and
+ * order it in d_subdirs, so that dcache_readdir behaves.
+ */
+static void ceph_set_dentry_offset(struct dentry *dn)
+{
+        struct dentry *dir = dn->d_parent;
+        struct inode *inode = dn->d_parent->d_inode;
+        struct ceph_dentry_info *di;
+        BUG_ON(!inode);
+        di = ceph_dentry(dn);
+        spin_lock(&inode->i_lock);
+        if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
+                spin_unlock(&inode->i_lock);
+                return;
+        }
+        di->offset = ceph_inode(inode)->i_max_offset++;
+        spin_unlock(&inode->i_lock);
+        spin_lock(&dcache_lock);
+        spin_lock(&dn->d_lock);
+        list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
+        dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
+             dn->d_u.d_child.prev, dn->d_u.d_child.next);
+        spin_unlock(&dn->d_lock);
+        spin_unlock(&dcache_lock);
+}
+/*
 * splice a dentry to an inode.
 * caller must hold directory i_mutex for this to be safe.
 *
@@ -794,6 +847,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
 {
        struct dentry *realdn;
+        BUG_ON(dn->d_inode);
        /* dn must be unhashed */
        if (!d_unhashed(dn))
                d_drop(dn);
@@ -815,44 +870,17 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
                dn = realdn;
        } else {
                BUG_ON(!ceph_dentry(dn));
                dout("dn %p attached to %p ino %llx.%llx\n",
                     dn, dn->d_inode, ceph_vinop(dn->d_inode));
        }
        if ((!prehash || *prehash) && d_unhashed(dn))
                d_rehash(dn);
+        ceph_set_dentry_offset(dn);
 out:
        return dn;
 }
 /*
- * Set dentry's directory position based on the current dir's max, and
- * order it in d_subdirs, so that dcache_readdir behaves.
- */
-static void ceph_set_dentry_offset(struct dentry *dn)
-{
-        struct dentry *dir = dn->d_parent;
-        struct inode *inode = dn->d_parent->d_inode;
-        struct ceph_dentry_info *di;
-        BUG_ON(!inode);
-        di = ceph_dentry(dn);
-        spin_lock(&inode->i_lock);
-        di->offset = ceph_inode(inode)->i_max_offset++;
-        spin_unlock(&inode->i_lock);
-        spin_lock(&dcache_lock);
-        spin_lock(&dn->d_lock);
-        list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
-        dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
-             dn->d_u.d_child.prev, dn->d_u.d_child.next);
-        spin_unlock(&dn->d_lock);
-        spin_unlock(&dcache_lock);
-}
-/*
 * Incorporate results into the local cache.  This is either just
 * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
 * after a lookup).
@@ -870,6 +898,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
        struct inode *in = NULL;
        struct ceph_mds_reply_inode *ininfo;
        struct ceph_vino vino;
+        struct ceph_client *client = ceph_sb_to_client(sb);
        int i = 0;
        int err = 0;
@@ -912,14 +941,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
        if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
                dout("fill_trace reply is empty!\n");
-                if (rinfo->head->result == 0 && req->r_locked_dir) {
+                if (rinfo->head->result == 0 && req->r_locked_dir)
-                        struct ceph_inode_info *ci =
+                        ceph_invalidate_dir_request(req);
-                                ceph_inode(req->r_locked_dir);
-                        dout(" clearing %p complete (empty trace)\n",
-                             req->r_locked_dir);
-                        ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-                        ci->i_release_count++;
-                }
                return 0;
        }
@@ -933,7 +956,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                        return err;
        }
-        if (rinfo->head->is_dentry && !req->r_aborted) {
+        /*
+         * ignore null lease/binding on snapdir ENOENT, or else we
+         * will have trouble splicing in the virtual snapdir later
+         */
+        if (rinfo->head->is_dentry && !req->r_aborted &&
+            (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
+                                               client->mount_args->snapdir_name,
+                                               req->r_dentry->d_name.len))) {
                /*
                 * lookup link rename   : null -> possibly existing inode
                 * mknod symlink mkdir  : null -> new inode
@@ -973,19 +1003,28 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                             dn, dn->d_name.len, dn->d_name.name);
                        dout("fill_trace doing d_move %p -> %p\n",
                             req->r_old_dentry, dn);
+                        /* d_move screws up d_subdirs order */
+                        ceph_i_clear(dir, CEPH_I_COMPLETE);
                        d_move(req->r_old_dentry, dn);
                        dout(" src %p '%.*s' dst %p '%.*s'\n",
                             req->r_old_dentry,
                             req->r_old_dentry->d_name.len,
                             req->r_old_dentry->d_name.name,
                             dn, dn->d_name.len, dn->d_name.name);
                        /* ensure target dentry is invalidated, despite
                           rehashing bug in vfs_rename_dir */
-                        dn->d_time = jiffies;
+                        ceph_invalidate_dentry_lease(dn);
-                        ceph_dentry(dn)->lease_shared_gen = 0;
                        /* take overwritten dentry's readdir offset */
+                        dout("dn %p gets %p offset %lld (old offset %lld)\n",
+                             req->r_old_dentry, dn, ceph_dentry(dn)->offset,
+                             ceph_dentry(req->r_old_dentry)->offset);
                        ceph_dentry(req->r_old_dentry)->offset =
                                ceph_dentry(dn)->offset;
                        dn = req->r_old_dentry;  /* use old_dentry */
                        in = dn->d_inode;
                }
@@ -1027,7 +1066,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                                goto done;
                        }
                        req->r_dentry = dn;  /* may have spliced */
-                        ceph_set_dentry_offset(dn);
                        igrab(in);
                } else if (ceph_ino(in) == vino.ino &&
                           ceph_snap(in) == vino.snap) {
@@ -1070,7 +1108,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                        err = PTR_ERR(dn);
                        goto done;
                }
-                ceph_set_dentry_offset(dn);
                req->r_dentry = dn;  /* may have spliced */
                igrab(in);
                rinfo->head->is_dentry = 1;  /* fool notrace handlers */
@@ -1397,7 +1434,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
+        if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
                       &ci->i_vmtruncate_work)) {
                dout("ceph_queue_vmtruncate %p\n", inode);
                igrab(inode);
@@ -1486,7 +1523,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
        struct inode *parent_inode = dentry->d_parent->d_inode;
        const unsigned int ia_valid = attr->ia_valid;
        struct ceph_mds_request *req;
-        struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
        int issued;
        int release = 0, dirtied = 0;
        int mask = 0;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 8a5bcae62846..d085f07756b4 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -98,7 +98,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
        struct ceph_ioctl_dataloc dl;
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+        struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
        u64 len = 1, olen;
        u64 tmp;
        struct ceph_object_layout ol;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a2600101ec22..b49f12822cbc 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1,6 +1,7 @@
 #include "ceph_debug.h"
 #include <linux/wait.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include "mds_client.h"
@@ -39,7 +40,7 @@
 static void __wake_requests(struct ceph_mds_client *mdsc,
                            struct list_head *head);
-const static struct ceph_connection_operations mds_con_ops;
+static const struct ceph_connection_operations mds_con_ops;
 /*
@@ -328,6 +329,8 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
        struct ceph_mds_session *s;
        s = kzalloc(sizeof(*s), GFP_NOFS);
+        if (!s)
+                return ERR_PTR(-ENOMEM);
        s->s_mdsc = mdsc;
        s->s_mds = mds;
        s->s_state = CEPH_MDS_SESSION_NEW;
@@ -529,7 +532,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 {
        dout("__unregister_request %p tid %lld\n", req, req->r_tid);
        rb_erase(&req->r_node, &mdsc->request_tree);
-        ceph_mdsc_put_request(req);
+        RB_CLEAR_NODE(&req->r_node);
        if (req->r_unsafe_dir) {
                struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
@@ -538,6 +541,8 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
                list_del_init(&req->r_unsafe_dir_item);
                spin_unlock(&ci->i_unsafe_lock);
        }
+        ceph_mdsc_put_request(req);
 }
 /*
@@ -660,10 +665,10 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
        struct ceph_msg *msg;
        struct ceph_mds_session_head *h;
-        msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
+        msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS);
-        if (IS_ERR(msg)) {
+        if (!msg) {
                pr_err("create_session_msg ENOMEM creating msg\n");
-                return ERR_PTR(PTR_ERR(msg));
+                return NULL;
        }
        h = msg->front.iov_base;
        h->op = cpu_to_le32(op);
@@ -682,7 +687,6 @@ static int __open_session(struct ceph_mds_client *mdsc,
        struct ceph_msg *msg;
        int mstate;
        int mds = session->s_mds;
-        int err = 0;
        /* wait for mds to go active? */
        mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
@@ -693,13 +697,9 @@ static int __open_session(struct ceph_mds_client *mdsc,
        /* send connect message */
        msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
-        if (IS_ERR(msg)) {
+        if (!msg)
-                err = PTR_ERR(msg);
+                return -ENOMEM;
-                goto out;
-        }
        ceph_con_send(&session->s_con, msg);
-out:
        return 0;
 }
@@ -731,9 +731,10 @@ static void cleanup_cap_releases(struct ceph_mds_session *session)
 }
 /*
- * Helper to safely iterate over all caps associated with a session.
+ * Helper to safely iterate over all caps associated with a session, with
+ * special care taken to handle a racing __ceph_remove_cap().
 *
- * caller must hold session s_mutex
+ * Caller must hold session s_mutex.
 */
 static int iterate_session_caps(struct ceph_mds_session *session,
                                 int (*cb)(struct inode *, struct ceph_cap *,
@@ -798,12 +799,49 @@ out:
 }
 static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
-                                   void *arg)
+                                  void *arg)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
+        int drop = 0;
        dout("removing cap %p, ci is %p, inode is %p\n",
             cap, ci, &ci->vfs_inode);
-        ceph_remove_cap(cap);
+        spin_lock(&inode->i_lock);
+        __ceph_remove_cap(cap);
+        if (!__ceph_is_any_real_caps(ci)) {
+                struct ceph_mds_client *mdsc =
+                        &ceph_sb_to_client(inode->i_sb)->mdsc;
+                spin_lock(&mdsc->cap_dirty_lock);
+                if (!list_empty(&ci->i_dirty_item)) {
+                        pr_info(" dropping dirty %s state for %p %lld\n",
+                                ceph_cap_string(ci->i_dirty_caps),
+                                inode, ceph_ino(inode));
+                        ci->i_dirty_caps = 0;
+                        list_del_init(&ci->i_dirty_item);
+                        drop = 1;
+                }
+                if (!list_empty(&ci->i_flushing_item)) {
+                        pr_info(" dropping dirty+flushing %s state for %p %lld\n",
+                                ceph_cap_string(ci->i_flushing_caps),
+                                inode, ceph_ino(inode));
+                        ci->i_flushing_caps = 0;
+                        list_del_init(&ci->i_flushing_item);
+                        mdsc->num_cap_flushing--;
+                        drop = 1;
+                }
+                if (drop && ci->i_wrbuffer_ref) {
+                        pr_info(" dropping dirty data for %p %lld\n",
+                                inode, ceph_ino(inode));
+                        ci->i_wrbuffer_ref = 0;
+                        ci->i_wrbuffer_ref_head = 0;
+                        drop++;
+                }
+                spin_unlock(&mdsc->cap_dirty_lock);
+        }
+        spin_unlock(&inode->i_lock);
+        while (drop--)
+                iput(inode);
        return 0;
 }
@@ -815,6 +853,7 @@ static void remove_session_caps(struct ceph_mds_session *session)
        dout("remove_session_caps on %p\n", session);
        iterate_session_caps(session, remove_session_caps_cb, NULL);
        BUG_ON(session->s_nr_caps > 0);
+        BUG_ON(!list_empty(&session->s_cap_flushing));
        cleanup_cap_releases(session);
 }
@@ -862,6 +901,7 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
        if (time_after_eq(jiffies, session->s_cap_ttl) &&
            time_after_eq(session->s_cap_ttl, session->s_renew_requested))
                pr_info("mds%d caps stale\n", session->s_mds);
+        session->s_renew_requested = jiffies;
        /* do not try to renew caps until a recovering mds has reconnected
         * with its clients. */
@@ -874,11 +914,10 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
        dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
                ceph_mds_state_name(state));
-        session->s_renew_requested = jiffies;
        msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
                                 ++session->s_renew_seq);
-        if (IS_ERR(msg))
+        if (!msg)
-                return PTR_ERR(msg);
+                return -ENOMEM;
        ceph_con_send(&session->s_con, msg);
        return 0;
 }
@@ -925,17 +964,15 @@ static int request_close_session(struct ceph_mds_client *mdsc,
                                 struct ceph_mds_session *session)
 {
        struct ceph_msg *msg;
-        int err = 0;
        dout("request_close_session mds%d state %s seq %lld\n",
             session->s_mds, session_state_name(session->s_state),
             session->s_seq);
        msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
-        if (IS_ERR(msg))
+        if (!msg)
-                err = PTR_ERR(msg);
+                return -ENOMEM;
-        else
+        ceph_con_send(&session->s_con, msg);
-                ceph_con_send(&session->s_con, msg);
+        return 0;
-        return err;
 }
 /*
@@ -1053,7 +1090,7 @@ static int add_cap_releases(struct ceph_mds_client *mdsc,
        while (session->s_num_cap_releases < session->s_nr_caps + extra) {
                spin_unlock(&session->s_cap_lock);
                msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
-                                   0, 0, NULL);
+                                   GFP_NOFS);
                if (!msg)
                        goto out_unlocked;
                dout("add_cap_releases %p msg %p now %d\n", session, msg,
@@ -1145,10 +1182,8 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
        struct ceph_msg *msg;
        dout("send_cap_releases mds%d\n", session->s_mds);
-        while (1) {
+        spin_lock(&session->s_cap_lock);
-                spin_lock(&session->s_cap_lock);
+        while (!list_empty(&session->s_cap_releases_done)) {
-                if (list_empty(&session->s_cap_releases_done))
-                        break;
                msg = list_first_entry(&session->s_cap_releases_done,
                                 struct ceph_msg, list_head);
                list_del_init(&msg->list_head);
@@ -1156,7 +1191,46 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
                msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
                dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
                ceph_con_send(&session->s_con, msg);
+                spin_lock(&session->s_cap_lock);
+        }
+        spin_unlock(&session->s_cap_lock);
+}
+static void discard_cap_releases(struct ceph_mds_client *mdsc,
+                                 struct ceph_mds_session *session)
+{
+        struct ceph_msg *msg;
+        struct ceph_mds_cap_release *head;
+        unsigned num;
+        dout("discard_cap_releases mds%d\n", session->s_mds);
+        spin_lock(&session->s_cap_lock);
+        /* zero out the in-progress message */
+        msg = list_first_entry(&session->s_cap_releases,
+                               struct ceph_msg, list_head);
+        head = msg->front.iov_base;
+        num = le32_to_cpu(head->num);
+        dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
+        head->num = cpu_to_le32(0);
+        session->s_num_cap_releases += num;
+        /* requeue completed messages */
+        while (!list_empty(&session->s_cap_releases_done)) {
+                msg = list_first_entry(&session->s_cap_releases_done,
+                                 struct ceph_msg, list_head);
+                list_del_init(&msg->list_head);
+                head = msg->front.iov_base;
+                num = le32_to_cpu(head->num);
+                dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
+                     num);
+                session->s_num_cap_releases += num;
+                head->num = cpu_to_le32(0);
+                msg->front.iov_len = sizeof(*head);
+                list_add(&msg->list_head, &session->s_cap_releases);
        }
        spin_unlock(&session->s_cap_lock);
 }
@@ -1175,6 +1249,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
        if (!req)
                return ERR_PTR(-ENOMEM);
+        mutex_init(&req->r_fill_mutex);
        req->r_started = jiffies;
        req->r_resend_mds = -1;
        INIT_LIST_HEAD(&req->r_unsafe_dir_item);
@@ -1245,7 +1320,7 @@ retry:
                        len += 1 + temp->d_name.len;
                temp = temp->d_parent;
                if (temp == NULL) {
-                        pr_err("build_path_dentry corrupt dentry %p\n", dentry);
+                        pr_err("build_path corrupt dentry %p\n", dentry);
                        return ERR_PTR(-EINVAL);
                }
        }
@@ -1261,7 +1336,7 @@ retry:
                struct inode *inode = temp->d_inode;
                if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
-                        dout("build_path_dentry path+%d: %p SNAPDIR\n",
+                        dout("build_path path+%d: %p SNAPDIR\n",
                             pos, temp);
                } else if (stop_on_nosnap && inode &&
                           ceph_snap(inode) == CEPH_NOSNAP) {
@@ -1272,20 +1347,18 @@ retry:
                                break;
                        strncpy(path + pos, temp->d_name.name,
                                temp->d_name.len);
-                        dout("build_path_dentry path+%d: %p '%.*s'\n",
-                             pos, temp, temp->d_name.len, path + pos);
                }
                if (pos)
                        path[--pos] = '/';
                temp = temp->d_parent;
                if (temp == NULL) {
-                        pr_err("build_path_dentry corrupt dentry\n");
+                        pr_err("build_path corrupt dentry\n");
                        kfree(path);
                        return ERR_PTR(-EINVAL);
                }
        }
        if (pos != 0) {
-                pr_err("build_path_dentry did not end path lookup where "
+                pr_err("build_path did not end path lookup where "
                       "expected, namelen is %d, pos is %d\n", len, pos);
                /* presumably this is only possible if racing with a
                   rename of one of the parent directories (we can not
@@ -1297,7 +1370,7 @@ retry:
        *base = ceph_ino(temp->d_inode);
        *plen = len;
-        dout("build_path_dentry on %p %d built %llx '%.*s'\n",
+        dout("build_path on %p %d built %llx '%.*s'\n",
             dentry, atomic_read(&dentry->d_count), *base, len, path);
        return path;
 }
@@ -1420,9 +1493,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
        if (req->r_old_dentry_drop)
                len += req->r_old_dentry->d_name.len;
-        msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
+        msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS);
-        if (IS_ERR(msg))
+        if (!msg) {
+                msg = ERR_PTR(-ENOMEM);
                goto out_free2;
+        }
        msg->hdr.tid = cpu_to_le64(req->r_tid);
@@ -1511,9 +1586,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
        }
        msg = create_request_message(mdsc, req, mds);
        if (IS_ERR(msg)) {
-                req->r_reply = ERR_PTR(PTR_ERR(msg));
+                req->r_err = PTR_ERR(msg);
                complete_request(mdsc, req);
-                return -PTR_ERR(msg);
+                return PTR_ERR(msg);
        }
        req->r_request = msg;
@@ -1546,7 +1621,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
        int mds = -1;
        int err = -EAGAIN;
-        if (req->r_reply)
+        if (req->r_err || req->r_got_result)
                goto out;
        if (req->r_timeout &&
@@ -1566,8 +1641,13 @@ static int __do_request(struct ceph_mds_client *mdsc,
        /* get, open session */
        session = __ceph_lookup_mds_session(mdsc, mds);
-        if (!session)
+        if (!session) {
                session = register_session(mdsc, mds);
+                if (IS_ERR(session)) {
+                        err = PTR_ERR(session);
+                        goto finish;
+                }
+        }
        dout("do_request mds%d session %p state %s\n", mds, session,
             session_state_name(session->s_state));
        if (session->s_state != CEPH_MDS_SESSION_OPEN &&
@@ -1598,7 +1678,7 @@ out:
        return err;
 finish:
-        req->r_reply = ERR_PTR(err);
+        req->r_err = err;
        complete_request(mdsc, req);
        goto out;
 }
@@ -1619,10 +1699,9 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
 /*
 * Wake up threads with requests pending for @mds, so that they can
- * resubmit their requests to a possibly different mds.  If @all is set,
+ * resubmit their requests to a possibly different mds.
- * wake up if their requests has been forwarded to @mds, too.
 */
-static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
+static void kick_requests(struct ceph_mds_client *mdsc, int mds)
 {
        struct ceph_mds_request *req;
        struct rb_node *p;
@@ -1678,64 +1757,78 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
        __register_request(mdsc, req, dir);
        __do_request(mdsc, req);
-        /* wait */
+        if (req->r_err) {
-        if (!req->r_reply) {
+                err = req->r_err;
-                mutex_unlock(&mdsc->mutex);
+                __unregister_request(mdsc, req);
-                if (req->r_timeout) {
+                dout("do_request early error %d\n", err);
-                        err = (long)wait_for_completion_interruptible_timeout(
+                goto out;
-                                &req->r_completion, req->r_timeout);
-                        if (err == 0)
-                                req->r_reply = ERR_PTR(-EIO);
-                        else if (err < 0)
-                                req->r_reply = ERR_PTR(err);
-                } else {
-                        err = wait_for_completion_interruptible(
-                                &req->r_completion);
-                        if (err)
-                                req->r_reply = ERR_PTR(err);
-                }
-                mutex_lock(&mdsc->mutex);
        }
-        if (IS_ERR(req->r_reply)) {
+        /* wait */
-                err = PTR_ERR(req->r_reply);
+        mutex_unlock(&mdsc->mutex);
-                req->r_reply = NULL;
+        dout("do_request waiting\n");
+        if (req->r_timeout) {
+                err = (long)wait_for_completion_killable_timeout(
+                        &req->r_completion, req->r_timeout);
+                if (err == 0)
+                        err = -EIO;
+        } else {
+                err = wait_for_completion_killable(&req->r_completion);
+        }
+        dout("do_request waited, got %d\n", err);
+        mutex_lock(&mdsc->mutex);
-                if (err == -ERESTARTSYS) {
+        /* only abort if we didn't race with a real reply */
-                        /* aborted */
+        if (req->r_got_result) {
-                        req->r_aborted = true;
+                err = le32_to_cpu(req->r_reply_info.head->result);
+        } else if (err < 0) {
+                dout("aborted request %lld with %d\n", req->r_tid, err);
-                        if (req->r_locked_dir &&
+                /*
-                            (req->r_op & CEPH_MDS_OP_WRITE)) {
+                 * ensure we aren't running concurrently with
-                                struct ceph_inode_info *ci =
+                 * ceph_fill_trace or ceph_readdir_prepopulate, which
-                                        ceph_inode(req->r_locked_dir);
+                 * rely on locks (dir mutex) held by our caller.
+                 */
+                mutex_lock(&req->r_fill_mutex);
+                req->r_err = err;
+                req->r_aborted = true;
+                mutex_unlock(&req->r_fill_mutex);
-                                dout("aborted, clearing I_COMPLETE on %p\n", 
+                if (req->r_locked_dir &&
-                                     req->r_locked_dir);
+                    (req->r_op & CEPH_MDS_OP_WRITE))
-                                spin_lock(&req->r_locked_dir->i_lock);
+                        ceph_invalidate_dir_request(req);
-                                ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-                                ci->i_release_count++;
-                                spin_unlock(&req->r_locked_dir->i_lock);
-                        }
-                } else {
-                        /* clean up this request */
-                        __unregister_request(mdsc, req);
-                        if (!list_empty(&req->r_unsafe_item))
-                                list_del_init(&req->r_unsafe_item);
-                        complete(&req->r_safe_completion);
-                }
-        } else if (req->r_err) {
-                err = req->r_err;
        } else {
-                err = le32_to_cpu(req->r_reply_info.head->result);
+                err = req->r_err;
        }
-        mutex_unlock(&mdsc->mutex);
+out:
+        mutex_unlock(&mdsc->mutex);
        dout("do_request %p done, result %d\n", req, err);
        return err;
 }
 /*
+ * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS
+ * namespace request.
+ */
+void ceph_invalidate_dir_request(struct ceph_mds_request *req)
+{
+        struct inode *inode = req->r_locked_dir;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode);
+        spin_lock(&inode->i_lock);
+        ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+        ci->i_release_count++;
+        spin_unlock(&inode->i_lock);
+        if (req->r_dentry)
+                ceph_invalidate_dentry_lease(req->r_dentry);
+        if (req->r_old_dentry)
+                ceph_invalidate_dentry_lease(req->r_old_dentry);
+}
+/*
 * Handle mds reply.
 *
 * We take the session mutex and parse and process the reply immediately.
@@ -1770,7 +1863,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        dout("handle_reply %p\n", req);
        /* correct session? */
-        if (!req->r_session && req->r_session != session) {
+        if (req->r_session != session) {
                pr_err("mdsc_handle_reply got %llu on session mds%d"
                       " not mds%d\n", tid, session->s_mds,
                       req->r_session ? req->r_session->s_mds : -1);
@@ -1786,6 +1879,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                mutex_unlock(&mdsc->mutex);
                goto out;
        }
+        if (req->r_got_safe && !head->safe) {
+                pr_warning("got unsafe after safe on %llu from mds%d\n",
+                           tid, mds);
+                mutex_unlock(&mdsc->mutex);
+                goto out;
+        }
        result = le32_to_cpu(head->result);
@@ -1827,11 +1926,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                        mutex_unlock(&mdsc->mutex);
                        goto out;
                }
-        }
+        } else {
-        BUG_ON(req->r_reply);
-        if (!head->safe) {
                req->r_got_unsafe = true;
                list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
        }
@@ -1860,21 +1955,30 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        }
        /* insert trace into our cache */
+        mutex_lock(&req->r_fill_mutex);
        err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
        if (err == 0) {
                if (result == 0 && rinfo->dir_nr)
                        ceph_readdir_prepopulate(req, req->r_session);
                ceph_unreserve_caps(&req->r_caps_reservation);
        }
+        mutex_unlock(&req->r_fill_mutex);
        up_read(&mdsc->snap_rwsem);
 out_err:
-        if (err) {
+        mutex_lock(&mdsc->mutex);
-                req->r_err = err;
+        if (!req->r_aborted) {
+                if (err) {
+                        req->r_err = err;
+                } else {
+                        req->r_reply = msg;
+                        ceph_msg_get(msg);
+                        req->r_got_result = true;
+                }
        } else {
-                req->r_reply = msg;
+                dout("reply arrived after request %lld was aborted\n", tid);
-                ceph_msg_get(msg);
        }
+        mutex_unlock(&mdsc->mutex);
        add_cap_releases(mdsc, req->r_session, -1);
        mutex_unlock(&session->s_mutex);
@@ -1910,16 +2014,21 @@ static void handle_forward(struct ceph_mds_client *mdsc,
        mutex_lock(&mdsc->mutex);
        req = __lookup_request(mdsc, tid);
        if (!req) {
-                dout("forward %llu to mds%d - req dne\n", tid, next_mds);
+                dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
                goto out;  /* dup reply? */
        }
-        if (fwd_seq <= req->r_num_fwd) {
+        if (req->r_aborted) {
-                dout("forward %llu to mds%d - old seq %d <= %d\n",
+                dout("forward tid %llu aborted, unregistering\n", tid);
+                __unregister_request(mdsc, req);
+        } else if (fwd_seq <= req->r_num_fwd) {
+                dout("forward tid %llu to mds%d - old seq %d <= %d\n",
                     tid, next_mds, req->r_num_fwd, fwd_seq);
        } else {
                /* resend. forward race not possible; mds would drop */
-                dout("forward %llu to mds%d (we resend)\n", tid, next_mds);
+                dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
+                BUG_ON(req->r_err);
+                BUG_ON(req->r_got_result);
                req->r_num_fwd = fwd_seq;
                req->r_resend_mds = next_mds;
                put_request_session(req);
@@ -1973,6 +2082,8 @@ static void handle_session(struct ceph_mds_session *session,
        switch (op) {
        case CEPH_SESSION_OPEN:
+                if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+                        pr_info("mds%d reconnect success\n", session->s_mds);
                session->s_state = CEPH_MDS_SESSION_OPEN;
                renewed_caps(mdsc, session, 0);
                wake = 1;
@@ -1986,10 +2097,12 @@ static void handle_session(struct ceph_mds_session *session,
                break;
        case CEPH_SESSION_CLOSE:
+                if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+                        pr_info("mds%d reconnect denied\n", session->s_mds);
                remove_session_caps(session);
                wake = 1; /* for good measure */
                complete(&mdsc->session_close_waiters);
-                kick_requests(mdsc, mds, 0);      /* cur only */
+                kick_requests(mdsc, mds);
                break;
        case CEPH_SESSION_STALE:
@@ -2121,61 +2234,51 @@ out:
 *
 * called with mdsc->mutex held.
 */
-static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
+static void send_mds_reconnect(struct ceph_mds_client *mdsc,
+                               struct ceph_mds_session *session)
 {
-        struct ceph_mds_session *session = NULL;
        struct ceph_msg *reply;
        struct rb_node *p;
-        int err;
+        int mds = session->s_mds;
+        int err = -ENOMEM;
        struct ceph_pagelist *pagelist;
-        pr_info("reconnect to recovering mds%d\n", mds);
+        pr_info("mds%d reconnect start\n", mds);
        pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
        if (!pagelist)
                goto fail_nopagelist;
        ceph_pagelist_init(pagelist);
-        reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL);
+        reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS);
-        if (IS_ERR(reply)) {
+        if (!reply)
-                err = PTR_ERR(reply);
                goto fail_nomsg;
-        }
-        /* find session */
-        session = __ceph_lookup_mds_session(mdsc, mds);
-        mutex_unlock(&mdsc->mutex);    /* drop lock for duration */
-        if (session) {
-                mutex_lock(&session->s_mutex);
-                session->s_state = CEPH_MDS_SESSION_RECONNECTING;
+        mutex_lock(&session->s_mutex);
-                session->s_seq = 0;
+        session->s_state = CEPH_MDS_SESSION_RECONNECTING;
+        session->s_seq = 0;
-                ceph_con_open(&session->s_con,
+        ceph_con_open(&session->s_con,
-                              ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+                      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
-                /* replay unsafe requests */
+        /* replay unsafe requests */
-                replay_unsafe_requests(mdsc, session);
+        replay_unsafe_requests(mdsc, session);
-        } else {
-                dout("no session for mds%d, will send short reconnect\n",
-                     mds);
-        }
        down_read(&mdsc->snap_rwsem);
-        if (!session)
-                goto send;
        dout("session %p state %s\n", session,
             session_state_name(session->s_state));
+        /* drop old cap expires; we're about to reestablish that state */
+        discard_cap_releases(mdsc, session);
        /* traverse this session's caps */
        err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
        if (err)
                goto fail;
        err = iterate_session_caps(session, encode_caps_cb, pagelist);
        if (err < 0)
-                goto out;
+                goto fail;
        /*
         * snaprealms.  we provide mds with the ino, seq (version), and
@@ -2197,34 +2300,30 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
                        goto fail;
        }
-send:
        reply->pagelist = pagelist;
        reply->hdr.data_len = cpu_to_le32(pagelist->length);
        reply->nr_pages = calc_pages_for(0, pagelist->length);
        ceph_con_send(&session->s_con, reply);
-        if (session) {
+        mutex_unlock(&session->s_mutex);
-                session->s_state = CEPH_MDS_SESSION_OPEN;
-                __wake_requests(mdsc, &session->s_waiting);
-        }
-out:
-        up_read(&mdsc->snap_rwsem);
-        if (session) {
-                mutex_unlock(&session->s_mutex);
-                ceph_put_mds_session(session);
-        }
        mutex_lock(&mdsc->mutex);
+        __wake_requests(mdsc, &session->s_waiting);
+        mutex_unlock(&mdsc->mutex);
+        up_read(&mdsc->snap_rwsem);
        return;
 fail:
        ceph_msg_put(reply);
+        up_read(&mdsc->snap_rwsem);
+        mutex_unlock(&session->s_mutex);
 fail_nomsg:
        ceph_pagelist_release(pagelist);
        kfree(pagelist);
 fail_nopagelist:
-        pr_err("ENOMEM preparing reconnect for mds%d\n", mds);
+        pr_err("error %d preparing reconnect for mds%d\n", err, mds);
-        goto out;
+        return;
 }
@@ -2276,7 +2375,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
                        }
                        /* kick any requests waiting on the recovering mds */
-                        kick_requests(mdsc, i, 1);
+                        kick_requests(mdsc, i);
                } else if (oldstate == newstate) {
                        continue;  /* nothing new with this mds */
                }
@@ -2285,22 +2384,21 @@ static void check_new_map(struct ceph_mds_client *mdsc,
                 * send reconnect?
                 */
                if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
-                    newstate >= CEPH_MDS_STATE_RECONNECT)
+                    newstate >= CEPH_MDS_STATE_RECONNECT) {
-                        send_mds_reconnect(mdsc, i);
+                        mutex_unlock(&mdsc->mutex);
+                        send_mds_reconnect(mdsc, s);
+                        mutex_lock(&mdsc->mutex);
+                }
                /*
-                 * kick requests on any mds that has gone active.
+                 * kick request on any mds that has gone active.
-                 *
-                 * kick requests on cur or forwarder: we may have sent
-                 * the request to mds1, mds1 told us it forwarded it
-                 * to mds2, but then we learn mds1 failed and can't be
-                 * sure it successfully forwarded our request before
-                 * it died.
                 */
                if (oldstate < CEPH_MDS_STATE_ACTIVE &&
                    newstate >= CEPH_MDS_STATE_ACTIVE) {
-                        pr_info("mds%d reconnect completed\n", s->s_mds);
+                        if (oldstate != CEPH_MDS_STATE_CREATING &&
-                        kick_requests(mdsc, i, 1);
+                            oldstate != CEPH_MDS_STATE_STARTING)
+                                pr_info("mds%d recovery completed\n", s->s_mds);
+                        kick_requests(mdsc, i);
                        ceph_kick_flushing_caps(mdsc, s);
                        wake_up_session_caps(s, 1);
                }
@@ -2443,12 +2541,12 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
        dnamelen = dentry->d_name.len;
        len += dnamelen;
-        msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
+        msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS);
-        if (IS_ERR(msg))
+        if (!msg)
                return;
        lease = msg->front.iov_base;
        lease->action = action;
-        lease->mask = cpu_to_le16(CEPH_LOCK_DN);
+        lease->mask = cpu_to_le16(1);
        lease->ino = cpu_to_le64(ceph_vino(inode).ino);
        lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
        lease->seq = cpu_to_le32(seq);
@@ -2478,7 +2576,7 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
        BUG_ON(inode == NULL);
        BUG_ON(dentry == NULL);
-        BUG_ON(mask != CEPH_LOCK_DN);
+        BUG_ON(mask == 0);
        /* is dentry lease valid? */
        spin_lock(&dentry->d_lock);
@@ -2589,7 +2687,9 @@ static void delayed_work(struct work_struct *work)
                else
                        ceph_con_keepalive(&s->s_con);
                add_cap_releases(mdsc, s, -1);
-                send_cap_releases(mdsc, s);
+                if (s->s_state == CEPH_MDS_SESSION_OPEN ||
+                    s->s_state == CEPH_MDS_SESSION_HUNG)
+                        send_cap_releases(mdsc, s);
                mutex_unlock(&s->s_mutex);
                ceph_put_mds_session(s);
@@ -2606,6 +2706,9 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
        mdsc->client = client;
        mutex_init(&mdsc->mutex);
        mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
+        if (mdsc->mdsmap == NULL)
+                return -ENOMEM;
        init_completion(&mdsc->safe_umount_waiters);
        init_completion(&mdsc->session_close_waiters);
        INIT_LIST_HEAD(&mdsc->waiting_for_map);
@@ -2631,6 +2734,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
        init_waitqueue_head(&mdsc->cap_flushing_wq);
        spin_lock_init(&mdsc->dentry_lru_lock);
        INIT_LIST_HEAD(&mdsc->dentry_lru);
        return 0;
 }
@@ -2682,29 +2786,41 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
 */
 static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
 {
-        struct ceph_mds_request *req = NULL;
+        struct ceph_mds_request *req = NULL, *nextreq;
        struct rb_node *n;
        mutex_lock(&mdsc->mutex);
        dout("wait_unsafe_requests want %lld\n", want_tid);
+restart:
        req = __get_oldest_req(mdsc);
        while (req && req->r_tid <= want_tid) {
+                /* find next request */
+                n = rb_next(&req->r_node);
+                if (n)
+                        nextreq = rb_entry(n, struct ceph_mds_request, r_node);
+                else
+                        nextreq = NULL;
                if ((req->r_op & CEPH_MDS_OP_WRITE)) {
                        /* write op */
                        ceph_mdsc_get_request(req);
+                        if (nextreq)
+                                ceph_mdsc_get_request(nextreq);
                        mutex_unlock(&mdsc->mutex);
                        dout("wait_unsafe_requests  wait on %llu (want %llu)\n",
                             req->r_tid, want_tid);
                        wait_for_completion(&req->r_safe_completion);
                        mutex_lock(&mdsc->mutex);
-                        n = rb_next(&req->r_node);
                        ceph_mdsc_put_request(req);
-                } else {
+                        if (!nextreq)
-                        n = rb_next(&req->r_node);
+                                break;  /* next dne before, so we're done! */
+                        if (RB_EMPTY_NODE(&nextreq->r_node)) {
+                                /* next request was removed from tree */
+                                ceph_mdsc_put_request(nextreq);
+                                goto restart;
+                        }
+                        ceph_mdsc_put_request(nextreq);  /* won't go away */
                }
-                if (!n)
+                req = nextreq;
-                        break;
-                req = rb_entry(n, struct ceph_mds_request, r_node);
        }
        mutex_unlock(&mdsc->mutex);
        dout("wait_unsafe_requests done\n");
@@ -2714,6 +2830,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
        u64 want_tid, want_flush;
+        if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+                return;
        dout("sync\n");
        mutex_lock(&mdsc->mutex);
        want_tid = mdsc->last_tid;
@@ -2896,9 +3015,10 @@ static void con_put(struct ceph_connection *con)
 static void peer_reset(struct ceph_connection *con)
 {
        struct ceph_mds_session *s = con->private;
+        struct ceph_mds_client *mdsc = s->s_mdsc;
-        pr_err("mds%d gave us the boot.  IMPLEMENT RECONNECT.\n",
+        pr_warning("mds%d closed our session\n", s->s_mds);
-               s->s_mds);
+        send_mds_reconnect(mdsc, s);
 }
 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
@@ -3005,7 +3125,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
        return ceph_monc_validate_auth(&mdsc->client->monc);
 }
-const static struct ceph_connection_operations mds_con_ops = {
+static const struct ceph_connection_operations mds_con_ops = {
        .get = con_get,
        .put = con_put,
        .dispatch = dispatch,
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 961cc6f65878..d9936c4f1212 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -165,6 +165,8 @@ struct ceph_mds_request {
        struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
        struct inode *r_target_inode;       /* resulting inode */
+        struct mutex r_fill_mutex;
        union ceph_mds_request_args r_args;
        int r_fmode;        /* file mode, if expecting cap */
@@ -213,7 +215,7 @@ struct ceph_mds_request {
        struct completion r_safe_completion;
        ceph_mds_request_callback_t r_callback;
        struct list_head  r_unsafe_item;  /* per-session unsafe list item */
-        bool              r_got_unsafe, r_got_safe;
+        bool              r_got_unsafe, r_got_safe, r_got_result;
        bool              r_did_prepopulate;
        u32               r_readdir_offset;
@@ -301,6 +303,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
                                    struct inode *inode,
                                    struct dentry *dn, int mask);
+extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
 extern struct ceph_mds_request *
 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
 extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 781656a49bf8..64b8b1f7863d 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -6,6 +6,7 @@
 #include <linux/inet.h>
 #include <linux/kthread.h>
 #include <linux/net.h>
+#include <linux/slab.h>
 #include <linux/socket.h>
 #include <linux/string.h>
 #include <net/tcp.h>
@@ -29,23 +30,15 @@ static char tag_msg = CEPH_MSGR_TAG_MSG;
 static char tag_ack = CEPH_MSGR_TAG_ACK;
 static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key socket_class;
+#endif
 static void queue_con(struct ceph_connection *con);
 static void con_work(struct work_struct *);
 static void ceph_fault(struct ceph_connection *con);
-const char *ceph_name_type_str(int t)
-{
-        switch (t) {
-        case CEPH_ENTITY_TYPE_MON: return "mon";
-        case CEPH_ENTITY_TYPE_MDS: return "mds";
-        case CEPH_ENTITY_TYPE_OSD: return "osd";
-        case CEPH_ENTITY_TYPE_CLIENT: return "client";
-        case CEPH_ENTITY_TYPE_ADMIN: return "admin";
-        default: return "???";
-        }
-}
 /*
 * nicely render a sockaddr as a string.
 */
@@ -127,6 +120,12 @@ void ceph_msgr_exit(void)
        destroy_workqueue(ceph_msgr_wq);
 }
+void ceph_msgr_flush()
+{
+        flush_workqueue(ceph_msgr_wq);
+}
 /*
 * socket callback functions
 */
@@ -227,6 +226,10 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)
        con->sock = sock;
        sock->sk->sk_allocation = GFP_NOFS;
+#ifdef CONFIG_LOCKDEP
+        lockdep_set_class(&sock->sk->sk_lock, &socket_class);
+#endif
        set_sock_callbacks(sock, con);
        dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
@@ -331,7 +334,9 @@ static void reset_connection(struct ceph_connection *con)
                ceph_msg_put(con->out_msg);
                con->out_msg = NULL;
        }
+        con->out_keepalive_pending = false;
        con->in_seq = 0;
+        con->in_seq_acked = 0;
 }
 /*
@@ -347,6 +352,7 @@ void ceph_con_close(struct ceph_connection *con)
        clear_bit(WRITE_PENDING, &con->state);
        mutex_lock(&con->mutex);
        reset_connection(con);
+        con->peer_global_seq = 0;
        cancel_delayed_work(&con->work);
        mutex_unlock(&con->mutex);
        queue_con(con);
@@ -366,6 +372,14 @@ void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
 }
 /*
+ * return true if this connection ever successfully opened
+ */
+bool ceph_con_opened(struct ceph_connection *con)
+{
+        return con->connect_seq > 0;
+}
+/*
 * generic get/put
 */
 struct ceph_connection *ceph_con_get(struct ceph_connection *con)
@@ -474,7 +488,14 @@ static void prepare_write_message(struct ceph_connection *con)
                list_move_tail(&m->list_head, &con->out_sent);
        }
-        m->hdr.seq = cpu_to_le64(++con->out_seq);
+        /*
+         * only assign outgoing seq # if we haven't sent this message
+         * yet.  if it is requeued, resend with it's original seq.
+         */
+        if (m->needs_out_seq) {
+                m->hdr.seq = cpu_to_le64(++con->out_seq);
+                m->needs_out_seq = false;
+        }
        dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
             m, con->out_seq, le16_to_cpu(m->hdr.type),
@@ -636,7 +657,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
        dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
             con->connect_seq, global_seq, proto);
-        con->out_connect.features = CEPH_FEATURE_SUPPORTED;
+        con->out_connect.features = CEPH_FEATURE_SUPPORTED_CLIENT;
        con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
        con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
        con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -830,13 +851,6 @@ static void prepare_read_connect(struct ceph_connection *con)
        con->in_base_pos = 0;
 }
-static void prepare_read_connect_retry(struct ceph_connection *con)
-{
-        dout("prepare_read_connect_retry %p\n", con);
-        con->in_base_pos = strlen(CEPH_BANNER) + sizeof(con->actual_peer_addr)
-                + sizeof(con->peer_addr_for_me);
-}
 static void prepare_read_ack(struct ceph_connection *con)
 {
        dout("prepare_read_ack %p\n", con);
@@ -1106,8 +1120,8 @@ static void fail_protocol(struct ceph_connection *con)
 static int process_connect(struct ceph_connection *con)
 {
-        u64 sup_feat = CEPH_FEATURE_SUPPORTED;
+        u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT;
-        u64 req_feat = CEPH_FEATURE_REQUIRED;
+        u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT;
        u64 server_feat = le64_to_cpu(con->in_reply.features);
        dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
@@ -1146,7 +1160,7 @@ static int process_connect(struct ceph_connection *con)
                }
                con->auth_retry = 1;
                prepare_write_connect(con->msgr, con, 0);
-                prepare_read_connect_retry(con);
+                prepare_read_connect(con);
                break;
        case CEPH_MSGR_TAG_RESETSESSION:
@@ -1215,6 +1229,7 @@ static int process_connect(struct ceph_connection *con)
                clear_bit(CONNECTING, &con->state);
                con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
                con->connect_seq++;
+                con->peer_features = server_feat;
                dout("process_connect got READY gseq %d cseq %d (%d)\n",
                     con->peer_global_seq,
                     le32_to_cpu(con->in_reply.connect_seq),
@@ -1323,6 +1338,7 @@ static int read_partial_message(struct ceph_connection *con)
        unsigned front_len, middle_len, data_len, data_off;
        int datacrc = con->msgr->nocrc;
        int skip;
+        u64 seq;
        dout("read_partial_message con %p msg %p\n", con, m);
@@ -1357,6 +1373,25 @@ static int read_partial_message(struct ceph_connection *con)
                return -EIO;
        data_off = le16_to_cpu(con->in_hdr.data_off);
+        /* verify seq# */
+        seq = le64_to_cpu(con->in_hdr.seq);
+        if ((s64)seq - (s64)con->in_seq < 1) {
+                pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
+                        ENTITY_NAME(con->peer_name),
+                        pr_addr(&con->peer_addr.in_addr),
+                        seq, con->in_seq + 1);
+                con->in_base_pos = -front_len - middle_len - data_len -
+                        sizeof(m->footer);
+                con->in_tag = CEPH_MSGR_TAG_READY;
+                con->in_seq++;
+                return 0;
+        } else if ((s64)seq - (s64)con->in_seq > 1) {
+                pr_err("read_partial_message bad seq %lld expected %lld\n",
+                       seq, con->in_seq + 1);
+                con->error_msg = "bad message sequence # for incoming message";
+                return -EBADMSG;
+        }
        /* allocate message? */
        if (!con->in_msg) {
                dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
@@ -1364,18 +1399,17 @@ static int read_partial_message(struct ceph_connection *con)
                con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
                if (skip) {
                        /* skip this message */
-                        dout("alloc_msg returned NULL, skipping message\n");
+                        dout("alloc_msg said skip message\n");
                        con->in_base_pos = -front_len - middle_len - data_len -
                                sizeof(m->footer);
                        con->in_tag = CEPH_MSGR_TAG_READY;
+                        con->in_seq++;
                        return 0;
                }
-                if (IS_ERR(con->in_msg)) {
+                if (!con->in_msg) {
-                        ret = PTR_ERR(con->in_msg);
-                        con->in_msg = NULL;
                        con->error_msg =
                                "error allocating memory for incoming message";
-                        return ret;
+                        return -ENOMEM;
                }
                m = con->in_msg;
                m->front.iov_len = 0;    /* haven't read it yet */
@@ -1475,14 +1509,14 @@ static void process_message(struct ceph_connection *con)
        /* if first message, set peer_name */
        if (con->peer_name.type == 0)
-                con->peer_name = msg->hdr.src.name;
+                con->peer_name = msg->hdr.src;
        con->in_seq++;
        mutex_unlock(&con->mutex);
        dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
             msg, le64_to_cpu(msg->hdr.seq),
-             ENTITY_NAME(msg->hdr.src.name),
+             ENTITY_NAME(msg->hdr.src),
             le16_to_cpu(msg->hdr.type),
             ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
             le32_to_cpu(msg->hdr.front_len),
@@ -1507,7 +1541,6 @@ static int try_write(struct ceph_connection *con)
        dout("try_write start %p state %lu nref %d\n", con, con->state,
             atomic_read(&con->nref));
-        mutex_lock(&con->mutex);
 more:
        dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
@@ -1600,7 +1633,6 @@ do_next:
 done:
        ret = 0;
 out:
-        mutex_unlock(&con->mutex);
        dout("try_write done on %p\n", con);
        return ret;
 }
@@ -1612,7 +1644,6 @@ out:
 */
 static int try_read(struct ceph_connection *con)
 {
-        struct ceph_messenger *msgr;
        int ret = -1;
        if (!con->sock)
@@ -1622,9 +1653,6 @@ static int try_read(struct ceph_connection *con)
                return 0;
        dout("try_read start on %p\n", con);
-        msgr = con->msgr;
-        mutex_lock(&con->mutex);
 more:
        dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
@@ -1719,7 +1747,6 @@ more:
 done:
        ret = 0;
 out:
-        mutex_unlock(&con->mutex);
        dout("try_read done on %p\n", con);
        return ret;
@@ -1791,6 +1818,8 @@ more:
        dout("con_work %p start, clearing QUEUED\n", con);
        clear_bit(QUEUED, &con->state);
+        mutex_lock(&con->mutex);
        if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
                dout("con_work CLOSED\n");
                con_close_socket(con);
@@ -1805,11 +1834,16 @@ more:
        if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
            try_read(con) < 0 ||
            try_write(con) < 0) {
+                mutex_unlock(&con->mutex);
                backoff = 1;
                ceph_fault(con);     /* error/fault path */
+                goto done_unlocked;
        }
 done:
+        mutex_unlock(&con->mutex);
+done_unlocked:
        clear_bit(BUSY, &con->state);
        dout("con->state=%lu\n", con->state);
        if (test_bit(QUEUED, &con->state)) {
@@ -1843,8 +1877,6 @@ static void ceph_fault(struct ceph_connection *con)
                goto out;
        }
-        clear_bit(BUSY, &con->state);  /* to avoid an improbable race */
        mutex_lock(&con->mutex);
        if (test_bit(CLOSED, &con->state))
                goto out_unlock;
@@ -1910,7 +1942,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
        /* the zero page is needed if a request is "canceled" while the message
         * is being written over the socket */
-        msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+        msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
        if (!msgr->zero_page) {
                kfree(msgr);
                return ERR_PTR(-ENOMEM);
@@ -1950,12 +1982,12 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
        }
        /* set src+dst */
-        msg->hdr.src.name = con->msgr->inst.name;
+        msg->hdr.src = con->msgr->inst.name;
-        msg->hdr.src.addr = con->msgr->my_enc_addr;
-        msg->hdr.orig_src = msg->hdr.src;
        BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
+        msg->needs_out_seq = true;
        /* queue */
        mutex_lock(&con->mutex);
        BUG_ON(!list_empty(&msg->list_head));
@@ -2021,6 +2053,7 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
                ceph_msg_put(con->in_msg);
                con->in_msg = NULL;
                con->in_tag = CEPH_MSGR_TAG_READY;
+                con->in_seq++;
        } else {
                dout("con_revoke_pages %p msg %p pages %p no-op\n",
                     con, con->in_msg, msg);
@@ -2043,26 +2076,29 @@ void ceph_con_keepalive(struct ceph_connection *con)
 * construct a new message with given type, size
 * the new msg has a ref count of 1.
 */
-struct ceph_msg *ceph_msg_new(int type, int front_len,
+struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
-                              int page_len, int page_off, struct page **pages)
 {
        struct ceph_msg *m;
-        m = kmalloc(sizeof(*m), GFP_NOFS);
+        m = kmalloc(sizeof(*m), flags);
        if (m == NULL)
                goto out;
        kref_init(&m->kref);
        INIT_LIST_HEAD(&m->list_head);
+        m->hdr.tid = 0;
        m->hdr.type = cpu_to_le16(type);
+        m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
+        m->hdr.version = 0;
        m->hdr.front_len = cpu_to_le32(front_len);
        m->hdr.middle_len = 0;
-        m->hdr.data_len = cpu_to_le32(page_len);
+        m->hdr.data_len = 0;
-        m->hdr.data_off = cpu_to_le16(page_off);
+        m->hdr.data_off = 0;
-        m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
+        m->hdr.reserved = 0;
        m->footer.front_crc = 0;
        m->footer.middle_crc = 0;
        m->footer.data_crc = 0;
+        m->footer.flags = 0;
        m->front_max = front_len;
        m->front_is_vmalloc = false;
        m->more_to_follow = false;
@@ -2071,11 +2107,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
        /* front */
        if (front_len) {
                if (front_len > PAGE_CACHE_SIZE) {
-                        m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
+                        m->front.iov_base = __vmalloc(front_len, flags,
                                                      PAGE_KERNEL);
                        m->front_is_vmalloc = true;
                } else {
-                        m->front.iov_base = kmalloc(front_len, GFP_NOFS);
+                        m->front.iov_base = kmalloc(front_len, flags);
                }
                if (m->front.iov_base == NULL) {
                        pr_err("msg_new can't allocate %d bytes\n",
@@ -2091,19 +2127,18 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
        m->middle = NULL;
        /* data */
-        m->nr_pages = calc_pages_for(page_off, page_len);
+        m->nr_pages = 0;
-        m->pages = pages;
+        m->pages = NULL;
        m->pagelist = NULL;
-        dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
+        dout("ceph_msg_new %p front %d\n", m, front_len);
-             m->nr_pages);
        return m;
 out2:
        ceph_msg_put(m);
 out:
-        pr_err("msg_new can't create type %d len %d\n", type, front_len);
+        pr_err("msg_new can't create type %d front %d\n", type, front_len);
-        return ERR_PTR(-ENOMEM);
+        return NULL;
 }
 /*
@@ -2146,29 +2181,25 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
                mutex_unlock(&con->mutex);
                msg = con->ops->alloc_msg(con, hdr, skip);
                mutex_lock(&con->mutex);
-                if (IS_ERR(msg))
+                if (!msg || *skip)
-                        return msg;
-                if (*skip)
                        return NULL;
        }
        if (!msg) {
                *skip = 0;
-                msg = ceph_msg_new(type, front_len, 0, 0, NULL);
+                msg = ceph_msg_new(type, front_len, GFP_NOFS);
                if (!msg) {
                        pr_err("unable to allocate msg type %d len %d\n",
                               type, front_len);
-                        return ERR_PTR(-ENOMEM);
+                        return NULL;
                }
        }
        memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
-        if (middle_len) {
+        if (middle_len && !msg->middle) {
                ret = ceph_alloc_middle(con, msg);
                if (ret < 0) {
                        ceph_msg_put(msg);
-                        return msg;
+                        return NULL;
                }
        }
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index 4caaa5911110..76fbc957bc13 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -49,10 +49,8 @@ struct ceph_connection_operations {
                                        int *skip);
 };
-extern const char *ceph_name_type_str(int t);
 /* use format string %s%d */
-#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
+#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
 struct ceph_messenger {
        struct ceph_entity_inst inst;    /* my name+address */
@@ -86,6 +84,7 @@ struct ceph_msg {
        struct kref kref;
        bool front_is_vmalloc;
        bool more_to_follow;
+        bool needs_out_seq;
        int front_max;
        struct ceph_msgpool *pool;
@@ -143,6 +142,7 @@ struct ceph_connection {
        struct ceph_entity_addr peer_addr; /* peer address */
        struct ceph_entity_name peer_name; /* peer name */
        struct ceph_entity_addr peer_addr_for_me;
+        unsigned peer_features;
        u32 connect_seq;      /* identify the most recent connection
                                 attempt for this connection, client */
        u32 peer_global_seq;  /* peer's global seq for this connection */
@@ -157,7 +157,6 @@ struct ceph_connection {
        struct list_head out_queue;
        struct list_head out_sent;   /* sending or sent but unacked */
        u64 out_seq;                 /* last message queued for send */
-        u64 out_seq_sent;            /* last message sent */
        bool out_keepalive_pending;
        u64 in_seq, in_seq_acked;  /* last message received, acked */
@@ -214,6 +213,7 @@ extern int ceph_parse_ips(const char *c, const char *end,
 extern int ceph_msgr_init(void);
 extern void ceph_msgr_exit(void);
+extern void ceph_msgr_flush(void);
 extern struct ceph_messenger *ceph_messenger_create(
        struct ceph_entity_addr *myaddr);
@@ -223,6 +223,7 @@ extern void ceph_con_init(struct ceph_messenger *msgr,
                          struct ceph_connection *con);
 extern void ceph_con_open(struct ceph_connection *con,
                          struct ceph_entity_addr *addr);
+extern bool ceph_con_opened(struct ceph_connection *con);
 extern void ceph_con_close(struct ceph_connection *con);
 extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
 extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
@@ -232,9 +233,7 @@ extern void ceph_con_keepalive(struct ceph_connection *con);
 extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
 extern void ceph_con_put(struct ceph_connection *con);
-extern struct ceph_msg *ceph_msg_new(int type, int front_len,
+extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
-                                     int page_len, int page_off,
-                                     struct page **pages);
 extern void ceph_msg_kfree(struct ceph_msg *m);
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 890597c09d43..21c62e9b7d1d 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -1,6 +1,7 @@
 #include "ceph_debug.h"
 #include <linux/types.h>
+#include <linux/slab.h>
 #include <linux/random.h>
 #include <linux/sched.h>
@@ -27,7 +28,7 @@
 * resend any outstanding requests.
 */
-const static struct ceph_connection_operations mon_con_ops;
+static const struct ceph_connection_operations mon_con_ops;
 static int __validate_auth(struct ceph_mon_client *monc);
@@ -103,6 +104,7 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
        monc->pending_auth = 1;
        monc->m_auth->front.iov_len = len;
        monc->m_auth->hdr.front_len = cpu_to_le32(len);
+        ceph_con_revoke(monc->con, monc->m_auth);
        ceph_msg_get(monc->m_auth);  /* keep our ref */
        ceph_con_send(monc->con, monc->m_auth);
 }
@@ -186,16 +188,12 @@ static void __send_subscribe(struct ceph_mon_client *monc)
             monc->want_next_osdmap);
        if ((__sub_expired(monc) && !monc->sub_sent) ||
            monc->want_next_osdmap == 1) {
-                struct ceph_msg *msg;
+                struct ceph_msg *msg = monc->m_subscribe;
                struct ceph_mon_subscribe_item *i;
                void *p, *end;
-                msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
-                if (!msg)
-                        return;
                p = msg->front.iov_base;
-                end = p + msg->front.iov_len;
+                end = p + msg->front_max;
                dout("__send_subscribe to 'mdsmap' %u+\n",
                     (unsigned)monc->have_mdsmap);
@@ -225,7 +223,8 @@ static void __send_subscribe(struct ceph_mon_client *monc)
                msg->front.iov_len = p - msg->front.iov_base;
                msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
-                ceph_con_send(monc->con, msg);
+                ceph_con_revoke(monc->con, msg);
+                ceph_con_send(monc->con, ceph_msg_get(msg));
                monc->sub_sent = jiffies | 1;  /* never 0 */
        }
@@ -352,14 +351,14 @@ out:
 /*
 * statfs
 */
-static struct ceph_mon_statfs_request *__lookup_statfs(
+static struct ceph_mon_generic_request *__lookup_generic_req(
        struct ceph_mon_client *monc, u64 tid)
 {
-        struct ceph_mon_statfs_request *req;
+        struct ceph_mon_generic_request *req;
-        struct rb_node *n = monc->statfs_request_tree.rb_node;
+        struct rb_node *n = monc->generic_request_tree.rb_node;
        while (n) {
-                req = rb_entry(n, struct ceph_mon_statfs_request, node);
+                req = rb_entry(n, struct ceph_mon_generic_request, node);
                if (tid < req->tid)
                        n = n->rb_left;
                else if (tid > req->tid)
@@ -370,16 +369,16 @@ static struct ceph_mon_statfs_request *__lookup_statfs(
        return NULL;
 }
-static void __insert_statfs(struct ceph_mon_client *monc,
+static void __insert_generic_request(struct ceph_mon_client *monc,
-                            struct ceph_mon_statfs_request *new)
+                            struct ceph_mon_generic_request *new)
 {
-        struct rb_node **p = &monc->statfs_request_tree.rb_node;
+        struct rb_node **p = &monc->generic_request_tree.rb_node;
        struct rb_node *parent = NULL;
-        struct ceph_mon_statfs_request *req = NULL;
+        struct ceph_mon_generic_request *req = NULL;
        while (*p) {
                parent = *p;
-                req = rb_entry(parent, struct ceph_mon_statfs_request, node);
+                req = rb_entry(parent, struct ceph_mon_generic_request, node);
                if (new->tid < req->tid)
                        p = &(*p)->rb_left;
                else if (new->tid > req->tid)
@@ -389,113 +388,157 @@ static void __insert_statfs(struct ceph_mon_client *monc,
        }
        rb_link_node(&new->node, parent, p);
-        rb_insert_color(&new->node, &monc->statfs_request_tree);
+        rb_insert_color(&new->node, &monc->generic_request_tree);
+}
+static void release_generic_request(struct kref *kref)
+{
+        struct ceph_mon_generic_request *req =
+                container_of(kref, struct ceph_mon_generic_request, kref);
+        if (req->reply)
+                ceph_msg_put(req->reply);
+        if (req->request)
+                ceph_msg_put(req->request);
+}
+static void put_generic_request(struct ceph_mon_generic_request *req)
+{
+        kref_put(&req->kref, release_generic_request);
+}
+static void get_generic_request(struct ceph_mon_generic_request *req)
+{
+        kref_get(&req->kref);
+}
+static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
+                                         struct ceph_msg_header *hdr,
+                                         int *skip)
+{
+        struct ceph_mon_client *monc = con->private;
+        struct ceph_mon_generic_request *req;
+        u64 tid = le64_to_cpu(hdr->tid);
+        struct ceph_msg *m;
+        mutex_lock(&monc->mutex);
+        req = __lookup_generic_req(monc, tid);
+        if (!req) {
+                dout("get_generic_reply %lld dne\n", tid);
+                *skip = 1;
+                m = NULL;
+        } else {
+                dout("get_generic_reply %lld got %p\n", tid, req->reply);
+                m = ceph_msg_get(req->reply);
+                /*
+                 * we don't need to track the connection reading into
+                 * this reply because we only have one open connection
+                 * at a time, ever.
+                 */
+        }
+        mutex_unlock(&monc->mutex);
+        return m;
 }
 static void handle_statfs_reply(struct ceph_mon_client *monc,
                                struct ceph_msg *msg)
 {
-        struct ceph_mon_statfs_request *req;
+        struct ceph_mon_generic_request *req;
        struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
-        u64 tid;
+        u64 tid = le64_to_cpu(msg->hdr.tid);
        if (msg->front.iov_len != sizeof(*reply))
                goto bad;
-        tid = le64_to_cpu(msg->hdr.tid);
        dout("handle_statfs_reply %p tid %llu\n", msg, tid);
        mutex_lock(&monc->mutex);
-        req = __lookup_statfs(monc, tid);
+        req = __lookup_generic_req(monc, tid);
        if (req) {
-                *req->buf = reply->st;
+                *(struct ceph_statfs *)req->buf = reply->st;
                req->result = 0;
+                get_generic_request(req);
        }
        mutex_unlock(&monc->mutex);
-        if (req)
+        if (req) {
                complete(&req->completion);
+                put_generic_request(req);
+        }
        return;
 bad:
-        pr_err("corrupt statfs reply, no tid\n");
+        pr_err("corrupt generic reply, no tid\n");
        ceph_msg_dump(msg);
 }
 /*
- * (re)send a statfs request
+ * Do a synchronous statfs().
 */
-static int send_statfs(struct ceph_mon_client *monc,
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
-                       struct ceph_mon_statfs_request *req)
 {
-        struct ceph_msg *msg;
+        struct ceph_mon_generic_request *req;
        struct ceph_mon_statfs *h;
+        int err;
-        dout("send_statfs tid %llu\n", req->tid);
+        req = kzalloc(sizeof(*req), GFP_NOFS);
-        msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
+        if (!req)
-        if (IS_ERR(msg))
+                return -ENOMEM;
-                return PTR_ERR(msg);
-        req->request = msg;
+        kref_init(&req->kref);
-        msg->hdr.tid = cpu_to_le64(req->tid);
+        req->buf = buf;
-        h = msg->front.iov_base;
+        init_completion(&req->completion);
+        err = -ENOMEM;
+        req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
+        if (!req->request)
+                goto out;
+        req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
+        if (!req->reply)
+                goto out;
+        /* fill out request */
+        h = req->request->front.iov_base;
        h->monhdr.have_version = 0;
        h->monhdr.session_mon = cpu_to_le16(-1);
        h->monhdr.session_mon_tid = 0;
        h->fsid = monc->monmap->fsid;
-        ceph_con_send(monc->con, msg);
-        return 0;
-}
-/*
- * Do a synchronous statfs().
- */
-int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
-{
-        struct ceph_mon_statfs_request req;
-        int err;
-        req.buf = buf;
-        init_completion(&req.completion);
-        /* allocate memory for reply */
-        err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
-        if (err)
-                return err;
        /* register request */
        mutex_lock(&monc->mutex);
-        req.tid = ++monc->last_tid;
+        req->tid = ++monc->last_tid;
-        req.last_attempt = jiffies;
+        req->request->hdr.tid = cpu_to_le64(req->tid);
-        req.delay = BASE_DELAY_INTERVAL;
+        __insert_generic_request(monc, req);
-        __insert_statfs(monc, &req);
+        monc->num_generic_requests++;
-        monc->num_statfs_requests++;
        mutex_unlock(&monc->mutex);
        /* send request and wait */
-        err = send_statfs(monc, &req);
+        ceph_con_send(monc->con, ceph_msg_get(req->request));
-        if (!err)
+        err = wait_for_completion_interruptible(&req->completion);
-                err = wait_for_completion_interruptible(&req.completion);
        mutex_lock(&monc->mutex);
-        rb_erase(&req.node, &monc->statfs_request_tree);
+        rb_erase(&req->node, &monc->generic_request_tree);
-        monc->num_statfs_requests--;
+        monc->num_generic_requests--;
-        ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
        mutex_unlock(&monc->mutex);
        if (!err)
-                err = req.result;
+                err = req->result;
+out:
+        kref_put(&req->kref, release_generic_request);
        return err;
 }
 /*
 * Resend pending statfs requests.
 */
-static void __resend_statfs(struct ceph_mon_client *monc)
+static void __resend_generic_request(struct ceph_mon_client *monc)
 {
-        struct ceph_mon_statfs_request *req;
+        struct ceph_mon_generic_request *req;
        struct rb_node *p;
-        for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
+        for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
-                req = rb_entry(p, struct ceph_mon_statfs_request, node);
+                req = rb_entry(p, struct ceph_mon_generic_request, node);
-                send_statfs(monc, req);
+                ceph_con_revoke(monc->con, req->request);
+                ceph_con_send(monc->con, ceph_msg_get(req->request));
        }
 }
@@ -585,26 +628,26 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
                CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
                CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
-        /* msg pools */
+        /* msgs */
-        err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
+        err = -ENOMEM;
-                               sizeof(struct ceph_mon_subscribe_ack), 1, false);
+        monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
-        if (err < 0)
+                                     sizeof(struct ceph_mon_subscribe_ack),
+                                     GFP_NOFS);
+        if (!monc->m_subscribe_ack)
                goto out_monmap;
-        err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
-                                sizeof(struct ceph_mon_statfs_reply), 0, false);
+        monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
-        if (err < 0)
+        if (!monc->m_subscribe)
-                goto out_pool1;
+                goto out_subscribe_ack;
-        err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
-        if (err < 0)
+        monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
-                goto out_pool2;
+        if (!monc->m_auth_reply)
+                goto out_subscribe;
-        monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
+        monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
        monc->pending_auth = 0;
-        if (IS_ERR(monc->m_auth)) {
+        if (!monc->m_auth)
-                err = PTR_ERR(monc->m_auth);
+                goto out_auth_reply;
-                monc->m_auth = NULL;
-                goto out_pool3;
-        }
        monc->cur_mon = -1;
        monc->hunting = true;
@@ -612,8 +655,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
        monc->sub_sent = 0;
        INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
-        monc->statfs_request_tree = RB_ROOT;
+        monc->generic_request_tree = RB_ROOT;
-        monc->num_statfs_requests = 0;
+        monc->num_generic_requests = 0;
        monc->last_tid = 0;
        monc->have_mdsmap = 0;
@@ -621,12 +664,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
        monc->want_next_osdmap = 1;
        return 0;
-out_pool3:
+out_auth_reply:
-        ceph_msgpool_destroy(&monc->msgpool_auth_reply);
+        ceph_msg_put(monc->m_auth_reply);
-out_pool2:
+out_subscribe:
-        ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
+        ceph_msg_put(monc->m_subscribe);
-out_pool1:
+out_subscribe_ack:
-        ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
+        ceph_msg_put(monc->m_subscribe_ack);
 out_monmap:
        kfree(monc->monmap);
 out:
@@ -650,9 +693,9 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
        ceph_auth_destroy(monc->auth);
        ceph_msg_put(monc->m_auth);
-        ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
+        ceph_msg_put(monc->m_auth_reply);
-        ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
+        ceph_msg_put(monc->m_subscribe);
-        ceph_msgpool_destroy(&monc->msgpool_auth_reply);
+        ceph_msg_put(monc->m_subscribe_ack);
        kfree(monc->monmap);
 }
@@ -661,8 +704,11 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
                              struct ceph_msg *msg)
 {
        int ret;
+        int was_auth = 0;
        mutex_lock(&monc->mutex);
+        if (monc->auth->ops)
+                was_auth = monc->auth->ops->is_authenticated(monc->auth);
        monc->pending_auth = 0;
        ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
                                     msg->front.iov_len,
@@ -673,14 +719,14 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
                wake_up(&monc->client->auth_wq);
        } else if (ret > 0) {
                __send_prepared_auth_request(monc, ret);
-        } else if (monc->auth->ops->is_authenticated(monc->auth)) {
+        } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
                dout("authenticated, starting session\n");
                monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
                monc->client->msgr->inst.name.num = monc->auth->global_id;
                __send_subscribe(monc);
-                __resend_statfs(monc);
+                __resend_generic_request(monc);
        }
        mutex_unlock(&monc->mutex);
 }
@@ -769,18 +815,17 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
        switch (type) {
        case CEPH_MSG_MON_SUBSCRIBE_ACK:
-                m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
+                m = ceph_msg_get(monc->m_subscribe_ack);
                break;
        case CEPH_MSG_STATFS_REPLY:
-                m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len);
+                return get_generic_reply(con, hdr, skip);
-                break;
        case CEPH_MSG_AUTH_REPLY:
-                m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
+                m = ceph_msg_get(monc->m_auth_reply);
                break;
        case CEPH_MSG_MON_MAP:
        case CEPH_MSG_MDS_MAP:
        case CEPH_MSG_OSD_MAP:
-                m = ceph_msg_new(type, front_len, 0, 0, NULL);
+                m = ceph_msg_new(type, front_len, GFP_NOFS);
                break;
        }
@@ -825,7 +870,7 @@ out:
        mutex_unlock(&monc->mutex);
 }
-const static struct ceph_connection_operations mon_con_ops = {
+static const struct ceph_connection_operations mon_con_ops = {
        .get = ceph_con_get,
        .put = ceph_con_put,
        .dispatch = dispatch,
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
index b958ad5afa06..174d794321d0 100644
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -2,10 +2,10 @@
 #define _FS_CEPH_MON_CLIENT_H
 #include <linux/completion.h>
+#include <linux/kref.h>
 #include <linux/rbtree.h>
 #include "messenger.h"
-#include "msgpool.h"
 struct ceph_client;
 struct ceph_mount_args;
@@ -22,7 +22,7 @@ struct ceph_monmap {
 };
 struct ceph_mon_client;
-struct ceph_mon_statfs_request;
+struct ceph_mon_generic_request;
 /*
@@ -40,17 +40,19 @@ struct ceph_mon_request {
 };
 /*
- * statfs() is done a bit differently because we need to get data back
+ * ceph_mon_generic_request is being used for the statfs and poolop requests
+ * which are bening done a bit differently because we need to get data back
 * to the caller
 */
-struct ceph_mon_statfs_request {
+struct ceph_mon_generic_request {
+        struct kref kref;
        u64 tid;
        struct rb_node node;
        int result;
-        struct ceph_statfs *buf;
+        void *buf;
        struct completion completion;
-        unsigned long last_attempt, delay; /* jiffies */
        struct ceph_msg *request;  /* original request */
+        struct ceph_msg *reply;    /* and reply */
 };
 struct ceph_mon_client {
@@ -61,7 +63,7 @@ struct ceph_mon_client {
        struct delayed_work delayed_work;
        struct ceph_auth_client *auth;
-        struct ceph_msg *m_auth;
+        struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
        int pending_auth;
        bool hunting;
@@ -70,14 +72,9 @@ struct ceph_mon_client {
        struct ceph_connection *con;
        bool have_fsid;
-        /* msg pools */
+        /* pending generic requests */
-        struct ceph_msgpool msgpool_subscribe_ack;
+        struct rb_root generic_request_tree;
-        struct ceph_msgpool msgpool_statfs_reply;
+        int num_generic_requests;
-        struct ceph_msgpool msgpool_auth_reply;
-        /* pending statfs requests */
-        struct rb_root statfs_request_tree;
-        int num_statfs_requests;
        u64 last_tid;
        /* mds/osd map */
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
index ca3b44a89f2d..dd65a6438131 100644
--- a/fs/ceph/msgpool.c
+++ b/fs/ceph/msgpool.c
@@ -7,180 +7,58 @@
 #include "msgpool.h"
-/*
+static void *alloc_fn(gfp_t gfp_mask, void *arg)
- * We use msg pools to preallocate memory for messages we expect to
+{
- * receive over the wire, to avoid getting ourselves into OOM
+        struct ceph_msgpool *pool = arg;
- * conditions at unexpected times.  We take use a few different
+        void *p;
- * strategies:
- *
- *  - for request/response type interactions, we preallocate the
- * memory needed for the response when we generate the request.
- *
- *  - for messages we can receive at any time from the MDS, we preallocate
- * a pool of messages we can re-use.
- *
- *  - for writeback, we preallocate some number of messages to use for
- * requests and their replies, so that we always make forward
- * progress.
- *
- * The msgpool behaves like a mempool_t, but keeps preallocated
- * ceph_msgs strung together on a list_head instead of using a pointer
- * vector.  This avoids vector reallocation when we adjust the number
- * of preallocated items (which happens frequently).
- */
+        p = ceph_msg_new(0, pool->front_len, gfp_mask);
+        if (!p)
+                pr_err("msgpool %s alloc failed\n", pool->name);
+        return p;
+}
-/*
+static void free_fn(void *element, void *arg)
- * Allocate or release as necessary to meet our target pool size.
- */
-static int __fill_msgpool(struct ceph_msgpool *pool)
 {
-        struct ceph_msg *msg;
+        ceph_msg_put(element);
-        while (pool->num < pool->min) {
-                dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
-                     pool->min);
-                spin_unlock(&pool->lock);
-                msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
-                spin_lock(&pool->lock);
-                if (IS_ERR(msg))
-                        return PTR_ERR(msg);
-                msg->pool = pool;
-                list_add(&msg->list_head, &pool->msgs);
-                pool->num++;
-        }
-        while (pool->num > pool->min) {
-                msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
-                dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
-                     pool->min, msg);
-                list_del_init(&msg->list_head);
-                pool->num--;
-                ceph_msg_kfree(msg);
-        }
-        return 0;
 }
 int ceph_msgpool_init(struct ceph_msgpool *pool,
-                      int front_len, int min, bool blocking)
+                      int front_len, int size, bool blocking, const char *name)
 {
-        int ret;
-        dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
-        spin_lock_init(&pool->lock);
        pool->front_len = front_len;
-        INIT_LIST_HEAD(&pool->msgs);
+        pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
-        pool->num = 0;
+        if (!pool->pool)
-        pool->min = min;
+                return -ENOMEM;
-        pool->blocking = blocking;
+        pool->name = name;
-        init_waitqueue_head(&pool->wait);
+        return 0;
-        spin_lock(&pool->lock);
-        ret = __fill_msgpool(pool);
-        spin_unlock(&pool->lock);
-        return ret;
 }
 void ceph_msgpool_destroy(struct ceph_msgpool *pool)
 {
-        dout("msgpool_destroy %p\n", pool);
+        mempool_destroy(pool->pool);
-        spin_lock(&pool->lock);
-        pool->min = 0;
-        __fill_msgpool(pool);
-        spin_unlock(&pool->lock);
 }
-int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
+                                  int front_len)
 {
-        int ret;
+        if (front_len > pool->front_len) {
+                pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
-        spin_lock(&pool->lock);
+                       pool->name, front_len, pool->front_len);
-        dout("msgpool_resv %p delta %d\n", pool, delta);
-        pool->min += delta;
-        ret = __fill_msgpool(pool);
-        spin_unlock(&pool->lock);
-        return ret;
-}
-struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
-{
-        wait_queue_t wait;
-        struct ceph_msg *msg;
-        if (front_len && front_len > pool->front_len) {
-                pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
-                       pool, front_len, pool->front_len);
                WARN_ON(1);
                /* try to alloc a fresh message */
-                msg = ceph_msg_new(0, front_len, 0, 0, NULL);
+                return ceph_msg_new(0, front_len, GFP_NOFS);
-                if (!IS_ERR(msg))
-                        return msg;
-        }
-        if (!front_len)
-                front_len = pool->front_len;
-        if (pool->blocking) {
-                /* mempool_t behavior; first try to alloc */
-                msg = ceph_msg_new(0, front_len, 0, 0, NULL);
-                if (!IS_ERR(msg))
-                        return msg;
        }
-        while (1) {
+        return mempool_alloc(pool->pool, GFP_NOFS);
-                spin_lock(&pool->lock);
-                if (likely(pool->num)) {
-                        msg = list_entry(pool->msgs.next, struct ceph_msg,
-                                         list_head);
-                        list_del_init(&msg->list_head);
-                        pool->num--;
-                        dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
-                             pool->num, pool->min);
-                        spin_unlock(&pool->lock);
-                        return msg;
-                }
-                pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
-                       pool->min, pool->blocking ? "waiting" : "may fail");
-                spin_unlock(&pool->lock);
-                if (!pool->blocking) {
-                        WARN_ON(1);
-                        /* maybe we can allocate it now? */
-                        msg = ceph_msg_new(0, front_len, 0, 0, NULL);
-                        if (!IS_ERR(msg))
-                                return msg;
-                        pr_err("msgpool_get %p empty + alloc failed\n", pool);
-                        return ERR_PTR(-ENOMEM);
-                }
-                init_wait(&wait);
-                prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
-                schedule();
-                finish_wait(&pool->wait, &wait);
-        }
 }
 void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
 {
-        spin_lock(&pool->lock);
+        /* reset msg front_len; user may have changed it */
-        if (pool->num < pool->min) {
+        msg->front.iov_len = pool->front_len;
-                /* reset msg front_len; user may have changed it */
+        msg->hdr.front_len = cpu_to_le32(pool->front_len);
-                msg->front.iov_len = pool->front_len;
-                msg->hdr.front_len = cpu_to_le32(pool->front_len);
-                kref_set(&msg->kref, 1);  /* retake a single ref */
+        kref_init(&msg->kref);  /* retake single ref */
-                list_add(&msg->list_head, &pool->msgs);
-                pool->num++;
-                dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
-                     pool->num, pool->min);
-                spin_unlock(&pool->lock);
-                wake_up(&pool->wait);
-        } else {
-                dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
-                     pool->num, pool->min);
-                spin_unlock(&pool->lock);
-                ceph_msg_kfree(msg);
-        }
 }
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
index bc834bfcd720..a362605f9368 100644
--- a/fs/ceph/msgpool.h
+++ b/fs/ceph/msgpool.h
@@ -1,6 +1,7 @@
 #ifndef _FS_CEPH_MSGPOOL
 #define _FS_CEPH_MSGPOOL
+#include <linux/mempool.h>
 #include "messenger.h"
 /*
@@ -8,18 +9,15 @@
 * avoid unexpected OOM conditions.
 */
 struct ceph_msgpool {
-        spinlock_t lock;
+        const char *name;
+        mempool_t *pool;
        int front_len;          /* preallocated payload size */
-        struct list_head msgs;  /* msgs in the pool; each has 1 ref */
-        int num, min;           /* cur, min # msgs in the pool */
-        bool blocking;
-        wait_queue_head_t wait;
 };
 extern int ceph_msgpool_init(struct ceph_msgpool *pool,
-                             int front_len, int size, bool blocking);
+                             int front_len, int size, bool blocking,
+                             const char *name);
 extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
-extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
 extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
                                         int front_len);
 extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
index 8aaab414f3f8..892a0298dfdf 100644
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -50,7 +50,6 @@ struct ceph_entity_name {
 #define CEPH_ENTITY_TYPE_MDS    0x02
 #define CEPH_ENTITY_TYPE_OSD    0x04
 #define CEPH_ENTITY_TYPE_CLIENT 0x08
-#define CEPH_ENTITY_TYPE_ADMIN  0x10
 #define CEPH_ENTITY_TYPE_AUTH   0x20
 #define CEPH_ENTITY_TYPE_ANY    0xFF
@@ -120,7 +119,7 @@ struct ceph_msg_connect_reply {
 /*
 * message header
 */
-struct ceph_msg_header {
+struct ceph_msg_header_old {
        __le64 seq;       /* message seq# for this session */
        __le64 tid;       /* transaction id */
        __le16 type;      /* message type */
@@ -138,6 +137,24 @@ struct ceph_msg_header {
        __le32 crc;       /* header crc32c */
 } __attribute__ ((packed));
+struct ceph_msg_header {
+        __le64 seq;       /* message seq# for this session */
+        __le64 tid;       /* transaction id */
+        __le16 type;      /* message type */
+        __le16 priority;  /* priority.  higher value == higher priority */
+        __le16 version;   /* version of message encoding */
+        __le32 front_len; /* bytes in main payload */
+        __le32 middle_len;/* bytes in middle payload */
+        __le32 data_len;  /* bytes of data payload */
+        __le16 data_off;  /* sender: include full offset;
+                             receiver: mask against ~PAGE_MASK */
+        struct ceph_entity_name src;
+        __le32 reserved;
+        __le32 crc;       /* header crc32c */
+} __attribute__ ((packed));
 #define CEPH_MSG_PRIO_LOW     64
 #define CEPH_MSG_PRIO_DEFAULT 127
 #define CEPH_MSG_PRIO_HIGH    196
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index dbe63db9762f..d25b4add85b4 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -16,7 +16,7 @@
 #define OSD_OP_FRONT_LEN        4096
 #define OSD_OPREPLY_FRONT_LEN   512
-const static struct ceph_connection_operations osd_con_ops;
+static const struct ceph_connection_operations osd_con_ops;
 static int __kick_requests(struct ceph_osd_client *osdc,
                          struct ceph_osd *kickosd);
@@ -147,7 +147,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
                req = kzalloc(sizeof(*req), GFP_NOFS);
        }
        if (req == NULL)
-                return ERR_PTR(-ENOMEM);
+                return NULL;
        req->r_osdc = osdc;
        req->r_mempool = use_mempool;
@@ -164,10 +164,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
                msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
        else
                msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
-                                   OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
+                                   OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
-        if (IS_ERR(msg)) {
+        if (!msg) {
                ceph_osdc_put_request(req);
-                return ERR_PTR(PTR_ERR(msg));
+                return NULL;
        }
        req->r_reply = msg;
@@ -178,10 +178,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
        if (use_mempool)
                msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
        else
-                msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
+                msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
-        if (IS_ERR(msg)) {
+        if (!msg) {
                ceph_osdc_put_request(req);
-                return ERR_PTR(PTR_ERR(msg));
+                return NULL;
        }
        msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
        memset(msg->front.iov_base, 0, msg->front.iov_len);
@@ -361,8 +361,13 @@ static void put_osd(struct ceph_osd *osd)
 {
        dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
             atomic_read(&osd->o_ref) - 1);
-        if (atomic_dec_and_test(&osd->o_ref))
+        if (atomic_dec_and_test(&osd->o_ref)) {
+                struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
+                if (osd->o_authorizer)
+                        ac->ops->destroy_authorizer(ac, osd->o_authorizer);
                kfree(osd);
+        }
 }
 /*
@@ -413,11 +418,22 @@ static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
 */
 static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 {
+        struct ceph_osd_request *req;
        int ret = 0;
        dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
        if (list_empty(&osd->o_requests)) {
                __remove_osd(osdc, osd);
+        } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
+                          &osd->o_con.peer_addr,
+                          sizeof(osd->o_con.peer_addr)) == 0 &&
+                   !ceph_con_opened(&osd->o_con)) {
+                dout(" osd addr hasn't changed and connection never opened,"
+                     " letting msgr retry");
+                /* touch each r_stamp for handle_timeout()'s benfit */
+                list_for_each_entry(req, &osd->o_requests, r_osd_item)
+                        req->r_stamp = jiffies;
+                ret = -EAGAIN;
        } else {
                ceph_con_close(&osd->o_con);
                ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
@@ -554,7 +570,8 @@ static int __map_osds(struct ceph_osd_client *osdc,
 {
        struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
        struct ceph_pg pgid;
-        int o = -1;
+        int acting[CEPH_PG_MAX_SIZE];
+        int o = -1, num = 0;
        int err;
        dout("map_osds %p tid %lld\n", req, req->r_tid);
@@ -565,10 +582,16 @@ static int __map_osds(struct ceph_osd_client *osdc,
        pgid = reqhead->layout.ol_pgid;
        req->r_pgid = pgid;
-        o = ceph_calc_pg_primary(osdc->osdmap, pgid);
+        err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
+        if (err > 0) {
+                o = acting[0];
+                num = err;
+        }
        if ((req->r_osd && req->r_osd->o_osd == o &&
-             req->r_sent >= req->r_osd->o_incarnation) ||
+             req->r_sent >= req->r_osd->o_incarnation &&
+             req->r_num_pg_osds == num &&
+             memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
            (req->r_osd == NULL && o == -1))
                return 0;  /* no change */
@@ -576,6 +599,10 @@ static int __map_osds(struct ceph_osd_client *osdc,
             req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
             req->r_osd ? req->r_osd->o_osd : -1);
+        /* record full pg acting set */
+        memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
+        req->r_num_pg_osds = num;
        if (req->r_osd) {
                __cancel_request(req);
                list_del_init(&req->r_osd_item);
@@ -601,7 +628,7 @@ static int __map_osds(struct ceph_osd_client *osdc,
                __remove_osd_from_lru(req->r_osd);
                list_add(&req->r_osd_item, &req->r_osd->o_requests);
        }
-        err = 1;   /* osd changed */
+        err = 1;   /* osd or pg changed */
 out:
        return err;
@@ -633,7 +660,7 @@ static int __send_request(struct ceph_osd_client *osdc,
        reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */
        reqhead->reassert_version = req->r_reassert_version;
-        req->r_sent_stamp = jiffies;
+        req->r_stamp = jiffies;
        list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
        ceph_msg_get(req->r_request); /* send consumes a ref */
@@ -660,7 +687,7 @@ static void handle_timeout(struct work_struct *work)
        unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
        unsigned long keepalive =
                osdc->client->mount_args->osd_keepalive_timeout * HZ;
-        unsigned long last_sent = 0;
+        unsigned long last_stamp = 0;
        struct rb_node *p;
        struct list_head slow_osds;
@@ -693,16 +720,16 @@ static void handle_timeout(struct work_struct *work)
         * should mark the osd as failed and we should find out about
         * it from an updated osd map.
         */
-        while (!list_empty(&osdc->req_lru)) {
+        while (timeout && !list_empty(&osdc->req_lru)) {
                req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
                                 r_req_lru_item);
-                if (time_before(jiffies, req->r_sent_stamp + timeout))
+                if (time_before(jiffies, req->r_stamp + timeout))
                        break;
-                BUG_ON(req == last_req && req->r_sent_stamp == last_sent);
+                BUG_ON(req == last_req && req->r_stamp == last_stamp);
                last_req = req;
-                last_sent = req->r_sent_stamp;
+                last_stamp = req->r_stamp;
                osd = req->r_osd;
                BUG_ON(!osd);
@@ -718,7 +745,7 @@ static void handle_timeout(struct work_struct *work)
         */
        INIT_LIST_HEAD(&slow_osds);
        list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
-                if (time_before(jiffies, req->r_sent_stamp + keepalive))
+                if (time_before(jiffies, req->r_stamp + keepalive))
                        break;
                osd = req->r_osd;
@@ -768,16 +795,18 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
        struct ceph_osd_request *req;
        u64 tid;
        int numops, object_len, flags;
+        s32 result;
        tid = le64_to_cpu(msg->hdr.tid);
        if (msg->front.iov_len < sizeof(*rhead))
                goto bad;
        numops = le32_to_cpu(rhead->num_ops);
        object_len = le32_to_cpu(rhead->object_len);
+        result = le32_to_cpu(rhead->result);
        if (msg->front.iov_len != sizeof(*rhead) + object_len +
            numops * sizeof(struct ceph_osd_op))
                goto bad;
-        dout("handle_reply %p tid %llu\n", msg, tid);
+        dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
        /* lookup */
        mutex_lock(&osdc->request_mutex);
@@ -823,7 +852,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
        dout("handle_reply tid %llu flags %d\n", tid, flags);
        /* either this is a read, or we got the safe response */
-        if ((flags & CEPH_OSD_FLAG_ONDISK) ||
+        if (result < 0 ||
+            (flags & CEPH_OSD_FLAG_ONDISK) ||
            ((flags & CEPH_OSD_FLAG_WRITE) == 0))
                __unregister_request(osdc, req);
@@ -862,7 +892,9 @@ static int __kick_requests(struct ceph_osd_client *osdc,
        dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
        if (kickosd) {
-                __reset_osd(osdc, kickosd);
+                err = __reset_osd(osdc, kickosd);
+                if (err == -EAGAIN)
+                        return 1;
        } else {
                for (p = rb_first(&osdc->osds); p; p = n) {
                        struct ceph_osd *osd =
@@ -913,7 +945,7 @@ static int __kick_requests(struct ceph_osd_client *osdc,
 kick:
                dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
-                     req->r_osd->o_osd);
+                     req->r_osd ? req->r_osd->o_osd : -1);
                req->r_flags |= CEPH_OSD_FLAG_RETRY;
                err = __send_request(osdc, req);
                if (err) {
@@ -1051,6 +1083,7 @@ done:
        if (newmap)
                kick_requests(osdc, NULL);
        up_read(&osdc->map_sem);
+        wake_up(&osdc->client->auth_wq);
        return;
 bad:
@@ -1060,45 +1093,6 @@ bad:
        return;
 }
-/*
- * A read request prepares specific pages that data is to be read into.
- * When a message is being read off the wire, we call prepare_pages to
- * find those pages.
- *  0 = success, -1 failure.
- */
-static int __prepare_pages(struct ceph_connection *con,
-                         struct ceph_msg_header *hdr,
-                         struct ceph_osd_request *req,
-                         u64 tid,
-                         struct ceph_msg *m)
-{
-        struct ceph_osd *osd = con->private;
-        struct ceph_osd_client *osdc;
-        int ret = -1;
-        int data_len = le32_to_cpu(hdr->data_len);
-        unsigned data_off = le16_to_cpu(hdr->data_off);
-        int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
-        if (!osd)
-                return -1;
-        osdc = osd->o_osdc;
-        dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
-             tid, req->r_num_pages, want);
-        if (unlikely(req->r_num_pages < want))
-                goto out;
-        m->pages = req->r_pages;
-        m->nr_pages = req->r_num_pages;
-        ret = 0; /* success */
-out:
-        BUG_ON(ret < 0 || m->nr_pages < want);
-        return ret;
-}
 /*
 * Register request, send initial attempt.
 */
@@ -1225,11 +1219,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
        if (!osdc->req_mempool)
                goto out;
-        err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true);
+        err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
+                                "osd_op");
        if (err < 0)
                goto out_mempool;
        err = ceph_msgpool_init(&osdc->msgpool_op_reply,
-                                OSD_OPREPLY_FRONT_LEN, 10, true);
+                                OSD_OPREPLY_FRONT_LEN, 10, true,
+                                "osd_op_reply");
        if (err < 0)
                goto out_msgpool;
        return 0;
@@ -1275,8 +1271,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
                                    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
                                    NULL, 0, truncate_seq, truncate_size, NULL,
                                    false, 1);
-        if (IS_ERR(req))
+        if (!req)
-                return PTR_ERR(req);
+                return -ENOMEM;
        /* it may be a short read due to an object boundary */
        req->r_pages = pages;
@@ -1318,8 +1314,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
                                    snapc, do_sync,
                                    truncate_seq, truncate_size, mtime,
                                    nofail, 1);
-        if (IS_ERR(req))
+        if (!req)
-                return PTR_ERR(req);
+                return -ENOMEM;
        /* it may be a short write due to an object boundary */
        req->r_pages = pages;
@@ -1367,7 +1363,8 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 }
 /*
- * lookup and return message for incoming reply
+ * lookup and return message for incoming reply.  set up reply message
+ * pages.
 */
 static struct ceph_msg *get_reply(struct ceph_connection *con,
                                  struct ceph_msg_header *hdr,
@@ -1380,7 +1377,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
        int front = le32_to_cpu(hdr->front_len);
        int data_len = le32_to_cpu(hdr->data_len);
        u64 tid;
-        int err;
        tid = le64_to_cpu(hdr->tid);
        mutex_lock(&osdc->request_mutex);
@@ -1398,13 +1394,14 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
                     req->r_reply, req->r_con_filling_msg);
                ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
                ceph_con_put(req->r_con_filling_msg);
+                req->r_con_filling_msg = NULL;
        }
        if (front > req->r_reply->front.iov_len) {
                pr_warning("get_reply front %d > preallocated %d\n",
                           front, (int)req->r_reply->front.iov_len);
-                m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
+                m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
-                if (IS_ERR(m))
+                if (!m)
                        goto out;
                ceph_msg_put(req->r_reply);
                req->r_reply = m;
@@ -1412,12 +1409,19 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
        m = ceph_msg_get(req->r_reply);
        if (data_len > 0) {
-                err = __prepare_pages(con, hdr, req, tid, m);
+                unsigned data_off = le16_to_cpu(hdr->data_off);
-                if (err < 0) {
+                int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
+                if (unlikely(req->r_num_pages < want)) {
+                        pr_warning("tid %lld reply %d > expected %d pages\n",
+                                   tid, want, m->nr_pages);
                        *skip = 1;
                        ceph_msg_put(m);
-                        m = ERR_PTR(err);
+                        m = NULL;
+                        goto out;
                }
+                m->pages = req->r_pages;
+                m->nr_pages = req->r_num_pages;
        }
        *skip = 0;
        req->r_con_filling_msg = ceph_con_get(con);
@@ -1439,7 +1443,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
        switch (type) {
        case CEPH_MSG_OSD_MAP:
-                return ceph_msg_new(type, front, 0, 0, NULL);
+                return ceph_msg_new(type, front, GFP_NOFS);
        case CEPH_MSG_OSD_OPREPLY:
                return get_reply(con, hdr, skip);
        default:
@@ -1525,7 +1529,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
        return ceph_monc_validate_auth(&osdc->client->monc);
 }
-const static struct ceph_connection_operations osd_con_ops = {
+static const struct ceph_connection_operations osd_con_ops = {
        .get = get_osd_con,
        .put = put_osd_con,
        .dispatch = dispatch,
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
index 1b1a3ca43afc..ce776989ef6a 100644
--- a/fs/ceph/osd_client.h
+++ b/fs/ceph/osd_client.h
@@ -48,6 +48,8 @@ struct ceph_osd_request {
        struct list_head r_osd_item;
        struct ceph_osd *r_osd;
        struct ceph_pg   r_pgid;
+        int              r_pg_osds[CEPH_PG_MAX_SIZE];
+        int              r_num_pg_osds;
        struct ceph_connection *r_con_filling_msg;
@@ -66,11 +68,10 @@ struct ceph_osd_request {
        struct list_head  r_unsafe_item;
        struct inode *r_inode;                /* for use by callbacks */
-        struct writeback_control *r_wbc;      /* ditto */
        char              r_oid[40];          /* object name */
        int               r_oid_len;
-        unsigned long     r_sent_stamp;
+        unsigned long     r_stamp;            /* send OR check time */
        bool              r_resend;           /* msg send failed, needs retry */
        struct ceph_file_layout r_file_layout;
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index b83f2692b835..ddc656fb5c05 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -1,4 +1,7 @@
+#include "ceph_debug.h"
+#include <linux/slab.h>
 #include <asm/div64.h>
 #include "super.h"
@@ -6,7 +9,6 @@
 #include "crush/hash.h"
 #include "crush/mapper.h"
 #include "decode.h"
-#include "ceph_debug.h"
 char *ceph_osdmap_state_str(char *str, int len, int state)
 {
@@ -312,71 +314,6 @@ bad:
        return ERR_PTR(err);
 }
-/*
- * osd map
- */
-void ceph_osdmap_destroy(struct ceph_osdmap *map)
-{
-        dout("osdmap_destroy %p\n", map);
-        if (map->crush)
-                crush_destroy(map->crush);
-        while (!RB_EMPTY_ROOT(&map->pg_temp)) {
-                struct ceph_pg_mapping *pg =
-                        rb_entry(rb_first(&map->pg_temp),
-                                 struct ceph_pg_mapping, node);
-                rb_erase(&pg->node, &map->pg_temp);
-                kfree(pg);
-        }
-        while (!RB_EMPTY_ROOT(&map->pg_pools)) {
-                struct ceph_pg_pool_info *pi =
-                        rb_entry(rb_first(&map->pg_pools),
-                                 struct ceph_pg_pool_info, node);
-                rb_erase(&pi->node, &map->pg_pools);
-                kfree(pi);
-        }
-        kfree(map->osd_state);
-        kfree(map->osd_weight);
-        kfree(map->osd_addr);
-        kfree(map);
-}
-/*
- * adjust max osd value.  reallocate arrays.
- */
-static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
-{
-        u8 *state;
-        struct ceph_entity_addr *addr;
-        u32 *weight;
-        state = kcalloc(max, sizeof(*state), GFP_NOFS);
-        addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
-        weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
-        if (state == NULL || addr == NULL || weight == NULL) {
-                kfree(state);
-                kfree(addr);
-                kfree(weight);
-                return -ENOMEM;
-        }
-        /* copy old? */
-        if (map->osd_state) {
-                memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
-                memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
-                memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
-                kfree(map->osd_state);
-                kfree(map->osd_addr);
-                kfree(map->osd_weight);
-        }
-        map->osd_state = state;
-        map->osd_weight = weight;
-        map->osd_addr = addr;
-        map->max_osd = max;
-        return 0;
-}
 /*
 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
 * to a set of osds)
@@ -480,6 +417,113 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
        return NULL;
 }
+static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
+{
+        rb_erase(&pi->node, root);
+        kfree(pi->name);
+        kfree(pi);
+}
+void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
+{
+        ceph_decode_copy(p, &pi->v, sizeof(pi->v));
+        calc_pg_masks(pi);
+        *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
+        *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
+}
+static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
+{
+        struct ceph_pg_pool_info *pi;
+        u32 num, len, pool;
+        ceph_decode_32_safe(p, end, num, bad);
+        dout(" %d pool names\n", num);
+        while (num--) {
+                ceph_decode_32_safe(p, end, pool, bad);
+                ceph_decode_32_safe(p, end, len, bad);
+                dout("  pool %d len %d\n", pool, len);
+                pi = __lookup_pg_pool(&map->pg_pools, pool);
+                if (pi) {
+                        kfree(pi->name);
+                        pi->name = kmalloc(len + 1, GFP_NOFS);
+                        if (pi->name) {
+                                memcpy(pi->name, *p, len);
+                                pi->name[len] = '\0';
+                                dout("  name is %s\n", pi->name);
+                        }
+                }
+                *p += len;
+        }
+        return 0;
+bad:
+        return -EINVAL;
+}
+/*
+ * osd map
+ */
+void ceph_osdmap_destroy(struct ceph_osdmap *map)
+{
+        dout("osdmap_destroy %p\n", map);
+        if (map->crush)
+                crush_destroy(map->crush);
+        while (!RB_EMPTY_ROOT(&map->pg_temp)) {
+                struct ceph_pg_mapping *pg =
+                        rb_entry(rb_first(&map->pg_temp),
+                                 struct ceph_pg_mapping, node);
+                rb_erase(&pg->node, &map->pg_temp);
+                kfree(pg);
+        }
+        while (!RB_EMPTY_ROOT(&map->pg_pools)) {
+                struct ceph_pg_pool_info *pi =
+                        rb_entry(rb_first(&map->pg_pools),
+                                 struct ceph_pg_pool_info, node);
+                __remove_pg_pool(&map->pg_pools, pi);
+        }
+        kfree(map->osd_state);
+        kfree(map->osd_weight);
+        kfree(map->osd_addr);
+        kfree(map);
+}
+/*
+ * adjust max osd value.  reallocate arrays.
+ */
+static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
+{
+        u8 *state;
+        struct ceph_entity_addr *addr;
+        u32 *weight;
+        state = kcalloc(max, sizeof(*state), GFP_NOFS);
+        addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
+        weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
+        if (state == NULL || addr == NULL || weight == NULL) {
+                kfree(state);
+                kfree(addr);
+                kfree(weight);
+                return -ENOMEM;
+        }
+        /* copy old? */
+        if (map->osd_state) {
+                memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
+                memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
+                memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
+                kfree(map->osd_state);
+                kfree(map->osd_addr);
+                kfree(map->osd_weight);
+        }
+        map->osd_state = state;
+        map->osd_weight = weight;
+        map->osd_addr = addr;
+        map->max_osd = max;
+        return 0;
+}
 /*
 * decode a full map.
 */
@@ -516,7 +560,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
        ceph_decode_32_safe(p, end, max, bad);
        while (max--) {
                ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
-                pi = kmalloc(sizeof(*pi), GFP_NOFS);
+                pi = kzalloc(sizeof(*pi), GFP_NOFS);
                if (!pi)
                        goto bad;
                pi->id = ceph_decode_32(p);
@@ -526,13 +570,13 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
                                   ev, CEPH_PG_POOL_VERSION);
                        goto bad;
                }
-                ceph_decode_copy(p, &pi->v, sizeof(pi->v));
+                __decode_pool(p, pi);
                __insert_pg_pool(&map->pg_pools, pi);
-                calc_pg_masks(pi);
-                *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
-                *p += le32_to_cpu(pi->v.num_removed_snap_intervals)
-                        * sizeof(u64) * 2;
        }
+        if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+                goto bad;
        ceph_decode_32_safe(p, end, map->pool_max, bad);
        ceph_decode_32_safe(p, end, map->flags, bad);
@@ -662,7 +706,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                     len, *p, end);
                newcrush = crush_decode(*p, min(*p+len, end));
                if (IS_ERR(newcrush))
-                        return ERR_PTR(PTR_ERR(newcrush));
+                        return ERR_CAST(newcrush);
        }
        /* new flags? */
@@ -706,7 +750,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                }
                pi = __lookup_pg_pool(&map->pg_pools, pool);
                if (!pi) {
-                        pi = kmalloc(sizeof(*pi), GFP_NOFS);
+                        pi = kzalloc(sizeof(*pi), GFP_NOFS);
                        if (!pi) {
                                err = -ENOMEM;
                                goto bad;
@@ -714,9 +758,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                        pi->id = pool;
                        __insert_pg_pool(&map->pg_pools, pi);
                }
-                ceph_decode_copy(p, &pi->v, sizeof(pi->v));
+                __decode_pool(p, pi);
-                calc_pg_masks(pi);
        }
+        if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+                goto bad;
        /* old_pool */
        ceph_decode_32_safe(p, end, len, bad);
@@ -725,10 +770,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                ceph_decode_32_safe(p, end, pool, bad);
                pi = __lookup_pg_pool(&map->pg_pools, pool);
-                if (pi) {
+                if (pi)
-                        rb_erase(&pi->node, &map->pg_pools);
+                        __remove_pg_pool(&map->pg_pools, pi);
-                        kfree(pi);
-                }
        }
        /* new_up */
@@ -998,12 +1041,33 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 }
 /*
+ * Return acting set for given pgid.
+ */
+int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+                        int *acting)
+{
+        int rawosds[CEPH_PG_MAX_SIZE], *osds;
+        int i, o, num = CEPH_PG_MAX_SIZE;
+        osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+        if (!osds)
+                return -1;
+        /* primary is first up osd */
+        o = 0;
+        for (i = 0; i < num; i++)
+                if (ceph_osd_is_up(osdmap, osds[i]))
+                        acting[o++] = osds[i];
+        return o;
+}
+/*
 * Return primary osd for given pgid, or -1 if none.
 */
 int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
 {
-        int rawosds[10], *osds;
+        int rawosds[CEPH_PG_MAX_SIZE], *osds;
-        int i, num = ARRAY_SIZE(rawosds);
+        int i, num = CEPH_PG_MAX_SIZE;
        osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
        if (!osds)
@@ -1011,9 +1075,7 @@ int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
        /* primary is first up osd */
        for (i = 0; i < num; i++)
-                if (ceph_osd_is_up(osdmap, osds[i])) {
+                if (ceph_osd_is_up(osdmap, osds[i]))
                        return osds[i];
-                        break;
-                }
        return -1;
 }
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
index 1fb55afb2642..970b547e510d 100644
--- a/fs/ceph/osdmap.h
+++ b/fs/ceph/osdmap.h
@@ -23,6 +23,7 @@ struct ceph_pg_pool_info {
        int id;
        struct ceph_pg_pool v;
        int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
+        char *name;
 };
 struct ceph_pg_mapping {
@@ -119,6 +120,8 @@ extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
                                   const char *oid,
                                   struct ceph_file_layout *fl,
                                   struct ceph_osdmap *osdmap);
+extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+                               int *acting);
 extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
                                struct ceph_pg pgid);
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
index 370e93695474..b6859f47d364 100644
--- a/fs/ceph/pagelist.c
+++ b/fs/ceph/pagelist.c
@@ -1,4 +1,5 @@
+#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
@@ -19,7 +20,7 @@ int ceph_pagelist_release(struct ceph_pagelist *pl)
 static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
 {
-        struct page *page = alloc_page(GFP_NOFS);
+        struct page *page = __page_cache_alloc(GFP_NOFS);
        if (!page)
                return -ENOMEM;
        pl->room += PAGE_SIZE;
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index 26ac8b89a676..8fcc023056c7 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -11,8 +11,10 @@
 /*
 * osdmap encoding versions
 */
-#define CEPH_OSDMAP_INC_VERSION 4
+#define CEPH_OSDMAP_INC_VERSION     5
-#define CEPH_OSDMAP_VERSION     4
+#define CEPH_OSDMAP_INC_VERSION_EXT 5
+#define CEPH_OSDMAP_VERSION         5
+#define CEPH_OSDMAP_VERSION_EXT     5
 /*
 * fs id
@@ -56,6 +58,7 @@ struct ceph_timespec {
 #define CEPH_PG_LAYOUT_LINEAR 2
 #define CEPH_PG_LAYOUT_HYBRID 3
+#define CEPH_PG_MAX_SIZE      16  /* max # osds in a single pg */
 /*
 * placement group.
@@ -98,8 +101,8 @@ struct ceph_pg_pool {
        __le64 snap_seq;          /* seq for per-pool snapshot */
        __le32 snap_epoch;        /* epoch of last snap */
        __le32 num_snaps;
-        __le32 num_removed_snap_intervals;
+        __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
-        __le64 uid;
+        __le64 auid;               /* who owns the pg */
 } __attribute__ ((packed));
 /*
@@ -205,6 +208,7 @@ enum {
        /* read */
        CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
        CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
+        CEPH_OSD_OP_CMPXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
        /* write */
        CEPH_OSD_OP_SETXATTR  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
@@ -302,6 +306,22 @@ enum {
 #define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
 #define EBLACKLISTED ESHUTDOWN /* blacklisted */
+/* xattr comparison */
+enum {
+        CEPH_OSD_CMPXATTR_OP_NOP = 0,
+        CEPH_OSD_CMPXATTR_OP_EQ  = 1,
+        CEPH_OSD_CMPXATTR_OP_NE  = 2,
+        CEPH_OSD_CMPXATTR_OP_GT  = 3,
+        CEPH_OSD_CMPXATTR_OP_GTE = 4,
+        CEPH_OSD_CMPXATTR_OP_LT  = 5,
+        CEPH_OSD_CMPXATTR_OP_LTE = 6
+};
+enum {
+        CEPH_OSD_CMPXATTR_MODE_STRING = 1,
+        CEPH_OSD_CMPXATTR_MODE_U64    = 2
+};
 /*
 * an individual object operation.  each may be accompanied by some data
 * payload
@@ -318,6 +338,8 @@ struct ceph_osd_op {
                struct {
                        __le32 name_len;
                        __le32 value_len;
+                        __u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
+                        __u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
                } __attribute__ ((packed)) xattr;
                struct {
                        __u8 class_len;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index bf2a5f3846a4..c0b26b6badba 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1,6 +1,7 @@
 #include "ceph_debug.h"
 #include <linux/sort.h>
+#include <linux/slab.h>
 #include "super.h"
 #include "decode.h"
@@ -314,9 +315,9 @@ static int build_snap_context(struct ceph_snap_realm *realm)
           because we rebuild_snap_realms() works _downward_ in
           hierarchy after each update.) */
        if (realm->cached_context &&
-            realm->cached_context->seq <= realm->seq &&
+            realm->cached_context->seq == realm->seq &&
            (!parent ||
-             realm->cached_context->seq <= parent->cached_context->seq)) {
+             realm->cached_context->seq >= parent->cached_context->seq)) {
                dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
                     " (unchanged)\n",
                     realm->ino, realm, realm->cached_context,
@@ -430,8 +431,7 @@ static int dup_array(u64 **dst, __le64 *src, int num)
 * Caller must hold snap_rwsem for read (i.e., the realm topology won't
 * change).
 */
-void ceph_queue_cap_snap(struct ceph_inode_info *ci,
+void ceph_queue_cap_snap(struct ceph_inode_info *ci)
-                         struct ceph_snap_context *snapc)
 {
        struct inode *inode = &ci->vfs_inode;
        struct ceph_cap_snap *capsnap;
@@ -450,10 +450,11 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci,
                   as no new writes are allowed to start when pending, so any
                   writes in progress now were started before the previous
                   cap_snap.  lucky us. */
-                dout("queue_cap_snap %p snapc %p seq %llu used %d"
+                dout("queue_cap_snap %p already pending\n", inode);
-                     " already pending\n", inode, snapc, snapc->seq, used);
                kfree(capsnap);
        } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
+                struct ceph_snap_context *snapc = ci->i_head_snapc;
                igrab(inode);
                atomic_set(&capsnap->nref, 1);
@@ -462,7 +463,6 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci,
                INIT_LIST_HEAD(&capsnap->flushing_item);
                capsnap->follows = snapc->seq - 1;
-                capsnap->context = ceph_get_snap_context(snapc);
                capsnap->issued = __ceph_caps_issued(ci, NULL);
                capsnap->dirty = __ceph_caps_dirty(ci);
@@ -479,7 +479,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci,
                   snapshot. */
                capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
                ci->i_wrbuffer_ref_head = 0;
-                ceph_put_snap_context(ci->i_head_snapc);
+                capsnap->context = snapc;
                ci->i_head_snapc = NULL;
                list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
@@ -512,7 +512,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
                            struct ceph_cap_snap *capsnap)
 {
        struct inode *inode = &ci->vfs_inode;
-        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
        BUG_ON(capsnap->writing);
        capsnap->size = inode->i_size;
@@ -521,15 +521,17 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
        capsnap->ctime = inode->i_ctime;
        capsnap->time_warp_seq = ci->i_time_warp_seq;
        if (capsnap->dirty_pages) {
-                dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu "
+                dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
                     "still has %d dirty pages\n", inode, capsnap,
                     capsnap->context, capsnap->context->seq,
-                     capsnap->size, capsnap->dirty_pages);
+                     ceph_cap_string(capsnap->dirty), capsnap->size,
+                     capsnap->dirty_pages);
                return 0;
        }
-        dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n",
+        dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
             inode, capsnap, capsnap->context,
-             capsnap->context->seq, capsnap->size);
+             capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+             capsnap->size);
        spin_lock(&mdsc->snap_flush_lock);
        list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
@@ -601,7 +603,7 @@ more:
                                if (lastinode)
                                        iput(lastinode);
                                lastinode = inode;
-                                ceph_queue_cap_snap(ci, realm->cached_context);
+                                ceph_queue_cap_snap(ci);
                                spin_lock(&realm->inodes_with_caps_lock);
                        }
                        spin_unlock(&realm->inodes_with_caps_lock);
@@ -818,11 +820,12 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
                         * queued (again) by ceph_update_snap_trace()
                         * below.  Queue it _now_, under the old context.
                         */
+                        spin_lock(&realm->inodes_with_caps_lock);
                        list_del_init(&ci->i_snap_realm_item);
+                        spin_unlock(&realm->inodes_with_caps_lock);
                        spin_unlock(&inode->i_lock);
-                        ceph_queue_cap_snap(ci,
+                        ceph_queue_cap_snap(ci);
-                                            ci->i_snap_realm->cached_context);
                        iput(inode);
                        continue;
@@ -866,16 +869,20 @@ skip_inode:
                                continue;
                        ci = ceph_inode(inode);
                        spin_lock(&inode->i_lock);
-                        if (!ci->i_snap_realm)
+                        if (list_empty(&ci->i_snap_realm_item)) {
-                                goto split_skip_inode;
+                                struct ceph_snap_realm *oldrealm =
-                        ceph_put_snap_realm(mdsc, ci->i_snap_realm);
+                                        ci->i_snap_realm;
-                        spin_lock(&realm->inodes_with_caps_lock);
-                        list_add(&ci->i_snap_realm_item,
+                                dout(" moving %p to split realm %llx %p\n",
-                                 &realm->inodes_with_caps);
+                                     inode, realm->ino, realm);
-                        ci->i_snap_realm = realm;
+                                spin_lock(&realm->inodes_with_caps_lock);
-                        spin_unlock(&realm->inodes_with_caps_lock);
+                                list_add(&ci->i_snap_realm_item,
-                        ceph_get_snap_realm(mdsc, realm);
+                                         &realm->inodes_with_caps);
-split_skip_inode:
+                                ci->i_snap_realm = realm;
+                                spin_unlock(&realm->inodes_with_caps_lock);
+                                ceph_get_snap_realm(mdsc, realm);
+                                ceph_put_snap_realm(mdsc, oldrealm);
+                        }
                        spin_unlock(&inode->i_lock);
                        iput(inode);
                }
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 4290a6e860b0..4e0bee240b9d 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -8,13 +8,11 @@
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/parser.h>
-#include <linux/rwsem.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include <linux/statfs.h>
 #include <linux/string.h>
-#include <linux/version.h>
-#include <linux/vmalloc.h>
 #include "decode.h"
 #include "super.h"
@@ -46,10 +44,20 @@ const char *ceph_file_part(const char *s, int len)
 */
 static void ceph_put_super(struct super_block *s)
 {
-        struct ceph_client *cl = ceph_client(s);
+        struct ceph_client *client = ceph_sb_to_client(s);
        dout("put_super\n");
-        ceph_mdsc_close_sessions(&cl->mdsc);
+        ceph_mdsc_close_sessions(&client->mdsc);
+        /*
+         * ensure we release the bdi before put_anon_super releases
+         * the device name.
+         */
+        if (s->s_bdi == &client->backing_dev_info) {
+                bdi_unregister(&client->backing_dev_info);
+                s->s_bdi = NULL;
+        }
        return;
 }
@@ -96,12 +104,40 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int ceph_syncfs(struct super_block *sb, int wait)
 {
        dout("sync_fs %d\n", wait);
-        ceph_osdc_sync(&ceph_client(sb)->osdc);
+        ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
-        ceph_mdsc_sync(&ceph_client(sb)->mdsc);
+        ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
        dout("sync_fs %d done\n", wait);
        return 0;
 }
+static int default_congestion_kb(void)
+{
+        int congestion_kb;
+        /*
+         * Copied from NFS
+         *
+         * congestion size, scale with available memory.
+         *
+         *  64MB:    8192k
+         * 128MB:   11585k
+         * 256MB:   16384k
+         * 512MB:   23170k
+         *   1GB:   32768k
+         *   2GB:   46340k
+         *   4GB:   65536k
+         *   8GB:   92681k
+         *  16GB:  131072k
+         *
+         * This allows larger machines to have larger/more transfers.
+         * Limit the default to 256M
+         */
+        congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+        if (congestion_kb > 256*1024)
+                congestion_kb = 256*1024;
+        return congestion_kb;
+}
 /**
 * ceph_show_options - Show mount options in /proc/mounts
@@ -127,6 +163,35 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
                seq_puts(m, ",nocrc");
        if (args->flags & CEPH_OPT_NOASYNCREADDIR)
                seq_puts(m, ",noasyncreaddir");
+        if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
+                seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
+        if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
+                seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
+        if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
+                seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
+        if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
+                seq_printf(m, ",osdkeepalivetimeout=%d",
+                         args->osd_keepalive_timeout);
+        if (args->wsize)
+                seq_printf(m, ",wsize=%d", args->wsize);
+        if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
+                seq_printf(m, ",rsize=%d", args->rsize);
+        if (args->congestion_kb != default_congestion_kb())
+                seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
+        if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
+                seq_printf(m, ",caps_wanted_delay_min=%d",
+                         args->caps_wanted_delay_min);
+        if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
+                seq_printf(m, ",caps_wanted_delay_max=%d",
+                           args->caps_wanted_delay_max);
+        if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
+                seq_printf(m, ",cap_release_safety=%d",
+                           args->cap_release_safety);
+        if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
+                seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
+        if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
+                seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
        if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
                seq_printf(m, ",snapdirname=%s", args->snapdir_name);
        if (args->name)
@@ -150,35 +215,6 @@ static void ceph_inode_init_once(void *foo)
        inode_init_once(&ci->vfs_inode);
 }
-static int default_congestion_kb(void)
-{
-        int congestion_kb;
-        /*
-         * Copied from NFS
-         *
-         * congestion size, scale with available memory.
-         *
-         *  64MB:    8192k
-         * 128MB:   11585k
-         * 256MB:   16384k
-         * 512MB:   23170k
-         *   1GB:   32768k
-         *   2GB:   46340k
-         *   4GB:   65536k
-         *   8GB:   92681k
-         *  16GB:  131072k
-         *
-         * This allows larger machines to have larger/more transfers.
-         * Limit the default to 256M
-         */
-        congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
-        if (congestion_kb > 256*1024)
-                congestion_kb = 256*1024;
-        return congestion_kb;
-}
 static int __init init_caches(void)
 {
        ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
@@ -297,7 +333,9 @@ enum {
        Opt_osd_idle_ttl,
        Opt_caps_wanted_delay_min,
        Opt_caps_wanted_delay_max,
+        Opt_cap_release_safety,
        Opt_readdir_max_entries,
+        Opt_readdir_max_bytes,
        Opt_congestion_kb,
        Opt_last_int,
        /* int args above */
@@ -328,7 +366,9 @@ static match_table_t arg_tokens = {
        {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
        {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
        {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+        {Opt_cap_release_safety, "cap_release_safety=%d"},
        {Opt_readdir_max_entries, "readdir_max_entries=%d"},
+        {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
        {Opt_congestion_kb, "write_congestion_kb=%d"},
        /* int args above */
        {Opt_snapdirname, "snapdirname=%s"},
@@ -377,8 +417,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
        args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
        args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
        args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
-        args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
+        args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
-        args->max_readdir = 1024;
+        args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+        args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
        args->congestion_kb = default_congestion_kb();
        /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
@@ -486,6 +527,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
                case Opt_readdir_max_entries:
                        args->max_readdir = intval;
                        break;
+                case Opt_readdir_max_bytes:
+                        args->max_readdir_bytes = intval;
+                        break;
                case Opt_congestion_kb:
                        args->congestion_kb = intval;
                        break;
@@ -625,9 +669,17 @@ static void ceph_destroy_client(struct ceph_client *client)
        /* unmount */
        ceph_mdsc_stop(&client->mdsc);
-        ceph_monc_stop(&client->monc);
        ceph_osdc_stop(&client->osdc);
+        /*
+         * make sure mds and osd connections close out before destroying
+         * the auth module, which is needed to free those connections'
+         * ceph_authorizers.
+         */
+        ceph_msgr_flush();
+        ceph_monc_stop(&client->monc);
        ceph_adjust_min_caps(-client->min_caps);
        ceph_debugfs_client_cleanup(client);
@@ -635,6 +687,8 @@ static void ceph_destroy_client(struct ceph_client *client)
        destroy_workqueue(client->pg_inv_wq);
        destroy_workqueue(client->trunc_wq);
+        bdi_destroy(&client->backing_dev_info);
        if (client->msgr)
                ceph_messenger_destroy(client->msgr);
        mempool_destroy(client->wb_pagevec_pool);
@@ -669,9 +723,10 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
 /*
 * true if we have the mon map (and have thus joined the cluster)
 */
-static int have_mon_map(struct ceph_client *client)
+static int have_mon_and_osd_map(struct ceph_client *client)
 {
-        return client->monc.monmap && client->monc.monmap->epoch;
+        return client->monc.monmap && client->monc.monmap->epoch &&
+               client->osdc.osdmap && client->osdc.osdmap->epoch;
 }
 /*
@@ -691,7 +746,7 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
        dout("open_root_inode opening '%s'\n", path);
        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
        if (IS_ERR(req))
-                return ERR_PTR(PTR_ERR(req));
+                return ERR_CAST(req);
        req->r_path1 = kstrdup(path, GFP_NOFS);
        req->r_ino1.ino = CEPH_INO_ROOT;
        req->r_ino1.snap = CEPH_NOSNAP;
@@ -749,7 +804,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
        if (err < 0)
                goto out;
-        while (!have_mon_map(client)) {
+        while (!have_mon_and_osd_map(client)) {
                err = -EIO;
                if (timeout && time_after_eq(jiffies, started + timeout))
                        goto out;
@@ -757,8 +812,8 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
                /* wait */
                dout("mount waiting for mon_map\n");
                err = wait_event_interruptible_timeout(client->auth_wq,
-                               have_mon_map(client) || (client->auth_err < 0),
+                       have_mon_and_osd_map(client) || (client->auth_err < 0),
-                               timeout);
+                       timeout);
                if (err == -EINTR || err == -ERESTARTSYS)
                        goto out;
                if (client->auth_err < 0) {
@@ -871,18 +926,21 @@ static int ceph_compare_super(struct super_block *sb, void *data)
 /*
 * construct our own bdi so we can control readahead, etc.
 */
+static atomic_long_t bdi_seq = ATOMIC_INIT(0);
 static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
 {
        int err;
-        sb->s_bdi = &client->backing_dev_info;
        /* set ra_pages based on rsize mount option? */
        if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
                client->backing_dev_info.ra_pages =
                        (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
                        >> PAGE_SHIFT;
-        err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
+        err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
+                           atomic_long_inc_return(&bdi_seq));
+        if (!err)
+                sb->s_bdi = &client->backing_dev_info;
        return err;
 }
@@ -919,9 +977,9 @@ static int ceph_get_sb(struct file_system_type *fs_type,
                goto out;
        }
-        if (ceph_client(sb) != client) {
+        if (ceph_sb_to_client(sb) != client) {
                ceph_destroy_client(client);
-                client = ceph_client(sb);
+                client = ceph_sb_to_client(sb);
                dout("get_sb got existing client %p\n", client);
        } else {
                dout("get_sb using new client %p\n", client);
@@ -939,8 +997,7 @@ static int ceph_get_sb(struct file_system_type *fs_type,
 out_splat:
        ceph_mdsc_close_sessions(&client->mdsc);
-        up_write(&sb->s_umount);
+        deactivate_locked_super(sb);
-        deactivate_super(sb);
        goto out_final;
 out:
@@ -956,9 +1013,6 @@ static void ceph_kill_sb(struct super_block *s)
        dout("kill_sb %p\n", s);
        ceph_mdsc_pre_umount(&client->mdsc);
        kill_anon_super(s);    /* will call put_super after sb is r/o */
-        if (s->s_bdi == &client->backing_dev_info)
-                bdi_unregister(&client->backing_dev_info);
-        bdi_destroy(&client->backing_dev_info);
        ceph_destroy_client(client);
 }
@@ -995,9 +1049,10 @@ static int __init init_ceph(void)
        if (ret)
                goto out_icache;
-        pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n",
+        pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n",
-                CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH,
+                CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL,
-                CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL);
+                CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
+                CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
        return 0;
 out_icache:
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 65d12036b670..10a4a406e887 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -12,6 +12,7 @@
 #include <linux/pagemap.h>
 #include <linux/wait.h>
 #include <linux/writeback.h>
+#include <linux/slab.h>
 #include "types.h"
 #include "messenger.h"
@@ -50,24 +51,25 @@
 struct ceph_mount_args {
        int sb_flags;
+        int flags;
+        struct ceph_fsid fsid;
+        struct ceph_entity_addr my_addr;
        int num_mon;
        struct ceph_entity_addr *mon_addr;
-        int flags;
        int mount_timeout;
        int osd_idle_ttl;
-        int caps_wanted_delay_min, caps_wanted_delay_max;
-        struct ceph_fsid fsid;
-        struct ceph_entity_addr my_addr;
-        int wsize;
-        int rsize;            /* max readahead */
-        int max_readdir;      /* max readdir size */
-        int congestion_kb;      /* max readdir size */
        int osd_timeout;
        int osd_keepalive_timeout;
+        int wsize;
+        int rsize;            /* max readahead */
+        int congestion_kb;    /* max writeback in flight */
+        int caps_wanted_delay_min, caps_wanted_delay_max;
+        int cap_release_safety;
+        int max_readdir;       /* max readdir result (entires) */
+        int max_readdir_bytes; /* max readdir result (bytes) */
        char *snapdir_name;   /* default ".snap" */
        char *name;
        char *secret;
-        int cap_release_safety;
 };
 /*
@@ -78,13 +80,14 @@ struct ceph_mount_args {
 #define CEPH_OSD_KEEPALIVE_DEFAULT  5
 #define CEPH_OSD_IDLE_TTL_DEFAULT    60
 #define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
+#define CEPH_MAX_READDIR_DEFAULT    1024
+#define CEPH_MAX_READDIR_BYTES_DEFAULT    (512*1024)
 #define CEPH_MSG_MAX_FRONT_LEN  (16*1024*1024)
 #define CEPH_MSG_MAX_DATA_LEN   (16*1024*1024)
 #define CEPH_SNAPDIRNAME_DEFAULT ".snap"
 #define CEPH_AUTH_NAME_DEFAULT   "guest"
 /*
 * Delay telling the MDS we no longer want caps, in case we reopen
 * the file.  Delay a minimum amount of time, even if we send a cap
@@ -94,6 +97,7 @@ struct ceph_mount_args {
 #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
 #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
+#define CEPH_CAP_RELEASE_SAFETY_DEFAULT        (CEPH_CAPS_PER_RELEASE * 4)
 /* mount state */
 enum {
@@ -158,12 +162,6 @@ struct ceph_client {
 #endif
 };
-static inline struct ceph_client *ceph_client(struct super_block *sb)
-{
-        return sb->s_fs_info;
-}
 /*
 * File i/o capability.  This tracks shared state with the metadata
 * server that allows us to cache or writeback attributes or to read
@@ -714,8 +712,7 @@ extern int ceph_update_snap_trace(struct ceph_mds_client *m,
 extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
                             struct ceph_mds_session *session,
                             struct ceph_msg *msg);
-extern void ceph_queue_cap_snap(struct ceph_inode_info *ci,
+extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
-                                struct ceph_snap_context *snapc);
 extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
                                  struct ceph_cap_snap *capsnap);
 extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
@@ -813,7 +810,7 @@ extern void ceph_put_cap(struct ceph_cap *cap);
 extern void ceph_queue_caps_release(struct inode *inode);
 extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
-extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
+extern int ceph_fsync(struct file *file, int datasync);
 extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
                                    struct ceph_mds_session *session);
 extern int ceph_get_cap_mds(struct inode *inode);
@@ -870,6 +867,7 @@ extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 extern void ceph_dentry_lru_add(struct dentry *dn);
 extern void ceph_dentry_lru_touch(struct dentry *dn);
 extern void ceph_dentry_lru_del(struct dentry *dn);
+extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
 /*
 * our d_ops vary depending on whether the inode is live,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 37d6ce645691..68aeebc69681 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -3,10 +3,12 @@
 #include "decode.h"
 #include <linux/xattr.h>
+#include <linux/slab.h>
 static bool ceph_is_valid_xattr(const char *name)
 {
-        return !strncmp(name, XATTR_SECURITY_PREFIX,
+        return !strncmp(name, "ceph.", 5) ||
+               !strncmp(name, XATTR_SECURITY_PREFIX,
                        XATTR_SECURITY_PREFIX_LEN) ||
               !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
               !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
@@ -75,14 +77,14 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
 }
 static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
-        { true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
+        { true, "ceph.dir.entries", ceph_vxattrcb_entries},
-        { true, "user.ceph.dir.files", ceph_vxattrcb_files},
+        { true, "ceph.dir.files", ceph_vxattrcb_files},
-        { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
+        { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs},
-        { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
+        { true, "ceph.dir.rentries", ceph_vxattrcb_rentries},
-        { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
+        { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles},
-        { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
+        { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
-        { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
+        { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes},
-        { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
+        { true, "ceph.dir.rctime", ceph_vxattrcb_rctime},
        { true, NULL, NULL }
 };
@@ -106,7 +108,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 }
 static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
-        { true, "user.ceph.layout", ceph_vxattrcb_layout},
+        { true, "ceph.layout", ceph_vxattrcb_layout},
        { NULL, NULL }
 };
@@ -185,12 +187,6 @@ static int __set_xattr(struct ceph_inode_info *ci,
                ci->i_xattrs.names_size -= xattr->name_len;
                ci->i_xattrs.vals_size -= xattr->val_len;
        }
-        if (!xattr) {
-                pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
-                       &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
-                       xattr->val);
-                return -ENOMEM;
-        }
        ci->i_xattrs.names_size += name_len;
        ci->i_xattrs.vals_size += val_len;
        if (val)
@@ -573,7 +569,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
             ci->i_xattrs.version, ci->i_xattrs.index_version);
        if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
-            (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
+            (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
                goto list_xattr;
        } else {
                spin_unlock(&inode->i_lock);
@@ -621,7 +617,7 @@ out:
 static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
                              const char *value, size_t size, int flags)
 {
-        struct ceph_client *client = ceph_client(dentry->d_sb);
+        struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
        struct inode *inode = dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct inode *parent_inode = dentry->d_parent->d_inode;
@@ -640,7 +636,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
                        return -ENOMEM;
                err = -ENOMEM;
                for (i = 0; i < nr_pages; i++) {
-                        pages[i] = alloc_page(GFP_NOFS);
+                        pages[i] = __page_cache_alloc(GFP_NOFS);
                        if (!pages[i]) {
                                nr_pages = i;
                                goto out;
@@ -778,7 +774,7 @@ out:
 static int ceph_send_removexattr(struct dentry *dentry, const char *name)
 {
-        struct ceph_client *client = ceph_client(dentry->d_sb);
+        struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
        struct ceph_mds_client *mdsc = &client->mdsc;
        struct inode *inode = dentry->d_inode;
        struct inode *parent_inode = dentry->d_parent->d_inode;
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index a20bea598933..cfd1ce34e0bc 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -492,17 +492,13 @@ compare_oid(unsigned long *oid1, unsigned int oid1len,
 int
 decode_negTokenInit(unsigned char *security_blob, int length,
-                    enum securityEnum *secType)
+                    struct TCP_Server_Info *server)
 {
        struct asn1_ctx ctx;
        unsigned char *end;
        unsigned char *sequence_end;
        unsigned long *oid = NULL;
        unsigned int cls, con, tag, oidlen, rc;
-        bool use_ntlmssp = false;
-        bool use_kerberos = false;
-        bool use_kerberosu2u = false;
-        bool use_mskerberos = false;
        /* cifs_dump_mem(" Received SecBlob ", security_blob, length); */
@@ -510,11 +506,11 @@ decode_negTokenInit(unsigned char *security_blob, int length,
        /* GSSAPI header */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding negTokenInit header"));
+                cFYI(1, "Error decoding negTokenInit header");
                return 0;
        } else if ((cls != ASN1_APL) || (con != ASN1_CON)
                   || (tag != ASN1_EOC)) {
-                cFYI(1, ("cls = %d con = %d tag = %d", cls, con, tag));
+                cFYI(1, "cls = %d con = %d tag = %d", cls, con, tag);
                return 0;
        }
@@ -535,56 +531,52 @@ decode_negTokenInit(unsigned char *security_blob, int length,
        /* SPNEGO OID not present or garbled -- bail out */
        if (!rc) {
-                cFYI(1, ("Error decoding negTokenInit header"));
+                cFYI(1, "Error decoding negTokenInit header");
                return 0;
        }
        /* SPNEGO */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding negTokenInit"));
+                cFYI(1, "Error decoding negTokenInit");
                return 0;
        } else if ((cls != ASN1_CTX) || (con != ASN1_CON)
                   || (tag != ASN1_EOC)) {
-                cFYI(1,
+                cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0",
-                     ("cls = %d con = %d tag = %d end = %p (%d) exit 0",
+                     cls, con, tag, end, *end);
-                      cls, con, tag, end, *end));
                return 0;
        }
        /* negTokenInit */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding negTokenInit"));
+                cFYI(1, "Error decoding negTokenInit");
                return 0;
        } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
                   || (tag != ASN1_SEQ)) {
-                cFYI(1,
+                cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1",
-                     ("cls = %d con = %d tag = %d end = %p (%d) exit 1",
+                     cls, con, tag, end, *end);
-                      cls, con, tag, end, *end));
                return 0;
        }
        /* sequence */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding 2nd part of negTokenInit"));
+                cFYI(1, "Error decoding 2nd part of negTokenInit");
                return 0;
        } else if ((cls != ASN1_CTX) || (con != ASN1_CON)
                   || (tag != ASN1_EOC)) {
-                cFYI(1,
+                cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0",
-                     ("cls = %d con = %d tag = %d end = %p (%d) exit 0",
+                     cls, con, tag, end, *end);
-                      cls, con, tag, end, *end));
                return 0;
        }
        /* sequence of */
        if (asn1_header_decode
            (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding 2nd part of negTokenInit"));
+                cFYI(1, "Error decoding 2nd part of negTokenInit");
                return 0;
        } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
                   || (tag != ASN1_SEQ)) {
-                cFYI(1,
+                cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1",
-                     ("cls = %d con = %d tag = %d end = %p (%d) exit 1",
+                     cls, con, tag, end, *end);
-                      cls, con, tag, end, *end));
                return 0;
        }
@@ -592,37 +584,33 @@ decode_negTokenInit(unsigned char *security_blob, int length,
        while (!asn1_eoc_decode(&ctx, sequence_end)) {
                rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
                if (!rc) {
-                        cFYI(1,
+                        cFYI(1, "Error decoding negTokenInit hdr exit2");
-                             ("Error decoding negTokenInit hdr exit2"));
                        return 0;
                }
                if ((tag == ASN1_OJI) && (con == ASN1_PRI)) {
                        if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) {
-                                cFYI(1, ("OID len = %d oid = 0x%lx 0x%lx "
+                                cFYI(1, "OID len = %d oid = 0x%lx 0x%lx "
-                                         "0x%lx 0x%lx", oidlen, *oid,
+                                        "0x%lx 0x%lx", oidlen, *oid,
-                                         *(oid + 1), *(oid + 2), *(oid + 3)));
+                                        *(oid + 1), *(oid + 2), *(oid + 3));
                                if (compare_oid(oid, oidlen, MSKRB5_OID,
-                                                MSKRB5_OID_LEN) &&
+                                                MSKRB5_OID_LEN))
-                                                !use_mskerberos)
+                                        server->sec_mskerberos = true;
-                                        use_mskerberos = true;
                                else if (compare_oid(oid, oidlen, KRB5U2U_OID,
-                                                     KRB5U2U_OID_LEN) &&
+                                                     KRB5U2U_OID_LEN))
-                                                     !use_kerberosu2u)
+                                        server->sec_kerberosu2u = true;
-                                        use_kerberosu2u = true;
                                else if (compare_oid(oid, oidlen, KRB5_OID,
-                                                     KRB5_OID_LEN) &&
+                                                     KRB5_OID_LEN))
-                                                     !use_kerberos)
+                                        server->sec_kerberos = true;
-                                        use_kerberos = true;
                                else if (compare_oid(oid, oidlen, NTLMSSP_OID,
                                                     NTLMSSP_OID_LEN))
-                                        use_ntlmssp = true;
+                                        server->sec_ntlmssp = true;
                                kfree(oid);
                        }
                } else {
-                        cFYI(1, ("Should be an oid what is going on?"));
+                        cFYI(1, "Should be an oid what is going on?");
                }
        }
@@ -632,54 +620,47 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                   no mechListMic (e.g. NTLMSSP instead of KRB5) */
                if (ctx.error == ASN1_ERR_DEC_EMPTY)
                        goto decode_negtoken_exit;
-                cFYI(1, ("Error decoding last part negTokenInit exit3"));
+                cFYI(1, "Error decoding last part negTokenInit exit3");
                return 0;
        } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
                /* tag = 3 indicating mechListMIC */
-                cFYI(1, ("Exit 4 cls = %d con = %d tag = %d end = %p (%d)",
+                cFYI(1, "Exit 4 cls = %d con = %d tag = %d end = %p (%d)",
-                         cls, con, tag, end, *end));
+                        cls, con, tag, end, *end);
                return 0;
        }
        /* sequence */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding last part negTokenInit exit5"));
+                cFYI(1, "Error decoding last part negTokenInit exit5");
                return 0;
        } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
                   || (tag != ASN1_SEQ)) {
-                cFYI(1, ("cls = %d con = %d tag = %d end = %p (%d)",
+                cFYI(1, "cls = %d con = %d tag = %d end = %p (%d)",
-                        cls, con, tag, end, *end));
+                        cls, con, tag, end, *end);
        }
        /* sequence of */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding last part negTokenInit exit 7"));
+                cFYI(1, "Error decoding last part negTokenInit exit 7");
                return 0;
        } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
-                cFYI(1, ("Exit 8 cls = %d con = %d tag = %d end = %p (%d)",
+                cFYI(1, "Exit 8 cls = %d con = %d tag = %d end = %p (%d)",
-                         cls, con, tag, end, *end));
+                        cls, con, tag, end, *end);
                return 0;
        }
        /* general string */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding last part negTokenInit exit9"));
+                cFYI(1, "Error decoding last part negTokenInit exit9");
                return 0;
        } else if ((cls != ASN1_UNI) || (con != ASN1_PRI)
                   || (tag != ASN1_GENSTR)) {
-                cFYI(1, ("Exit10 cls = %d con = %d tag = %d end = %p (%d)",
+                cFYI(1, "Exit10 cls = %d con = %d tag = %d end = %p (%d)",
-                         cls, con, tag, end, *end));
+                        cls, con, tag, end, *end);
                return 0;
        }
-        cFYI(1, ("Need to call asn1_octets_decode() function for %s",
+        cFYI(1, "Need to call asn1_octets_decode() function for %s",
-                 ctx.pointer)); /* is this UTF-8 or ASCII? */
+                ctx.pointer);   /* is this UTF-8 or ASCII? */
 decode_negtoken_exit:
-        if (use_kerberos)
-                *secType = Kerberos;
-        else if (use_mskerberos)
-                *secType = MSKerberos;
-        else if (use_ntlmssp)
-                *secType = RawNTLMSSP;
        return 1;
 }
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 42cec2a7c0cf..4fce6e61b34e 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -60,10 +60,10 @@ cifs_dump_mem(char *label, void *data, int length)
 #ifdef CONFIG_CIFS_DEBUG2
 void cifs_dump_detail(struct smb_hdr *smb)
 {
-        cERROR(1, ("Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
+        cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
                  smb->Command, smb->Status.CifsError,
-                  smb->Flags, smb->Flags2, smb->Mid, smb->Pid));
+                  smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
-        cERROR(1, ("smb buf %p len %d", smb, smbCalcSize_LE(smb)));
+        cERROR(1, "smb buf %p len %d", smb, smbCalcSize_LE(smb));
 }
@@ -75,25 +75,25 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
        if (server == NULL)
                return;
-        cERROR(1, ("Dump pending requests:"));
+        cERROR(1, "Dump pending requests:");
        spin_lock(&GlobalMid_Lock);
        list_for_each(tmp, &server->pending_mid_q) {
                mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-                cERROR(1, ("State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
+                cERROR(1, "State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
                        mid_entry->midState,
                        (int)mid_entry->command,
                        mid_entry->pid,
                        mid_entry->tsk,
-                        mid_entry->mid));
+                        mid_entry->mid);
 #ifdef CONFIG_CIFS_STATS2
-                cERROR(1, ("IsLarge: %d buf: %p time rcv: %ld now: %ld",
+                cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld",
                        mid_entry->largeBuf,
                        mid_entry->resp_buf,
                        mid_entry->when_received,
-                        jiffies));
+                        jiffies);
 #endif /* STATS2 */
-                cERROR(1, ("IsMult: %d IsEnd: %d", mid_entry->multiRsp,
+                cERROR(1, "IsMult: %d IsEnd: %d", mid_entry->multiRsp,
-                          mid_entry->multiEnd));
+                          mid_entry->multiEnd);
                if (mid_entry->resp_buf) {
                        cifs_dump_detail(mid_entry->resp_buf);
                        cifs_dump_mem("existing buf: ",
@@ -716,7 +716,7 @@ static const struct file_operations cifs_multiuser_mount_proc_fops = {
 static int cifs_security_flags_proc_show(struct seq_file *m, void *v)
 {
-        seq_printf(m, "0x%x\n", extended_security);
+        seq_printf(m, "0x%x\n", global_secflags);
        return 0;
 }
@@ -744,13 +744,13 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
                /* single char or single char followed by null */
                c = flags_string[0];
                if (c == '0' || c == 'n' || c == 'N') {
-                        extended_security = CIFSSEC_DEF; /* default */
+                        global_secflags = CIFSSEC_DEF; /* default */
                        return count;
                } else if (c == '1' || c == 'y' || c == 'Y') {
-                        extended_security = CIFSSEC_MAX;
+                        global_secflags = CIFSSEC_MAX;
                        return count;
                } else if (!isdigit(c)) {
-                        cERROR(1, ("invalid flag %c", c));
+                        cERROR(1, "invalid flag %c", c);
                        return -EINVAL;
                }
        }
@@ -758,26 +758,26 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
        flags = simple_strtoul(flags_string, NULL, 0);
-        cFYI(1, ("sec flags 0x%x", flags));
+        cFYI(1, "sec flags 0x%x", flags);
        if (flags <= 0)  {
-                cERROR(1, ("invalid security flags %s", flags_string));
+                cERROR(1, "invalid security flags %s", flags_string);
                return -EINVAL;
        }
        if (flags & ~CIFSSEC_MASK) {
-                cERROR(1, ("attempt to set unsupported security flags 0x%x",
+                cERROR(1, "attempt to set unsupported security flags 0x%x",
-                        flags & ~CIFSSEC_MASK));
+                        flags & ~CIFSSEC_MASK);
                return -EINVAL;
        }
        /* flags look ok - update the global security flags for cifs module */
-        extended_security = flags;
+        global_secflags = flags;
-        if (extended_security & CIFSSEC_MUST_SIGN) {
+        if (global_secflags & CIFSSEC_MUST_SIGN) {
                /* requiring signing implies signing is allowed */
-                extended_security |= CIFSSEC_MAY_SIGN;
+                global_secflags |= CIFSSEC_MAY_SIGN;
-                cFYI(1, ("packet signing now required"));
+                cFYI(1, "packet signing now required");
-        } else if ((extended_security & CIFSSEC_MAY_SIGN) == 0) {
+        } else if ((global_secflags & CIFSSEC_MAY_SIGN) == 0) {
-                cFYI(1, ("packet signing disabled"));
+                cFYI(1, "packet signing disabled");
        }
        /* BB should we turn on MAY flags for other MUST options? */
        return count;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 5eb3b83bbfa7..aa316891ac0c 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -43,34 +43,54 @@ void dump_smb(struct smb_hdr *, int);
 */
 #ifdef CIFS_DEBUG
 /* information message: e.g., configuration, major event */
 extern int cifsFYI;
-#define cifsfyi(format,arg...) if (cifsFYI & CIFS_INFO) printk(KERN_DEBUG " " __FILE__ ": " format "\n" "" , ## arg)
+#define cifsfyi(fmt, arg...)                                            \
+do {                                                                    \
+        if (cifsFYI & CIFS_INFO)                                        \
+                printk(KERN_DEBUG "%s: " fmt "\n", __FILE__, ##arg);    \
+} while (0)
-#define cFYI(button,prspec) if (button) cifsfyi prspec
+#define cFYI(set, fmt, arg...)                  \
+do {                                            \
+        if (set)                                \
+                cifsfyi(fmt, ##arg);            \
+} while (0)
-#define cifswarn(format, arg...) printk(KERN_WARNING ": " format "\n" , ## arg)
+#define cifswarn(fmt, arg...)                   \
+        printk(KERN_WARNING fmt "\n", ##arg)
 /* debug event message: */
 extern int cifsERROR;
-#define cEVENT(format,arg...) if (cifsERROR) printk(KERN_EVENT __FILE__ ": " format "\n" , ## arg)
+#define cEVENT(fmt, arg...)                                             \
+do {                                                                    \
+        if (cifsERROR)                                                  \
+                printk(KERN_EVENT "%s: " fmt "\n", __FILE__, ##arg);    \
+} while (0)
 /* error event message: e.g., i/o error */
-#define cifserror(format,arg...) if (cifsERROR) printk(KERN_ERR " CIFS VFS: " format "\n" "" , ## arg)
+#define cifserror(fmt, arg...)                                  \
+do {                                                            \
+        if (cifsERROR)                                          \
+                printk(KERN_ERR "CIFS VFS: " fmt "\n", ##arg);  \
+} while (0)
-#define cERROR(button, prspec) if (button) cifserror prspec
+#define cERROR(set, fmt, arg...)                \
+do {                                            \
+        if (set)                                \
+                cifserror(fmt, ##arg);          \
+} while (0)
 /*
 *      debug OFF
 *      ---------
 */
 #else           /* _CIFS_DEBUG */
-#define cERROR(button, prspec)
+#define cERROR(set, fmt, arg...)
-#define cEVENT(format, arg...)
+#define cEVENT(fmt, arg...)
-#define cFYI(button, prspec)
+#define cFYI(set, fmt, arg...)
-#define cifserror(format, arg...)
+#define cifserror(fmt, arg...)
 #endif          /* _CIFS_DEBUG */
 #endif                          /* _H_CIFS_DEBUG */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index b1d61d0bdfc7..ac19a6f3dae0 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -15,6 +15,7 @@
 #include <linux/dcache.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #include <linux/vfs.h>
 #include <linux/fs.h>
 #include "cifsglob.h"
@@ -84,8 +85,8 @@ static char *cifs_get_share_name(const char *node_name)
        /* find server name end */
        pSep = memchr(UNC+2, '\\', len-2);
        if (!pSep) {
-                cERROR(1, ("%s: no server name end in node name: %s",
+                cERROR(1, "%s: no server name end in node name: %s",
-                        __func__, node_name));
+                        __func__, node_name);
                kfree(UNC);
                return ERR_PTR(-EINVAL);
        }
@@ -141,8 +142,8 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
        rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
        if (rc != 0) {
-                cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d",
+                cERROR(1, "%s: Failed to resolve server part of %s to IP: %d",
-                          __func__, *devname, rc));
+                          __func__, *devname, rc);
                goto compose_mount_options_err;
        }
        /* md_len = strlen(...) + 12 for 'sep+prefixpath='
@@ -216,8 +217,8 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
                strcat(mountdata, fullpath + ref->path_consumed);
        }
-        /*cFYI(1,("%s: parent mountdata: %s", __func__,sb_mountdata));*/
+        /*cFYI(1, "%s: parent mountdata: %s", __func__,sb_mountdata);*/
-        /*cFYI(1, ("%s: submount mountdata: %s", __func__, mountdata ));*/
+        /*cFYI(1, "%s: submount mountdata: %s", __func__, mountdata );*/
 compose_mount_options_out:
        kfree(srvIP);
@@ -293,11 +294,11 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
 static void dump_referral(const struct dfs_info3_param *ref)
 {
-        cFYI(1, ("DFS: ref path: %s", ref->path_name));
+        cFYI(1, "DFS: ref path: %s", ref->path_name);
-        cFYI(1, ("DFS: node path: %s", ref->node_name));
+        cFYI(1, "DFS: node path: %s", ref->node_name);
-        cFYI(1, ("DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type));
+        cFYI(1, "DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type);
-        cFYI(1, ("DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag,
+        cFYI(1, "DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag,
-                                ref->path_consumed));
+                                ref->path_consumed);
 }
@@ -313,7 +314,7 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
        int rc = 0;
        struct vfsmount *mnt = ERR_PTR(-ENOENT);
-        cFYI(1, ("in %s", __func__));
+        cFYI(1, "in %s", __func__);
        BUG_ON(IS_ROOT(dentry));
        xid = GetXid();
@@ -351,15 +352,15 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
                /* connect to a node */
                len = strlen(referrals[i].node_name);
                if (len < 2) {
-                        cERROR(1, ("%s: Net Address path too short: %s",
+                        cERROR(1, "%s: Net Address path too short: %s",
-                                        __func__, referrals[i].node_name));
+                                        __func__, referrals[i].node_name);
                        rc = -EINVAL;
                        goto out_err;
                }
                mnt = cifs_dfs_do_refmount(nd->path.mnt,
                                nd->path.dentry, referrals + i);
-                cFYI(1, ("%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
+                cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
-                                        referrals[i].node_name, mnt));
+                                        referrals[i].node_name, mnt);
                /* complete mount procedure if we accured submount */
                if (!IS_ERR(mnt))
@@ -377,7 +378,7 @@ out:
        FreeXid(xid);
        free_dfs_info_array(referrals, num_referrals);
        kfree(full_path);
-        cFYI(1, ("leaving %s" , __func__));
+        cFYI(1, "leaving %s" , __func__);
        return ERR_PTR(rc);
 out_err:
        path_put(&nd->path);
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 4797787c6a44..246a167cb913 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -18,6 +18,8 @@
 #ifndef _CIFS_FS_SB_H
 #define _CIFS_FS_SB_H
+#include <linux/backing-dev.h>
 #define CIFS_MOUNT_NO_PERM      1 /* do not do client vfs_perm check */
 #define CIFS_MOUNT_SET_UID      2 /* set current's euid in create etc. */
 #define CIFS_MOUNT_SERVER_INUM  4 /* inode numbers from uniqueid from server  */
@@ -50,5 +52,6 @@ struct cifs_sb_info {
 #ifdef CONFIG_CIFS_DFS_UPCALL
        char   *mountdata; /* mount options received at mount time */
 #endif
+        struct backing_dev_info bdi;
 };
 #endif                          /* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 8ec7736ce954..379bd7d9c05f 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -20,6 +20,7 @@
 */
 #include <linux/list.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <keys/user-type.h>
 #include <linux/key-type.h>
@@ -132,9 +133,9 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
        dp = description + strlen(description);
        /* for now, only sec=krb5 and sec=mskrb5 are valid */
-        if (server->secType == Kerberos)
+        if (server->sec_kerberos)
                sprintf(dp, ";sec=krb5");
-        else if (server->secType == MSKerberos)
+        else if (server->sec_mskerberos)
                sprintf(dp, ";sec=mskrb5");
        else
                goto out;
@@ -148,7 +149,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
        dp = description + strlen(description);
        sprintf(dp, ";pid=0x%x", current->pid);
-        cFYI(1, ("key description = %s", description));
+        cFYI(1, "key description = %s", description);
        spnego_key = request_key(&cifs_spnego_key_type, description, "");
 #ifdef CONFIG_CIFS_DEBUG2
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 714a542cbafc..430f510a1720 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -19,6 +19,7 @@
 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include "cifs_unicode.h"
 #include "cifs_uniupr.h"
 #include "cifspdu.h"
@@ -199,9 +200,8 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
                /* works for 2.4.0 kernel or later */
                charlen = codepage->char2uni(from, len, &wchar_to[i]);
                if (charlen < 1) {
-                        cERROR(1,
+                        cERROR(1, "strtoUCS: char2uni of %d returned %d",
-                               ("strtoUCS: char2uni of %d returned %d",
+                                (int)*from, charlen);
-                                (int)*from, charlen));
                        /* A question mark */
                        to[i] = cpu_to_le16(0x003f);
                        charlen = 1;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 7dfe0842a6f6..85d7cf7ff2c8 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -22,6 +22,7 @@
 */
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsacl.h"
@@ -86,11 +87,11 @@ int match_sid(struct cifs_sid *ctsid)
                                continue; /* all sub_auth values do not match */
                }
-                cFYI(1, ("matching sid: %s\n", wksidarr[i].sidname));
+                cFYI(1, "matching sid: %s\n", wksidarr[i].sidname);
                return 0; /* sids compare/match */
        }
-        cFYI(1, ("No matching sid"));
+        cFYI(1, "No matching sid");
        return -1;
 }
@@ -207,14 +208,14 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
                        *pbits_to_set &= ~S_IXUGO;
                return;
        } else if (type != ACCESS_ALLOWED) {
-                cERROR(1, ("unknown access control type %d", type));
+                cERROR(1, "unknown access control type %d", type);
                return;
        }
        /* else ACCESS_ALLOWED type */
        if (flags & GENERIC_ALL) {
                *pmode |= (S_IRWXUGO & (*pbits_to_set));
-                cFYI(DBG2, ("all perms"));
+                cFYI(DBG2, "all perms");
                return;
        }
        if ((flags & GENERIC_WRITE) ||
@@ -227,7 +228,7 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
                        ((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
                *pmode |= (S_IXUGO & (*pbits_to_set));
-        cFYI(DBG2, ("access flags 0x%x mode now 0x%x", flags, *pmode));
+        cFYI(DBG2, "access flags 0x%x mode now 0x%x", flags, *pmode);
        return;
 }
@@ -256,7 +257,7 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use,
        if (mode & S_IXUGO)
                *pace_flags |= SET_FILE_EXEC_RIGHTS;
-        cFYI(DBG2, ("mode: 0x%x, access flags now 0x%x", mode, *pace_flags));
+        cFYI(DBG2, "mode: 0x%x, access flags now 0x%x", mode, *pace_flags);
        return;
 }
@@ -296,24 +297,24 @@ static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
        /* validate that we do not go past end of acl */
        if (le16_to_cpu(pace->size) < 16) {
-                cERROR(1, ("ACE too small, %d", le16_to_cpu(pace->size)));
+                cERROR(1, "ACE too small %d", le16_to_cpu(pace->size));
                return;
        }
        if (end_of_acl < (char *)pace + le16_to_cpu(pace->size)) {
-                cERROR(1, ("ACL too small to parse ACE"));
+                cERROR(1, "ACL too small to parse ACE");
                return;
        }
        num_subauth = pace->sid.num_subauth;
        if (num_subauth) {
                int i;
-                cFYI(1, ("ACE revision %d num_auth %d type %d flags %d size %d",
+                cFYI(1, "ACE revision %d num_auth %d type %d flags %d size %d",
                        pace->sid.revision, pace->sid.num_subauth, pace->type,
-                        pace->flags, le16_to_cpu(pace->size)));
+                        pace->flags, le16_to_cpu(pace->size));
                for (i = 0; i < num_subauth; ++i) {
-                        cFYI(1, ("ACE sub_auth[%d]: 0x%x", i,
+                        cFYI(1, "ACE sub_auth[%d]: 0x%x", i,
-                                le32_to_cpu(pace->sid.sub_auth[i])));
+                                le32_to_cpu(pace->sid.sub_auth[i]));
                }
                /* BB add length check to make sure that we do not have huge
@@ -346,13 +347,13 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
        /* validate that we do not go past end of acl */
        if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) {
-                cERROR(1, ("ACL too small to parse DACL"));
+                cERROR(1, "ACL too small to parse DACL");
                return;
        }
-        cFYI(DBG2, ("DACL revision %d size %d num aces %d",
+        cFYI(DBG2, "DACL revision %d size %d num aces %d",
                le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size),
-                le32_to_cpu(pdacl->num_aces)));
+                le32_to_cpu(pdacl->num_aces));
        /* reset rwx permissions for user/group/other.
           Also, if num_aces is 0 i.e. DACL has no ACEs,
@@ -436,25 +437,25 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
        /* validate that we do not go past end of ACL - sid must be at least 8
           bytes long (assuming no sub-auths - e.g. the null SID */
        if (end_of_acl < (char *)psid + 8) {
-                cERROR(1, ("ACL too small to parse SID %p", psid));
+                cERROR(1, "ACL too small to parse SID %p", psid);
                return -EINVAL;
        }
        if (psid->num_subauth) {
 #ifdef CONFIG_CIFS_DEBUG2
                int i;
-                cFYI(1, ("SID revision %d num_auth %d",
+                cFYI(1, "SID revision %d num_auth %d",
-                        psid->revision, psid->num_subauth));
+                        psid->revision, psid->num_subauth);
                for (i = 0; i < psid->num_subauth; i++) {
-                        cFYI(1, ("SID sub_auth[%d]: 0x%x ", i,
+                        cFYI(1, "SID sub_auth[%d]: 0x%x ", i,
-                                le32_to_cpu(psid->sub_auth[i])));
+                                le32_to_cpu(psid->sub_auth[i]));
                }
                /* BB add length check to make sure that we do not have huge
                        num auths and therefore go off the end */
-                cFYI(1, ("RID 0x%x",
+                cFYI(1, "RID 0x%x",
-                        le32_to_cpu(psid->sub_auth[psid->num_subauth-1])));
+                        le32_to_cpu(psid->sub_auth[psid->num_subauth-1]));
 #endif
        }
@@ -481,11 +482,11 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
                                le32_to_cpu(pntsd->gsidoffset));
        dacloffset = le32_to_cpu(pntsd->dacloffset);
        dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
-        cFYI(DBG2, ("revision %d type 0x%x ooffset 0x%x goffset 0x%x "
+        cFYI(DBG2, "revision %d type 0x%x ooffset 0x%x goffset 0x%x "
                 "sacloffset 0x%x dacloffset 0x%x",
                 pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset),
                 le32_to_cpu(pntsd->gsidoffset),
-                 le32_to_cpu(pntsd->sacloffset), dacloffset));
+                 le32_to_cpu(pntsd->sacloffset), dacloffset);
 /*      cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */
        rc = parse_sid(owner_sid_ptr, end_of_acl);
        if (rc)
@@ -499,7 +500,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
                parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr,
                           group_sid_ptr, fattr);
        else
-                cFYI(1, ("no ACL")); /* BB grant all or default perms? */
+                cFYI(1, "no ACL"); /* BB grant all or default perms? */
 /*      cifscred->uid = owner_sid_ptr->rid;
        cifscred->gid = group_sid_ptr->rid;
@@ -562,7 +563,7 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
        FreeXid(xid);
-        cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
+        cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
        return pntsd;
 }
@@ -580,12 +581,12 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
                         &fid, &oplock, NULL, cifs_sb->local_nls,
                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc) {
-                cERROR(1, ("Unable to open file to get ACL"));
+                cERROR(1, "Unable to open file to get ACL");
                goto out;
        }
        rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
-        cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
+        cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
        CIFSSMBClose(xid, cifs_sb->tcon, fid);
 out:
@@ -620,7 +621,7 @@ static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
        rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
        FreeXid(xid);
-        cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
+        cFYI(DBG2, "SetCIFSACL rc = %d", rc);
        return rc;
 }
@@ -637,12 +638,12 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
                         &fid, &oplock, NULL, cifs_sb->local_nls,
                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc) {
-                cERROR(1, ("Unable to open file to set ACL"));
+                cERROR(1, "Unable to open file to set ACL");
                goto out;
        }
        rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
-        cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
+        cFYI(DBG2, "SetCIFSACL rc = %d", rc);
        CIFSSMBClose(xid, cifs_sb->tcon, fid);
 out:
@@ -658,7 +659,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
        struct cifsFileInfo *open_file;
        int rc;
-        cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
+        cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
        open_file = find_readable_file(CIFS_I(inode));
        if (!open_file)
@@ -678,7 +679,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
        u32 acllen = 0;
        int rc = 0;
-        cFYI(DBG2, ("converting ACL to mode for %s", path));
+        cFYI(DBG2, "converting ACL to mode for %s", path);
        if (pfid)
                pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen);
@@ -689,7 +690,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
        if (pntsd)
                rc = parse_sec_desc(pntsd, acllen, fattr);
        if (rc)
-                cFYI(1, ("parse sec desc failed rc = %d", rc));
+                cFYI(1, "parse sec desc failed rc = %d", rc);
        kfree(pntsd);
        return;
@@ -703,7 +704,7 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
        struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
        struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
-        cFYI(DBG2, ("set ACL from mode for %s", path));
+        cFYI(DBG2, "set ACL from mode for %s", path);
        /* Get the security descriptor */
        pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
@@ -720,19 +721,19 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
                                        DEFSECDESCLEN : secdesclen;
                pnntsd = kmalloc(secdesclen, GFP_KERNEL);
                if (!pnntsd) {
-                        cERROR(1, ("Unable to allocate security descriptor"));
+                        cERROR(1, "Unable to allocate security descriptor");
                        kfree(pntsd);
                        return -ENOMEM;
                }
                rc = build_sec_desc(pntsd, pnntsd, inode, nmode);
-                cFYI(DBG2, ("build_sec_desc rc: %d", rc));
+                cFYI(DBG2, "build_sec_desc rc: %d", rc);
                if (!rc) {
                        /* Set the security descriptor */
                        rc = set_cifs_acl(pnntsd, secdesclen, inode, path);
-                        cFYI(DBG2, ("set_cifs_acl rc: %d", rc));
+                        cFYI(DBG2, "set_cifs_acl rc: %d", rc);
                }
                kfree(pnntsd);
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 7efe1745494d..847628dfdc44 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -20,6 +20,7 @@
 */
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifs_debug.h"
@@ -102,7 +103,7 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
                if (iov[i].iov_len == 0)
                        continue;
                if (iov[i].iov_base == NULL) {
-                        cERROR(1, ("null iovec entry"));
+                        cERROR(1, "null iovec entry");
                        return -EIO;
                }
                /* The first entry includes a length field (which does not get
@@ -180,8 +181,8 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
        /* Do not need to verify session setups with signature "BSRSPYL "  */
        if (memcmp(cifs_pdu->Signature.SecuritySignature, "BSRSPYL ", 8) == 0)
-                cFYI(1, ("dummy signature received for smb command 0x%x",
+                cFYI(1, "dummy signature received for smb command 0x%x",
-                        cifs_pdu->Command));
+                        cifs_pdu->Command);
        /* save off the origiginal signature so we can modify the smb and check
                its signature against what the server sent */
@@ -290,7 +291,7 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
        if (password)
                strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE);
-        if (!encrypt && extended_security & CIFSSEC_MAY_PLNTXT) {
+        if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) {
                memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
                memcpy(lnm_session_key, password_with_pad,
                        CIFS_ENCPWD_SIZE);
@@ -397,7 +398,7 @@ void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
        /* calculate buf->ntlmv2_hash */
        rc = calc_ntlmv2_hash(ses, nls_cp);
        if (rc)
-                cERROR(1, ("could not get v2 hash rc %d", rc));
+                cERROR(1, "could not get v2 hash rc %d", rc);
        CalcNTLMv2_response(ses, resp_buf);
        /* now calculate the MAC key for NTLMv2 */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 5183bc2a1916..78c02eb4cb1f 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -49,10 +49,6 @@
 #include "cifs_spnego.h"
 #define CIFS_MAGIC_NUMBER 0xFF534D42    /* the first four bytes of SMB PDUs */
-#ifdef CONFIG_CIFS_QUOTA
-static const struct quotactl_ops cifs_quotactl_ops;
-#endif /* QUOTA */
 int cifsFYI = 0;
 int cifsERROR = 1;
 int traceSMB = 0;
@@ -61,7 +57,7 @@ unsigned int experimEnabled = 0;
 unsigned int linuxExtEnabled = 1;
 unsigned int lookupCacheEnabled = 1;
 unsigned int multiuser_mount = 0;
-unsigned int extended_security = CIFSSEC_DEF;
+unsigned int global_secflags = CIFSSEC_DEF;
 /* unsigned int ntlmv2_support = 0; */
 unsigned int sign_CIFS_PDUs = 1;
 static const struct super_operations cifs_super_ops;
@@ -86,8 +82,6 @@ extern mempool_t *cifs_sm_req_poolp;
 extern mempool_t *cifs_req_poolp;
 extern mempool_t *cifs_mid_poolp;
-extern struct kmem_cache *cifs_oplock_cachep;
 static int
 cifs_read_super(struct super_block *sb, void *data,
                const char *devname, int silent)
@@ -103,6 +97,12 @@ cifs_read_super(struct super_block *sb, void *data,
        if (cifs_sb == NULL)
                return -ENOMEM;
+        rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
+        if (rc) {
+                kfree(cifs_sb);
+                return rc;
+        }
 #ifdef CONFIG_CIFS_DFS_UPCALL
        /* copy mount params to sb for use in submounts */
        /* BB: should we move this after the mount so we
@@ -115,6 +115,7 @@ cifs_read_super(struct super_block *sb, void *data,
                int len = strlen(data);
                cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL);
                if (cifs_sb->mountdata == NULL) {
+                        bdi_destroy(&cifs_sb->bdi);
                        kfree(sb->s_fs_info);
                        sb->s_fs_info = NULL;
                        return -ENOMEM;
@@ -128,19 +129,16 @@ cifs_read_super(struct super_block *sb, void *data,
        if (rc) {
                if (!silent)
-                        cERROR(1,
+                        cERROR(1, "cifs_mount failed w/return code = %d", rc);
-                               ("cifs_mount failed w/return code = %d", rc));
                goto out_mount_failed;
        }
        sb->s_magic = CIFS_MAGIC_NUMBER;
        sb->s_op = &cifs_super_ops;
+        sb->s_bdi = &cifs_sb->bdi;
 /*      if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
            sb->s_blocksize =
                cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */
-#ifdef CONFIG_CIFS_QUOTA
-        sb->s_qcop = &cifs_quotactl_ops;
-#endif
        sb->s_blocksize = CIFS_MAX_MSGSIZE;
        sb->s_blocksize_bits = 14;      /* default 2**14 = CIFS_MAX_MSGSIZE */
        inode = cifs_root_iget(sb, ROOT_I);
@@ -160,7 +158,7 @@ cifs_read_super(struct super_block *sb, void *data,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
-                cFYI(1, ("export ops supported"));
+                cFYI(1, "export ops supported");
                sb->s_export_op = &cifs_export_ops;
        }
 #endif /* EXPERIMENTAL */
@@ -168,7 +166,7 @@ cifs_read_super(struct super_block *sb, void *data,
        return 0;
 out_no_root:
-        cERROR(1, ("cifs_read_super: get root inode failed"));
+        cERROR(1, "cifs_read_super: get root inode failed");
        if (inode)
                iput(inode);
@@ -183,6 +181,7 @@ out_mount_failed:
                }
 #endif
                unload_nls(cifs_sb->local_nls);
+                bdi_destroy(&cifs_sb->bdi);
                kfree(cifs_sb);
        }
        return rc;
@@ -194,10 +193,10 @@ cifs_put_super(struct super_block *sb)
        int rc = 0;
        struct cifs_sb_info *cifs_sb;
-        cFYI(1, ("In cifs_put_super"));
+        cFYI(1, "In cifs_put_super");
        cifs_sb = CIFS_SB(sb);
        if (cifs_sb == NULL) {
-                cFYI(1, ("Empty cifs superblock info passed to unmount"));
+                cFYI(1, "Empty cifs superblock info passed to unmount");
                return;
        }
@@ -205,7 +204,7 @@ cifs_put_super(struct super_block *sb)
        rc = cifs_umount(sb, cifs_sb);
        if (rc)
-                cERROR(1, ("cifs_umount failed with return code %d", rc));
+                cERROR(1, "cifs_umount failed with return code %d", rc);
 #ifdef CONFIG_CIFS_DFS_UPCALL
        if (cifs_sb->mountdata) {
                kfree(cifs_sb->mountdata);
@@ -214,6 +213,7 @@ cifs_put_super(struct super_block *sb)
 #endif
        unload_nls(cifs_sb->local_nls);
+        bdi_destroy(&cifs_sb->bdi);
        kfree(cifs_sb);
        unlock_kernel();
@@ -290,7 +290,6 @@ static int cifs_permission(struct inode *inode, int mask)
 static struct kmem_cache *cifs_inode_cachep;
 static struct kmem_cache *cifs_req_cachep;
 static struct kmem_cache *cifs_mid_cachep;
-struct kmem_cache *cifs_oplock_cachep;
 static struct kmem_cache *cifs_sm_req_cachep;
 mempool_t *cifs_sm_req_poolp;
 mempool_t *cifs_req_poolp;
@@ -422,106 +421,6 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
        return 0;
 }
-#ifdef CONFIG_CIFS_QUOTA
-int cifs_xquota_set(struct super_block *sb, int quota_type, qid_t qid,
-                struct fs_disk_quota *pdquota)
-{
-        int xid;
-        int rc = 0;
-        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        struct cifsTconInfo *pTcon;
-        if (cifs_sb)
-                pTcon = cifs_sb->tcon;
-        else
-                return -EIO;
-        xid = GetXid();
-        if (pTcon) {
-                cFYI(1, ("set type: 0x%x id: %d", quota_type, qid));
-        } else
-                rc = -EIO;
-        FreeXid(xid);
-        return rc;
-}
-int cifs_xquota_get(struct super_block *sb, int quota_type, qid_t qid,
-                    struct fs_disk_quota *pdquota)
-{
-        int xid;
-        int rc = 0;
-        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        struct cifsTconInfo *pTcon;
-        if (cifs_sb)
-                pTcon = cifs_sb->tcon;
-        else
-                return -EIO;
-        xid = GetXid();
-        if (pTcon) {
-                cFYI(1, ("set type: 0x%x id: %d", quota_type, qid));
-        } else
-                rc = -EIO;
-        FreeXid(xid);
-        return rc;
-}
-int cifs_xstate_set(struct super_block *sb, unsigned int flags, int operation)
-{
-        int xid;
-        int rc = 0;
-        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        struct cifsTconInfo *pTcon;
-        if (cifs_sb)
-                pTcon = cifs_sb->tcon;
-        else
-                return -EIO;
-        xid = GetXid();
-        if (pTcon) {
-                cFYI(1, ("flags: 0x%x operation: 0x%x", flags, operation));
-        } else
-                rc = -EIO;
-        FreeXid(xid);
-        return rc;
-}
-int cifs_xstate_get(struct super_block *sb, struct fs_quota_stat *qstats)
-{
-        int xid;
-        int rc = 0;
-        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        struct cifsTconInfo *pTcon;
-        if (cifs_sb)
-                pTcon = cifs_sb->tcon;
-        else
-                return -EIO;
-        xid = GetXid();
-        if (pTcon) {
-                cFYI(1, ("pqstats %p", qstats));
-        } else
-                rc = -EIO;
-        FreeXid(xid);
-        return rc;
-}
-static const struct quotactl_ops cifs_quotactl_ops = {
-        .set_xquota     = cifs_xquota_set,
-        .get_xquota     = cifs_xquota_get,
-        .set_xstate     = cifs_xstate_set,
-        .get_xstate     = cifs_xstate_get,
-};
-#endif
 static void cifs_umount_begin(struct super_block *sb)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -548,7 +447,7 @@ static void cifs_umount_begin(struct super_block *sb)
        /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
        /* cancel_notify_requests(tcon); */
        if (tcon->ses && tcon->ses->server) {
-                cFYI(1, ("wake up tasks now - umount begin not complete"));
+                cFYI(1, "wake up tasks now - umount begin not complete");
                wake_up_all(&tcon->ses->server->request_q);
                wake_up_all(&tcon->ses->server->response_q);
                msleep(1); /* yield */
@@ -599,7 +498,7 @@ cifs_get_sb(struct file_system_type *fs_type,
        int rc;
        struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL);
-        cFYI(1, ("Devname: %s flags: %d ", dev_name, flags));
+        cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
        if (IS_ERR(sb))
                return PTR_ERR(sb);
@@ -646,7 +545,6 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
        return generic_file_llseek_unlocked(file, offset, origin);
 }
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 {
        /* note that this is called by vfs setlease with the BKL held
@@ -675,7 +573,6 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
        else
                return -EAGAIN;
 }
-#endif
 struct file_system_type cifs_fs_type = {
        .owner = THIS_MODULE,
@@ -752,10 +649,7 @@ const struct file_operations cifs_file_ops = {
 #ifdef CONFIG_CIFS_POSIX
        .unlocked_ioctl = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
-#ifdef CONFIG_CIFS_EXPERIMENTAL
        .setlease = cifs_setlease,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 const struct file_operations cifs_file_direct_ops = {
@@ -774,9 +668,7 @@ const struct file_operations cifs_file_direct_ops = {
        .unlocked_ioctl  = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
        .llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_EXPERIMENTAL
        .setlease = cifs_setlease,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 const struct file_operations cifs_file_nobrl_ops = {
        .read = do_sync_read,
@@ -793,10 +685,7 @@ const struct file_operations cifs_file_nobrl_ops = {
 #ifdef CONFIG_CIFS_POSIX
        .unlocked_ioctl = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
-#ifdef CONFIG_CIFS_EXPERIMENTAL
        .setlease = cifs_setlease,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 const struct file_operations cifs_file_direct_nobrl_ops = {
@@ -808,14 +697,13 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
        .release = cifs_close,
        .fsync = cifs_fsync,
        .flush = cifs_flush,
+        .mmap = cifs_file_mmap,
        .splice_read = generic_file_splice_read,
 #ifdef CONFIG_CIFS_POSIX
        .unlocked_ioctl  = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
        .llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_EXPERIMENTAL
        .setlease = cifs_setlease,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 const struct file_operations cifs_dir_ops = {
@@ -867,7 +755,7 @@ cifs_init_request_bufs(void)
        } else {
                CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/
        }
-/*      cERROR(1,("CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize)); */
+/*      cERROR(1, "CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize); */
        cifs_req_cachep = kmem_cache_create("cifs_request",
                                            CIFSMaxBufSize +
                                            MAX_CIFS_HDR_SIZE, 0,
@@ -879,7 +767,7 @@ cifs_init_request_bufs(void)
                cifs_min_rcv = 1;
        else if (cifs_min_rcv > 64) {
                cifs_min_rcv = 64;
-                cERROR(1, ("cifs_min_rcv set to maximum (64)"));
+                cERROR(1, "cifs_min_rcv set to maximum (64)");
        }
        cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv,
@@ -910,7 +798,7 @@ cifs_init_request_bufs(void)
                cifs_min_small = 2;
        else if (cifs_min_small > 256) {
                cifs_min_small = 256;
-                cFYI(1, ("cifs_min_small set to maximum (256)"));
+                cFYI(1, "cifs_min_small set to maximum (256)");
        }
        cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small,
@@ -951,15 +839,6 @@ cifs_init_mids(void)
                return -ENOMEM;
        }
-        cifs_oplock_cachep = kmem_cache_create("cifs_oplock_structs",
-                                        sizeof(struct oplock_q_entry), 0,
-                                        SLAB_HWCACHE_ALIGN, NULL);
-        if (cifs_oplock_cachep == NULL) {
-                mempool_destroy(cifs_mid_poolp);
-                kmem_cache_destroy(cifs_mid_cachep);
-                return -ENOMEM;
-        }
        return 0;
 }
@@ -968,7 +847,6 @@ cifs_destroy_mids(void)
 {
        mempool_destroy(cifs_mid_poolp);
        kmem_cache_destroy(cifs_mid_cachep);
-        kmem_cache_destroy(cifs_oplock_cachep);
 }
 static int __init
@@ -1008,10 +886,10 @@ init_cifs(void)
        if (cifs_max_pending < 2) {
                cifs_max_pending = 2;
-                cFYI(1, ("cifs_max_pending set to min of 2"));
+                cFYI(1, "cifs_max_pending set to min of 2");
        } else if (cifs_max_pending > 256) {
                cifs_max_pending = 256;
-                cFYI(1, ("cifs_max_pending set to max of 256"));
+                cFYI(1, "cifs_max_pending set to max of 256");
        }
        rc = cifs_init_inodecache();
@@ -1069,7 +947,7 @@ init_cifs(void)
 static void __exit
 exit_cifs(void)
 {
-        cFYI(DBG2, ("exit_cifs"));
+        cFYI(DBG2, "exit_cifs");
        cifs_proc_clean();
 #ifdef CONFIG_CIFS_DFS_UPCALL
        cifs_dfs_release_automount_timer();
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 7aa57ecdc437..a7eb65c84b1c 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -84,7 +84,7 @@ extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
 extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
                         size_t write_size, loff_t *poffset);
 extern int cifs_lock(struct file *, int, struct file_lock *);
-extern int cifs_fsync(struct file *, struct dentry *, int);
+extern int cifs_fsync(struct file *, int);
 extern int cifs_flush(struct file *, fl_owner_t id);
 extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
@@ -114,5 +114,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.62"
+#define CIFS_VERSION   "1.64"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 63c89d1d70b5..a88479ceaad5 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -18,6 +18,7 @@
 */
 #include <linux/in.h>
 #include <linux/in6.h>
+#include <linux/slab.h>
 #include <linux/slow-work.h>
 #include "cifs_fs_sb.h"
 #include "cifsacl.h"
@@ -86,7 +87,6 @@ enum securityEnum {
        RawNTLMSSP,             /* NTLMSSP without SPNEGO, NTLMv2 hash */
 /*      NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */
        Kerberos,               /* Kerberos via SPNEGO */
-        MSKerberos,             /* MS Kerberos via SPNEGO */
 };
 enum protocolEnum {
@@ -184,6 +184,12 @@ struct TCP_Server_Info {
        struct mac_key mac_signing_key;
        char ntlmv2_hash[16];
        unsigned long lstrp; /* when we got last response from this server */
+        u16 dialect; /* dialect index that server chose */
+        /* extended security flavors that server supports */
+        bool    sec_kerberos;           /* supports plain Kerberos */
+        bool    sec_mskerberos;         /* supports legacy MS Kerberos */
+        bool    sec_kerberosu2u;        /* supports U2U Kerberos */
+        bool    sec_ntlmssp;            /* supports NTLMSSP */
 };
 /*
@@ -501,6 +507,7 @@ struct dfs_info3_param {
 #define CIFS_FATTR_DFS_REFERRAL         0x1
 #define CIFS_FATTR_DELETE_PENDING       0x2
 #define CIFS_FATTR_NEED_REVAL           0x4
+#define CIFS_FATTR_INO_COLLISION        0x8
 struct cifs_fattr {
        u32             cf_flags;
@@ -716,7 +723,7 @@ GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions
 GLOBAL_EXTERN unsigned int oplockEnabled;
 GLOBAL_EXTERN unsigned int experimEnabled;
 GLOBAL_EXTERN unsigned int lookupCacheEnabled;
-GLOBAL_EXTERN unsigned int extended_security;   /* if on, session setup sent
+GLOBAL_EXTERN unsigned int global_secflags;     /* if on, session setup sent
                                with more secure ntlmssp2 challenge/resp */
 GLOBAL_EXTERN unsigned int sign_CIFS_PDUs;  /* enable smb packet signing */
 GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 39e47f46dea5..fb1657e0fdb8 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -39,8 +39,20 @@ extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *,
                        unsigned int /* length */);
 extern unsigned int _GetXid(void);
 extern void _FreeXid(unsigned int);
-#define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid()));
+#define GetXid()                                                \
-#define FreeXid(curr_xid) {_FreeXid(curr_xid); cFYI(1,("CIFS VFS: leaving %s (xid = %d) rc = %d",__func__,curr_xid,(int)rc));}
+({                                                              \
+        int __xid = (int)_GetXid();                             \
+        cFYI(1, "CIFS VFS: in %s as Xid: %d with uid: %d",      \
+             __func__, __xid, current_fsuid());                 \
+        __xid;                                                  \
+})
+#define FreeXid(curr_xid)                                       \
+do {                                                            \
+        _FreeXid(curr_xid);                                     \
+        cFYI(1, "CIFS VFS: leaving %s (xid = %d) rc = %d",      \
+             __func__, curr_xid, (int)rc);                      \
+} while (0)
 extern char *build_path_from_dentry(struct dentry *);
 extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
@@ -73,7 +85,7 @@ extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *);
 extern unsigned int smbCalcSize(struct smb_hdr *ptr);
 extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
-                        enum securityEnum *secType);
+                        struct TCP_Server_Info *server);
 extern int cifs_convert_address(char *src, void *dst);
 extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr);
 extern void header_assemble(struct smb_hdr *, char /* command */ ,
@@ -83,7 +95,6 @@ extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
                                struct cifsSesInfo *ses,
                                void **request_buf);
 extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
-                             const int stage,
                             const struct nls_table *nls_cp);
 extern __u16 GetNextMid(struct TCP_Server_Info *server);
 extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
@@ -95,8 +106,11 @@ extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
                                __u16 fileHandle, struct file *file,
                                struct vfsmount *mnt, unsigned int oflags);
 extern int cifs_posix_open(char *full_path, struct inode **pinode,
-                           struct vfsmount *mnt, int mode, int oflags,
+                                struct vfsmount *mnt,
-                           __u32 *poplock, __u16 *pnetfid, int xid);
+                                struct super_block *sb,
+                                int mode, int oflags,
+                                __u32 *poplock, __u16 *pnetfid, int xid);
+void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr);
 extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
                                     FILE_UNIX_BASIC_INFO *info,
                                     struct cifs_sb_info *cifs_sb);
@@ -125,7 +139,9 @@ extern void cifs_dfs_release_automount_timer(void);
 void cifs_proc_init(void);
 void cifs_proc_clean(void);
-extern int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
+extern int cifs_negotiate_protocol(unsigned int xid,
+                                  struct cifsSesInfo *ses);
+extern int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
                        struct nls_table *nls_info);
 extern int CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 7cc7f83e9314..c65c3419dd37 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/cifssmb.c
 *
- *   Copyright (C) International Business Machines  Corp., 2002,2009
+ *   Copyright (C) International Business Machines  Corp., 2002,2010
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   Contains the routines for constructing the SMB PDUs themselves
@@ -30,6 +30,7 @@
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/vfs.h>
+#include <linux/slab.h>
 #include <linux/posix_acl_xattr.h>
 #include <asm/uaccess.h>
 #include "cifspdu.h"
@@ -129,8 +130,8 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
                if (smb_command != SMB_COM_WRITE_ANDX &&
                    smb_command != SMB_COM_OPEN_ANDX &&
                    smb_command != SMB_COM_TREE_DISCONNECT) {
-                        cFYI(1, ("can not send cmd %d while umounting",
+                        cFYI(1, "can not send cmd %d while umounting",
-                                smb_command));
+                                smb_command);
                        return -ENODEV;
                }
        }
@@ -156,7 +157,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
                 * back on-line
                 */
                if (!tcon->retry || ses->status == CifsExiting) {
-                        cFYI(1, ("gave up waiting on reconnect in smb_init"));
+                        cFYI(1, "gave up waiting on reconnect in smb_init");
                        return -EHOSTDOWN;
                }
        }
@@ -171,7 +172,8 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
         * reconnect the same SMB session
         */
        mutex_lock(&ses->session_mutex);
-        if (ses->need_reconnect)
+        rc = cifs_negotiate_protocol(0, ses);
+        if (rc == 0 && ses->need_reconnect)
                rc = cifs_setup_session(0, ses, nls_codepage);
        /* do we need to reconnect tcon? */
@@ -183,7 +185,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
        mark_open_files_invalid(tcon);
        rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
        mutex_unlock(&ses->session_mutex);
-        cFYI(1, ("reconnect tcon rc = %d", rc));
+        cFYI(1, "reconnect tcon rc = %d", rc);
        if (rc)
                goto out;
@@ -354,7 +356,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        struct TCP_Server_Info *server;
        u16 count;
        unsigned int secFlags;
-        u16 dialect;
        if (ses->server)
                server = ses->server;
@@ -371,9 +372,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
                secFlags = ses->overrideSecFlg;  /* BB FIXME fix sign flags? */
        else /* if override flags set only sign/seal OR them with global auth */
-                secFlags = extended_security | ses->overrideSecFlg;
+                secFlags = global_secflags | ses->overrideSecFlg;
-        cFYI(1, ("secFlags 0x%x", secFlags));
+        cFYI(1, "secFlags 0x%x", secFlags);
        pSMB->hdr.Mid = GetNextMid(server);
        pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS);
@@ -381,14 +382,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
        else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
-                cFYI(1, ("Kerberos only mechanism, enable extended security"));
+                cFYI(1, "Kerberos only mechanism, enable extended security");
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
        }
 #ifdef CONFIG_CIFS_EXPERIMENTAL
        else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
        else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
-                cFYI(1, ("NTLMSSP only mechanism, enable extended security"));
+                cFYI(1, "NTLMSSP only mechanism, enable extended security");
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
        }
 #endif
@@ -407,10 +408,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        if (rc != 0)
                goto neg_err_exit;
-        dialect = le16_to_cpu(pSMBr->DialectIndex);
+        server->dialect = le16_to_cpu(pSMBr->DialectIndex);
-        cFYI(1, ("Dialect: %d", dialect));
+        cFYI(1, "Dialect: %d", server->dialect);
        /* Check wct = 1 error case */
-        if ((pSMBr->hdr.WordCount < 13) || (dialect == BAD_PROT)) {
+        if ((pSMBr->hdr.WordCount < 13) || (server->dialect == BAD_PROT)) {
                /* core returns wct = 1, but we do not ask for core - otherwise
                small wct just comes when dialect index is -1 indicating we
                could not negotiate a common dialect */
@@ -418,8 +419,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                goto neg_err_exit;
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
        } else if ((pSMBr->hdr.WordCount == 13)
-                        && ((dialect == LANMAN_PROT)
+                        && ((server->dialect == LANMAN_PROT)
-                                || (dialect == LANMAN2_PROT))) {
+                                || (server->dialect == LANMAN2_PROT))) {
                __s16 tmp;
                struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
@@ -427,8 +428,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        (secFlags & CIFSSEC_MAY_PLNTXT))
                        server->secType = LANMAN;
                else {
-                        cERROR(1, ("mount failed weak security disabled"
+                        cERROR(1, "mount failed weak security disabled"
-                                   " in /proc/fs/cifs/SecurityFlags"));
+                                   " in /proc/fs/cifs/SecurityFlags");
                        rc = -EOPNOTSUPP;
                        goto neg_err_exit;
                }
@@ -461,9 +462,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        utc = CURRENT_TIME;
                        ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
                                            rsp->SrvTime.Time, 0);
-                        cFYI(1, ("SrvTime %d sec since 1970 (utc: %d) diff: %d",
+                        cFYI(1, "SrvTime %d sec since 1970 (utc: %d) diff: %d",
                                (int)ts.tv_sec, (int)utc.tv_sec,
-                                (int)(utc.tv_sec - ts.tv_sec)));
+                                (int)(utc.tv_sec - ts.tv_sec));
                        val = (int)(utc.tv_sec - ts.tv_sec);
                        seconds = abs(val);
                        result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
@@ -477,7 +478,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        server->timeAdj = (int)tmp;
                        server->timeAdj *= 60; /* also in seconds */
                }
-                cFYI(1, ("server->timeAdj: %d seconds", server->timeAdj));
+                cFYI(1, "server->timeAdj: %d seconds", server->timeAdj);
                /* BB get server time for time conversions and add
@@ -492,14 +493,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        goto neg_err_exit;
                }
-                cFYI(1, ("LANMAN negotiated"));
+                cFYI(1, "LANMAN negotiated");
                /* we will not end up setting signing flags - as no signing
                was in LANMAN and server did not return the flags on */
                goto signing_check;
 #else /* weak security disabled */
        } else if (pSMBr->hdr.WordCount == 13) {
-                cERROR(1, ("mount failed, cifs module not built "
+                cERROR(1, "mount failed, cifs module not built "
-                          "with CIFS_WEAK_PW_HASH support"));
+                          "with CIFS_WEAK_PW_HASH support");
                rc = -EOPNOTSUPP;
 #endif /* WEAK_PW_HASH */
                goto neg_err_exit;
@@ -511,14 +512,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        /* else wct == 17 NTLM */
        server->secMode = pSMBr->SecurityMode;
        if ((server->secMode & SECMODE_USER) == 0)
-                cFYI(1, ("share mode security"));
+                cFYI(1, "share mode security");
        if ((server->secMode & SECMODE_PW_ENCRYPT) == 0)
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
                if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0)
 #endif /* CIFS_WEAK_PW_HASH */
-                        cERROR(1, ("Server requests plain text password"
+                        cERROR(1, "Server requests plain text password"
-                                  " but client support disabled"));
+                                  " but client support disabled");
        if ((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
                server->secType = NTLMv2;
@@ -538,7 +539,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 #endif */
        else {
                rc = -EOPNOTSUPP;
-                cERROR(1, ("Invalid security type"));
+                cERROR(1, "Invalid security type");
                goto neg_err_exit;
        }
        /* else ... any others ...? */
@@ -550,7 +551,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
                        (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
        server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
-        cFYI(DBG2, ("Max buf = %d", ses->server->maxBuf));
+        cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
        GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
        server->capabilities = le32_to_cpu(pSMBr->Capabilities);
        server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
@@ -581,7 +582,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        if (memcmp(server->server_GUID,
                                   pSMBr->u.extended_response.
                                   GUID, 16) != 0) {
-                                cFYI(1, ("server UID changed"));
+                                cFYI(1, "server UID changed");
                                memcpy(server->server_GUID,
                                        pSMBr->u.extended_response.GUID,
                                        16);
@@ -596,13 +597,19 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        server->secType = RawNTLMSSP;
                } else {
                        rc = decode_negTokenInit(pSMBr->u.extended_response.
-                                                 SecurityBlob,
+                                                 SecurityBlob, count - 16,
-                                                 count - 16,
+                                                 server);
-                                                 &server->secType);
                        if (rc == 1)
                                rc = 0;
                        else
                                rc = -EINVAL;
+                        if (server->sec_kerberos || server->sec_mskerberos)
+                                server->secType = Kerberos;
+                        else if (server->sec_ntlmssp)
+                                server->secType = RawNTLMSSP;
+                        else
+                                rc = -EOPNOTSUPP;
                }
        } else
                server->capabilities &= ~CAP_EXTENDED_SECURITY;
@@ -613,22 +620,21 @@ signing_check:
        if ((secFlags & CIFSSEC_MAY_SIGN) == 0) {
                /* MUST_SIGN already includes the MAY_SIGN FLAG
                   so if this is zero it means that signing is disabled */
-                cFYI(1, ("Signing disabled"));
+                cFYI(1, "Signing disabled");
                if (server->secMode & SECMODE_SIGN_REQUIRED) {
-                        cERROR(1, ("Server requires "
+                        cERROR(1, "Server requires "
                                   "packet signing to be enabled in "
-                                   "/proc/fs/cifs/SecurityFlags."));
+                                   "/proc/fs/cifs/SecurityFlags.");
                        rc = -EOPNOTSUPP;
                }
                server->secMode &=
                        ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
        } else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
                /* signing required */
-                cFYI(1, ("Must sign - secFlags 0x%x", secFlags));
+                cFYI(1, "Must sign - secFlags 0x%x", secFlags);
                if ((server->secMode &
                        (SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) {
-                        cERROR(1,
+                        cERROR(1, "signing required but server lacks support");
-                                ("signing required but server lacks support"));
                        rc = -EOPNOTSUPP;
                } else
                        server->secMode |= SECMODE_SIGN_REQUIRED;
@@ -642,7 +648,7 @@ signing_check:
 neg_err_exit:
        cifs_buf_release(pSMB);
-        cFYI(1, ("negprot rc %d", rc));
+        cFYI(1, "negprot rc %d", rc);
        return rc;
 }
@@ -652,7 +658,7 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
        struct smb_hdr *smb_buffer;
        int rc = 0;
-        cFYI(1, ("In tree disconnect"));
+        cFYI(1, "In tree disconnect");
        /* BB: do we need to check this? These should never be NULL. */
        if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
@@ -674,7 +680,7 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
        rc = SendReceiveNoRsp(xid, tcon->ses, smb_buffer, 0);
        if (rc)
-                cFYI(1, ("Tree disconnect failed %d", rc));
+                cFYI(1, "Tree disconnect failed %d", rc);
        /* No need to return error on this operation if tid invalidated and
           closed on server already e.g. due to tcp session crashing */
@@ -690,7 +696,7 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
        LOGOFF_ANDX_REQ *pSMB;
        int rc = 0;
-        cFYI(1, ("In SMBLogoff for session disconnect"));
+        cFYI(1, "In SMBLogoff for session disconnect");
        /*
         * BB: do we need to check validity of ses and server? They should
@@ -743,7 +749,7 @@ CIFSPOSIXDelFile(const int xid, struct cifsTconInfo *tcon, const char *fileName,
        int bytes_returned = 0;
        __u16 params, param_offset, offset, byte_count;
-        cFYI(1, ("In POSIX delete"));
+        cFYI(1, "In POSIX delete");
 PsxDelete:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -795,7 +801,7 @@ PsxDelete:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc)
-                cFYI(1, ("Posix delete returned %d", rc));
+                cFYI(1, "Posix delete returned %d", rc);
        cifs_buf_release(pSMB);
        cifs_stats_inc(&tcon->num_deletes);
@@ -842,7 +848,7 @@ DelFileRetry:
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_deletes);
        if (rc)
-                cFYI(1, ("Error in RMFile = %d", rc));
+                cFYI(1, "Error in RMFile = %d", rc);
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -861,7 +867,7 @@ CIFSSMBRmDir(const int xid, struct cifsTconInfo *tcon, const char *dirName,
        int bytes_returned;
        int name_len;
-        cFYI(1, ("In CIFSSMBRmDir"));
+        cFYI(1, "In CIFSSMBRmDir");
 RmDirRetry:
        rc = smb_init(SMB_COM_DELETE_DIRECTORY, 0, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -886,7 +892,7 @@ RmDirRetry:
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_rmdirs);
        if (rc)
-                cFYI(1, ("Error in RMDir = %d", rc));
+                cFYI(1, "Error in RMDir = %d", rc);
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -904,7 +910,7 @@ CIFSSMBMkDir(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned;
        int name_len;
-        cFYI(1, ("In CIFSSMBMkDir"));
+        cFYI(1, "In CIFSSMBMkDir");
 MkDirRetry:
        rc = smb_init(SMB_COM_CREATE_DIRECTORY, 0, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -929,7 +935,7 @@ MkDirRetry:
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_mkdirs);
        if (rc)
-                cFYI(1, ("Error in Mkdir = %d", rc));
+                cFYI(1, "Error in Mkdir = %d", rc);
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -952,7 +958,7 @@ CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon, __u32 posix_flags,
        OPEN_PSX_REQ *pdata;
        OPEN_PSX_RSP *psx_rsp;
-        cFYI(1, ("In POSIX Create"));
+        cFYI(1, "In POSIX Create");
 PsxCreat:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -1006,11 +1012,11 @@ PsxCreat:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Posix create returned %d", rc));
+                cFYI(1, "Posix create returned %d", rc);
                goto psx_create_err;
        }
-        cFYI(1, ("copying inode info"));
+        cFYI(1, "copying inode info");
        rc = validate_t2((struct smb_t2_rsp *)pSMBr);
        if (rc || (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP))) {
@@ -1032,11 +1038,11 @@ PsxCreat:
        /* check to make sure response data is there */
        if (psx_rsp->ReturnedLevel != cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC)) {
                pRetData->Type = cpu_to_le32(-1); /* unknown */
-                cFYI(DBG2, ("unknown type"));
+                cFYI(DBG2, "unknown type");
        } else {
                if (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP)
                                        + sizeof(FILE_UNIX_BASIC_INFO)) {
-                        cERROR(1, ("Open response data too small"));
+                        cERROR(1, "Open response data too small");
                        pRetData->Type = cpu_to_le32(-1);
                        goto psx_create_err;
                }
@@ -1083,7 +1089,7 @@ static __u16 convert_disposition(int disposition)
                        ofun = SMBOPEN_OCREATE | SMBOPEN_OTRUNC;
                        break;
                default:
-                        cFYI(1, ("unknown disposition %d", disposition));
+                        cFYI(1, "unknown disposition %d", disposition);
                        ofun =  SMBOPEN_OAPPEND; /* regular open */
        }
        return ofun;
@@ -1174,7 +1180,7 @@ OldOpenRetry:
                        (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
        cifs_stats_inc(&tcon->num_opens);
        if (rc) {
-                cFYI(1, ("Error in Open = %d", rc));
+                cFYI(1, "Error in Open = %d", rc);
        } else {
        /* BB verify if wct == 15 */
@@ -1287,7 +1293,7 @@ openRetry:
                        (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
        cifs_stats_inc(&tcon->num_opens);
        if (rc) {
-                cFYI(1, ("Error in Open = %d", rc));
+                cFYI(1, "Error in Open = %d", rc);
        } else {
                *pOplock = pSMBr->OplockLevel; /* 1 byte no need to le_to_cpu */
                *netfid = pSMBr->Fid;   /* cifs fid stays in le */
@@ -1325,7 +1331,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
        int resp_buf_type = 0;
        struct kvec iov[1];
-        cFYI(1, ("Reading %d bytes on fid %d", count, netfid));
+        cFYI(1, "Reading %d bytes on fid %d", count, netfid);
        if (tcon->ses->capabilities & CAP_LARGE_FILES)
                wct = 12;
        else {
@@ -1370,7 +1376,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
        cifs_stats_inc(&tcon->num_reads);
        pSMBr = (READ_RSP *)iov[0].iov_base;
        if (rc) {
-                cERROR(1, ("Send error in read = %d", rc));
+                cERROR(1, "Send error in read = %d", rc);
        } else {
                int data_length = le16_to_cpu(pSMBr->DataLengthHigh);
                data_length = data_length << 16;
@@ -1380,15 +1386,15 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
                /*check that DataLength would not go beyond end of SMB */
                if ((data_length > CIFSMaxBufSize)
                                || (data_length > count)) {
-                        cFYI(1, ("bad length %d for count %d",
+                        cFYI(1, "bad length %d for count %d",
-                                 data_length, count));
+                                 data_length, count);
                        rc = -EIO;
                        *nbytes = 0;
                } else {
                        pReadData = (char *) (&pSMBr->hdr.Protocol) +
                                        le16_to_cpu(pSMBr->DataOffset);
 /*                      if (rc = copy_to_user(buf, pReadData, data_length)) {
-                                cERROR(1,("Faulting on read rc = %d",rc));
+                                cERROR(1, "Faulting on read rc = %d",rc);
                                rc = -EFAULT;
                        }*/ /* can not use copy_to_user when using page cache*/
                        if (*buf)
@@ -1430,7 +1436,9 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
        __u32 bytes_sent;
        __u16 byte_count;
-        /* cFYI(1, ("write at %lld %d bytes", offset, count));*/
+        *nbytes = 0;
+        /* cFYI(1, "write at %lld %d bytes", offset, count);*/
        if (tcon->ses == NULL)
                return -ECONNABORTED;
@@ -1511,12 +1519,19 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
                         (struct smb_hdr *) pSMBr, &bytes_returned, long_op);
        cifs_stats_inc(&tcon->num_writes);
        if (rc) {
-                cFYI(1, ("Send error in write = %d", rc));
+                cFYI(1, "Send error in write = %d", rc);
-                *nbytes = 0;
        } else {
                *nbytes = le16_to_cpu(pSMBr->CountHigh);
                *nbytes = (*nbytes) << 16;
                *nbytes += le16_to_cpu(pSMBr->Count);
+                /*
+                 * Mask off high 16 bits when bytes written as returned by the
+                 * server is greater than bytes requested by the client. Some
+                 * OS/2 servers are known to set incorrect CountHigh values.
+                 */
+                if (*nbytes > count)
+                        *nbytes &= 0xFFFF;
        }
        cifs_buf_release(pSMB);
@@ -1541,7 +1556,7 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
        *nbytes = 0;
-        cFYI(1, ("write2 at %lld %d bytes", (long long)offset, count));
+        cFYI(1, "write2 at %lld %d bytes", (long long)offset, count);
        if (tcon->ses->capabilities & CAP_LARGE_FILES) {
                wct = 14;
@@ -1596,7 +1611,7 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
                          long_op);
        cifs_stats_inc(&tcon->num_writes);
        if (rc) {
-                cFYI(1, ("Send error Write2 = %d", rc));
+                cFYI(1, "Send error Write2 = %d", rc);
        } else if (resp_buf_type == 0) {
                /* presumably this can not happen, but best to be safe */
                rc = -EIO;
@@ -1605,6 +1620,14 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
                *nbytes = le16_to_cpu(pSMBr->CountHigh);
                *nbytes = (*nbytes) << 16;
                *nbytes += le16_to_cpu(pSMBr->Count);
+                /*
+                 * Mask off high 16 bits when bytes written as returned by the
+                 * server is greater than bytes requested by the client. OS/2
+                 * servers are known to set incorrect CountHigh values.
+                 */
+                if (*nbytes > count)
+                        *nbytes &= 0xFFFF;
        }
 /*      cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
@@ -1633,7 +1656,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
        int timeout = 0;
        __u16 count;
-        cFYI(1, ("CIFSSMBLock timeout %d numLock %d", (int)waitFlag, numLock));
+        cFYI(1, "CIFSSMBLock timeout %d numLock %d", (int)waitFlag, numLock);
        rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
        if (rc)
@@ -1681,7 +1704,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
        }
        cifs_stats_inc(&tcon->num_locks);
        if (rc)
-                cFYI(1, ("Send error in Lock = %d", rc));
+                cFYI(1, "Send error in Lock = %d", rc);
        /* Note: On -EAGAIN error only caller can retry on handle based calls
        since file handle passed in no longer valid */
@@ -1704,7 +1727,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
        __u16 params, param_offset, offset, byte_count, count;
        struct kvec iov[1];
-        cFYI(1, ("Posix Lock"));
+        cFYI(1, "Posix Lock");
        if (pLockData == NULL)
                return -EINVAL;
@@ -1774,7 +1797,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
        }
        if (rc) {
-                cFYI(1, ("Send error in Posix Lock = %d", rc));
+                cFYI(1, "Send error in Posix Lock = %d", rc);
        } else if (get_flag) {
                /* lock structure can be returned on get */
                __u16 data_offset;
@@ -1793,8 +1816,21 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
                }
                parm_data = (struct cifs_posix_lock *)
                        ((char *)&pSMBr->hdr.Protocol + data_offset);
-                if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK))
+                if (parm_data->lock_type == __constant_cpu_to_le16(CIFS_UNLCK))
                        pLockData->fl_type = F_UNLCK;
+                else {
+                        if (parm_data->lock_type ==
+                                        __constant_cpu_to_le16(CIFS_RDLCK))
+                                pLockData->fl_type = F_RDLCK;
+                        else if (parm_data->lock_type ==
+                                        __constant_cpu_to_le16(CIFS_WRLCK))
+                                pLockData->fl_type = F_WRLCK;
+                        pLockData->fl_start = parm_data->start;
+                        pLockData->fl_end = parm_data->start +
+                                                parm_data->length - 1;
+                        pLockData->fl_pid = parm_data->pid;
+                }
        }
 plk_err_exit:
@@ -1818,7 +1854,7 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 {
        int rc = 0;
        CLOSE_REQ *pSMB = NULL;
-        cFYI(1, ("In CIFSSMBClose"));
+        cFYI(1, "In CIFSSMBClose");
 /* do not retry on dead session on close */
        rc = small_smb_init(SMB_COM_CLOSE, 3, tcon, (void **) &pSMB);
@@ -1835,7 +1871,7 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
        if (rc) {
                if (rc != -EINTR) {
                        /* EINTR is expected when user ctl-c to kill app */
-                        cERROR(1, ("Send error in Close = %d", rc));
+                        cERROR(1, "Send error in Close = %d", rc);
                }
        }
@@ -1851,7 +1887,7 @@ CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 {
        int rc = 0;
        FLUSH_REQ *pSMB = NULL;
-        cFYI(1, ("In CIFSSMBFlush"));
+        cFYI(1, "In CIFSSMBFlush");
        rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB);
        if (rc)
@@ -1862,7 +1898,7 @@ CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
        cifs_stats_inc(&tcon->num_flushes);
        if (rc)
-                cERROR(1, ("Send error in Flush = %d", rc));
+                cERROR(1, "Send error in Flush = %d", rc);
        return rc;
 }
@@ -1879,7 +1915,7 @@ CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
        int name_len, name_len2;
        __u16 count;
-        cFYI(1, ("In CIFSSMBRename"));
+        cFYI(1, "In CIFSSMBRename");
 renameRetry:
        rc = smb_init(SMB_COM_RENAME, 1, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -1925,7 +1961,7 @@ renameRetry:
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_renames);
        if (rc)
-                cFYI(1, ("Send error in rename = %d", rc));
+                cFYI(1, "Send error in rename = %d", rc);
        cifs_buf_release(pSMB);
@@ -1949,7 +1985,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
        int len_of_str;
        __u16 params, param_offset, offset, count, byte_count;
-        cFYI(1, ("Rename to File by handle"));
+        cFYI(1, "Rename to File by handle");
        rc = smb_init(SMB_COM_TRANSACTION2, 15, pTcon, (void **) &pSMB,
                        (void **) &pSMBr);
        if (rc)
@@ -2004,7 +2040,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&pTcon->num_t2renames);
        if (rc)
-                cFYI(1, ("Send error in Rename (by file handle) = %d", rc));
+                cFYI(1, "Send error in Rename (by file handle) = %d", rc);
        cifs_buf_release(pSMB);
@@ -2026,7 +2062,7 @@ CIFSSMBCopy(const int xid, struct cifsTconInfo *tcon, const char *fromName,
        int name_len, name_len2;
        __u16 count;
-        cFYI(1, ("In CIFSSMBCopy"));
+        cFYI(1, "In CIFSSMBCopy");
 copyRetry:
        rc = smb_init(SMB_COM_COPY, 1, tcon, (void **) &pSMB,
                        (void **) &pSMBr);
@@ -2071,8 +2107,8 @@ copyRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in copy = %d with %d files copied",
+                cFYI(1, "Send error in copy = %d with %d files copied",
-                        rc, le16_to_cpu(pSMBr->CopyCount)));
+                        rc, le16_to_cpu(pSMBr->CopyCount));
        }
        cifs_buf_release(pSMB);
@@ -2096,7 +2132,7 @@ CIFSUnixCreateSymLink(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned = 0;
        __u16 params, param_offset, offset, byte_count;
-        cFYI(1, ("In Symlink Unix style"));
+        cFYI(1, "In Symlink Unix style");
 createSymLinkRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -2161,7 +2197,7 @@ createSymLinkRetry:
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_symlinks);
        if (rc)
-                cFYI(1, ("Send error in SetPathInfo create symlink = %d", rc));
+                cFYI(1, "Send error in SetPathInfo create symlink = %d", rc);
        cifs_buf_release(pSMB);
@@ -2185,7 +2221,7 @@ CIFSUnixCreateHardLink(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned = 0;
        __u16 params, param_offset, offset, byte_count;
-        cFYI(1, ("In Create Hard link Unix style"));
+        cFYI(1, "In Create Hard link Unix style");
 createHardLinkRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -2247,7 +2283,7 @@ createHardLinkRetry:
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_hardlinks);
        if (rc)
-                cFYI(1, ("Send error in SetPathInfo (hard link) = %d", rc));
+                cFYI(1, "Send error in SetPathInfo (hard link) = %d", rc);
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -2268,7 +2304,7 @@ CIFSCreateHardLink(const int xid, struct cifsTconInfo *tcon,
        int name_len, name_len2;
        __u16 count;
-        cFYI(1, ("In CIFSCreateHardLink"));
+        cFYI(1, "In CIFSCreateHardLink");
 winCreateHardLinkRetry:
        rc = smb_init(SMB_COM_NT_RENAME, 4, tcon, (void **) &pSMB,
@@ -2319,7 +2355,7 @@ winCreateHardLinkRetry:
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_hardlinks);
        if (rc)
-                cFYI(1, ("Send error in hard link (NT rename) = %d", rc));
+                cFYI(1, "Send error in hard link (NT rename) = %d", rc);
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -2342,7 +2378,7 @@ CIFSSMBUnixQuerySymLink(const int xid, struct cifsTconInfo *tcon,
        __u16 params, byte_count;
        char *data_start;
-        cFYI(1, ("In QPathSymLinkInfo (Unix) for path %s", searchName));
+        cFYI(1, "In QPathSymLinkInfo (Unix) for path %s", searchName);
 querySymLinkRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -2389,7 +2425,7 @@ querySymLinkRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QuerySymLinkInfo = %d", rc));
+                cFYI(1, "Send error in QuerySymLinkInfo = %d", rc);
        } else {
                /* decode response */
@@ -2490,21 +2526,21 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
        /* should we also check that parm and data areas do not overlap? */
        if (*ppparm > end_of_smb) {
-                cFYI(1, ("parms start after end of smb"));
+                cFYI(1, "parms start after end of smb");
                return -EINVAL;
        } else if (parm_count + *ppparm > end_of_smb) {
-                cFYI(1, ("parm end after end of smb"));
+                cFYI(1, "parm end after end of smb");
                return -EINVAL;
        } else if (*ppdata > end_of_smb) {
-                cFYI(1, ("data starts after end of smb"));
+                cFYI(1, "data starts after end of smb");
                return -EINVAL;
        } else if (data_count + *ppdata > end_of_smb) {
-                cFYI(1, ("data %p + count %d (%p) ends after end of smb %p start %p",
+                cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
                        *ppdata, data_count, (data_count + *ppdata),
-                        end_of_smb, pSMBr));
+                        end_of_smb, pSMBr);
                return -EINVAL;
        } else if (parm_count + data_count > pSMBr->ByteCount) {
-                cFYI(1, ("parm count and data count larger than SMB"));
+                cFYI(1, "parm count and data count larger than SMB");
                return -EINVAL;
        }
        *pdatalen = data_count;
@@ -2523,7 +2559,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
        struct smb_com_transaction_ioctl_req *pSMB;
        struct smb_com_transaction_ioctl_rsp *pSMBr;
-        cFYI(1, ("In Windows reparse style QueryLink for path %s", searchName));
+        cFYI(1, "In Windows reparse style QueryLink for path %s", searchName);
        rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
        if (rc)
@@ -2552,7 +2588,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QueryReparseLinkInfo = %d", rc));
+                cFYI(1, "Send error in QueryReparseLinkInfo = %d", rc);
        } else {                /* decode response */
                __u32 data_offset = le32_to_cpu(pSMBr->DataOffset);
                __u32 data_count = le32_to_cpu(pSMBr->DataCount);
@@ -2576,7 +2612,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
                        if ((reparse_buf->LinkNamesBuf +
                                reparse_buf->TargetNameOffset +
                                reparse_buf->TargetNameLen) > end_of_smb) {
-                                cFYI(1, ("reparse buf beyond SMB"));
+                                cFYI(1, "reparse buf beyond SMB");
                                rc = -EIO;
                                goto qreparse_out;
                        }
@@ -2597,12 +2633,12 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
                        }
                } else {
                        rc = -EIO;
-                        cFYI(1, ("Invalid return data count on "
+                        cFYI(1, "Invalid return data count on "
-                                 "get reparse info ioctl"));
+                                 "get reparse info ioctl");
                }
                symlinkinfo[buflen] = 0; /* just in case so the caller
                                        does not go off the end of the buffer */
-                cFYI(1, ("readlink result - %s", symlinkinfo));
+                cFYI(1, "readlink result - %s", symlinkinfo);
        }
 qreparse_out:
@@ -2625,7 +2661,7 @@ static void cifs_convert_ace(posix_acl_xattr_entry *ace,
        ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm);
        ace->e_tag  = cpu_to_le16(cifs_ace->cifs_e_tag);
        ace->e_id   = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid));
-        /* cFYI(1,("perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id)); */
+        /* cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id); */
        return;
 }
@@ -2651,8 +2687,8 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
                size += sizeof(struct cifs_posix_ace) * count;
                /* check if we would go beyond end of SMB */
                if (size_of_data_area < size) {
-                        cFYI(1, ("bad CIFS POSIX ACL size %d vs. %d",
+                        cFYI(1, "bad CIFS POSIX ACL size %d vs. %d",
-                                size_of_data_area, size));
+                                size_of_data_area, size);
                        return -EINVAL;
                }
        } else if (acl_type & ACL_TYPE_DEFAULT) {
@@ -2699,7 +2735,7 @@ static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace,
                cifs_ace->cifs_uid = cpu_to_le64(-1);
        } else
                cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id));
-        /*cFYI(1,("perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id));*/
+        /*cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id);*/
        return rc;
 }
@@ -2717,12 +2753,12 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
                return 0;
        count = posix_acl_xattr_count((size_t)buflen);
-        cFYI(1, ("setting acl with %d entries from buf of length %d and "
+        cFYI(1, "setting acl with %d entries from buf of length %d and "
                "version of %d",
-                count, buflen, le32_to_cpu(local_acl->a_version)));
+                count, buflen, le32_to_cpu(local_acl->a_version));
        if (le32_to_cpu(local_acl->a_version) != 2) {
-                cFYI(1, ("unknown POSIX ACL version %d",
+                cFYI(1, "unknown POSIX ACL version %d",
-                     le32_to_cpu(local_acl->a_version)));
+                     le32_to_cpu(local_acl->a_version));
                return 0;
        }
        cifs_acl->version = cpu_to_le16(1);
@@ -2731,7 +2767,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
        else if (acl_type == ACL_TYPE_DEFAULT)
                cifs_acl->default_entry_count = cpu_to_le16(count);
        else {
-                cFYI(1, ("unknown ACL type %d", acl_type));
+                cFYI(1, "unknown ACL type %d", acl_type);
                return 0;
        }
        for (i = 0; i < count; i++) {
@@ -2764,7 +2800,7 @@ CIFSSMBGetPosixACL(const int xid, struct cifsTconInfo *tcon,
        int name_len;
        __u16 params, byte_count;
-        cFYI(1, ("In GetPosixACL (Unix) for path %s", searchName));
+        cFYI(1, "In GetPosixACL (Unix) for path %s", searchName);
 queryAclRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -2816,7 +2852,7 @@ queryAclRetry:
                (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_acl_get);
        if (rc) {
-                cFYI(1, ("Send error in Query POSIX ACL = %d", rc));
+                cFYI(1, "Send error in Query POSIX ACL = %d", rc);
        } else {
                /* decode response */
@@ -2853,7 +2889,7 @@ CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned = 0;
        __u16 params, byte_count, data_count, param_offset, offset;
-        cFYI(1, ("In SetPosixACL (Unix) for path %s", fileName));
+        cFYI(1, "In SetPosixACL (Unix) for path %s", fileName);
 setAclRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -2908,7 +2944,7 @@ setAclRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc)
-                cFYI(1, ("Set POSIX ACL returned %d", rc));
+                cFYI(1, "Set POSIX ACL returned %d", rc);
 setACLerrorExit:
        cifs_buf_release(pSMB);
@@ -2928,7 +2964,7 @@ CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned;
        __u16 params, byte_count;
-        cFYI(1, ("In GetExtAttr"));
+        cFYI(1, "In GetExtAttr");
        if (tcon == NULL)
                return -ENODEV;
@@ -2967,7 +3003,7 @@ GetExtAttrRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("error %d in GetExtAttr", rc));
+                cFYI(1, "error %d in GetExtAttr", rc);
        } else {
                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -2982,7 +3018,7 @@ GetExtAttrRetry:
                        struct file_chattr_info *pfinfo;
                        /* BB Do we need a cast or hash here ? */
                        if (count != 16) {
-                                cFYI(1, ("Illegal size ret in GetExtAttr"));
+                                cFYI(1, "Illegal size ret in GetExtAttr");
                                rc = -EIO;
                                goto GetExtAttrOut;
                        }
@@ -3012,7 +3048,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
        QUERY_SEC_DESC_REQ *pSMB;
        struct kvec iov[1];
-        cFYI(1, ("GetCifsACL"));
+        cFYI(1, "GetCifsACL");
        *pbuflen = 0;
        *acl_inf = NULL;
@@ -3037,7 +3073,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
                         CIFS_STD_OP);
        cifs_stats_inc(&tcon->num_acl_get);
        if (rc) {
-                cFYI(1, ("Send error in QuerySecDesc = %d", rc));
+                cFYI(1, "Send error in QuerySecDesc = %d", rc);
        } else {                /* decode response */
                __le32 *parm;
                __u32 parm_len;
@@ -3052,7 +3088,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
                        goto qsec_out;
                pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base;
-                cFYI(1, ("smb %p parm %p data %p", pSMBr, parm, *acl_inf));
+                cFYI(1, "smb %p parm %p data %p", pSMBr, parm, *acl_inf);
                if (le32_to_cpu(pSMBr->ParameterCount) != 4) {
                        rc = -EIO;      /* bad smb */
@@ -3064,8 +3100,8 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
                acl_len = le32_to_cpu(*parm);
                if (acl_len != *pbuflen) {
-                        cERROR(1, ("acl length %d does not match %d",
+                        cERROR(1, "acl length %d does not match %d",
-                                   acl_len, *pbuflen));
+                                   acl_len, *pbuflen);
                        if (*pbuflen > acl_len)
                                *pbuflen = acl_len;
                }
@@ -3074,7 +3110,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
                   header followed by the smallest SID */
                if ((*pbuflen < sizeof(struct cifs_ntsd) + 8) ||
                    (*pbuflen >= 64 * 1024)) {
-                        cERROR(1, ("bad acl length %d", *pbuflen));
+                        cERROR(1, "bad acl length %d", *pbuflen);
                        rc = -EINVAL;
                        *pbuflen = 0;
                } else {
@@ -3148,9 +3184,9 @@ setCifsAclRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-        cFYI(1, ("SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc));
+        cFYI(1, "SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc);
        if (rc)
-                cFYI(1, ("Set CIFS ACL returned %d", rc));
+                cFYI(1, "Set CIFS ACL returned %d", rc);
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -3174,7 +3210,7 @@ int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned;
        int name_len;
-        cFYI(1, ("In SMBQPath path %s", searchName));
+        cFYI(1, "In SMBQPath path %s", searchName);
 QInfRetry:
        rc = smb_init(SMB_COM_QUERY_INFORMATION, 0, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -3200,7 +3236,7 @@ QInfRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QueryInfo = %d", rc));
+                cFYI(1, "Send error in QueryInfo = %d", rc);
        } else if (pFinfo) {
                struct timespec ts;
                __u32 time = le32_to_cpu(pSMBr->last_write_time);
@@ -3274,7 +3310,7 @@ QFileInfoRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QPathInfo = %d", rc));
+                cFYI(1, "Send error in QPathInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -3312,7 +3348,7 @@ CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
        int name_len;
        __u16 params, byte_count;
-/* cFYI(1, ("In QPathInfo path %s", searchName)); */
+/* cFYI(1, "In QPathInfo path %s", searchName); */
 QPathInfoRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -3362,7 +3398,7 @@ QPathInfoRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QPathInfo = %d", rc));
+                cFYI(1, "Send error in QPathInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -3442,14 +3478,14 @@ UnixQFileInfoRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QPathInfo = %d", rc));
+                cFYI(1, "Send error in QPathInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
                if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
-                        cERROR(1, ("Malformed FILE_UNIX_BASIC_INFO response.\n"
+                        cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
                                   "Unix Extensions can be disabled on mount "
-                                   "by specifying the nosfu mount option."));
+                                   "by specifying the nosfu mount option.");
                        rc = -EIO;      /* bad smb */
                } else {
                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -3481,7 +3517,7 @@ CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
        int name_len;
        __u16 params, byte_count;
-        cFYI(1, ("In QPathInfo (Unix) the path %s", searchName));
+        cFYI(1, "In QPathInfo (Unix) the path %s", searchName);
 UnixQPathInfoRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -3528,14 +3564,14 @@ UnixQPathInfoRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QPathInfo = %d", rc));
+                cFYI(1, "Send error in QPathInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
                if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
-                        cERROR(1, ("Malformed FILE_UNIX_BASIC_INFO response.\n"
+                        cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
                                   "Unix Extensions can be disabled on mount "
-                                   "by specifying the nosfu mount option."));
+                                   "by specifying the nosfu mount option.");
                        rc = -EIO;      /* bad smb */
                } else {
                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -3569,7 +3605,7 @@ CIFSFindFirst(const int xid, struct cifsTconInfo *tcon,
        int name_len;
        __u16 params, byte_count;
-        cFYI(1, ("In FindFirst for %s", searchName));
+        cFYI(1, "In FindFirst for %s", searchName);
 findFirstRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -3646,7 +3682,7 @@ findFirstRetry:
        if (rc) {/* BB add logic to retry regular search if Unix search
                        rejected unexpectedly by server */
                /* BB Add code to handle unsupported level rc */
-                cFYI(1, ("Error in FindFirst = %d", rc));
+                cFYI(1, "Error in FindFirst = %d", rc);
                cifs_buf_release(pSMB);
@@ -3685,7 +3721,7 @@ findFirstRetry:
                        lnoff = le16_to_cpu(parms->LastNameOffset);
                        if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
                              lnoff) {
-                                cERROR(1, ("ignoring corrupt resume name"));
+                                cERROR(1, "ignoring corrupt resume name");
                                psrch_inf->last_entry = NULL;
                                return rc;
                        }
@@ -3713,7 +3749,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned, name_len;
        __u16 params, byte_count;
-        cFYI(1, ("In FindNext"));
+        cFYI(1, "In FindNext");
        if (psrch_inf->endOfSearch)
                return -ENOENT;
@@ -3777,7 +3813,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
                        cifs_buf_release(pSMB);
                        rc = 0; /* search probably was closed at end of search*/
                } else
-                        cFYI(1, ("FindNext returned = %d", rc));
+                        cFYI(1, "FindNext returned = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -3813,15 +3849,15 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
                        lnoff = le16_to_cpu(parms->LastNameOffset);
                        if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
                              lnoff) {
-                                cERROR(1, ("ignoring corrupt resume name"));
+                                cERROR(1, "ignoring corrupt resume name");
                                psrch_inf->last_entry = NULL;
                                return rc;
                        } else
                                psrch_inf->last_entry =
                                        psrch_inf->srch_entries_start + lnoff;
-/*  cFYI(1,("fnxt2 entries in buf %d index_of_last %d",
+/*  cFYI(1, "fnxt2 entries in buf %d index_of_last %d",
-            psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry)); */
+            psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry); */
                        /* BB fixme add unlock here */
                }
@@ -3846,7 +3882,7 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
        int rc = 0;
        FINDCLOSE_REQ *pSMB = NULL;
-        cFYI(1, ("In CIFSSMBFindClose"));
+        cFYI(1, "In CIFSSMBFindClose");
        rc = small_smb_init(SMB_COM_FIND_CLOSE2, 1, tcon, (void **)&pSMB);
        /* no sense returning error if session restarted
@@ -3860,7 +3896,7 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
        pSMB->ByteCount = 0;
        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
        if (rc)
-                cERROR(1, ("Send error in FindClose = %d", rc));
+                cERROR(1, "Send error in FindClose = %d", rc);
        cifs_stats_inc(&tcon->num_fclose);
@@ -3883,7 +3919,7 @@ CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
        int name_len, bytes_returned;
        __u16 params, byte_count;
-        cFYI(1, ("In GetSrvInodeNum for %s", searchName));
+        cFYI(1, "In GetSrvInodeNum for %s", searchName);
        if (tcon == NULL)
                return -ENODEV;
@@ -3933,7 +3969,7 @@ GetInodeNumberRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("error %d in QueryInternalInfo", rc));
+                cFYI(1, "error %d in QueryInternalInfo", rc);
        } else {
                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -3948,7 +3984,7 @@ GetInodeNumberRetry:
                        struct file_internal_info *pfinfo;
                        /* BB Do we need a cast or hash here ? */
                        if (count < 8) {
-                                cFYI(1, ("Illegal size ret in QryIntrnlInf"));
+                                cFYI(1, "Illegal size ret in QryIntrnlInf");
                                rc = -EIO;
                                goto GetInodeNumOut;
                        }
@@ -3989,16 +4025,16 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
        *num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals);
        if (*num_of_nodes < 1) {
-                cERROR(1, ("num_referrals: must be at least > 0,"
+                cERROR(1, "num_referrals: must be at least > 0,"
-                        "but we get num_referrals = %d\n", *num_of_nodes));
+                        "but we get num_referrals = %d\n", *num_of_nodes);
                rc = -EINVAL;
                goto parse_DFS_referrals_exit;
        }
        ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals);
        if (ref->VersionNumber != cpu_to_le16(3)) {
-                cERROR(1, ("Referrals of V%d version are not supported,"
+                cERROR(1, "Referrals of V%d version are not supported,"
-                        "should be V3", le16_to_cpu(ref->VersionNumber)));
+                        "should be V3", le16_to_cpu(ref->VersionNumber));
                rc = -EINVAL;
                goto parse_DFS_referrals_exit;
        }
@@ -4007,14 +4043,14 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
        data_end = (char *)(&(pSMBr->PathConsumed)) +
                                le16_to_cpu(pSMBr->t2.DataCount);
-        cFYI(1, ("num_referrals: %d dfs flags: 0x%x ... \n",
+        cFYI(1, "num_referrals: %d dfs flags: 0x%x ...\n",
                        *num_of_nodes,
-                        le32_to_cpu(pSMBr->DFSFlags)));
+                        le32_to_cpu(pSMBr->DFSFlags));
        *target_nodes = kzalloc(sizeof(struct dfs_info3_param) *
                        *num_of_nodes, GFP_KERNEL);
        if (*target_nodes == NULL) {
-                cERROR(1, ("Failed to allocate buffer for target_nodes\n"));
+                cERROR(1, "Failed to allocate buffer for target_nodes\n");
                rc = -ENOMEM;
                goto parse_DFS_referrals_exit;
        }
@@ -4090,7 +4126,7 @@ CIFSGetDFSRefer(const int xid, struct cifsSesInfo *ses,
        *num_of_nodes = 0;
        *target_nodes = NULL;
-        cFYI(1, ("In GetDFSRefer the path %s", searchName));
+        cFYI(1, "In GetDFSRefer the path %s", searchName);
        if (ses == NULL)
                return -ENODEV;
 getDFSRetry:
@@ -4157,7 +4193,7 @@ getDFSRetry:
        rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in GetDFSRefer = %d", rc));
+                cFYI(1, "Send error in GetDFSRefer = %d", rc);
                goto GetDFSRefExit;
        }
        rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4168,9 +4204,9 @@ getDFSRetry:
                goto GetDFSRefExit;
        }
-        cFYI(1, ("Decoding GetDFSRefer response BCC: %d  Offset %d",
+        cFYI(1, "Decoding GetDFSRefer response BCC: %d  Offset %d",
                                pSMBr->ByteCount,
-                                le16_to_cpu(pSMBr->t2.DataOffset)));
+                                le16_to_cpu(pSMBr->t2.DataOffset));
        /* parse returned result into more usable form */
        rc = parse_DFS_referrals(pSMBr, num_of_nodes,
@@ -4198,7 +4234,7 @@ SMBOldQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData)
        int bytes_returned = 0;
        __u16 params, byte_count;
-        cFYI(1, ("OldQFSInfo"));
+        cFYI(1, "OldQFSInfo");
 oldQFSInfoRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                (void **) &pSMBr);
@@ -4231,7 +4267,7 @@ oldQFSInfoRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QFSInfo = %d", rc));
+                cFYI(1, "Send error in QFSInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4239,8 +4275,8 @@ oldQFSInfoRetry:
                        rc = -EIO;      /* bad smb */
                else {
                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
-                        cFYI(1, ("qfsinf resp BCC: %d  Offset %d",
+                        cFYI(1, "qfsinf resp BCC: %d  Offset %d",
-                                 pSMBr->ByteCount, data_offset));
+                                 pSMBr->ByteCount, data_offset);
                        response_data = (FILE_SYSTEM_ALLOC_INFO *)
                                (((char *) &pSMBr->hdr.Protocol) + data_offset);
@@ -4252,11 +4288,10 @@ oldQFSInfoRetry:
                               le32_to_cpu(response_data->TotalAllocationUnits);
                        FSData->f_bfree = FSData->f_bavail =
                                le32_to_cpu(response_data->FreeAllocationUnits);
-                        cFYI(1,
+                        cFYI(1, "Blocks: %lld  Free: %lld Block size %ld",
-                             ("Blocks: %lld  Free: %lld Block size %ld",
+                             (unsigned long long)FSData->f_blocks,
-                              (unsigned long long)FSData->f_blocks,
+                             (unsigned long long)FSData->f_bfree,
-                              (unsigned long long)FSData->f_bfree,
+                             FSData->f_bsize);
-                              FSData->f_bsize));
                }
        }
        cifs_buf_release(pSMB);
@@ -4278,7 +4313,7 @@ CIFSSMBQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData)
        int bytes_returned = 0;
        __u16 params, byte_count;
-        cFYI(1, ("In QFSInfo"));
+        cFYI(1, "In QFSInfo");
 QFSInfoRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -4311,7 +4346,7 @@ QFSInfoRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QFSInfo = %d", rc));
+                cFYI(1, "Send error in QFSInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4332,11 +4367,10 @@ QFSInfoRetry:
                            le64_to_cpu(response_data->TotalAllocationUnits);
                        FSData->f_bfree = FSData->f_bavail =
                            le64_to_cpu(response_data->FreeAllocationUnits);
-                        cFYI(1,
+                        cFYI(1, "Blocks: %lld  Free: %lld Block size %ld",
-                             ("Blocks: %lld  Free: %lld Block size %ld",
+                             (unsigned long long)FSData->f_blocks,
-                              (unsigned long long)FSData->f_blocks,
+                             (unsigned long long)FSData->f_bfree,
-                              (unsigned long long)FSData->f_bfree,
+                             FSData->f_bsize);
-                              FSData->f_bsize));
                }
        }
        cifs_buf_release(pSMB);
@@ -4358,7 +4392,7 @@ CIFSSMBQFSAttributeInfo(const int xid, struct cifsTconInfo *tcon)
        int bytes_returned = 0;
        __u16 params, byte_count;
-        cFYI(1, ("In QFSAttributeInfo"));
+        cFYI(1, "In QFSAttributeInfo");
 QFSAttributeRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -4392,7 +4426,7 @@ QFSAttributeRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cERROR(1, ("Send error in QFSAttributeInfo = %d", rc));
+                cERROR(1, "Send error in QFSAttributeInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4428,7 +4462,7 @@ CIFSSMBQFSDeviceInfo(const int xid, struct cifsTconInfo *tcon)
        int bytes_returned = 0;
        __u16 params, byte_count;
-        cFYI(1, ("In QFSDeviceInfo"));
+        cFYI(1, "In QFSDeviceInfo");
 QFSDeviceRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -4463,7 +4497,7 @@ QFSDeviceRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QFSDeviceInfo = %d", rc));
+                cFYI(1, "Send error in QFSDeviceInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4498,7 +4532,7 @@ CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon)
        int bytes_returned = 0;
        __u16 params, byte_count;
-        cFYI(1, ("In QFSUnixInfo"));
+        cFYI(1, "In QFSUnixInfo");
 QFSUnixRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -4532,7 +4566,7 @@ QFSUnixRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cERROR(1, ("Send error in QFSUnixInfo = %d", rc));
+                cERROR(1, "Send error in QFSUnixInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4567,7 +4601,7 @@ CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap)
        int bytes_returned = 0;
        __u16 params, param_offset, offset, byte_count;
-        cFYI(1, ("In SETFSUnixInfo"));
+        cFYI(1, "In SETFSUnixInfo");
 SETFSUnixRetry:
        /* BB switch to small buf init to save memory */
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -4615,7 +4649,7 @@ SETFSUnixRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cERROR(1, ("Send error in SETFSUnixInfo = %d", rc));
+                cERROR(1, "Send error in SETFSUnixInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
                if (rc)
@@ -4643,7 +4677,7 @@ CIFSSMBQFSPosixInfo(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned = 0;
        __u16 params, byte_count;
-        cFYI(1, ("In QFSPosixInfo"));
+        cFYI(1, "In QFSPosixInfo");
 QFSPosixRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -4677,7 +4711,7 @@ QFSPosixRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QFSUnixInfo = %d", rc));
+                cFYI(1, "Send error in QFSUnixInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4737,7 +4771,7 @@ CIFSSMBSetEOF(const int xid, struct cifsTconInfo *tcon, const char *fileName,
        int bytes_returned = 0;
        __u16 params, byte_count, data_count, param_offset, offset;
-        cFYI(1, ("In SetEOF"));
+        cFYI(1, "In SetEOF");
 SetEOFRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -4803,7 +4837,7 @@ SetEOFRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc)
-                cFYI(1, ("SetPathInfo (file size) returned %d", rc));
+                cFYI(1, "SetPathInfo (file size) returned %d", rc);
        cifs_buf_release(pSMB);
@@ -4823,8 +4857,8 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
        int rc = 0;
        __u16 params, param_offset, offset, byte_count, count;
-        cFYI(1, ("SetFileSize (via SetFileInfo) %lld",
+        cFYI(1, "SetFileSize (via SetFileInfo) %lld",
-                        (long long)size));
+                        (long long)size);
        rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
        if (rc)
@@ -4883,9 +4917,7 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
        if (rc) {
-                cFYI(1,
+                cFYI(1, "Send error in SetFileInfo (SetFileSize) = %d", rc);
-                     ("Send error in SetFileInfo (SetFileSize) = %d",
-                      rc));
        }
        /* Note: On -EAGAIN error only caller can retry on handle based calls
@@ -4909,7 +4941,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
        int rc = 0;
        __u16 params, param_offset, offset, byte_count, count;
-        cFYI(1, ("Set Times (via SetFileInfo)"));
+        cFYI(1, "Set Times (via SetFileInfo)");
        rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
        if (rc)
@@ -4954,7 +4986,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
        memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
        if (rc)
-                cFYI(1, ("Send error in Set Time (SetFileInfo) = %d", rc));
+                cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
        /* Note: On -EAGAIN error only caller can retry on handle based calls
                since file handle passed in no longer valid */
@@ -4971,7 +5003,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
        int rc = 0;
        __u16 params, param_offset, offset, byte_count, count;
-        cFYI(1, ("Set File Disposition (via SetFileInfo)"));
+        cFYI(1, "Set File Disposition (via SetFileInfo)");
        rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
        if (rc)
@@ -5013,7 +5045,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
        *data_offset = delete_file ? 1 : 0;
        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
        if (rc)
-                cFYI(1, ("Send error in SetFileDisposition = %d", rc));
+                cFYI(1, "Send error in SetFileDisposition = %d", rc);
        return rc;
 }
@@ -5031,7 +5063,7 @@ CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
        char *data_offset;
        __u16 params, param_offset, offset, byte_count, count;
-        cFYI(1, ("In SetTimes"));
+        cFYI(1, "In SetTimes");
 SetTimesRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -5087,7 +5119,7 @@ SetTimesRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc)
-                cFYI(1, ("SetPathInfo (times) returned %d", rc));
+                cFYI(1, "SetPathInfo (times) returned %d", rc);
        cifs_buf_release(pSMB);
@@ -5112,7 +5144,7 @@ CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon, char *fileName,
        int bytes_returned;
        int name_len;
-        cFYI(1, ("In SetAttrLegacy"));
+        cFYI(1, "In SetAttrLegacy");
 SetAttrLgcyRetry:
        rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB,
@@ -5138,7 +5170,7 @@ SetAttrLgcyRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc)
-                cFYI(1, ("Error in LegacySetAttr = %d", rc));
+                cFYI(1, "Error in LegacySetAttr = %d", rc);
        cifs_buf_release(pSMB);
@@ -5200,7 +5232,7 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
        int rc = 0;
        u16 params, param_offset, offset, byte_count, count;
-        cFYI(1, ("Set Unix Info (via SetFileInfo)"));
+        cFYI(1, "Set Unix Info (via SetFileInfo)");
        rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
        if (rc)
@@ -5245,7 +5277,7 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
        if (rc)
-                cFYI(1, ("Send error in Set Time (SetFileInfo) = %d", rc));
+                cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
        /* Note: On -EAGAIN error only caller can retry on handle based calls
                since file handle passed in no longer valid */
@@ -5266,7 +5298,7 @@ CIFSSMBUnixSetPathInfo(const int xid, struct cifsTconInfo *tcon, char *fileName,
        FILE_UNIX_BASIC_INFO *data_offset;
        __u16 params, param_offset, offset, count, byte_count;
-        cFYI(1, ("In SetUID/GID/Mode"));
+        cFYI(1, "In SetUID/GID/Mode");
 setPermsRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -5322,7 +5354,7 @@ setPermsRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc)
-                cFYI(1, ("SetPathInfo (perms) returned %d", rc));
+                cFYI(1, "SetPathInfo (perms) returned %d", rc);
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -5341,7 +5373,7 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
        struct dir_notify_req *dnotify_req;
        int bytes_returned;
-        cFYI(1, ("In CIFSSMBNotify for file handle %d", (int)netfid));
+        cFYI(1, "In CIFSSMBNotify for file handle %d", (int)netfid);
        rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
        if (rc)
@@ -5375,7 +5407,7 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
                         (struct smb_hdr *)pSMBr, &bytes_returned,
                         CIFS_ASYNC_OP);
        if (rc) {
-                cFYI(1, ("Error in Notify = %d", rc));
+                cFYI(1, "Error in Notify = %d", rc);
        } else {
                /* Add file to outstanding requests */
                /* BB change to kmem cache alloc */
@@ -5431,7 +5463,7 @@ CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
        char *end_of_smb;
        __u16 params, byte_count, data_offset;
-        cFYI(1, ("In Query All EAs path %s", searchName));
+        cFYI(1, "In Query All EAs path %s", searchName);
 QAllEAsRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -5478,7 +5510,7 @@ QAllEAsRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QueryAllEAs = %d", rc));
+                cFYI(1, "Send error in QueryAllEAs = %d", rc);
                goto QAllEAsOut;
        }
@@ -5506,16 +5538,16 @@ QAllEAsRetry:
                                (((char *) &pSMBr->hdr.Protocol) + data_offset);
        list_len = le32_to_cpu(ea_response_data->list_len);
-        cFYI(1, ("ea length %d", list_len));
+        cFYI(1, "ea length %d", list_len);
        if (list_len <= 8) {
-                cFYI(1, ("empty EA list returned from server"));
+                cFYI(1, "empty EA list returned from server");
                goto QAllEAsOut;
        }
        /* make sure list_len doesn't go past end of SMB */
        end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr);
        if ((char *)ea_response_data + list_len > end_of_smb) {
-                cFYI(1, ("EA list appears to go beyond SMB"));
+                cFYI(1, "EA list appears to go beyond SMB");
                rc = -EIO;
                goto QAllEAsOut;
        }
@@ -5532,7 +5564,7 @@ QAllEAsRetry:
                temp_ptr += 4;
                /* make sure we can read name_len and value_len */
                if (list_len < 0) {
-                        cFYI(1, ("EA entry goes beyond length of list"));
+                        cFYI(1, "EA entry goes beyond length of list");
                        rc = -EIO;
                        goto QAllEAsOut;
                }
@@ -5541,7 +5573,7 @@ QAllEAsRetry:
                value_len = le16_to_cpu(temp_fea->value_len);
                list_len -= name_len + 1 + value_len;
                if (list_len < 0) {
-                        cFYI(1, ("EA entry goes beyond length of list"));
+                        cFYI(1, "EA entry goes beyond length of list");
                        rc = -EIO;
                        goto QAllEAsOut;
                }
@@ -5608,7 +5640,7 @@ CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon, const char *fileName,
        int bytes_returned = 0;
        __u16 params, param_offset, byte_count, offset, count;
-        cFYI(1, ("In SetEA"));
+        cFYI(1, "In SetEA");
 SetEARetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -5690,7 +5722,7 @@ SetEARetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc)
-                cFYI(1, ("SetPathInfo (EA) returned %d", rc));
+                cFYI(1, "SetPathInfo (EA) returned %d", rc);
        cifs_buf_release(pSMB);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 45eb6cba793f..2208f06e4c45 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -23,6 +23,7 @@
 #include <linux/string.h>
 #include <linux/list.h>
 #include <linux/wait.h>
+#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/ctype.h>
 #include <linux/utsname.h>
@@ -101,6 +102,7 @@ struct smb_vol {
        bool sockopt_tcp_nodelay:1;
        unsigned short int port;
        char *prepath;
+        struct nls_table *local_nls;
 };
 static int ipv4_connect(struct TCP_Server_Info *server);
@@ -134,7 +136,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
        spin_unlock(&GlobalMid_Lock);
        server->maxBuf = 0;
-        cFYI(1, ("Reconnecting tcp session"));
+        cFYI(1, "Reconnecting tcp session");
        /* before reconnecting the tcp session, mark the smb session (uid)
                and the tid bad so they are not used until reconnected */
@@ -152,12 +154,12 @@ cifs_reconnect(struct TCP_Server_Info *server)
        /* do not want to be sending data on a socket we are freeing */
        mutex_lock(&server->srv_mutex);
        if (server->ssocket) {
-                cFYI(1, ("State: 0x%x Flags: 0x%lx", server->ssocket->state,
+                cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state,
-                        server->ssocket->flags));
+                        server->ssocket->flags);
                kernel_sock_shutdown(server->ssocket, SHUT_WR);
-                cFYI(1, ("Post shutdown state: 0x%x Flags: 0x%lx",
+                cFYI(1, "Post shutdown state: 0x%x Flags: 0x%lx",
                        server->ssocket->state,
-                        server->ssocket->flags));
+                        server->ssocket->flags);
                sock_release(server->ssocket);
                server->ssocket = NULL;
        }
@@ -186,7 +188,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
                else
                        rc = ipv4_connect(server);
                if (rc) {
-                        cFYI(1, ("reconnect error %d", rc));
+                        cFYI(1, "reconnect error %d", rc);
                        msleep(3000);
                } else {
                        atomic_inc(&tcpSesReconnectCount);
@@ -222,7 +224,7 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
        /* check for plausible wct, bcc and t2 data and parm sizes */
        /* check for parm and data offset going beyond end of smb */
        if (pSMB->WordCount != 10) { /* coalesce_t2 depends on this */
-                cFYI(1, ("invalid transact2 word count"));
+                cFYI(1, "invalid transact2 word count");
                return -EINVAL;
        }
@@ -236,15 +238,15 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
        if (remaining == 0)
                return 0;
        else if (remaining < 0) {
-                cFYI(1, ("total data %d smaller than data in frame %d",
+                cFYI(1, "total data %d smaller than data in frame %d",
-                        total_data_size, data_in_this_rsp));
+                        total_data_size, data_in_this_rsp);
                return -EINVAL;
        } else {
-                cFYI(1, ("missing %d bytes from transact2, check next response",
+                cFYI(1, "missing %d bytes from transact2, check next response",
-                        remaining));
+                        remaining);
                if (total_data_size > maxBufSize) {
-                        cERROR(1, ("TotalDataSize %d is over maximum buffer %d",
+                        cERROR(1, "TotalDataSize %d is over maximum buffer %d",
-                                total_data_size, maxBufSize));
+                                total_data_size, maxBufSize);
                        return -EINVAL;
                }
                return remaining;
@@ -266,7 +268,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
        total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount);
        if (total_data_size != le16_to_cpu(pSMB2->t2_rsp.TotalDataCount)) {
-                cFYI(1, ("total data size of primary and secondary t2 differ"));
+                cFYI(1, "total data size of primary and secondary t2 differ");
        }
        total_in_buf = le16_to_cpu(pSMBt->t2_rsp.DataCount);
@@ -281,7 +283,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
        total_in_buf2 = le16_to_cpu(pSMB2->t2_rsp.DataCount);
        if (remaining < total_in_buf2) {
-                cFYI(1, ("transact2 2nd response contains too much data"));
+                cFYI(1, "transact2 2nd response contains too much data");
        }
        /* find end of first SMB data area */
@@ -310,7 +312,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
        pTargetSMB->smb_buf_length = byte_count;
        if (remaining == total_in_buf2) {
-                cFYI(1, ("found the last secondary response"));
+                cFYI(1, "found the last secondary response");
                return 0; /* we are done */
        } else /* more responses to go */
                return 1;
@@ -338,7 +340,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
        int reconnect;
        current->flags |= PF_MEMALLOC;
-        cFYI(1, ("Demultiplex PID: %d", task_pid_nr(current)));
+        cFYI(1, "Demultiplex PID: %d", task_pid_nr(current));
        length = atomic_inc_return(&tcpSesAllocCount);
        if (length > 1)
@@ -352,7 +354,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
                if (bigbuf == NULL) {
                        bigbuf = cifs_buf_get();
                        if (!bigbuf) {
-                                cERROR(1, ("No memory for large SMB response"));
+                                cERROR(1, "No memory for large SMB response");
                                msleep(3000);
                                /* retry will check if exiting */
                                continue;
@@ -365,7 +367,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
                if (smallbuf == NULL) {
                        smallbuf = cifs_small_buf_get();
                        if (!smallbuf) {
-                                cERROR(1, ("No memory for SMB response"));
+                                cERROR(1, "No memory for SMB response");
                                msleep(1000);
                                /* retry will check if exiting */
                                continue;
@@ -390,9 +392,9 @@ incomplete_rcv:
                if (server->tcpStatus == CifsExiting) {
                        break;
                } else if (server->tcpStatus == CifsNeedReconnect) {
-                        cFYI(1, ("Reconnect after server stopped responding"));
+                        cFYI(1, "Reconnect after server stopped responding");
                        cifs_reconnect(server);
-                        cFYI(1, ("call to reconnect done"));
+                        cFYI(1, "call to reconnect done");
                        csocket = server->ssocket;
                        continue;
                } else if ((length == -ERESTARTSYS) || (length == -EAGAIN)) {
@@ -410,7 +412,7 @@ incomplete_rcv:
                                continue;
                } else if (length <= 0) {
                        if (server->tcpStatus == CifsNew) {
-                                cFYI(1, ("tcp session abend after SMBnegprot"));
+                                cFYI(1, "tcp session abend after SMBnegprot");
                                /* some servers kill the TCP session rather than
                                   returning an SMB negprot error, in which
                                   case reconnecting here is not going to help,
@@ -418,18 +420,18 @@ incomplete_rcv:
                                break;
                        }
                        if (!try_to_freeze() && (length == -EINTR)) {
-                                cFYI(1, ("cifsd thread killed"));
+                                cFYI(1, "cifsd thread killed");
                                break;
                        }
-                        cFYI(1, ("Reconnect after unexpected peek error %d",
+                        cFYI(1, "Reconnect after unexpected peek error %d",
-                                length));
+                                length);
                        cifs_reconnect(server);
                        csocket = server->ssocket;
                        wake_up(&server->response_q);
                        continue;
                } else if (length < pdu_length) {
-                        cFYI(1, ("requested %d bytes but only got %d bytes",
+                        cFYI(1, "requested %d bytes but only got %d bytes",
-                                  pdu_length, length));
+                                  pdu_length, length);
                        pdu_length -= length;
                        msleep(1);
                        goto incomplete_rcv;
@@ -449,18 +451,18 @@ incomplete_rcv:
                pdu_length = be32_to_cpu((__force __be32)smb_buffer->smb_buf_length);
                smb_buffer->smb_buf_length = pdu_length;
-                cFYI(1, ("rfc1002 length 0x%x", pdu_length+4));
+                cFYI(1, "rfc1002 length 0x%x", pdu_length+4);
                if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) {
                        continue;
                } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) {
-                        cFYI(1, ("Good RFC 1002 session rsp"));
+                        cFYI(1, "Good RFC 1002 session rsp");
                        continue;
                } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) {
                        /* we get this from Windows 98 instead of
                           an error on SMB negprot response */
-                        cFYI(1, ("Negative RFC1002 Session Response Error 0x%x)",
+                        cFYI(1, "Negative RFC1002 Session Response Error 0x%x)",
-                                pdu_length));
+                                pdu_length);
                        if (server->tcpStatus == CifsNew) {
                                /* if nack on negprot (rather than
                                ret of smb negprot error) reconnecting
@@ -483,7 +485,7 @@ incomplete_rcv:
                                continue;
                        }
                } else if (temp != (char) 0) {
-                        cERROR(1, ("Unknown RFC 1002 frame"));
+                        cERROR(1, "Unknown RFC 1002 frame");
                        cifs_dump_mem(" Received Data: ", (char *)smb_buffer,
                                      length);
                        cifs_reconnect(server);
@@ -494,8 +496,8 @@ incomplete_rcv:
                /* else we have an SMB response */
                if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
                            (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
-                        cERROR(1, ("Invalid size SMB length %d pdu_length %d",
+                        cERROR(1, "Invalid size SMB length %d pdu_length %d",
-                                        length, pdu_length+4));
+                                        length, pdu_length+4);
                        cifs_reconnect(server);
                        csocket = server->ssocket;
                        wake_up(&server->response_q);
@@ -538,8 +540,8 @@ incomplete_rcv:
                                length = 0;
                                continue;
                        } else if (length <= 0) {
-                                cERROR(1, ("Received no data, expecting %d",
+                                cERROR(1, "Received no data, expecting %d",
-                                              pdu_length - total_read));
+                                              pdu_length - total_read);
                                cifs_reconnect(server);
                                csocket = server->ssocket;
                                reconnect = 1;
@@ -587,7 +589,7 @@ incomplete_rcv:
                                                }
                                        } else {
                                                if (!isLargeBuf) {
-                                                        cERROR(1,("1st trans2 resp needs bigbuf"));
+                                                        cERROR(1, "1st trans2 resp needs bigbuf");
                                        /* BB maybe we can fix this up,  switch
                                           to already allocated large buffer? */
                                                } else {
@@ -629,8 +631,8 @@ multi_t2_fnd:
                        wake_up_process(task_to_wake);
                } else if (!is_valid_oplock_break(smb_buffer, server) &&
                           !isMultiRsp) {
-                        cERROR(1, ("No task to wake, unknown frame received! "
+                        cERROR(1, "No task to wake, unknown frame received! "
-                                   "NumMids %d", midCount.counter));
+                                   "NumMids %d", midCount.counter);
                        cifs_dump_mem("Received Data is: ", (char *)smb_buffer,
                                      sizeof(struct smb_hdr));
 #ifdef CONFIG_CIFS_DEBUG2
@@ -707,8 +709,8 @@ multi_t2_fnd:
                list_for_each(tmp, &server->pending_mid_q) {
                mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
                        if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
-                                cFYI(1, ("Clearing Mid 0x%x - waking up ",
+                                cFYI(1, "Clearing Mid 0x%x - waking up ",
-                                         mid_entry->mid));
+                                         mid_entry->mid);
                                task_to_wake = mid_entry->tsk;
                                if (task_to_wake)
                                        wake_up_process(task_to_wake);
@@ -727,7 +729,7 @@ multi_t2_fnd:
                to wait at least 45 seconds before giving up
                on a request getting a response and going ahead
                and killing cifsd */
-                cFYI(1, ("Wait for exit from demultiplex thread"));
+                cFYI(1, "Wait for exit from demultiplex thread");
                msleep(46000);
                /* if threads still have not exited they are probably never
                coming home not much else we can do but free the memory */
@@ -848,7 +850,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                        separator[0] = options[4];
                        options += 5;
                } else {
-                        cFYI(1, ("Null separator not allowed"));
+                        cFYI(1, "Null separator not allowed");
                }
        }
@@ -973,7 +975,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                        }
                } else if (strnicmp(data, "sec", 3) == 0) {
                        if (!value || !*value) {
-                                cERROR(1, ("no security value specified"));
+                                cERROR(1, "no security value specified");
                                continue;
                        } else if (strnicmp(value, "krb5i", 5) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_KRB5 |
@@ -981,7 +983,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                        } else if (strnicmp(value, "krb5p", 5) == 0) {
                                /* vol->secFlg |= CIFSSEC_MUST_SEAL |
                                        CIFSSEC_MAY_KRB5; */
-                                cERROR(1, ("Krb5 cifs privacy not supported"));
+                                cERROR(1, "Krb5 cifs privacy not supported");
                                return 1;
                        } else if (strnicmp(value, "krb5", 4) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_KRB5;
@@ -1013,7 +1015,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                        } else if (strnicmp(value, "none", 4) == 0) {
                                vol->nullauth = 1;
                        } else {
-                                cERROR(1, ("bad security option: %s", value));
+                                cERROR(1, "bad security option: %s", value);
                                return 1;
                        }
                } else if ((strnicmp(data, "unc", 3) == 0)
@@ -1052,7 +1054,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                        a domain name and need special handling? */
                        if (strnlen(value, 256) < 256) {
                                vol->domainname = value;
-                                cFYI(1, ("Domain name set"));
+                                cFYI(1, "Domain name set");
                        } else {
                                printk(KERN_WARNING "CIFS: domain name too "
                                                    "long\n");
@@ -1075,7 +1077,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                                        strcpy(vol->prepath+1, value);
                                } else
                                        strcpy(vol->prepath, value);
-                                cFYI(1, ("prefix path %s", vol->prepath));
+                                cFYI(1, "prefix path %s", vol->prepath);
                        } else {
                                printk(KERN_WARNING "CIFS: prefix too long\n");
                                return 1;
@@ -1091,7 +1093,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                                        vol->iocharset = value;
                                /* if iocharset not set then load_nls_default
                                   is used by caller */
-                                cFYI(1, ("iocharset set to %s", value));
+                                cFYI(1, "iocharset set to %s", value);
                        } else {
                                printk(KERN_WARNING "CIFS: iocharset name "
                                                    "too long.\n");
@@ -1143,14 +1145,14 @@ cifs_parse_mount_options(char *options, const char *devname,
                        }
                } else if (strnicmp(data, "sockopt", 5) == 0) {
                        if (!value || !*value) {
-                                cERROR(1, ("no socket option specified"));
+                                cERROR(1, "no socket option specified");
                                continue;
                        } else if (strnicmp(value, "TCP_NODELAY", 11) == 0) {
                                vol->sockopt_tcp_nodelay = 1;
                        }
                } else if (strnicmp(data, "netbiosname", 4) == 0) {
                        if (!value || !*value || (*value == ' ')) {
-                                cFYI(1, ("invalid (empty) netbiosname"));
+                                cFYI(1, "invalid (empty) netbiosname");
                        } else {
                                memset(vol->source_rfc1001_name, 0x20, 15);
                                for (i = 0; i < 15; i++) {
@@ -1174,7 +1176,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                } else if (strnicmp(data, "servern", 7) == 0) {
                        /* servernetbiosname specified override *SMBSERVER */
                        if (!value || !*value || (*value == ' ')) {
-                                cFYI(1, ("empty server netbiosname specified"));
+                                cFYI(1, "empty server netbiosname specified");
                        } else {
                                /* last byte, type, is 0x20 for servr type */
                                memset(vol->target_rfc1001_name, 0x20, 16);
@@ -1433,7 +1435,7 @@ cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
                ++server->srv_count;
                write_unlock(&cifs_tcp_ses_lock);
-                cFYI(1, ("Existing tcp session with server found"));
+                cFYI(1, "Existing tcp session with server found");
                return server;
        }
        write_unlock(&cifs_tcp_ses_lock);
@@ -1474,7 +1476,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        memset(&addr, 0, sizeof(struct sockaddr_storage));
-        cFYI(1, ("UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip));
+        cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip);
        if (volume_info->UNCip && volume_info->UNC) {
                rc = cifs_convert_address(volume_info->UNCip, &addr);
@@ -1486,13 +1488,12 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        } else if (volume_info->UNCip) {
                /* BB using ip addr as tcp_ses name to connect to the
                   DFS root below */
-                cERROR(1, ("Connecting to DFS root not implemented yet"));
+                cERROR(1, "Connecting to DFS root not implemented yet");
                rc = -EINVAL;
                goto out_err;
        } else /* which tcp_sess DFS root would we conect to */ {
-                cERROR(1,
+                cERROR(1, "CIFS mount error: No UNC path (e.g. -o "
-                       ("CIFS mount error: No UNC path (e.g. -o "
+                        "unc=//192.168.1.100/public) specified");
-                        "unc=//192.168.1.100/public) specified"));
                rc = -EINVAL;
                goto out_err;
        }
@@ -1539,7 +1540,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        ++tcp_ses->srv_count;
        if (addr.ss_family == AF_INET6) {
-                cFYI(1, ("attempting ipv6 connect"));
+                cFYI(1, "attempting ipv6 connect");
                /* BB should we allow ipv6 on port 139? */
                /* other OS never observed in Wild doing 139 with v6 */
                sin_server6->sin6_port = htons(volume_info->port);
@@ -1553,7 +1554,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                rc = ipv4_connect(tcp_ses);
        }
        if (rc < 0) {
-                cERROR(1, ("Error connecting to socket. Aborting operation"));
+                cERROR(1, "Error connecting to socket. Aborting operation");
                goto out_err;
        }
@@ -1566,7 +1567,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                                  tcp_ses, "cifsd");
        if (IS_ERR(tcp_ses->tsk)) {
                rc = PTR_ERR(tcp_ses->tsk);
-                cERROR(1, ("error %d create cifsd thread", rc));
+                cERROR(1, "error %d create cifsd thread", rc);
                module_put(THIS_MODULE);
                goto out_err;
        }
@@ -1615,6 +1616,7 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
        int xid;
        struct TCP_Server_Info *server = ses->server;
+        cFYI(1, "%s: ses_count=%d\n", __func__, ses->ses_count);
        write_lock(&cifs_tcp_ses_lock);
        if (--ses->ses_count > 0) {
                write_unlock(&cifs_tcp_ses_lock);
@@ -1633,6 +1635,102 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
        cifs_put_tcp_session(server);
 }
+static struct cifsSesInfo *
+cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
+{
+        int rc = -ENOMEM, xid;
+        struct cifsSesInfo *ses;
+        xid = GetXid();
+        ses = cifs_find_smb_ses(server, volume_info->username);
+        if (ses) {
+                cFYI(1, "Existing smb sess found (status=%d)", ses->status);
+                /* existing SMB ses has a server reference already */
+                cifs_put_tcp_session(server);
+                mutex_lock(&ses->session_mutex);
+                rc = cifs_negotiate_protocol(xid, ses);
+                if (rc) {
+                        mutex_unlock(&ses->session_mutex);
+                        /* problem -- put our ses reference */
+                        cifs_put_smb_ses(ses);
+                        FreeXid(xid);
+                        return ERR_PTR(rc);
+                }
+                if (ses->need_reconnect) {
+                        cFYI(1, "Session needs reconnect");
+                        rc = cifs_setup_session(xid, ses,
+                                                volume_info->local_nls);
+                        if (rc) {
+                                mutex_unlock(&ses->session_mutex);
+                                /* problem -- put our reference */
+                                cifs_put_smb_ses(ses);
+                                FreeXid(xid);
+                                return ERR_PTR(rc);
+                        }
+                }
+                mutex_unlock(&ses->session_mutex);
+                FreeXid(xid);
+                return ses;
+        }
+        cFYI(1, "Existing smb sess not found");
+        ses = sesInfoAlloc();
+        if (ses == NULL)
+                goto get_ses_fail;
+        /* new SMB session uses our server ref */
+        ses->server = server;
+        if (server->addr.sockAddr6.sin6_family == AF_INET6)
+                sprintf(ses->serverName, "%pI6",
+                        &server->addr.sockAddr6.sin6_addr);
+        else
+                sprintf(ses->serverName, "%pI4",
+                        &server->addr.sockAddr.sin_addr.s_addr);
+        if (volume_info->username)
+                strncpy(ses->userName, volume_info->username,
+                        MAX_USERNAME_SIZE);
+        /* volume_info->password freed at unmount */
+        if (volume_info->password) {
+                ses->password = kstrdup(volume_info->password, GFP_KERNEL);
+                if (!ses->password)
+                        goto get_ses_fail;
+        }
+        if (volume_info->domainname) {
+                int len = strlen(volume_info->domainname);
+                ses->domainName = kmalloc(len + 1, GFP_KERNEL);
+                if (ses->domainName)
+                        strcpy(ses->domainName, volume_info->domainname);
+        }
+        ses->linux_uid = volume_info->linux_uid;
+        ses->overrideSecFlg = volume_info->secFlg;
+        mutex_lock(&ses->session_mutex);
+        rc = cifs_negotiate_protocol(xid, ses);
+        if (!rc)
+                rc = cifs_setup_session(xid, ses, volume_info->local_nls);
+        mutex_unlock(&ses->session_mutex);
+        if (rc)
+                goto get_ses_fail;
+        /* success, put it on the list */
+        write_lock(&cifs_tcp_ses_lock);
+        list_add(&ses->smb_ses_list, &server->smb_ses_list);
+        write_unlock(&cifs_tcp_ses_lock);
+        FreeXid(xid);
+        return ses;
+get_ses_fail:
+        sesInfoFree(ses);
+        FreeXid(xid);
+        return ERR_PTR(rc);
+}
 static struct cifsTconInfo *
 cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
 {
@@ -1661,6 +1759,7 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
        int xid;
        struct cifsSesInfo *ses = tcon->ses;
+        cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count);
        write_lock(&cifs_tcp_ses_lock);
        if (--tcon->tc_count > 0) {
                write_unlock(&cifs_tcp_ses_lock);
@@ -1678,6 +1777,80 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
        cifs_put_smb_ses(ses);
 }
+static struct cifsTconInfo *
+cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info)
+{
+        int rc, xid;
+        struct cifsTconInfo *tcon;
+        tcon = cifs_find_tcon(ses, volume_info->UNC);
+        if (tcon) {
+                cFYI(1, "Found match on UNC path");
+                /* existing tcon already has a reference */
+                cifs_put_smb_ses(ses);
+                if (tcon->seal != volume_info->seal)
+                        cERROR(1, "transport encryption setting "
+                                   "conflicts with existing tid");
+                return tcon;
+        }
+        tcon = tconInfoAlloc();
+        if (tcon == NULL) {
+                rc = -ENOMEM;
+                goto out_fail;
+        }
+        tcon->ses = ses;
+        if (volume_info->password) {
+                tcon->password = kstrdup(volume_info->password, GFP_KERNEL);
+                if (!tcon->password) {
+                        rc = -ENOMEM;
+                        goto out_fail;
+                }
+        }
+        if (strchr(volume_info->UNC + 3, '\\') == NULL
+            && strchr(volume_info->UNC + 3, '/') == NULL) {
+                cERROR(1, "Missing share name");
+                rc = -ENODEV;
+                goto out_fail;
+        }
+        /* BB Do we need to wrap session_mutex around
+         * this TCon call and Unix SetFS as
+         * we do on SessSetup and reconnect? */
+        xid = GetXid();
+        rc = CIFSTCon(xid, ses, volume_info->UNC, tcon, volume_info->local_nls);
+        FreeXid(xid);
+        cFYI(1, "CIFS Tcon rc = %d", rc);
+        if (rc)
+                goto out_fail;
+        if (volume_info->nodfs) {
+                tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
+                cFYI(1, "DFS disabled (%d)", tcon->Flags);
+        }
+        tcon->seal = volume_info->seal;
+        /* we can have only one retry value for a connection
+           to a share so for resources mounted more than once
+           to the same server share the last value passed in
+           for the retry flag is used */
+        tcon->retry = volume_info->retry;
+        tcon->nocase = volume_info->nocase;
+        tcon->local_lease = volume_info->local_lease;
+        write_lock(&cifs_tcp_ses_lock);
+        list_add(&tcon->tcon_list, &ses->tcon_list);
+        write_unlock(&cifs_tcp_ses_lock);
+        return tcon;
+out_fail:
+        tconInfoFree(tcon);
+        return ERR_PTR(rc);
+}
 int
 get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
             const struct nls_table *nls_codepage, unsigned int *pnum_referrals,
@@ -1702,8 +1875,7 @@ get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
                strcpy(temp_unc + 2, pSesInfo->serverName);
                strcpy(temp_unc + 2 + strlen(pSesInfo->serverName), "\\IPC$");
                rc = CIFSTCon(xid, pSesInfo, temp_unc, NULL, nls_codepage);
-                cFYI(1,
+                cFYI(1, "CIFS Tcon rc = %d ipc_tid = %d", rc, pSesInfo->ipc_tid);
-                     ("CIFS Tcon rc = %d ipc_tid = %d", rc, pSesInfo->ipc_tid));
                kfree(temp_unc);
        }
        if (rc == 0)
@@ -1776,12 +1948,12 @@ ipv4_connect(struct TCP_Server_Info *server)
                rc = sock_create_kern(PF_INET, SOCK_STREAM,
                                      IPPROTO_TCP, &socket);
                if (rc < 0) {
-                        cERROR(1, ("Error %d creating socket", rc));
+                        cERROR(1, "Error %d creating socket", rc);
                        return rc;
                }
                /* BB other socket options to set KEEPALIVE, NODELAY? */
-                cFYI(1, ("Socket created"));
+                cFYI(1, "Socket created");
                server->ssocket = socket;
                socket->sk->sk_allocation = GFP_NOFS;
                cifs_reclassify_socket4(socket);
@@ -1826,7 +1998,7 @@ ipv4_connect(struct TCP_Server_Info *server)
        if (!connected) {
                if (orig_port)
                        server->addr.sockAddr.sin_port = orig_port;
-                cFYI(1, ("Error %d connecting to server via ipv4", rc));
+                cFYI(1, "Error %d connecting to server via ipv4", rc);
                sock_release(socket);
                server->ssocket = NULL;
                return rc;
@@ -1854,12 +2026,12 @@ ipv4_connect(struct TCP_Server_Info *server)
                rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
                                (char *)&val, sizeof(val));
                if (rc)
-                        cFYI(1, ("set TCP_NODELAY socket option error %d", rc));
+                        cFYI(1, "set TCP_NODELAY socket option error %d", rc);
        }
-         cFYI(1, ("sndbuf %d rcvbuf %d rcvtimeo 0x%lx",
+         cFYI(1, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx",
                 socket->sk->sk_sndbuf,
-                 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo));
+                 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
        /* send RFC1001 sessinit */
        if (server->addr.sockAddr.sin_port == htons(RFC1001_PORT)) {
@@ -1937,13 +2109,13 @@ ipv6_connect(struct TCP_Server_Info *server)
                rc = sock_create_kern(PF_INET6, SOCK_STREAM,
                                      IPPROTO_TCP, &socket);
                if (rc < 0) {
-                        cERROR(1, ("Error %d creating ipv6 socket", rc));
+                        cERROR(1, "Error %d creating ipv6 socket", rc);
                        socket = NULL;
                        return rc;
                }
                /* BB other socket options to set KEEPALIVE, NODELAY? */
-                cFYI(1, ("ipv6 Socket created"));
+                cFYI(1, "ipv6 Socket created");
                server->ssocket = socket;
                socket->sk->sk_allocation = GFP_NOFS;
                cifs_reclassify_socket6(socket);
@@ -1987,7 +2159,7 @@ ipv6_connect(struct TCP_Server_Info *server)
        if (!connected) {
                if (orig_port)
                        server->addr.sockAddr6.sin6_port = orig_port;
-                cFYI(1, ("Error %d connecting to server via ipv6", rc));
+                cFYI(1, "Error %d connecting to server via ipv6", rc);
                sock_release(socket);
                server->ssocket = NULL;
                return rc;
@@ -2006,7 +2178,7 @@ ipv6_connect(struct TCP_Server_Info *server)
                rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
                                (char *)&val, sizeof(val));
                if (rc)
-                        cFYI(1, ("set TCP_NODELAY socket option error %d", rc));
+                        cFYI(1, "set TCP_NODELAY socket option error %d", rc);
        }
        server->ssocket = socket;
@@ -2031,13 +2203,13 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
        if (vol_info && vol_info->no_linux_ext) {
                tcon->fsUnixInfo.Capability = 0;
                tcon->unix_ext = 0; /* Unix Extensions disabled */
-                cFYI(1, ("Linux protocol extensions disabled"));
+                cFYI(1, "Linux protocol extensions disabled");
                return;
        } else if (vol_info)
                tcon->unix_ext = 1; /* Unix Extensions supported */
        if (tcon->unix_ext == 0) {
-                cFYI(1, ("Unix extensions disabled so not set on reconnect"));
+                cFYI(1, "Unix extensions disabled so not set on reconnect");
                return;
        }
@@ -2053,12 +2225,11 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
                                cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
                        if ((saved_cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
                                if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
-                                        cERROR(1, ("POSIXPATH support change"));
+                                        cERROR(1, "POSIXPATH support change");
                                cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
                        } else if ((cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
-                                cERROR(1, ("possible reconnect error"));
+                                cERROR(1, "possible reconnect error");
-                                cERROR(1,
+                                cERROR(1, "server disabled POSIX path support");
-                                        ("server disabled POSIX path support"));
                        }
                }
@@ -2066,7 +2237,7 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
                if (vol_info && vol_info->no_psx_acl)
                        cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
                else if (CIFS_UNIX_POSIX_ACL_CAP & cap) {
-                        cFYI(1, ("negotiated posix acl support"));
+                        cFYI(1, "negotiated posix acl support");
                        if (sb)
                                sb->s_flags |= MS_POSIXACL;
                }
@@ -2074,7 +2245,7 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
                if (vol_info && vol_info->posix_paths == 0)
                        cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
                else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) {
-                        cFYI(1, ("negotiate posix pathnames"));
+                        cFYI(1, "negotiate posix pathnames");
                        if (sb)
                                CIFS_SB(sb)->mnt_cifs_flags |=
                                        CIFS_MOUNT_POSIX_PATHS;
@@ -2089,39 +2260,38 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
                if (sb && (CIFS_SB(sb)->rsize > 127 * 1024)) {
                        if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) {
                                CIFS_SB(sb)->rsize = 127 * 1024;
-                                cFYI(DBG2,
+                                cFYI(DBG2, "larger reads not supported by srv");
-                                        ("larger reads not supported by srv"));
                        }
                }
-                cFYI(1, ("Negotiate caps 0x%x", (int)cap));
+                cFYI(1, "Negotiate caps 0x%x", (int)cap);
 #ifdef CONFIG_CIFS_DEBUG2
                if (cap & CIFS_UNIX_FCNTL_CAP)
-                        cFYI(1, ("FCNTL cap"));
+                        cFYI(1, "FCNTL cap");
                if (cap & CIFS_UNIX_EXTATTR_CAP)
-                        cFYI(1, ("EXTATTR cap"));
+                        cFYI(1, "EXTATTR cap");
                if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
-                        cFYI(1, ("POSIX path cap"));
+                        cFYI(1, "POSIX path cap");
                if (cap & CIFS_UNIX_XATTR_CAP)
-                        cFYI(1, ("XATTR cap"));
+                        cFYI(1, "XATTR cap");
                if (cap & CIFS_UNIX_POSIX_ACL_CAP)
-                        cFYI(1, ("POSIX ACL cap"));
+                        cFYI(1, "POSIX ACL cap");
                if (cap & CIFS_UNIX_LARGE_READ_CAP)
-                        cFYI(1, ("very large read cap"));
+                        cFYI(1, "very large read cap");
                if (cap & CIFS_UNIX_LARGE_WRITE_CAP)
-                        cFYI(1, ("very large write cap"));
+                        cFYI(1, "very large write cap");
 #endif /* CIFS_DEBUG2 */
                if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) {
                        if (vol_info == NULL) {
-                                cFYI(1, ("resetting capabilities failed"));
+                                cFYI(1, "resetting capabilities failed");
                        } else
-                                cERROR(1, ("Negotiating Unix capabilities "
+                                cERROR(1, "Negotiating Unix capabilities "
                                           "with the server failed.  Consider "
                                           "mounting with the Unix Extensions\n"
                                           "disabled, if problems are found, "
                                           "by specifying the nounix mount "
-                                           "option."));
+                                           "option.");
                }
        }
@@ -2151,8 +2321,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
                          struct cifs_sb_info *cifs_sb)
 {
        if (pvolume_info->rsize > CIFSMaxBufSize) {
-                cERROR(1, ("rsize %d too large, using MaxBufSize",
+                cERROR(1, "rsize %d too large, using MaxBufSize",
-                        pvolume_info->rsize));
+                        pvolume_info->rsize);
                cifs_sb->rsize = CIFSMaxBufSize;
        } else if ((pvolume_info->rsize) &&
                        (pvolume_info->rsize <= CIFSMaxBufSize))
@@ -2161,8 +2331,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
                cifs_sb->rsize = CIFSMaxBufSize;
        if (pvolume_info->wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
-                cERROR(1, ("wsize %d too large, using 4096 instead",
+                cERROR(1, "wsize %d too large, using 4096 instead",
-                          pvolume_info->wsize));
+                          pvolume_info->wsize);
                cifs_sb->wsize = 4096;
        } else if (pvolume_info->wsize)
                cifs_sb->wsize = pvolume_info->wsize;
@@ -2180,7 +2350,7 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
        if (cifs_sb->rsize < 2048) {
                cifs_sb->rsize = 2048;
                /* Windows ME may prefer this */
-                cFYI(1, ("readsize set to minimum: 2048"));
+                cFYI(1, "readsize set to minimum: 2048");
        }
        /* calculate prepath */
        cifs_sb->prepath = pvolume_info->prepath;
@@ -2198,8 +2368,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
        cifs_sb->mnt_gid = pvolume_info->linux_gid;
        cifs_sb->mnt_file_mode = pvolume_info->file_mode;
        cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
-        cFYI(1, ("file mode: 0x%x  dir mode: 0x%x",
+        cFYI(1, "file mode: 0x%x  dir mode: 0x%x",
-                cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode));
+                cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
        if (pvolume_info->noperm)
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
@@ -2228,13 +2398,13 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
        if (pvolume_info->dynperm)
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
        if (pvolume_info->direct_io) {
-                cFYI(1, ("mounting share using direct i/o"));
+                cFYI(1, "mounting share using direct i/o");
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
        }
        if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
-                cERROR(1, ("mount option dynperm ignored if cifsacl "
+                cERROR(1, "mount option dynperm ignored if cifsacl "
-                           "mount option supported"));
+                           "mount option supported");
 }
 static int
@@ -2261,7 +2431,7 @@ cleanup_volume_info(struct smb_vol **pvolume_info)
 {
        struct smb_vol *volume_info;
-        if (!pvolume_info && !*pvolume_info)
+        if (!pvolume_info || !*pvolume_info)
                return;
        volume_info = *pvolume_info;
@@ -2343,11 +2513,11 @@ try_mount_again:
        }
        if (volume_info->nullauth) {
-                cFYI(1, ("null user"));
+                cFYI(1, "null user");
                volume_info->username = "";
        } else if (volume_info->username) {
                /* BB fixme parse for domain name here */
-                cFYI(1, ("Username: %s", volume_info->username));
+                cFYI(1, "Username: %s", volume_info->username);
        } else {
                cifserror("No username specified");
        /* In userspace mount helper we can get user name from alternate
@@ -2356,20 +2526,20 @@ try_mount_again:
                goto out;
        }
        /* this is needed for ASCII cp to Unicode converts */
        if (volume_info->iocharset == NULL) {
-                cifs_sb->local_nls = load_nls_default();
+                /* load_nls_default cannot return null */
-        /* load_nls_default can not return null */
+                volume_info->local_nls = load_nls_default();
        } else {
-                cifs_sb->local_nls = load_nls(volume_info->iocharset);
+                volume_info->local_nls = load_nls(volume_info->iocharset);
-                if (cifs_sb->local_nls == NULL) {
+                if (volume_info->local_nls == NULL) {
-                        cERROR(1, ("CIFS mount error: iocharset %s not found",
+                        cERROR(1, "CIFS mount error: iocharset %s not found",
-                                 volume_info->iocharset));
+                                 volume_info->iocharset);
                        rc = -ELIBACC;
                        goto out;
                }
        }
+        cifs_sb->local_nls = volume_info->local_nls;
        /* get a reference to a tcp session */
        srvTcp = cifs_get_tcp_session(volume_info);
@@ -2378,148 +2548,30 @@ try_mount_again:
                goto out;
        }
-        pSesInfo = cifs_find_smb_ses(srvTcp, volume_info->username);
+        /* get a reference to a SMB session */
-        if (pSesInfo) {
+        pSesInfo = cifs_get_smb_ses(srvTcp, volume_info);
-                cFYI(1, ("Existing smb sess found (status=%d)",
+        if (IS_ERR(pSesInfo)) {
-                        pSesInfo->status));
+                rc = PTR_ERR(pSesInfo);
-                /*
+                pSesInfo = NULL;
-                 * The existing SMB session already has a reference to srvTcp,
+                goto mount_fail_check;
-                 * so we can put back the extra one we got before
-                 */
-                cifs_put_tcp_session(srvTcp);
-                mutex_lock(&pSesInfo->session_mutex);
-                if (pSesInfo->need_reconnect) {
-                        cFYI(1, ("Session needs reconnect"));
-                        rc = cifs_setup_session(xid, pSesInfo,
-                                                cifs_sb->local_nls);
-                }
-                mutex_unlock(&pSesInfo->session_mutex);
-        } else if (!rc) {
-                cFYI(1, ("Existing smb sess not found"));
-                pSesInfo = sesInfoAlloc();
-                if (pSesInfo == NULL) {
-                        rc = -ENOMEM;
-                        goto mount_fail_check;
-                }
-                /* new SMB session uses our srvTcp ref */
-                pSesInfo->server = srvTcp;
-                if (srvTcp->addr.sockAddr6.sin6_family == AF_INET6)
-                        sprintf(pSesInfo->serverName, "%pI6",
-                                &srvTcp->addr.sockAddr6.sin6_addr);
-                else
-                        sprintf(pSesInfo->serverName, "%pI4",
-                                &srvTcp->addr.sockAddr.sin_addr.s_addr);
-                write_lock(&cifs_tcp_ses_lock);
-                list_add(&pSesInfo->smb_ses_list, &srvTcp->smb_ses_list);
-                write_unlock(&cifs_tcp_ses_lock);
-                /* volume_info->password freed at unmount */
-                if (volume_info->password) {
-                        pSesInfo->password = kstrdup(volume_info->password,
-                                                     GFP_KERNEL);
-                        if (!pSesInfo->password) {
-                                rc = -ENOMEM;
-                                goto mount_fail_check;
-                        }
-                }
-                if (volume_info->username)
-                        strncpy(pSesInfo->userName, volume_info->username,
-                                MAX_USERNAME_SIZE);
-                if (volume_info->domainname) {
-                        int len = strlen(volume_info->domainname);
-                        pSesInfo->domainName = kmalloc(len + 1, GFP_KERNEL);
-                        if (pSesInfo->domainName)
-                                strcpy(pSesInfo->domainName,
-                                        volume_info->domainname);
-                }
-                pSesInfo->linux_uid = volume_info->linux_uid;
-                pSesInfo->overrideSecFlg = volume_info->secFlg;
-                mutex_lock(&pSesInfo->session_mutex);
-                /* BB FIXME need to pass vol->secFlgs BB */
-                rc = cifs_setup_session(xid, pSesInfo,
-                                        cifs_sb->local_nls);
-                mutex_unlock(&pSesInfo->session_mutex);
        }
-        /* search for existing tcon to this server share */
+        setup_cifs_sb(volume_info, cifs_sb);
-        if (!rc) {
+        if (pSesInfo->capabilities & CAP_LARGE_FILES)
-                setup_cifs_sb(volume_info, cifs_sb);
+                sb->s_maxbytes = MAX_LFS_FILESIZE;
+        else
-                tcon = cifs_find_tcon(pSesInfo, volume_info->UNC);
+                sb->s_maxbytes = MAX_NON_LFS;
-                if (tcon) {
-                        cFYI(1, ("Found match on UNC path"));
-                        /* existing tcon already has a reference */
-                        cifs_put_smb_ses(pSesInfo);
-                        if (tcon->seal != volume_info->seal)
-                                cERROR(1, ("transport encryption setting "
-                                           "conflicts with existing tid"));
-                } else {
-                        tcon = tconInfoAlloc();
-                        if (tcon == NULL) {
-                                rc = -ENOMEM;
-                                goto mount_fail_check;
-                        }
-                        tcon->ses = pSesInfo;
-                        if (volume_info->password) {
-                                tcon->password = kstrdup(volume_info->password,
-                                                         GFP_KERNEL);
-                                if (!tcon->password) {
-                                        rc = -ENOMEM;
-                                        goto mount_fail_check;
-                                }
-                        }
-                        if ((strchr(volume_info->UNC + 3, '\\') == NULL)
-                            && (strchr(volume_info->UNC + 3, '/') == NULL)) {
-                                cERROR(1, ("Missing share name"));
-                                rc = -ENODEV;
-                                goto mount_fail_check;
-                        } else {
-                                /* BB Do we need to wrap sesSem around
-                                 * this TCon call and Unix SetFS as
-                                 * we do on SessSetup and reconnect? */
-                                rc = CIFSTCon(xid, pSesInfo, volume_info->UNC,
-                                              tcon, cifs_sb->local_nls);
-                                cFYI(1, ("CIFS Tcon rc = %d", rc));
-                                if (volume_info->nodfs) {
-                                        tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
-                                        cFYI(1, ("DFS disabled (%d)",
-                                                tcon->Flags));
-                                }
-                        }
-                        if (rc)
-                                goto remote_path_check;
-                        tcon->seal = volume_info->seal;
-                        write_lock(&cifs_tcp_ses_lock);
-                        list_add(&tcon->tcon_list, &pSesInfo->tcon_list);
-                        write_unlock(&cifs_tcp_ses_lock);
-                }
-                /* we can have only one retry value for a connection
-                   to a share so for resources mounted more than once
-                   to the same server share the last value passed in
-                   for the retry flag is used */
-                tcon->retry = volume_info->retry;
-                tcon->nocase = volume_info->nocase;
-                tcon->local_lease = volume_info->local_lease;
-        }
-        if (pSesInfo) {
-                if (pSesInfo->capabilities & CAP_LARGE_FILES)
-                        sb->s_maxbytes = MAX_LFS_FILESIZE;
-                else
-                        sb->s_maxbytes = MAX_NON_LFS;
-        }
        /* BB FIXME fix time_gran to be larger for LANMAN sessions */
        sb->s_time_gran = 100;
-        if (rc)
+        /* search for existing tcon to this server share */
+        tcon = cifs_get_tcon(pSesInfo, volume_info);
+        if (IS_ERR(tcon)) {
+                rc = PTR_ERR(tcon);
+                tcon = NULL;
                goto remote_path_check;
+        }
        cifs_sb->tcon = tcon;
@@ -2543,7 +2595,7 @@ try_mount_again:
        if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
                cifs_sb->rsize = 1024 * 127;
-                cFYI(DBG2, ("no very large read support, rsize now 127K"));
+                cFYI(DBG2, "no very large read support, rsize now 127K");
        }
        if (!(tcon->ses->capabilities & CAP_LARGE_WRITE_X))
                cifs_sb->wsize = min(cifs_sb->wsize,
@@ -2592,7 +2644,7 @@ remote_path_check:
                        goto mount_fail_check;
                }
-                cFYI(1, ("Getting referral for: %s", full_path));
+                cFYI(1, "Getting referral for: %s", full_path);
                rc = get_dfs_path(xid, pSesInfo , full_path + 1,
                        cifs_sb->local_nls, &num_referrals, &referrals,
                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -2706,7 +2758,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                   by Samba (not sure whether other servers allow
                   NTLMv2 password here) */
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-                if ((extended_security & CIFSSEC_MAY_LANMAN) &&
+                if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
                    (ses->server->secType == LANMAN))
                        calc_lanman_hash(tcon->password, ses->server->cryptKey,
                                         ses->server->secMode &
@@ -2777,13 +2829,13 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                if (length == 3) {
                        if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') &&
                            (bcc_ptr[2] == 'C')) {
-                                cFYI(1, ("IPC connection"));
+                                cFYI(1, "IPC connection");
                                tcon->ipc = 1;
                        }
                } else if (length == 2) {
                        if ((bcc_ptr[0] == 'A') && (bcc_ptr[1] == ':')) {
                                /* the most common case */
-                                cFYI(1, ("disk share connection"));
+                                cFYI(1, "disk share connection");
                        }
                }
                bcc_ptr += length + 1;
@@ -2796,7 +2848,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                                                      bytes_left, is_unicode,
                                                      nls_codepage);
-                cFYI(1, ("nativeFileSystem=%s", tcon->nativeFileSystem));
+                cFYI(1, "nativeFileSystem=%s", tcon->nativeFileSystem);
                if ((smb_buffer_response->WordCount == 3) ||
                         (smb_buffer_response->WordCount == 7))
@@ -2804,7 +2856,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                        tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport);
                else
                        tcon->Flags = 0;
-                cFYI(1, ("Tcon flags: 0x%x ", tcon->Flags));
+                cFYI(1, "Tcon flags: 0x%x ", tcon->Flags);
        } else if ((rc == 0) && tcon == NULL) {
                /* all we need to save for IPC$ connection */
                ses->ipc_tid = smb_buffer_response->Tid;
@@ -2832,57 +2884,61 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
        return rc;
 }
-int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
+int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses)
-                                           struct nls_table *nls_info)
 {
        int rc = 0;
-        int first_time = 0;
+        struct TCP_Server_Info *server = ses->server;
-        struct TCP_Server_Info *server = pSesInfo->server;
+        /* only send once per connect */
-        /* what if server changes its buffer size after dropping the session? */
+        if (server->maxBuf != 0)
-        if (server->maxBuf == 0) /* no need to send on reconnect */ {
+                return 0;
-                rc = CIFSSMBNegotiate(xid, pSesInfo);
-                if (rc == -EAGAIN) {
+        rc = CIFSSMBNegotiate(xid, ses);
-                        /* retry only once on 1st time connection */
+        if (rc == -EAGAIN) {
-                        rc = CIFSSMBNegotiate(xid, pSesInfo);
+                /* retry only once on 1st time connection */
-                        if (rc == -EAGAIN)
+                rc = CIFSSMBNegotiate(xid, ses);
-                                rc = -EHOSTDOWN;
+                if (rc == -EAGAIN)
-                }
+                        rc = -EHOSTDOWN;
-                if (rc == 0) {
+        }
-                        spin_lock(&GlobalMid_Lock);
+        if (rc == 0) {
-                        if (server->tcpStatus != CifsExiting)
+                spin_lock(&GlobalMid_Lock);
-                                server->tcpStatus = CifsGood;
+                if (server->tcpStatus != CifsExiting)
-                        else
+                        server->tcpStatus = CifsGood;
-                                rc = -EHOSTDOWN;
+                else
-                        spin_unlock(&GlobalMid_Lock);
+                        rc = -EHOSTDOWN;
+                spin_unlock(&GlobalMid_Lock);
-                }
-                first_time = 1;
        }
-        if (rc)
+        return rc;
-                goto ss_err_exit;
+}
+int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
+                        struct nls_table *nls_info)
+{
+        int rc = 0;
+        struct TCP_Server_Info *server = ses->server;
-        pSesInfo->flags = 0;
+        ses->flags = 0;
-        pSesInfo->capabilities = server->capabilities;
+        ses->capabilities = server->capabilities;
        if (linuxExtEnabled == 0)
-                pSesInfo->capabilities &= (~CAP_UNIX);
+                ses->capabilities &= (~CAP_UNIX);
-        cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
+        cFYI(1, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
-                 server->secMode, server->capabilities, server->timeAdj));
+                 server->secMode, server->capabilities, server->timeAdj);
-        rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info);
+        rc = CIFS_SessSetup(xid, ses, nls_info);
        if (rc) {
-                cERROR(1, ("Send error in SessSetup = %d", rc));
+                cERROR(1, "Send error in SessSetup = %d", rc);
        } else {
-                cFYI(1, ("CIFS Session Established successfully"));
+                cFYI(1, "CIFS Session Established successfully");
                spin_lock(&GlobalMid_Lock);
-                pSesInfo->status = CifsGood;
+                ses->status = CifsGood;
-                pSesInfo->need_reconnect = false;
+                ses->need_reconnect = false;
                spin_unlock(&GlobalMid_Lock);
        }
-ss_err_exit:
        return rc;
 }
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index e9f7ecc2714b..391816b461ca 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -73,7 +73,7 @@ cifs_bp_rename_retry:
                namelen += (1 + temp->d_name.len);
                temp = temp->d_parent;
                if (temp == NULL) {
-                        cERROR(1, ("corrupt dentry"));
+                        cERROR(1, "corrupt dentry");
                        return NULL;
                }
        }
@@ -90,19 +90,18 @@ cifs_bp_rename_retry:
                        full_path[namelen] = dirsep;
                        strncpy(full_path + namelen + 1, temp->d_name.name,
                                temp->d_name.len);
-                        cFYI(0, ("name: %s", full_path + namelen));
+                        cFYI(0, "name: %s", full_path + namelen);
                }
                temp = temp->d_parent;
                if (temp == NULL) {
-                        cERROR(1, ("corrupt dentry"));
+                        cERROR(1, "corrupt dentry");
                        kfree(full_path);
                        return NULL;
                }
        }
        if (namelen != pplen + dfsplen) {
-                cERROR(1,
+                cERROR(1, "did not end path lookup where expected namelen is %d",
-                       ("did not end path lookup where expected namelen is %d",
+                        namelen);
-                        namelen));
                /* presumably this is only possible if racing with a rename
                of one of the parent directories  (we can not lock the dentries
                above us to prevent this, but retrying should be harmless) */
@@ -130,6 +129,12 @@ cifs_bp_rename_retry:
        return full_path;
 }
+/*
+ * When called with struct file pointer set to NULL, there is no way we could
+ * update file->private_data, but getting it stuck on openFileList provides a
+ * way to access it from cifs_fill_filedata and thereby set file->private_data
+ * from cifs_open.
+ */
 struct cifsFileInfo *
 cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
                  struct file *file, struct vfsmount *mnt, unsigned int oflags)
@@ -173,7 +178,7 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
                if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
                        pCifsInode->clientCanCacheAll = true;
                        pCifsInode->clientCanCacheRead = true;
-                        cFYI(1, ("Exclusive Oplock inode %p", newinode));
+                        cFYI(1, "Exclusive Oplock inode %p", newinode);
                } else if ((oplock & 0xF) == OPLOCK_READ)
                                pCifsInode->clientCanCacheRead = true;
        }
@@ -183,16 +188,17 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
 }
 int cifs_posix_open(char *full_path, struct inode **pinode,
-                    struct vfsmount *mnt, int mode, int oflags,
+                        struct vfsmount *mnt, struct super_block *sb,
-                    __u32 *poplock, __u16 *pnetfid, int xid)
+                        int mode, int oflags,
+                        __u32 *poplock, __u16 *pnetfid, int xid)
 {
        int rc;
        FILE_UNIX_BASIC_INFO *presp_data;
        __u32 posix_flags = 0;
-        struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
+        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        struct cifs_fattr fattr;
-        cFYI(1, ("posix open %s", full_path));
+        cFYI(1, "posix open %s", full_path);
        presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
        if (presp_data == NULL)
@@ -242,7 +248,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
        /* get new inode and set it up */
        if (*pinode == NULL) {
-                *pinode = cifs_iget(mnt->mnt_sb, &fattr);
+                cifs_fill_uniqueid(sb, &fattr);
+                *pinode = cifs_iget(sb, &fattr);
                if (!*pinode) {
                        rc = -ENOMEM;
                        goto posix_open_ret;
@@ -251,7 +258,18 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
                cifs_fattr_to_inode(*pinode, &fattr);
        }
-        cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt, oflags);
+        /*
+         * cifs_fill_filedata() takes care of setting cifsFileInfo pointer to
+         * file->private_data.
+         */
+        if (mnt) {
+                struct cifsFileInfo *pfile_info;
+                pfile_info = cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt,
+                                               oflags);
+                if (pfile_info == NULL)
+                        rc = -ENOMEM;
+        }
 posix_open_ret:
        kfree(presp_data);
@@ -315,13 +333,14 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        if (nd && (nd->flags & LOOKUP_OPEN))
                oflags = nd->intent.open.flags;
        else
-                oflags = FMODE_READ;
+                oflags = FMODE_READ | SMB_O_CREAT;
        if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
            (CIFS_UNIX_POSIX_PATH_OPS_CAP &
                        le64_to_cpu(tcon->fsUnixInfo.Capability))) {
-                rc = cifs_posix_open(full_path, &newinode, nd->path.mnt,
+                rc = cifs_posix_open(full_path, &newinode,
-                                     mode, oflags, &oplock, &fileHandle, xid);
+                        nd ? nd->path.mnt : NULL,
+                        inode->i_sb, mode, oflags, &oplock, &fileHandle, xid);
                /* EIO could indicate that (posix open) operation is not
                   supported, despite what server claimed in capability
                   negotation.  EREMOTE indicates DFS junction, which is not
@@ -358,7 +377,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                else if ((oflags & O_CREAT) == O_CREAT)
                        disposition = FILE_OPEN_IF;
                else
-                        cFYI(1, ("Create flag not set in create function"));
+                        cFYI(1, "Create flag not set in create function");
        }
        /* BB add processing to set equivalent of mode - e.g. via CreateX with
@@ -394,7 +413,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        }
        if (rc) {
-                cFYI(1, ("cifs_create returned 0x%x", rc));
+                cFYI(1, "cifs_create returned 0x%x", rc);
                goto cifs_create_out;
        }
@@ -457,15 +476,22 @@ cifs_create_set_dentry:
        if (rc == 0)
                setup_cifs_dentry(tcon, direntry, newinode);
        else
-                cFYI(1, ("Create worked, get_inode_info failed rc = %d", rc));
+                cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
        /* nfsd case - nfs srv does not set nd */
        if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) {
                /* mknod case - do not leave file open */
                CIFSSMBClose(xid, tcon, fileHandle);
        } else if (!(posix_create) && (newinode)) {
-                        cifs_new_fileinfo(newinode, fileHandle, NULL,
+                struct cifsFileInfo *pfile_info;
-                                                nd->path.mnt, oflags);
+                /*
+                 * cifs_fill_filedata() takes care of setting cifsFileInfo
+                 * pointer to file->private_data.
+                 */
+                pfile_info = cifs_new_fileinfo(newinode, fileHandle, NULL,
+                                               nd->path.mnt, oflags);
+                if (pfile_info == NULL)
+                        rc = -ENOMEM;
        }
 cifs_create_out:
        kfree(buf);
@@ -531,7 +557,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
                        u16 fileHandle;
                        FILE_ALL_INFO *buf;
-                        cFYI(1, ("sfu compat create special file"));
+                        cFYI(1, "sfu compat create special file");
                        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
                        if (buf == NULL) {
@@ -616,8 +642,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        xid = GetXid();
-        cFYI(1, ("parent inode = 0x%p name is: %s and dentry = 0x%p",
+        cFYI(1, "parent inode = 0x%p name is: %s and dentry = 0x%p",
-              parent_dir_inode, direntry->d_name.name, direntry));
+              parent_dir_inode, direntry->d_name.name, direntry);
        /* check whether path exists */
@@ -632,7 +658,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
                int i;
                for (i = 0; i < direntry->d_name.len; i++)
                        if (direntry->d_name.name[i] == '\\') {
-                                cFYI(1, ("Invalid file name"));
+                                cFYI(1, "Invalid file name");
                                FreeXid(xid);
                                return ERR_PTR(-EINVAL);
                        }
@@ -657,11 +683,11 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        }
        if (direntry->d_inode != NULL) {
-                cFYI(1, ("non-NULL inode in lookup"));
+                cFYI(1, "non-NULL inode in lookup");
        } else {
-                cFYI(1, ("NULL inode in lookup"));
+                cFYI(1, "NULL inode in lookup");
        }
-        cFYI(1, ("Full path: %s inode = 0x%p", full_path, direntry->d_inode));
+        cFYI(1, "Full path: %s inode = 0x%p", full_path, direntry->d_inode);
        /* Posix open is only called (at lookup time) for file create now.
         * For opens (rather than creates), because we do not know if it
@@ -678,6 +704,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
                     (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
                     (nd->intent.open.flags & O_CREAT)) {
                        rc = cifs_posix_open(full_path, &newInode, nd->path.mnt,
+                                        parent_dir_inode->i_sb,
                                        nd->intent.open.create_mode,
                                        nd->intent.open.flags, &oplock,
                                        &fileHandle, xid);
@@ -723,7 +750,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        /*      if it was once a directory (but how can we tell?) we could do
                shrink_dcache_parent(direntry); */
        } else if (rc != -EACCES) {
-                cERROR(1, ("Unexpected lookup error %d", rc));
+                cERROR(1, "Unexpected lookup error %d", rc);
                /* We special case check for Access Denied - since that
                is a common return code */
        }
@@ -742,8 +769,8 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
                if (cifs_revalidate_dentry(direntry))
                        return 0;
        } else {
-                cFYI(1, ("neg dentry 0x%p name = %s",
+                cFYI(1, "neg dentry 0x%p name = %s",
-                         direntry, direntry->d_name.name));
+                         direntry, direntry->d_name.name);
                if (time_after(jiffies, direntry->d_time + HZ) ||
                        !lookupCacheEnabled) {
                        d_drop(direntry);
@@ -758,7 +785,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 {
        int rc = 0;
-        cFYI(1, ("In cifs d_delete, name = %s", direntry->d_name.name));
+        cFYI(1, "In cifs d_delete, name = %s", direntry->d_name.name);
        return rc;
 }     */
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 87948147d7ec..4db2c5e7283f 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -23,6 +23,7 @@
 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */
+#include <linux/slab.h>
 #include <keys/user-type.h>
 #include "dns_resolve.h"
 #include "cifsglob.h"
@@ -105,14 +106,14 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
        /* search for server name delimiter */
        len = strlen(unc);
        if (len < 3) {
-                cFYI(1, ("%s: unc is too short: %s", __func__, unc));
+                cFYI(1, "%s: unc is too short: %s", __func__, unc);
                return -EINVAL;
        }
        len -= 2;
        name = memchr(unc+2, '\\', len);
        if (!name) {
-                cFYI(1, ("%s: probably server name is whole unc: %s",
+                cFYI(1, "%s: probably server name is whole unc: %s",
-                                        __func__, unc));
+                                        __func__, unc);
        } else {
                len = (name - unc) - 2/* leading // */;
        }
@@ -126,8 +127,8 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
        name[len] = 0;
        if (is_ip(name)) {
-                cFYI(1, ("%s: it is IP, skipping dns upcall: %s",
+                cFYI(1, "%s: it is IP, skipping dns upcall: %s",
-                                        __func__, name));
+                                        __func__, name);
                data = name;
                goto skip_upcall;
        }
@@ -137,7 +138,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
                len = rkey->type_data.x[0];
                data = rkey->payload.data;
        } else {
-                cERROR(1, ("%s: unable to resolve: %s", __func__, name));
+                cERROR(1, "%s: unable to resolve: %s", __func__, name);
                goto out;
        }
@@ -147,10 +148,10 @@ skip_upcall:
                if (*ip_addr) {
                        memcpy(*ip_addr, data, len + 1);
                        if (!IS_ERR(rkey))
-                                cFYI(1, ("%s: resolved: %s to %s", __func__,
+                                cFYI(1, "%s: resolved: %s to %s", __func__,
                                                        name,
                                                        *ip_addr
-                                        ));
+                                        );
                        rc = 0;
                } else {
                        rc = -ENOMEM;
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 6177f7cca16a..993f82045bf6 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -49,7 +49,7 @@
 static struct dentry *cifs_get_parent(struct dentry *dentry)
 {
        /* BB need to add code here eventually to enable export via NFSD */
-        cFYI(1, ("get parent for %p", dentry));
+        cFYI(1, "get parent for %p", dentry);
        return ERR_PTR(-EACCES);
 }
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ca2ba7a0193c..f1ff785b2292 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3,7 +3,7 @@
 *
 *   vfs operations that deal with files
 *
- *   Copyright (C) International Business Machines  Corp., 2002,2007
+ *   Copyright (C) International Business Machines  Corp., 2002,2010
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *              Jeremy Allison (jra@samba.org)
 *
@@ -31,6 +31,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/delay.h>
 #include <linux/mount.h>
+#include <linux/slab.h>
 #include <asm/div64.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
@@ -107,8 +108,7 @@ static inline int cifs_get_disposition(unsigned int flags)
 /* all arguments to this function must be checked for validity in caller */
 static inline int
 cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
-                             struct cifsInodeInfo *pCifsInode,
+                             struct cifsInodeInfo *pCifsInode, __u32 oplock,
-                             struct cifsFileInfo *pCifsFile, __u32 oplock,
                             u16 netfid)
 {
@@ -135,15 +135,15 @@ cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
        if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
                           (file->f_path.dentry->d_inode->i_size ==
                            (loff_t)le64_to_cpu(buf->EndOfFile))) {
-                cFYI(1, ("inode unchanged on server"));
+                cFYI(1, "inode unchanged on server");
        } else {
                if (file->f_path.dentry->d_inode->i_mapping) {
                        rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
                        if (rc != 0)
                                CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
                }
-                cFYI(1, ("invalidating remote inode since open detected it "
+                cFYI(1, "invalidating remote inode since open detected it "
-                         "changed"));
+                         "changed");
                invalidate_remote_inode(file->f_path.dentry->d_inode);
        } */
@@ -151,8 +151,8 @@ psx_client_can_cache:
        if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
                pCifsInode->clientCanCacheAll = true;
                pCifsInode->clientCanCacheRead = true;
-                cFYI(1, ("Exclusive Oplock granted on inode %p",
+                cFYI(1, "Exclusive Oplock granted on inode %p",
-                         file->f_path.dentry->d_inode));
+                         file->f_path.dentry->d_inode);
        } else if ((oplock & 0xF) == OPLOCK_READ)
                pCifsInode->clientCanCacheRead = true;
@@ -189,8 +189,8 @@ cifs_fill_filedata(struct file *file)
        if (file->private_data != NULL) {
                return pCifsFile;
        } else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL))
-                        cERROR(1, ("could not find file instance for "
+                        cERROR(1, "could not find file instance for "
-                                   "new file %p", file));
+                                   "new file %p", file);
        return NULL;
 }
@@ -216,7 +216,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
        if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
                           (file->f_path.dentry->d_inode->i_size ==
                            (loff_t)le64_to_cpu(buf->EndOfFile))) {
-                cFYI(1, ("inode unchanged on server"));
+                cFYI(1, "inode unchanged on server");
        } else {
                if (file->f_path.dentry->d_inode->i_mapping) {
                        /* BB no need to lock inode until after invalidate
@@ -225,8 +225,8 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
                        if (rc != 0)
                                CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
                }
-                cFYI(1, ("invalidating remote inode since open detected it "
+                cFYI(1, "invalidating remote inode since open detected it "
-                         "changed"));
+                         "changed");
                invalidate_remote_inode(file->f_path.dentry->d_inode);
        }
@@ -241,8 +241,8 @@ client_can_cache:
        if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) {
                pCifsInode->clientCanCacheAll = true;
                pCifsInode->clientCanCacheRead = true;
-                cFYI(1, ("Exclusive Oplock granted on inode %p",
+                cFYI(1, "Exclusive Oplock granted on inode %p",
-                         file->f_path.dentry->d_inode));
+                         file->f_path.dentry->d_inode);
        } else if ((*oplock & 0xF) == OPLOCK_READ)
                pCifsInode->clientCanCacheRead = true;
@@ -284,8 +284,8 @@ int cifs_open(struct inode *inode, struct file *file)
                return rc;
        }
-        cFYI(1, ("inode = 0x%p file flags are 0x%x for %s",
+        cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
-                 inode, file->f_flags, full_path));
+                 inode, file->f_flags, full_path);
        if (oplockEnabled)
                oplock = REQ_OPLOCK;
@@ -297,27 +297,29 @@ int cifs_open(struct inode *inode, struct file *file)
            (CIFS_UNIX_POSIX_PATH_OPS_CAP &
                        le64_to_cpu(tcon->fsUnixInfo.Capability))) {
                int oflags = (int) cifs_posix_convert_flags(file->f_flags);
+                oflags |= SMB_O_CREAT;
                /* can not refresh inode info since size could be stale */
                rc = cifs_posix_open(full_path, &inode, file->f_path.mnt,
-                                     cifs_sb->mnt_file_mode /* ignored */,
+                                inode->i_sb,
-                                     oflags, &oplock, &netfid, xid);
+                                cifs_sb->mnt_file_mode /* ignored */,
+                                oflags, &oplock, &netfid, xid);
                if (rc == 0) {
-                        cFYI(1, ("posix open succeeded"));
+                        cFYI(1, "posix open succeeded");
                        /* no need for special case handling of setting mode
                           on read only files needed here */
                        pCifsFile = cifs_fill_filedata(file);
                        cifs_posix_open_inode_helper(inode, file, pCifsInode,
-                                                     pCifsFile, oplock, netfid);
+                                                     oplock, netfid);
                        goto out;
                } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
                        if (tcon->ses->serverNOS)
-                                cERROR(1, ("server %s of type %s returned"
+                                cERROR(1, "server %s of type %s returned"
                                           " unexpected error on SMB posix open"
                                           ", disabling posix open support."
                                           " Check if server update available.",
                                           tcon->ses->serverName,
-                                           tcon->ses->serverNOS));
+                                           tcon->ses->serverNOS);
                        tcon->broken_posix_open = true;
                } else if ((rc != -EIO) && (rc != -EREMOTE) &&
                         (rc != -EOPNOTSUPP)) /* path not found or net err */
@@ -385,7 +387,7 @@ int cifs_open(struct inode *inode, struct file *file)
                                & CIFS_MOUNT_MAP_SPECIAL_CHR);
        }
        if (rc) {
-                cFYI(1, ("cifs_open returned 0x%x", rc));
+                cFYI(1, "cifs_open returned 0x%x", rc);
                goto out;
        }
@@ -468,7 +470,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
        }
        if (file->f_path.dentry == NULL) {
-                cERROR(1, ("no valid name if dentry freed"));
+                cERROR(1, "no valid name if dentry freed");
                dump_stack();
                rc = -EBADF;
                goto reopen_error_exit;
@@ -476,7 +478,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
        inode = file->f_path.dentry->d_inode;
        if (inode == NULL) {
-                cERROR(1, ("inode not valid"));
+                cERROR(1, "inode not valid");
                dump_stack();
                rc = -EBADF;
                goto reopen_error_exit;
@@ -498,8 +500,8 @@ reopen_error_exit:
                return rc;
        }
-        cFYI(1, ("inode = 0x%p file flags 0x%x for %s",
+        cFYI(1, "inode = 0x%p file flags 0x%x for %s",
-                 inode, file->f_flags, full_path));
+                 inode, file->f_flags, full_path);
        if (oplockEnabled)
                oplock = REQ_OPLOCK;
@@ -512,10 +514,11 @@ reopen_error_exit:
                int oflags = (int) cifs_posix_convert_flags(file->f_flags);
                /* can not refresh inode info since size could be stale */
                rc = cifs_posix_open(full_path, NULL, file->f_path.mnt,
-                                     cifs_sb->mnt_file_mode /* ignored */,
+                                inode->i_sb,
-                                     oflags, &oplock, &netfid, xid);
+                                cifs_sb->mnt_file_mode /* ignored */,
+                                oflags, &oplock, &netfid, xid);
                if (rc == 0) {
-                        cFYI(1, ("posix reopen succeeded"));
+                        cFYI(1, "posix reopen succeeded");
                        goto reopen_success;
                }
                /* fallthrough to retry open the old way on errors, especially
@@ -536,8 +539,8 @@ reopen_error_exit:
                                CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc) {
                mutex_unlock(&pCifsFile->fh_mutex);
-                cFYI(1, ("cifs_open returned 0x%x", rc));
+                cFYI(1, "cifs_open returned 0x%x", rc);
-                cFYI(1, ("oplock: %d", oplock));
+                cFYI(1, "oplock: %d", oplock);
        } else {
 reopen_success:
                pCifsFile->netfid = netfid;
@@ -569,8 +572,8 @@ reopen_success:
                        if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
                                pCifsInode->clientCanCacheAll = true;
                                pCifsInode->clientCanCacheRead = true;
-                                cFYI(1, ("Exclusive Oplock granted on inode %p",
+                                cFYI(1, "Exclusive Oplock granted on inode %p",
-                                         file->f_path.dentry->d_inode));
+                                         file->f_path.dentry->d_inode);
                        } else if ((oplock & 0xF) == OPLOCK_READ) {
                                pCifsInode->clientCanCacheRead = true;
                                pCifsInode->clientCanCacheAll = false;
@@ -618,8 +621,7 @@ int cifs_close(struct inode *inode, struct file *file)
                                        the struct would be in each open file,
                                        but this should give enough time to
                                        clear the socket */
-                                        cFYI(DBG2,
+                                        cFYI(DBG2, "close delay, write pending");
-                                                ("close delay, write pending"));
                                        msleep(timeout);
                                        timeout *= 4;
                                }
@@ -652,7 +654,7 @@ int cifs_close(struct inode *inode, struct file *file)
        read_lock(&GlobalSMBSeslock);
        if (list_empty(&(CIFS_I(inode)->openFileList))) {
-                cFYI(1, ("closing last open instance for inode %p", inode));
+                cFYI(1, "closing last open instance for inode %p", inode);
                /* if the file is not open we do not know if we can cache info
                   on this inode, much less write behind and read ahead */
                CIFS_I(inode)->clientCanCacheRead = false;
@@ -673,7 +675,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
            (struct cifsFileInfo *)file->private_data;
        char *ptmp;
-        cFYI(1, ("Closedir inode = 0x%p", inode));
+        cFYI(1, "Closedir inode = 0x%p", inode);
        xid = GetXid();
@@ -684,22 +686,22 @@ int cifs_closedir(struct inode *inode, struct file *file)
                pTcon = cifs_sb->tcon;
-                cFYI(1, ("Freeing private data in close dir"));
+                cFYI(1, "Freeing private data in close dir");
                write_lock(&GlobalSMBSeslock);
                if (!pCFileStruct->srch_inf.endOfSearch &&
                    !pCFileStruct->invalidHandle) {
                        pCFileStruct->invalidHandle = true;
                        write_unlock(&GlobalSMBSeslock);
                        rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid);
-                        cFYI(1, ("Closing uncompleted readdir with rc %d",
+                        cFYI(1, "Closing uncompleted readdir with rc %d",
-                                 rc));
+                                 rc);
                        /* not much we can do if it fails anyway, ignore rc */
                        rc = 0;
                } else
                        write_unlock(&GlobalSMBSeslock);
                ptmp = pCFileStruct->srch_inf.ntwrk_buf_start;
                if (ptmp) {
-                        cFYI(1, ("closedir free smb buf in srch struct"));
+                        cFYI(1, "closedir free smb buf in srch struct");
                        pCFileStruct->srch_inf.ntwrk_buf_start = NULL;
                        if (pCFileStruct->srch_inf.smallBuf)
                                cifs_small_buf_release(ptmp);
@@ -747,49 +749,49 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
        rc = -EACCES;
        xid = GetXid();
-        cFYI(1, ("Lock parm: 0x%x flockflags: "
+        cFYI(1, "Lock parm: 0x%x flockflags: "
                 "0x%x flocktype: 0x%x start: %lld end: %lld",
                cmd, pfLock->fl_flags, pfLock->fl_type, pfLock->fl_start,
-                pfLock->fl_end));
+                pfLock->fl_end);
        if (pfLock->fl_flags & FL_POSIX)
-                cFYI(1, ("Posix"));
+                cFYI(1, "Posix");
        if (pfLock->fl_flags & FL_FLOCK)
-                cFYI(1, ("Flock"));
+                cFYI(1, "Flock");
        if (pfLock->fl_flags & FL_SLEEP) {
-                cFYI(1, ("Blocking lock"));
+                cFYI(1, "Blocking lock");
                wait_flag = true;
        }
        if (pfLock->fl_flags & FL_ACCESS)
-                cFYI(1, ("Process suspended by mandatory locking - "
+                cFYI(1, "Process suspended by mandatory locking - "
-                         "not implemented yet"));
+                         "not implemented yet");
        if (pfLock->fl_flags & FL_LEASE)
-                cFYI(1, ("Lease on file - not implemented yet"));
+                cFYI(1, "Lease on file - not implemented yet");
        if (pfLock->fl_flags &
            (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE)))
-                cFYI(1, ("Unknown lock flags 0x%x", pfLock->fl_flags));
+                cFYI(1, "Unknown lock flags 0x%x", pfLock->fl_flags);
        if (pfLock->fl_type == F_WRLCK) {
-                cFYI(1, ("F_WRLCK "));
+                cFYI(1, "F_WRLCK ");
                numLock = 1;
        } else if (pfLock->fl_type == F_UNLCK) {
-                cFYI(1, ("F_UNLCK"));
+                cFYI(1, "F_UNLCK");
                numUnlock = 1;
                /* Check if unlock includes more than
                one lock range */
        } else if (pfLock->fl_type == F_RDLCK) {
-                cFYI(1, ("F_RDLCK"));
+                cFYI(1, "F_RDLCK");
                lockType |= LOCKING_ANDX_SHARED_LOCK;
                numLock = 1;
        } else if (pfLock->fl_type == F_EXLCK) {
-                cFYI(1, ("F_EXLCK"));
+                cFYI(1, "F_EXLCK");
                numLock = 1;
        } else if (pfLock->fl_type == F_SHLCK) {
-                cFYI(1, ("F_SHLCK"));
+                cFYI(1, "F_SHLCK");
                lockType |= LOCKING_ANDX_SHARED_LOCK;
                numLock = 1;
        } else
-                cFYI(1, ("Unknown type of lock"));
+                cFYI(1, "Unknown type of lock");
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
        tcon = cifs_sb->tcon;
@@ -832,14 +834,38 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                                         0 /* wait flag */ );
                        pfLock->fl_type = F_UNLCK;
                        if (rc != 0)
-                                cERROR(1, ("Error unlocking previously locked "
+                                cERROR(1, "Error unlocking previously locked "
-                                           "range %d during test of lock", rc));
+                                           "range %d during test of lock", rc);
                        rc = 0;
                } else {
                        /* if rc == ERR_SHARING_VIOLATION ? */
-                        rc = 0; /* do not change lock type to unlock
+                        rc = 0;
-                                   since range in use */
+                        if (lockType & LOCKING_ANDX_SHARED_LOCK) {
+                                pfLock->fl_type = F_WRLCK;
+                        } else {
+                                rc = CIFSSMBLock(xid, tcon, netfid, length,
+                                        pfLock->fl_start, 0, 1,
+                                        lockType | LOCKING_ANDX_SHARED_LOCK,
+                                        0 /* wait flag */);
+                                if (rc == 0) {
+                                        rc = CIFSSMBLock(xid, tcon, netfid,
+                                                length, pfLock->fl_start, 1, 0,
+                                                lockType |
+                                                LOCKING_ANDX_SHARED_LOCK,
+                                                0 /* wait flag */);
+                                        pfLock->fl_type = F_RDLCK;
+                                        if (rc != 0)
+                                                cERROR(1, "Error unlocking "
+                                                "previously locked range %d "
+                                                "during test of lock", rc);
+                                        rc = 0;
+                                } else {
+                                        pfLock->fl_type = F_WRLCK;
+                                        rc = 0;
+                                }
+                        }
                }
                FreeXid(xid);
@@ -898,9 +924,10 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                                                        1, 0, li->type, false);
                                        if (stored_rc)
                                                rc = stored_rc;
+                                        else {
-                                        list_del(&li->llist);
+                                                list_del(&li->llist);
-                                        kfree(li);
+                                                kfree(li);
+                                        }
                                }
                        }
                        mutex_unlock(&fid->lock_mutex);
@@ -963,9 +990,8 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        pTcon = cifs_sb->tcon;
-        /* cFYI(1,
+        /* cFYI(1, " write %d bytes to offset %lld of %s", write_size,
-           (" write %d bytes to offset %lld of %s", write_size,
+           *poffset, file->f_path.dentry->d_name.name); */
-           *poffset, file->f_path.dentry->d_name.name)); */
        if (file->private_data == NULL)
                return -EBADF;
@@ -1066,8 +1092,8 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
        pTcon = cifs_sb->tcon;
-        cFYI(1, ("write %zd bytes to offset %lld of %s", write_size,
+        cFYI(1, "write %zd bytes to offset %lld of %s", write_size,
-           *poffset, file->f_path.dentry->d_name.name));
+           *poffset, file->f_path.dentry->d_name.name);
        if (file->private_data == NULL)
                return -EBADF;
@@ -1208,7 +1234,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
        it being zero) during stress testcases so we need to check for it */
        if (cifs_inode == NULL) {
-                cERROR(1, ("Null inode passed to cifs_writeable_file"));
+                cERROR(1, "Null inode passed to cifs_writeable_file");
                dump_stack();
                return NULL;
        }
@@ -1252,7 +1278,7 @@ refind_writable:
                        again. Note that it would be bad
                        to hold up writepages here (rather than
                        in caller) with continuous retries */
-                        cFYI(1, ("wp failed on reopen file"));
+                        cFYI(1, "wp failed on reopen file");
                        read_lock(&GlobalSMBSeslock);
                        /* can not use this handle, no write
                           pending on this one after all */
@@ -1328,7 +1354,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
                else if (bytes_written < 0)
                        rc = bytes_written;
        } else {
-                cFYI(1, ("No writeable filehandles for inode"));
+                cFYI(1, "No writeable filehandles for inode");
                rc = -EIO;
        }
@@ -1500,7 +1526,7 @@ retry:
                         */
                        open_file = find_writable_file(CIFS_I(mapping->host));
                        if (!open_file) {
-                                cERROR(1, ("No writable handles for inode"));
+                                cERROR(1, "No writable handles for inode");
                                rc = -EBADF;
                        } else {
                                long_op = cifs_write_timeout(cifsi, offset);
@@ -1513,8 +1539,8 @@ retry:
                                cifs_update_eof(cifsi, offset, bytes_written);
                                if (rc || bytes_written < bytes_to_write) {
-                                        cERROR(1, ("Write2 ret %d, wrote %d",
+                                        cERROR(1, "Write2 ret %d, wrote %d",
-                                                  rc, bytes_written));
+                                                  rc, bytes_written);
                                        /* BB what if continued retry is
                                           requested via mount flags? */
                                        if (rc == -ENOSPC)
@@ -1575,7 +1601,7 @@ static int cifs_writepage(struct page *page, struct writeback_control *wbc)
 /* BB add check for wbc flags */
        page_cache_get(page);
        if (!PageUptodate(page))
-                cFYI(1, ("ppw - page not up to date"));
+                cFYI(1, "ppw - page not up to date");
        /*
         * Set the "writeback" flag, and clear "dirty" in the radix tree.
@@ -1604,8 +1630,8 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
        int rc;
        struct inode *inode = mapping->host;
-        cFYI(1, ("write_end for page %p from pos %lld with %d bytes",
+        cFYI(1, "write_end for page %p from pos %lld with %d bytes",
-                 page, pos, copied));
+                 page, pos, copied);
        if (PageChecked(page)) {
                if (copied == len)
@@ -1650,7 +1676,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
        return rc;
 }
-int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int cifs_fsync(struct file *file, int datasync)
 {
        int xid;
        int rc = 0;
@@ -1661,8 +1687,8 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
        xid = GetXid();
-        cFYI(1, ("Sync file - name: %s datasync: 0x%x",
+        cFYI(1, "Sync file - name: %s datasync: 0x%x",
-                dentry->d_name.name, datasync));
+                file->f_path.dentry->d_name.name, datasync);
        rc = filemap_write_and_wait(inode->i_mapping);
        if (rc == 0) {
@@ -1686,7 +1712,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
        unsigned int rpages = 0;
        int rc = 0;
-        cFYI(1, ("sync page %p",page));
+        cFYI(1, "sync page %p", page);
        mapping = page->mapping;
        if (!mapping)
                return 0;
@@ -1697,7 +1723,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 /*      fill in rpages then
        result = cifs_pagein_inode(inode, index, rpages); */ /* BB finish */
-/*      cFYI(1, ("rpages is %d for sync page of Index %ld", rpages, index));
+/*      cFYI(1, "rpages is %d for sync page of Index %ld", rpages, index);
 #if 0
        if (rc < 0)
@@ -1731,7 +1757,7 @@ int cifs_flush(struct file *file, fl_owner_t id)
                CIFS_I(inode)->write_behind_rc = 0;
        }
-        cFYI(1, ("Flush inode %p file %p rc %d", inode, file, rc));
+        cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc);
        return rc;
 }
@@ -1763,7 +1789,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
        open_file = (struct cifsFileInfo *)file->private_data;
        if ((file->f_flags & O_ACCMODE) == O_WRONLY)
-                cFYI(1, ("attempting read on write only file instance"));
+                cFYI(1, "attempting read on write only file instance");
        for (total_read = 0, current_offset = read_data;
             read_size > total_read;
@@ -1844,7 +1870,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
        open_file = (struct cifsFileInfo *)file->private_data;
        if ((file->f_flags & O_ACCMODE) == O_WRONLY)
-                cFYI(1, ("attempting read on write only file instance"));
+                cFYI(1, "attempting read on write only file instance");
        for (total_read = 0, current_offset = read_data;
             read_size > total_read;
@@ -1895,7 +1921,7 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
        xid = GetXid();
        rc = cifs_revalidate_file(file);
        if (rc) {
-                cFYI(1, ("Validation prior to mmap failed, error=%d", rc));
+                cFYI(1, "Validation prior to mmap failed, error=%d", rc);
                FreeXid(xid);
                return rc;
        }
@@ -1906,8 +1932,7 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 static void cifs_copy_cache_pages(struct address_space *mapping,
-        struct list_head *pages, int bytes_read, char *data,
+        struct list_head *pages, int bytes_read, char *data)
-        struct pagevec *plru_pvec)
 {
        struct page *page;
        char *target;
@@ -1919,10 +1944,10 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
                page = list_entry(pages->prev, struct page, lru);
                list_del(&page->lru);
-                if (add_to_page_cache(page, mapping, page->index,
+                if (add_to_page_cache_lru(page, mapping, page->index,
                                      GFP_KERNEL)) {
                        page_cache_release(page);
-                        cFYI(1, ("Add page cache failed"));
+                        cFYI(1, "Add page cache failed");
                        data += PAGE_CACHE_SIZE;
                        bytes_read -= PAGE_CACHE_SIZE;
                        continue;
@@ -1945,8 +1970,6 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
                flush_dcache_page(page);
                SetPageUptodate(page);
                unlock_page(page);
-                if (!pagevec_add(plru_pvec, page))
-                        __pagevec_lru_add_file(plru_pvec);
                data += PAGE_CACHE_SIZE;
        }
        return;
@@ -1965,7 +1988,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
        unsigned int read_size, i;
        char *smb_read_data = NULL;
        struct smb_com_read_rsp *pSMBr;
-        struct pagevec lru_pvec;
        struct cifsFileInfo *open_file;
        int buf_type = CIFS_NO_BUFFER;
@@ -1979,8 +2001,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
        pTcon = cifs_sb->tcon;
-        pagevec_init(&lru_pvec, 0);
+        cFYI(DBG2, "rpages: num pages %d", num_pages);
-        cFYI(DBG2, ("rpages: num pages %d", num_pages));
        for (i = 0; i < num_pages; ) {
                unsigned contig_pages;
                struct page *tmp_page;
@@ -2013,8 +2034,8 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                /* Read size needs to be in multiples of one page */
                read_size = min_t(const unsigned int, read_size,
                                  cifs_sb->rsize & PAGE_CACHE_MASK);
-                cFYI(DBG2, ("rpages: read size 0x%x  contiguous pages %d",
+                cFYI(DBG2, "rpages: read size 0x%x  contiguous pages %d",
-                                read_size, contig_pages));
+                                read_size, contig_pages);
                rc = -EAGAIN;
                while (rc == -EAGAIN) {
                        if ((open_file->invalidHandle) &&
@@ -2041,14 +2062,14 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                        }
                }
                if ((rc < 0) || (smb_read_data == NULL)) {
-                        cFYI(1, ("Read error in readpages: %d", rc));
+                        cFYI(1, "Read error in readpages: %d", rc);
                        break;
                } else if (bytes_read > 0) {
                        task_io_account_read(bytes_read);
                        pSMBr = (struct smb_com_read_rsp *)smb_read_data;
                        cifs_copy_cache_pages(mapping, page_list, bytes_read,
                                smb_read_data + 4 /* RFC1001 hdr */ +
-                                le16_to_cpu(pSMBr->DataOffset), &lru_pvec);
+                                le16_to_cpu(pSMBr->DataOffset));
                        i +=  bytes_read >> PAGE_CACHE_SHIFT;
                        cifs_stats_bytes_read(pTcon, bytes_read);
@@ -2064,9 +2085,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                                /* break; */
                        }
                } else {
-                        cFYI(1, ("No bytes read (%d) at offset %lld . "
+                        cFYI(1, "No bytes read (%d) at offset %lld . "
-                                 "Cleaning remaining pages from readahead list",
+                                "Cleaning remaining pages from readahead list",
-                                 bytes_read, offset));
+                                bytes_read, offset);
                        /* BB turn off caching and do new lookup on
                           file size at server? */
                        break;
@@ -2081,8 +2102,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                bytes_read = 0;
        }
-        pagevec_lru_add_file(&lru_pvec);
 /* need to free smb_read_data buf before exit */
        if (smb_read_data) {
                if (buf_type == CIFS_SMALL_BUFFER)
@@ -2111,7 +2130,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
        if (rc < 0)
                goto io_error;
        else
-                cFYI(1, ("Bytes read %d", rc));
+                cFYI(1, "Bytes read %d", rc);
        file->f_path.dentry->d_inode->i_atime =
                current_fs_time(file->f_path.dentry->d_inode->i_sb);
@@ -2143,8 +2162,8 @@ static int cifs_readpage(struct file *file, struct page *page)
                return rc;
        }
-        cFYI(1, ("readpage %p at offset %d 0x%x\n",
+        cFYI(1, "readpage %p at offset %d 0x%x\n",
-                 page, (int)offset, (int)offset));
+                 page, (int)offset, (int)offset);
        rc = cifs_readpage_worker(file, page, &offset);
@@ -2214,7 +2233,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
        struct page *page;
        int rc = 0;
-        cFYI(1, ("write_begin from %lld len %d", (long long)pos, len));
+        cFYI(1, "write_begin from %lld len %d", (long long)pos, len);
        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page) {
@@ -2286,12 +2305,10 @@ cifs_oplock_break(struct slow_work *work)
        int rc, waitrc = 0;
        if (inode && S_ISREG(inode->i_mode)) {
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+                if (cinode->clientCanCacheRead)
-                if (cinode->clientCanCacheAll == 0)
                        break_lease(inode, O_RDONLY);
-                else if (cinode->clientCanCacheRead == 0)
+                else
                        break_lease(inode, O_WRONLY);
-#endif
                rc = filemap_fdatawrite(inode->i_mapping);
                if (cinode->clientCanCacheRead == 0) {
                        waitrc = filemap_fdatawait(inode->i_mapping);
@@ -2301,7 +2318,7 @@ cifs_oplock_break(struct slow_work *work)
                        rc = waitrc;
                if (rc)
                        cinode->write_behind_rc = rc;
-                cFYI(1, ("Oplock flush inode %p rc %d", inode, rc));
+                cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
        }
        /*
@@ -2313,7 +2330,7 @@ cifs_oplock_break(struct slow_work *work)
        if (!cfile->closePend && !cfile->oplock_break_cancelled) {
                rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0,
                                 LOCKING_ANDX_OPLOCK_RELEASE, false);
-                cFYI(1, ("Oplock release rc = %d", rc));
+                cFYI(1, "Oplock release rc = %d", rc);
        }
 }
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 723daaccbd0e..62b324f26a56 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/inode.c
 *
- *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Copyright (C) International Business Machines  Corp., 2002,2010
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   This library is free software; you can redistribute it and/or modify
@@ -20,6 +20,7 @@
 */
 #include <linux/fs.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <asm/div64.h>
 #include "cifsfs.h"
@@ -85,30 +86,30 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
 {
        struct cifsInodeInfo *cifs_i = CIFS_I(inode);
-        cFYI(1, ("%s: revalidating inode %llu", __func__, cifs_i->uniqueid));
+        cFYI(1, "%s: revalidating inode %llu", __func__, cifs_i->uniqueid);
        if (inode->i_state & I_NEW) {
-                cFYI(1, ("%s: inode %llu is new", __func__, cifs_i->uniqueid));
+                cFYI(1, "%s: inode %llu is new", __func__, cifs_i->uniqueid);
                return;
        }
        /* don't bother with revalidation if we have an oplock */
        if (cifs_i->clientCanCacheRead) {
-                cFYI(1, ("%s: inode %llu is oplocked", __func__,
+                cFYI(1, "%s: inode %llu is oplocked", __func__,
-                         cifs_i->uniqueid));
+                         cifs_i->uniqueid);
                return;
        }
         /* revalidate if mtime or size have changed */
        if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
            cifs_i->server_eof == fattr->cf_eof) {
-                cFYI(1, ("%s: inode %llu is unchanged", __func__,
+                cFYI(1, "%s: inode %llu is unchanged", __func__,
-                         cifs_i->uniqueid));
+                         cifs_i->uniqueid);
                return;
        }
-        cFYI(1, ("%s: invalidating inode %llu mapping", __func__,
+        cFYI(1, "%s: invalidating inode %llu mapping", __func__,
-                 cifs_i->uniqueid));
+                 cifs_i->uniqueid);
        cifs_i->invalid_mapping = true;
 }
@@ -136,15 +137,14 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
                inode->i_mode = fattr->cf_mode;
        cifs_i->cifsAttrs = fattr->cf_cifsattrs;
-        cifs_i->uniqueid = fattr->cf_uniqueid;
        if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
                cifs_i->time = 0;
        else
                cifs_i->time = jiffies;
-        cFYI(1, ("inode 0x%p old_time=%ld new_time=%ld", inode,
+        cFYI(1, "inode 0x%p old_time=%ld new_time=%ld", inode,
-                 oldtime, cifs_i->time));
+                 oldtime, cifs_i->time);
        cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING;
@@ -169,6 +169,17 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
        cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL);
 }
+void
+cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr)
+{
+        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
+                return;
+        fattr->cf_uniqueid = iunique(sb, ROOT_I);
+}
 /* Fill a cifs_fattr struct with info from FILE_UNIX_BASIC_INFO. */
 void
 cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
@@ -226,7 +237,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
                /* safest to call it a file if we do not know */
                fattr->cf_mode |= S_IFREG;
                fattr->cf_dtype = DT_REG;
-                cFYI(1, ("unknown type %d", le32_to_cpu(info->Type)));
+                cFYI(1, "unknown type %d", le32_to_cpu(info->Type));
                break;
        }
@@ -255,7 +266,7 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        cFYI(1, ("creating fake fattr for DFS referral"));
+        cFYI(1, "creating fake fattr for DFS referral");
        memset(fattr, 0, sizeof(*fattr));
        fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU;
@@ -304,7 +315,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        tcon = cifs_sb->tcon;
-        cFYI(1, ("Getting info on %s", full_path));
+        cFYI(1, "Getting info on %s", full_path);
        /* could have done a find first instead but this returns more info */
        rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data,
@@ -322,6 +333,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
        if (*pinode == NULL) {
                /* get new inode */
+                cifs_fill_uniqueid(sb, &fattr);
                *pinode = cifs_iget(sb, &fattr);
                if (!*pinode)
                        rc = -ENOMEM;
@@ -372,7 +384,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
                                 &bytes_read, &pbuf, &buf_type);
                if ((rc == 0) && (bytes_read >= 8)) {
                        if (memcmp("IntxBLK", pbuf, 8) == 0) {
-                                cFYI(1, ("Block device"));
+                                cFYI(1, "Block device");
                                fattr->cf_mode |= S_IFBLK;
                                fattr->cf_dtype = DT_BLK;
                                if (bytes_read == 24) {
@@ -384,7 +396,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
                                        fattr->cf_rdev = MKDEV(mjr, mnr);
                                }
                        } else if (memcmp("IntxCHR", pbuf, 8) == 0) {
-                                cFYI(1, ("Char device"));
+                                cFYI(1, "Char device");
                                fattr->cf_mode |= S_IFCHR;
                                fattr->cf_dtype = DT_CHR;
                                if (bytes_read == 24) {
@@ -396,7 +408,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
                                        fattr->cf_rdev = MKDEV(mjr, mnr);
                                }
                        } else if (memcmp("IntxLNK", pbuf, 7) == 0) {
-                                cFYI(1, ("Symlink"));
+                                cFYI(1, "Symlink");
                                fattr->cf_mode |= S_IFLNK;
                                fattr->cf_dtype = DT_LNK;
                        } else {
@@ -438,10 +450,10 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
        else if (rc > 3) {
                mode = le32_to_cpu(*((__le32 *)ea_value));
                fattr->cf_mode &= ~SFBITS_MASK;
-                cFYI(1, ("special bits 0%o org mode 0%o", mode,
+                cFYI(1, "special bits 0%o org mode 0%o", mode,
-                         fattr->cf_mode));
+                         fattr->cf_mode);
                fattr->cf_mode = (mode & SFBITS_MASK) | fattr->cf_mode;
-                cFYI(1, ("special mode bits 0%o", mode));
+                cFYI(1, "special mode bits 0%o", mode);
        }
        return 0;
@@ -547,11 +559,11 @@ int cifs_get_inode_info(struct inode **pinode,
        struct cifs_fattr fattr;
        pTcon = cifs_sb->tcon;
-        cFYI(1, ("Getting info on %s", full_path));
+        cFYI(1, "Getting info on %s", full_path);
        if ((pfindData == NULL) && (*pinode != NULL)) {
                if (CIFS_I(*pinode)->clientCanCacheRead) {
-                        cFYI(1, ("No need to revalidate cached inode sizes"));
+                        cFYI(1, "No need to revalidate cached inode sizes");
                        return rc;
                }
        }
@@ -617,7 +629,7 @@ int cifs_get_inode_info(struct inode **pinode,
                                        cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
                        if (rc1 || !fattr.cf_uniqueid) {
-                                cFYI(1, ("GetSrvInodeNum rc %d", rc1));
+                                cFYI(1, "GetSrvInodeNum rc %d", rc1);
                                fattr.cf_uniqueid = iunique(sb, ROOT_I);
                                cifs_autodisable_serverino(cifs_sb);
                        }
@@ -633,13 +645,13 @@ int cifs_get_inode_info(struct inode **pinode,
            cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
                tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid);
                if (tmprc)
-                        cFYI(1, ("cifs_sfu_type failed: %d", tmprc));
+                        cFYI(1, "cifs_sfu_type failed: %d", tmprc);
        }
 #ifdef CONFIG_CIFS_EXPERIMENTAL
        /* fill in 0777 bits from ACL */
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-                cFYI(1, ("Getting mode bits from ACL"));
+                cFYI(1, "Getting mode bits from ACL");
                cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid);
        }
 #endif
@@ -714,6 +726,16 @@ cifs_find_inode(struct inode *inode, void *opaque)
        if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
                return 0;
+        /*
+         * uh oh -- it's a directory. We can't use it since hardlinked dirs are
+         * verboten. Disable serverino and return it as if it were found, the
+         * caller can discard it, generate a uniqueid and retry the find
+         */
+        if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) {
+                fattr->cf_flags |= CIFS_FATTR_INO_COLLISION;
+                cifs_autodisable_serverino(CIFS_SB(inode->i_sb));
+        }
        return 1;
 }
@@ -733,15 +755,22 @@ cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
        unsigned long hash;
        struct inode *inode;
-        cFYI(1, ("looking for uniqueid=%llu", fattr->cf_uniqueid));
+retry_iget5_locked:
+        cFYI(1, "looking for uniqueid=%llu", fattr->cf_uniqueid);
        /* hash down to 32-bits on 32-bit arch */
        hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
        inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr);
-        /* we have fattrs in hand, update the inode */
        if (inode) {
+                /* was there a problematic inode number collision? */
+                if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) {
+                        iput(inode);
+                        fattr->cf_uniqueid = iunique(sb, ROOT_I);
+                        fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION;
+                        goto retry_iget5_locked;
+                }
                cifs_fattr_to_inode(inode, fattr);
                if (sb->s_flags & MS_NOATIME)
                        inode->i_flags |= S_NOATIME | S_NOCMTIME;
@@ -779,7 +808,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
                return ERR_PTR(-ENOMEM);
        if (rc && cifs_sb->tcon->ipc) {
-                cFYI(1, ("ipc connection - fake read inode"));
+                cFYI(1, "ipc connection - fake read inode");
                inode->i_mode |= S_IFDIR;
                inode->i_nlink = 2;
                inode->i_op = &cifs_ipc_inode_ops;
@@ -841,7 +870,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
         * server times.
         */
        if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
-                cFYI(1, ("CIFS - CTIME changed"));
+                cFYI(1, "CIFS - CTIME changed");
                info_buf.ChangeTime =
                    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
        } else
@@ -876,8 +905,8 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
                        goto out;
        }
-        cFYI(1, ("calling SetFileInfo since SetPathInfo for "
+        cFYI(1, "calling SetFileInfo since SetPathInfo for "
-                 "times not supported by this server"));
+                 "times not supported by this server");
        rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
                         SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
                         CREATE_NOT_DIR, &netfid, &oplock,
@@ -1035,7 +1064,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
        struct iattr *attrs = NULL;
        __u32 dosattr = 0, origattr = 0;
-        cFYI(1, ("cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry));
+        cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry);
        xid = GetXid();
@@ -1054,7 +1083,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
                rc = CIFSPOSIXDelFile(xid, tcon, full_path,
                        SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls,
                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-                cFYI(1, ("posix del rc %d", rc));
+                cFYI(1, "posix del rc %d", rc);
                if ((rc == 0) || (rc == -ENOENT))
                        goto psx_del_no_retry;
        }
@@ -1128,7 +1157,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
        struct inode *newinode = NULL;
        struct cifs_fattr fattr;
-        cFYI(1, ("In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode));
+        cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode);
        xid = GetXid();
@@ -1163,7 +1192,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                        kfree(pInfo);
                        goto mkdir_retry_old;
                } else if (rc) {
-                        cFYI(1, ("posix mkdir returned 0x%x", rc));
+                        cFYI(1, "posix mkdir returned 0x%x", rc);
                        d_drop(direntry);
                } else {
                        if (pInfo->Type == cpu_to_le32(-1)) {
@@ -1180,6 +1209,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                                direntry->d_op = &cifs_dentry_ops;
                        cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
+                        cifs_fill_uniqueid(inode->i_sb, &fattr);
                        newinode = cifs_iget(inode->i_sb, &fattr);
                        if (!newinode) {
                                kfree(pInfo);
@@ -1189,12 +1219,12 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                        d_instantiate(direntry, newinode);
 #ifdef CONFIG_CIFS_DEBUG2
-                        cFYI(1, ("instantiated dentry %p %s to inode %p",
+                        cFYI(1, "instantiated dentry %p %s to inode %p",
-                                direntry, direntry->d_name.name, newinode));
+                                direntry, direntry->d_name.name, newinode);
                        if (newinode->i_nlink != 2)
-                                cFYI(1, ("unexpected number of links %d",
+                                cFYI(1, "unexpected number of links %d",
-                                        newinode->i_nlink));
+                                        newinode->i_nlink);
 #endif
                }
                kfree(pInfo);
@@ -1205,7 +1235,7 @@ mkdir_retry_old:
        rc = CIFSSMBMkDir(xid, pTcon, full_path, cifs_sb->local_nls,
                          cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc) {
-                cFYI(1, ("cifs_mkdir returned 0x%x", rc));
+                cFYI(1, "cifs_mkdir returned 0x%x", rc);
                d_drop(direntry);
        } else {
 mkdir_get_info:
@@ -1308,7 +1338,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
        char *full_path = NULL;
        struct cifsInodeInfo *cifsInode;
-        cFYI(1, ("cifs_rmdir, inode = 0x%p", inode));
+        cFYI(1, "cifs_rmdir, inode = 0x%p", inode);
        xid = GetXid();
@@ -1510,6 +1540,11 @@ cifs_inode_needs_reval(struct inode *inode)
        if (time_after_eq(jiffies, cifs_i->time + HZ))
                return true;
+        /* hardlinked files w/ noserverino get "special" treatment */
+        if (!(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
+            S_ISREG(inode->i_mode) && inode->i_nlink != 1)
+                return true;
        return false;
 }
@@ -1576,9 +1611,9 @@ int cifs_revalidate_dentry(struct dentry *dentry)
                goto check_inval;
        }
-        cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
+        cFYI(1, "Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
                 "jiffies %ld", full_path, inode, inode->i_count.counter,
-                 dentry, dentry->d_time, jiffies));
+                 dentry, dentry->d_time, jiffies);
        if (CIFS_SB(sb)->tcon->unix_ext)
                rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
@@ -1672,12 +1707,12 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
                rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
                                        npid, false);
                cifsFileInfo_put(open_file);
-                cFYI(1, ("SetFSize for attrs rc = %d", rc));
+                cFYI(1, "SetFSize for attrs rc = %d", rc);
                if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
                        unsigned int bytes_written;
                        rc = CIFSSMBWrite(xid, pTcon, nfid, 0, attrs->ia_size,
                                          &bytes_written, NULL, NULL, 1);
-                        cFYI(1, ("Wrt seteof rc %d", rc));
+                        cFYI(1, "Wrt seteof rc %d", rc);
                }
        } else
                rc = -EINVAL;
@@ -1691,7 +1726,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
                                   false, cifs_sb->local_nls,
                                   cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-                cFYI(1, ("SetEOF by path (setattrs) rc = %d", rc));
+                cFYI(1, "SetEOF by path (setattrs) rc = %d", rc);
                if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
                        __u16 netfid;
                        int oplock = 0;
@@ -1708,7 +1743,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
                                                  attrs->ia_size,
                                                  &bytes_written, NULL,
                                                  NULL, 1);
-                                cFYI(1, ("wrt seteof rc %d", rc));
+                                cFYI(1, "wrt seteof rc %d", rc);
                                CIFSSMBClose(xid, pTcon, netfid);
                        }
                }
@@ -1736,8 +1771,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
        struct cifs_unix_set_info_args *args = NULL;
        struct cifsFileInfo *open_file;
-        cFYI(1, ("setattr_unix on file %s attrs->ia_valid=0x%x",
+        cFYI(1, "setattr_unix on file %s attrs->ia_valid=0x%x",
-                 direntry->d_name.name, attrs->ia_valid));
+                 direntry->d_name.name, attrs->ia_valid);
        xid = GetXid();
@@ -1867,8 +1902,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
        xid = GetXid();
-        cFYI(1, ("setattr on file %s attrs->iavalid 0x%x",
+        cFYI(1, "setattr on file %s attrs->iavalid 0x%x",
-                 direntry->d_name.name, attrs->ia_valid));
+                 direntry->d_name.name, attrs->ia_valid);
        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) {
                /* check if we have permission to change attrs */
@@ -1925,7 +1960,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
                attrs->ia_valid &= ~ATTR_MODE;
        if (attrs->ia_valid & ATTR_MODE) {
-                cFYI(1, ("Mode changed to 0%o", attrs->ia_mode));
+                cFYI(1, "Mode changed to 0%o", attrs->ia_mode);
                mode = attrs->ia_mode;
        }
@@ -2011,7 +2046,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 #if 0
 void cifs_delete_inode(struct inode *inode)
 {
-        cFYI(1, ("In cifs_delete_inode, inode = 0x%p", inode));
+        cFYI(1, "In cifs_delete_inode, inode = 0x%p", inode);
        /* may have to add back in if and when safe distributed caching of
           directories added e.g. via FindNotify */
 }
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index f94650683a00..505926f1ee6b 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -47,7 +47,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
        xid = GetXid();
-        cFYI(1, ("ioctl file %p  cmd %u  arg %lu", filep, command, arg));
+        cFYI(1, "ioctl file %p  cmd %u  arg %lu", filep, command, arg);
        cifs_sb = CIFS_SB(inode->i_sb);
@@ -64,12 +64,12 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
        switch (command) {
                case CIFS_IOC_CHECKUMOUNT:
-                        cFYI(1, ("User unmount attempted"));
+                        cFYI(1, "User unmount attempted");
                        if (cifs_sb->mnt_uid == current_uid())
                                rc = 0;
                        else {
                                rc = -EACCES;
-                                cFYI(1, ("uids do not match"));
+                                cFYI(1, "uids do not match");
                        }
                        break;
 #ifdef CONFIG_CIFS_POSIX
@@ -97,11 +97,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
                                /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid,
                                        extAttrBits, &ExtAttrMask);*/
                        }
-                        cFYI(1, ("set flags not implemented yet"));
+                        cFYI(1, "set flags not implemented yet");
                        break;
 #endif /* CONFIG_CIFS_POSIX */
                default:
-                        cFYI(1, ("unsupported ioctl"));
+                        cFYI(1, "unsupported ioctl");
                        break;
        }
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index fc1e0487eaee..473ca8033656 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -20,6 +20,7 @@
 */
 #include <linux/fs.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
@@ -138,7 +139,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
        if (!full_path)
                goto out;
-        cFYI(1, ("Full path: %s inode = 0x%p", full_path, inode));
+        cFYI(1, "Full path: %s inode = 0x%p", full_path, inode);
        rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
                                     cifs_sb->local_nls);
@@ -177,8 +178,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
                return rc;
        }
-        cFYI(1, ("Full path: %s", full_path));
+        cFYI(1, "Full path: %s", full_path);
-        cFYI(1, ("symname is %s", symname));
+        cFYI(1, "symname is %s", symname);
        /* BB what if DFS and this volume is on different share? BB */
        if (pTcon->unix_ext)
@@ -197,8 +198,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
                                                 inode->i_sb, xid, NULL);
                if (rc != 0) {
-                        cFYI(1, ("Create symlink ok, getinodeinfo fail rc = %d",
+                        cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d",
-                              rc));
+                              rc);
                } else {
                        if (pTcon->nocase)
                                direntry->d_op = &cifs_ci_dentry_ops;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index d1474996a812..1394aa37f26c 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -51,7 +51,7 @@ _GetXid(void)
        if (GlobalTotalActiveXid > GlobalMaxActiveXid)
                GlobalMaxActiveXid = GlobalTotalActiveXid;
        if (GlobalTotalActiveXid > 65000)
-                cFYI(1, ("warning: more than 65000 requests active"));
+                cFYI(1, "warning: more than 65000 requests active");
        xid = GlobalCurrentXid++;
        spin_unlock(&GlobalMid_Lock);
        return xid;
@@ -88,7 +88,7 @@ void
 sesInfoFree(struct cifsSesInfo *buf_to_free)
 {
        if (buf_to_free == NULL) {
-                cFYI(1, ("Null buffer passed to sesInfoFree"));
+                cFYI(1, "Null buffer passed to sesInfoFree");
                return;
        }
@@ -126,7 +126,7 @@ void
 tconInfoFree(struct cifsTconInfo *buf_to_free)
 {
        if (buf_to_free == NULL) {
-                cFYI(1, ("Null buffer passed to tconInfoFree"));
+                cFYI(1, "Null buffer passed to tconInfoFree");
                return;
        }
        atomic_dec(&tconInfoAllocCount);
@@ -166,7 +166,7 @@ void
 cifs_buf_release(void *buf_to_free)
 {
        if (buf_to_free == NULL) {
-                /* cFYI(1, ("Null buffer passed to cifs_buf_release"));*/
+                /* cFYI(1, "Null buffer passed to cifs_buf_release");*/
                return;
        }
        mempool_free(buf_to_free, cifs_req_poolp);
@@ -202,7 +202,7 @@ cifs_small_buf_release(void *buf_to_free)
 {
        if (buf_to_free == NULL) {
-                cFYI(1, ("Null buffer passed to cifs_small_buf_release"));
+                cFYI(1, "Null buffer passed to cifs_small_buf_release");
                return;
        }
        mempool_free(buf_to_free, cifs_sm_req_poolp);
@@ -345,19 +345,19 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
                /*      with userid/password pairs found on the smb session   */
                /*      for other target tcp/ip addresses               BB    */
                                if (current_fsuid() != treeCon->ses->linux_uid) {
-                                        cFYI(1, ("Multiuser mode and UID "
+                                        cFYI(1, "Multiuser mode and UID "
-                                                 "did not match tcon uid"));
+                                                 "did not match tcon uid");
                                        read_lock(&cifs_tcp_ses_lock);
                                        list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
                                                ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list);
                                                if (ses->linux_uid == current_fsuid()) {
                                                        if (ses->server == treeCon->ses->server) {
-                                                                cFYI(1, ("found matching uid substitute right smb_uid"));
+                                                                cFYI(1, "found matching uid substitute right smb_uid");
                                                                buffer->Uid = ses->Suid;
                                                                break;
                                                        } else {
                                /* BB eventually call cifs_setup_session here */
-                                                                cFYI(1, ("local UID found but no smb sess with this server exists"));
+                                                                cFYI(1, "local UID found but no smb sess with this server exists");
                                                        }
                                                }
                                        }
@@ -394,17 +394,16 @@ checkSMBhdr(struct smb_hdr *smb, __u16 mid)
                        if (smb->Command == SMB_COM_LOCKING_ANDX)
                                return 0;
                        else
-                                cERROR(1, ("Received Request not response"));
+                                cERROR(1, "Received Request not response");
                }
        } else { /* bad signature or mid */
                if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff))
-                        cERROR(1,
+                        cERROR(1, "Bad protocol string signature header %x",
-                               ("Bad protocol string signature header %x",
+                                *(unsigned int *) smb->Protocol);
-                                *(unsigned int *) smb->Protocol));
                if (mid != smb->Mid)
-                        cERROR(1, ("Mids do not match"));
+                        cERROR(1, "Mids do not match");
        }
-        cERROR(1, ("bad smb detected. The Mid=%d", smb->Mid));
+        cERROR(1, "bad smb detected. The Mid=%d", smb->Mid);
        return 1;
 }
@@ -413,7 +412,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 {
        __u32 len = smb->smb_buf_length;
        __u32 clc_len;  /* calculated length */
-        cFYI(0, ("checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len));
+        cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len);
        if (length < 2 + sizeof(struct smb_hdr)) {
                if ((length >= sizeof(struct smb_hdr) - 1)
@@ -437,15 +436,15 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
                                tmp[sizeof(struct smb_hdr)+1] = 0;
                                return 0;
                        }
-                        cERROR(1, ("rcvd invalid byte count (bcc)"));
+                        cERROR(1, "rcvd invalid byte count (bcc)");
                } else {
-                        cERROR(1, ("Length less than smb header size"));
+                        cERROR(1, "Length less than smb header size");
                }
                return 1;
        }
        if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-                cERROR(1, ("smb length greater than MaxBufSize, mid=%d",
+                cERROR(1, "smb length greater than MaxBufSize, mid=%d",
-                                   smb->Mid));
+                                   smb->Mid);
                return 1;
        }
@@ -454,8 +453,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
        clc_len = smbCalcSize_LE(smb);
        if (4 + len != length) {
-                cERROR(1, ("Length read does not match RFC1001 length %d",
+                cERROR(1, "Length read does not match RFC1001 length %d",
-                           len));
+                           len);
                return 1;
        }
@@ -466,8 +465,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
                        if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF))
                                return 0; /* bcc wrapped */
                }
-                cFYI(1, ("Calculated size %d vs length %d mismatch for mid %d",
+                cFYI(1, "Calculated size %d vs length %d mismatch for mid %d",
-                                clc_len, 4 + len, smb->Mid));
+                                clc_len, 4 + len, smb->Mid);
                /* Windows XP can return a few bytes too much, presumably
                an illegal pad, at the end of byte range lock responses
                so we allow for that three byte pad, as long as actual
@@ -482,8 +481,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
                if ((4+len > clc_len) && (len <= clc_len + 512))
                        return 0;
                else {
-                        cERROR(1, ("RFC1001 size %d bigger than SMB for Mid=%d",
+                        cERROR(1, "RFC1001 size %d bigger than SMB for Mid=%d",
-                                        len, smb->Mid));
+                                        len, smb->Mid);
                        return 1;
                }
        }
@@ -501,7 +500,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
        struct cifsFileInfo *netfile;
        int rc;
-        cFYI(1, ("Checking for oplock break or dnotify response"));
+        cFYI(1, "Checking for oplock break or dnotify response");
        if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) &&
           (pSMB->hdr.Flags & SMBFLG_RESPONSE)) {
                struct smb_com_transaction_change_notify_rsp *pSMBr =
@@ -513,15 +512,15 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                        pnotify = (struct file_notify_information *)
                                ((char *)&pSMBr->hdr.Protocol + data_offset);
-                        cFYI(1, ("dnotify on %s Action: 0x%x",
+                        cFYI(1, "dnotify on %s Action: 0x%x",
-                                 pnotify->FileName, pnotify->Action));
+                                 pnotify->FileName, pnotify->Action);
                        /*   cifs_dump_mem("Rcvd notify Data: ",buf,
                                sizeof(struct smb_hdr)+60); */
                        return true;
                }
                if (pSMBr->hdr.Status.CifsError) {
-                        cFYI(1, ("notify err 0x%d",
+                        cFYI(1, "notify err 0x%d",
-                                pSMBr->hdr.Status.CifsError));
+                                pSMBr->hdr.Status.CifsError);
                        return true;
                }
                return false;
@@ -535,7 +534,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                   large dirty files cached on the client */
                if ((NT_STATUS_INVALID_HANDLE) ==
                   le32_to_cpu(pSMB->hdr.Status.CifsError)) {
-                        cFYI(1, ("invalid handle on oplock break"));
+                        cFYI(1, "invalid handle on oplock break");
                        return true;
                } else if (ERRbadfid ==
                   le16_to_cpu(pSMB->hdr.Status.DosError.Error)) {
@@ -547,8 +546,8 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
        if (pSMB->hdr.WordCount != 8)
                return false;
-        cFYI(1, ("oplock type 0x%d level 0x%d",
+        cFYI(1, "oplock type 0x%d level 0x%d",
-                 pSMB->LockType, pSMB->OplockLevel));
+                 pSMB->LockType, pSMB->OplockLevel);
        if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE))
                return false;
@@ -579,15 +578,15 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                                        return true;
                                }
-                                cFYI(1, ("file id match, oplock break"));
+                                cFYI(1, "file id match, oplock break");
                                pCifsInode = CIFS_I(netfile->pInode);
                                pCifsInode->clientCanCacheAll = false;
                                if (pSMB->OplockLevel == 0)
                                        pCifsInode->clientCanCacheRead = false;
                                rc = slow_work_enqueue(&netfile->oplock_break);
                                if (rc) {
-                                        cERROR(1, ("failed to enqueue oplock "
+                                        cERROR(1, "failed to enqueue oplock "
-                                                   "break: %d\n", rc));
+                                                   "break: %d\n", rc);
                                } else {
                                        netfile->oplock_break_cancelled = false;
                                }
@@ -597,12 +596,12 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                        }
                        read_unlock(&GlobalSMBSeslock);
                        read_unlock(&cifs_tcp_ses_lock);
-                        cFYI(1, ("No matching file for oplock break"));
+                        cFYI(1, "No matching file for oplock break");
                        return true;
                }
        }
        read_unlock(&cifs_tcp_ses_lock);
-        cFYI(1, ("Can not process oplock break for non-existent connection"));
+        cFYI(1, "Can not process oplock break for non-existent connection");
        return true;
 }
@@ -721,11 +720,11 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
 {
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
                cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
-                cERROR(1, ("Autodisabling the use of server inode numbers on "
+                cERROR(1, "Autodisabling the use of server inode numbers on "
                           "%s. This server doesn't seem to support them "
                           "properly. Hardlinks will not be recognized on this "
                           "mount. Consider mounting with the \"noserverino\" "
                           "option to silence this message.",
-                           cifs_sb->tcon->treeName));
+                           cifs_sb->tcon->treeName);
        }
 }
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index bd6d6895730d..d35d52889cb5 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -149,7 +149,7 @@ cifs_inet_pton(const int address_family, const char *cp, void *dst)
        else if (address_family == AF_INET6)
                ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL);
-        cFYI(DBG2, ("address conversion returned %d for %s", ret, cp));
+        cFYI(DBG2, "address conversion returned %d for %s", ret, cp);
        if (ret > 0)
                ret = 1;
        return ret;
@@ -870,8 +870,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
        }
        /* else ERRHRD class errors or junk  - return EIO */
-        cFYI(1, ("Mapping smb error code %d to POSIX err %d",
+        cFYI(1, "Mapping smb error code %d to POSIX err %d",
-                 smberrcode, rc));
+                 smberrcode, rc);
        /* generic corrective action e.g. reconnect SMB session on
         * ERRbaduid could be added */
@@ -940,20 +940,20 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
        SMB_TIME *st = (SMB_TIME *)&time;
        SMB_DATE *sd = (SMB_DATE *)&date;
-        cFYI(1, ("date %d time %d", date, time));
+        cFYI(1, "date %d time %d", date, time);
        sec = 2 * st->TwoSeconds;
        min = st->Minutes;
        if ((sec > 59) || (min > 59))
-                cERROR(1, ("illegal time min %d sec %d", min, sec));
+                cERROR(1, "illegal time min %d sec %d", min, sec);
        sec += (min * 60);
        sec += 60 * 60 * st->Hours;
        if (st->Hours > 24)
-                cERROR(1, ("illegal hours %d", st->Hours));
+                cERROR(1, "illegal hours %d", st->Hours);
        days = sd->Day;
        month = sd->Month;
        if ((days > 31) || (month > 12)) {
-                cERROR(1, ("illegal date, month %d day: %d", month, days));
+                cERROR(1, "illegal date, month %d day: %d", month, days);
                if (month > 12)
                        month = 12;
        }
@@ -979,7 +979,7 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
        ts.tv_sec = sec + offset;
-        /* cFYI(1,("sec after cnvrt dos to unix time %d",sec)); */
+        /* cFYI(1, "sec after cnvrt dos to unix time %d",sec); */
        ts.tv_nsec = 0;
        return ts;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index c343b14ba2d3..daf1753af674 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -22,6 +22,7 @@
 */
 #include <linux/fs.h>
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/stat.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -46,15 +47,15 @@ static void dump_cifs_file_struct(struct file *file, char *label)
        if (file) {
                cf = file->private_data;
                if (cf == NULL) {
-                        cFYI(1, ("empty cifs private file data"));
+                        cFYI(1, "empty cifs private file data");
                        return;
                }
                if (cf->invalidHandle)
-                        cFYI(1, ("invalid handle"));
+                        cFYI(1, "invalid handle");
                if (cf->srch_inf.endOfSearch)
-                        cFYI(1, ("end of search"));
+                        cFYI(1, "end of search");
                if (cf->srch_inf.emptyDir)
-                        cFYI(1, ("empty dir"));
+                        cFYI(1, "empty dir");
        }
 }
 #else
@@ -75,7 +76,7 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
        struct inode *inode;
        struct super_block *sb = parent->d_inode->i_sb;
-        cFYI(1, ("For %s", name->name));
+        cFYI(1, "For %s", name->name);
        if (parent->d_op && parent->d_op->d_hash)
                parent->d_op->d_hash(parent, name);
@@ -213,7 +214,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
                                fid,
                                cifs_sb->local_nls);
                if (CIFSSMBClose(xid, ptcon, fid)) {
-                        cFYI(1, ("Error closing temporary reparsepoint open)"));
+                        cFYI(1, "Error closing temporary reparsepoint open");
                }
        }
 }
@@ -251,7 +252,7 @@ static int initiate_cifs_search(const int xid, struct file *file)
        if (full_path == NULL)
                return -ENOMEM;
-        cFYI(1, ("Full path: %s start at: %lld", full_path, file->f_pos));
+        cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos);
 ffirst_retry:
        /* test for Unix extensions */
@@ -296,7 +297,7 @@ static int cifs_unicode_bytelen(char *str)
                if (ustr[len] == 0)
                        return len << 1;
        }
-        cFYI(1, ("Unicode string longer than PATH_MAX found"));
+        cFYI(1, "Unicode string longer than PATH_MAX found");
        return len << 1;
 }
@@ -313,19 +314,18 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
                                pfData->FileNameLength;
        } else
                new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset);
-        cFYI(1, ("new entry %p old entry %p", new_entry, old_entry));
+        cFYI(1, "new entry %p old entry %p", new_entry, old_entry);
        /* validate that new_entry is not past end of SMB */
        if (new_entry >= end_of_smb) {
-                cERROR(1,
+                cERROR(1, "search entry %p began after end of SMB %p old entry %p",
-                      ("search entry %p began after end of SMB %p old entry %p",
+                        new_entry, end_of_smb, old_entry);
-                        new_entry, end_of_smb, old_entry));
                return NULL;
        } else if (((level == SMB_FIND_FILE_INFO_STANDARD) &&
                    (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb))
                  || ((level != SMB_FIND_FILE_INFO_STANDARD) &&
                   (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb)))  {
-                cERROR(1, ("search entry %p extends after end of SMB %p",
+                cERROR(1, "search entry %p extends after end of SMB %p",
-                        new_entry, end_of_smb));
+                        new_entry, end_of_smb);
                return NULL;
        } else
                return new_entry;
@@ -379,8 +379,8 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
                filename = &pFindData->FileName[0];
                len = pFindData->FileNameLength;
        } else {
-                cFYI(1, ("Unknown findfirst level %d",
+                cFYI(1, "Unknown findfirst level %d",
-                         cfile->srch_inf.info_level));
+                         cfile->srch_inf.info_level);
        }
        if (filename) {
@@ -480,7 +480,7 @@ static int cifs_save_resume_key(const char *current_entry,
                len = (unsigned int)pFindData->FileNameLength;
                cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
        } else {
-                cFYI(1, ("Unknown findfirst level %d", level));
+                cFYI(1, "Unknown findfirst level %d", level);
                return -EINVAL;
        }
        cifsFile->srch_inf.resume_name_len = len;
@@ -524,7 +524,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
             is_dir_changed(file)) ||
           (index_to_find < first_entry_in_buffer)) {
                /* close and restart search */
-                cFYI(1, ("search backing up - close and restart search"));
+                cFYI(1, "search backing up - close and restart search");
                write_lock(&GlobalSMBSeslock);
                if (!cifsFile->srch_inf.endOfSearch &&
                    !cifsFile->invalidHandle) {
@@ -534,7 +534,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
                } else
                        write_unlock(&GlobalSMBSeslock);
                if (cifsFile->srch_inf.ntwrk_buf_start) {
-                        cFYI(1, ("freeing SMB ff cache buf on search rewind"));
+                        cFYI(1, "freeing SMB ff cache buf on search rewind");
                        if (cifsFile->srch_inf.smallBuf)
                                cifs_small_buf_release(cifsFile->srch_inf.
                                                ntwrk_buf_start);
@@ -545,8 +545,8 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
                }
                rc = initiate_cifs_search(xid, file);
                if (rc) {
-                        cFYI(1, ("error %d reinitiating a search on rewind",
+                        cFYI(1, "error %d reinitiating a search on rewind",
-                                 rc));
+                                 rc);
                        return rc;
                }
                cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
@@ -554,7 +554,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
        while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
              (rc == 0) && !cifsFile->srch_inf.endOfSearch) {
-                cFYI(1, ("calling findnext2"));
+                cFYI(1, "calling findnext2");
                rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
                                  &cifsFile->srch_inf);
                cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
@@ -574,7 +574,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
                first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry
                                        - cifsFile->srch_inf.entries_in_buffer;
                pos_in_buf = index_to_find - first_entry_in_buffer;
-                cFYI(1, ("found entry - pos_in_buf %d", pos_in_buf));
+                cFYI(1, "found entry - pos_in_buf %d", pos_in_buf);
                for (i = 0; (i < (pos_in_buf)) && (current_entry != NULL); i++) {
                        /* go entry by entry figuring out which is first */
@@ -583,19 +583,19 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
                }
                if ((current_entry == NULL) && (i < pos_in_buf)) {
                        /* BB fixme - check if we should flag this error */
-                        cERROR(1, ("reached end of buf searching for pos in buf"
+                        cERROR(1, "reached end of buf searching for pos in buf"
                          " %d index to find %lld rc %d",
-                          pos_in_buf, index_to_find, rc));
+                          pos_in_buf, index_to_find, rc);
                }
                rc = 0;
                *ppCurrentEntry = current_entry;
        } else {
-                cFYI(1, ("index not in buffer - could not findnext into it"));
+                cFYI(1, "index not in buffer - could not findnext into it");
                return 0;
        }
        if (pos_in_buf >= cifsFile->srch_inf.entries_in_buffer) {
-                cFYI(1, ("can not return entries pos_in_buf beyond last"));
+                cFYI(1, "can not return entries pos_in_buf beyond last");
                *num_to_ret = 0;
        } else
                *num_to_ret = cifsFile->srch_inf.entries_in_buffer - pos_in_buf;
@@ -655,12 +655,12 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
                /* one byte length, no name conversion */
                len = (unsigned int)pFindData->FileNameLength;
        } else {
-                cFYI(1, ("Unknown findfirst level %d", level));
+                cFYI(1, "Unknown findfirst level %d", level);
                return -EINVAL;
        }
        if (len > max_len) {
-                cERROR(1, ("bad search response length %d past smb end", len));
+                cERROR(1, "bad search response length %d past smb end", len);
                return -EINVAL;
        }
@@ -753,7 +753,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
         * case already. Why should we be clobbering other errors from it?
         */
        if (rc) {
-                cFYI(1, ("filldir rc = %d", rc));
+                cFYI(1, "filldir rc = %d", rc);
                rc = -EOVERFLOW;
        }
        dput(tmp_dentry);
@@ -785,7 +785,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
        case 0:
                if (filldir(direntry, ".", 1, file->f_pos,
                     file->f_path.dentry->d_inode->i_ino, DT_DIR) < 0) {
-                        cERROR(1, ("Filldir for current dir failed"));
+                        cERROR(1, "Filldir for current dir failed");
                        rc = -ENOMEM;
                        break;
                }
@@ -793,7 +793,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
        case 1:
                if (filldir(direntry, "..", 2, file->f_pos,
                     file->f_path.dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) {
-                        cERROR(1, ("Filldir for parent dir failed"));
+                        cERROR(1, "Filldir for parent dir failed");
                        rc = -ENOMEM;
                        break;
                }
@@ -806,7 +806,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                if (file->private_data == NULL) {
                        rc = initiate_cifs_search(xid, file);
-                        cFYI(1, ("initiate cifs search rc %d", rc));
+                        cFYI(1, "initiate cifs search rc %d", rc);
                        if (rc) {
                                FreeXid(xid);
                                return rc;
@@ -820,7 +820,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                cifsFile = file->private_data;
                if (cifsFile->srch_inf.endOfSearch) {
                        if (cifsFile->srch_inf.emptyDir) {
-                                cFYI(1, ("End of search, empty dir"));
+                                cFYI(1, "End of search, empty dir");
                                rc = 0;
                                break;
                        }
@@ -832,16 +832,16 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                rc = find_cifs_entry(xid, pTcon, file,
                                &current_entry, &num_to_fill);
                if (rc) {
-                        cFYI(1, ("fce error %d", rc));
+                        cFYI(1, "fce error %d", rc);
                        goto rddir2_exit;
                } else if (current_entry != NULL) {
-                        cFYI(1, ("entry %lld found", file->f_pos));
+                        cFYI(1, "entry %lld found", file->f_pos);
                } else {
-                        cFYI(1, ("could not find entry"));
+                        cFYI(1, "could not find entry");
                        goto rddir2_exit;
                }
-                cFYI(1, ("loop through %d times filling dir for net buf %p",
+                cFYI(1, "loop through %d times filling dir for net buf %p",
-                        num_to_fill, cifsFile->srch_inf.ntwrk_buf_start));
+                        num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
                max_len = smbCalcSize((struct smb_hdr *)
                                cifsFile->srch_inf.ntwrk_buf_start);
                end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
@@ -850,8 +850,8 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                for (i = 0; (i < num_to_fill) && (rc == 0); i++) {
                        if (current_entry == NULL) {
                                /* evaluate whether this case is an error */
-                                cERROR(1, ("past SMB end,  num to fill %d i %d",
+                                cERROR(1, "past SMB end,  num to fill %d i %d",
-                                          num_to_fill, i));
+                                          num_to_fill, i);
                                break;
                        }
                        /* if buggy server returns . and .. late do
@@ -866,8 +866,8 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                        file->f_pos++;
                        if (file->f_pos ==
                                cifsFile->srch_inf.index_of_last_entry) {
-                                cFYI(1, ("last entry in buf at pos %lld %s",
+                                cFYI(1, "last entry in buf at pos %lld %s",
-                                        file->f_pos, tmp_buf));
+                                        file->f_pos, tmp_buf);
                                cifs_save_resume_key(current_entry, cifsFile);
                                break;
                        } else
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index aaa9c1c5a5bd..7707389bdf2c 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -29,14 +29,17 @@
 #include "ntlmssp.h"
 #include "nterr.h"
 #include <linux/utsname.h>
+#include <linux/slab.h>
 #include "cifs_spnego.h"
 extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
                         unsigned char *p24);
-/* Checks if this is the first smb session to be reconnected after
+/*
-   the socket has been reestablished (so we know whether to use vc 0).
+ * Checks if this is the first smb session to be reconnected after
-   Called while holding the cifs_tcp_ses_lock, so do not block */
+ * the socket has been reestablished (so we know whether to use vc 0).
+ * Called while holding the cifs_tcp_ses_lock, so do not block
+ */
 static bool is_first_ses_reconnect(struct cifsSesInfo *ses)
 {
        struct list_head *tmp;
@@ -283,7 +286,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
        int len;
        char *data = *pbcc_area;
-        cFYI(1, ("bleft %d", bleft));
+        cFYI(1, "bleft %d", bleft);
        /*
         * Windows servers do not always double null terminate their final
@@ -300,7 +303,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
        kfree(ses->serverOS);
        ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
-        cFYI(1, ("serverOS=%s", ses->serverOS));
+        cFYI(1, "serverOS=%s", ses->serverOS);
        len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
        data += len;
        bleft -= len;
@@ -309,7 +312,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
        kfree(ses->serverNOS);
        ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
-        cFYI(1, ("serverNOS=%s", ses->serverNOS));
+        cFYI(1, "serverNOS=%s", ses->serverNOS);
        len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
        data += len;
        bleft -= len;
@@ -318,7 +321,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
        kfree(ses->serverDomain);
        ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
-        cFYI(1, ("serverDomain=%s", ses->serverDomain));
+        cFYI(1, "serverDomain=%s", ses->serverDomain);
        return;
 }
@@ -331,7 +334,7 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
        int len;
        char *bcc_ptr = *pbcc_area;
-        cFYI(1, ("decode sessetup ascii. bleft %d", bleft));
+        cFYI(1, "decode sessetup ascii. bleft %d", bleft);
        len = strnlen(bcc_ptr, bleft);
        if (len >= bleft)
@@ -343,7 +346,7 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
        if (ses->serverOS)
                strncpy(ses->serverOS, bcc_ptr, len);
        if (strncmp(ses->serverOS, "OS/2", 4) == 0) {
-                        cFYI(1, ("OS/2 server"));
+                        cFYI(1, "OS/2 server");
                        ses->flags |= CIFS_SES_OS2;
        }
@@ -372,7 +375,7 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
        /* BB For newer servers which do not support Unicode,
           but thus do return domain here we could add parsing
           for it later, but it is not very important */
-        cFYI(1, ("ascii: bytes left %d", bleft));
+        cFYI(1, "ascii: bytes left %d", bleft);
        return rc;
 }
@@ -383,16 +386,16 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
        CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
        if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
-                cERROR(1, ("challenge blob len %d too small", blob_len));
+                cERROR(1, "challenge blob len %d too small", blob_len);
                return -EINVAL;
        }
        if (memcmp(pblob->Signature, "NTLMSSP", 8)) {
-                cERROR(1, ("blob signature incorrect %s", pblob->Signature));
+                cERROR(1, "blob signature incorrect %s", pblob->Signature);
                return -EINVAL;
        }
        if (pblob->MessageType != NtLmChallenge) {
-                cERROR(1, ("Incorrect message type %d", pblob->MessageType));
+                cERROR(1, "Incorrect message type %d", pblob->MessageType);
                return -EINVAL;
        }
@@ -446,7 +449,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
   This function returns the length of the data in the blob */
 static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                                   struct cifsSesInfo *ses,
-                                   const struct nls_table *nls_cp, int first)
+                                   const struct nls_table *nls_cp, bool first)
 {
        AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
        __u32 flags;
@@ -545,7 +548,7 @@ static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
 static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
                                  struct cifsSesInfo *ses,
-                                  const struct nls_table *nls, int first_time)
+                                  const struct nls_table *nls, bool first_time)
 {
        int bloblen;
@@ -558,8 +561,8 @@ static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
 #endif
 int
-CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
+CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
-                const struct nls_table *nls_cp)
+               const struct nls_table *nls_cp)
 {
        int rc = 0;
        int wct;
@@ -576,13 +579,18 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
        int bytes_remaining;
        struct key *spnego_key = NULL;
        __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
+        bool first_time;
        if (ses == NULL)
                return -EINVAL;
+        read_lock(&cifs_tcp_ses_lock);
+        first_time = is_first_ses_reconnect(ses);
+        read_unlock(&cifs_tcp_ses_lock);
        type = ses->server->secType;
-        cFYI(1, ("sess setup type %d", type));
+        cFYI(1, "sess setup type %d", type);
 ssetup_ntlmssp_authenticate:
        if (phase == NtLmChallenge)
                phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
@@ -663,7 +671,7 @@ ssetup_ntlmssp_authenticate:
                changed to do higher than lanman dialect and
                we reconnected would we ever calc signing_key? */
-                cFYI(1, ("Negotiating LANMAN setting up strings"));
+                cFYI(1, "Negotiating LANMAN setting up strings");
                /* Unicode not allowed for LANMAN dialects */
                ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 #endif
@@ -743,7 +751,7 @@ ssetup_ntlmssp_authenticate:
                        unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
                } else
                        ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
-        } else if (type == Kerberos || type == MSKerberos) {
+        } else if (type == Kerberos) {
 #ifdef CONFIG_CIFS_UPCALL
                struct cifs_spnego_msg *msg;
                spnego_key = cifs_get_spnego_key(ses);
@@ -757,17 +765,17 @@ ssetup_ntlmssp_authenticate:
                /* check version field to make sure that cifs.upcall is
                   sending us a response in an expected form */
                if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
-                        cERROR(1, ("incorrect version of cifs.upcall (expected"
+                        cERROR(1, "incorrect version of cifs.upcall (expected"
                                   " %d but got %d)",
-                                   CIFS_SPNEGO_UPCALL_VERSION, msg->version));
+                                   CIFS_SPNEGO_UPCALL_VERSION, msg->version);
                        rc = -EKEYREJECTED;
                        goto ssetup_exit;
                }
                /* bail out if key is too long */
                if (msg->sesskey_len >
                    sizeof(ses->server->mac_signing_key.data.krb5)) {
-                        cERROR(1, ("Kerberos signing key too long (%u bytes)",
+                        cERROR(1, "Kerberos signing key too long (%u bytes)",
-                                msg->sesskey_len));
+                                msg->sesskey_len);
                        rc = -EOVERFLOW;
                        goto ssetup_exit;
                }
@@ -795,7 +803,7 @@ ssetup_ntlmssp_authenticate:
                /* BB: is this right? */
                        ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 #else /* ! CONFIG_CIFS_UPCALL */
-                cERROR(1, ("Kerberos negotiated but upcall support disabled!"));
+                cERROR(1, "Kerberos negotiated but upcall support disabled!");
                rc = -ENOSYS;
                goto ssetup_exit;
 #endif /* CONFIG_CIFS_UPCALL */
@@ -803,12 +811,12 @@ ssetup_ntlmssp_authenticate:
 #ifdef CONFIG_CIFS_EXPERIMENTAL
                if (type == RawNTLMSSP) {
                        if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
-                                cERROR(1, ("NTLMSSP requires Unicode support"));
+                                cERROR(1, "NTLMSSP requires Unicode support");
                                rc = -ENOSYS;
                                goto ssetup_exit;
                        }
-                        cFYI(1, ("ntlmssp session setup phase %d", phase));
+                        cFYI(1, "ntlmssp session setup phase %d", phase);
                        pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
                        capabilities |= CAP_EXTENDED_SECURITY;
                        pSMB->req.Capabilities |= cpu_to_le32(capabilities);
@@ -826,7 +834,7 @@ ssetup_ntlmssp_authenticate:
                                   on the response (challenge) */
                                smb_buf->Uid = ses->Suid;
                        } else {
-                                cERROR(1, ("invalid phase %d", phase));
+                                cERROR(1, "invalid phase %d", phase);
                                rc = -ENOSYS;
                                goto ssetup_exit;
                        }
@@ -838,12 +846,12 @@ ssetup_ntlmssp_authenticate:
                        }
                        unicode_oslm_strings(&bcc_ptr, nls_cp);
                } else {
-                        cERROR(1, ("secType %d not supported!", type));
+                        cERROR(1, "secType %d not supported!", type);
                        rc = -ENOSYS;
                        goto ssetup_exit;
                }
 #else
-                cERROR(1, ("secType %d not supported!", type));
+                cERROR(1, "secType %d not supported!", type);
                rc = -ENOSYS;
                goto ssetup_exit;
 #endif
@@ -861,7 +869,7 @@ ssetup_ntlmssp_authenticate:
                          CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR);
        /* SMB request buf freed in SendReceive2 */
-        cFYI(1, ("ssetup rc from sendrecv2 is %d", rc));
+        cFYI(1, "ssetup rc from sendrecv2 is %d", rc);
        pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
        smb_buf = (struct smb_hdr *)iov[0].iov_base;
@@ -869,7 +877,7 @@ ssetup_ntlmssp_authenticate:
        if ((type == RawNTLMSSP) && (smb_buf->Status.CifsError ==
                        cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) {
                if (phase != NtLmNegotiate) {
-                        cERROR(1, ("Unexpected more processing error"));
+                        cERROR(1, "Unexpected more processing error");
                        goto ssetup_exit;
                }
                /* NTLMSSP Negotiate sent now processing challenge (response) */
@@ -881,14 +889,14 @@ ssetup_ntlmssp_authenticate:
        if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) {
                rc = -EIO;
-                cERROR(1, ("bad word count %d", smb_buf->WordCount));
+                cERROR(1, "bad word count %d", smb_buf->WordCount);
                goto ssetup_exit;
        }
        action = le16_to_cpu(pSMB->resp.Action);
        if (action & GUEST_LOGIN)
-                cFYI(1, ("Guest login")); /* BB mark SesInfo struct? */
+                cFYI(1, "Guest login"); /* BB mark SesInfo struct? */
        ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
-        cFYI(1, ("UID = %d ", ses->Suid));
+        cFYI(1, "UID = %d ", ses->Suid);
        /* response can have either 3 or 4 word count - Samba sends 3 */
        /* and lanman response is 3 */
        bytes_remaining = BCC(smb_buf);
@@ -898,7 +906,7 @@ ssetup_ntlmssp_authenticate:
                __u16 blob_len;
                blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
                if (blob_len > bytes_remaining) {
-                        cERROR(1, ("bad security blob length %d", blob_len));
+                        cERROR(1, "bad security blob length %d", blob_len);
                        rc = -EINVAL;
                        goto ssetup_exit;
                }
@@ -932,7 +940,7 @@ ssetup_exit:
        }
        kfree(str_area);
        if (resp_buf_type == CIFS_SMALL_BUFFER) {
-                cFYI(1, ("ssetup freeing small buf %p", iov[0].iov_base));
+                cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base);
                cifs_small_buf_release(iov[0].iov_base);
        } else if (resp_buf_type == CIFS_LARGE_BUFFER)
                cifs_buf_release(iov[0].iov_base);
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 93fb09a99c69..192ea51af20f 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -24,6 +24,7 @@
 */
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 07b8e71544ee..82f78c4d6978 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -22,6 +22,7 @@
 #include <linux/fs.h>
 #include <linux/list.h>
+#include <linux/gfp.h>
 #include <linux/wait.h>
 #include <linux/net.h>
 #include <linux/delay.h>
@@ -34,7 +35,6 @@
 #include "cifs_debug.h"
 extern mempool_t *cifs_mid_poolp;
-extern struct kmem_cache *cifs_oplock_cachep;
 static struct mid_q_entry *
 AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
@@ -42,7 +42,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
        struct mid_q_entry *temp;
        if (server == NULL) {
-                cERROR(1, ("Null TCP session in AllocMidQEntry"));
+                cERROR(1, "Null TCP session in AllocMidQEntry");
                return NULL;
        }
@@ -54,7 +54,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
                temp->mid = smb_buffer->Mid;    /* always LE */
                temp->pid = current->pid;
                temp->command = smb_buffer->Command;
-                cFYI(1, ("For smb_command %d", temp->command));
+                cFYI(1, "For smb_command %d", temp->command);
        /*      do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
                /* when mid allocated can be before when sent */
                temp->when_alloc = jiffies;
@@ -139,7 +139,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
                total_len += iov[i].iov_len;
        smb_buffer->smb_buf_length = cpu_to_be32(smb_buffer->smb_buf_length);
-        cFYI(1, ("Sending smb:  total_len %d", total_len));
+        cFYI(1, "Sending smb:  total_len %d", total_len);
        dump_smb(smb_buffer, len);
        i = 0;
@@ -167,9 +167,8 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
                           reconnect which may clear the network problem.
                        */
                        if ((i >= 14) || (!server->noblocksnd && (i > 2))) {
-                                cERROR(1,
+                                cERROR(1, "sends on sock %p stuck for 15 seconds",
-                                   ("sends on sock %p stuck for 15 seconds",
+                                    ssocket);
-                                    ssocket));
                                rc = -EAGAIN;
                                break;
                        }
@@ -183,13 +182,13 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
                        total_len = 0;
                        break;
                } else if (rc > total_len) {
-                        cERROR(1, ("sent %d requested %d", rc, total_len));
+                        cERROR(1, "sent %d requested %d", rc, total_len);
                        break;
                }
                if (rc == 0) {
                        /* should never happen, letting socket clear before
                           retrying is our only obvious option here */
-                        cERROR(1, ("tcp sent no data"));
+                        cERROR(1, "tcp sent no data");
                        msleep(500);
                        continue;
                }
@@ -212,8 +211,8 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
        }
        if ((total_len > 0) && (total_len != smb_buf_length + 4)) {
-                cFYI(1, ("partial send (%d remaining), terminating session",
+                cFYI(1, "partial send (%d remaining), terminating session",
-                        total_len));
+                        total_len);
                /* If we have only sent part of an SMB then the next SMB
                   could be taken as the remainder of this one.  We need
                   to kill the socket so the server throws away the partial
@@ -222,7 +221,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
        }
        if (rc < 0) {
-                cERROR(1, ("Error %d sending data on socket to server", rc));
+                cERROR(1, "Error %d sending data on socket to server", rc);
        } else
                rc = 0;
@@ -295,7 +294,7 @@ static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
        }
        if (ses->server->tcpStatus == CifsNeedReconnect) {
-                cFYI(1, ("tcp session dead - return to caller to retry"));
+                cFYI(1, "tcp session dead - return to caller to retry");
                return -EAGAIN;
        }
@@ -347,7 +346,7 @@ static int wait_for_response(struct cifsSesInfo *ses,
                        lrt += time_to_wait;
                        if (time_after(jiffies, lrt)) {
                                /* No replies for time_to_wait. */
-                                cERROR(1, ("server not responding"));
+                                cERROR(1, "server not responding");
                                return -1;
                        }
                } else {
@@ -378,7 +377,7 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
        iov[0].iov_len = in_buf->smb_buf_length + 4;
        flags |= CIFS_NO_RESP;
        rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags);
-        cFYI(DBG2, ("SendRcvNoRsp flags %d rc %d", flags, rc));
+        cFYI(DBG2, "SendRcvNoRsp flags %d rc %d", flags, rc);
        return rc;
 }
@@ -401,7 +400,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
        if ((ses == NULL) || (ses->server == NULL)) {
                cifs_small_buf_release(in_buf);
-                cERROR(1, ("Null session"));
+                cERROR(1, "Null session");
                return -EIO;
        }
@@ -470,7 +469,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
        else if (long_op == CIFS_BLOCKING_OP)
                timeout = 0x7FFFFFFF; /*  large, but not so large as to wrap */
        else {
-                cERROR(1, ("unknown timeout flag %d", long_op));
+                cERROR(1, "unknown timeout flag %d", long_op);
                rc = -EIO;
                goto out;
        }
@@ -489,8 +488,8 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
        spin_lock(&GlobalMid_Lock);
        if (midQ->resp_buf == NULL) {
-                cERROR(1, ("No response to cmd %d mid %d",
+                cERROR(1, "No response to cmd %d mid %d",
-                        midQ->command, midQ->mid));
+                        midQ->command, midQ->mid);
                if (midQ->midState == MID_REQUEST_SUBMITTED) {
                        if (ses->server->tcpStatus == CifsExiting)
                                rc = -EHOSTDOWN;
@@ -503,7 +502,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
                if (rc != -EHOSTDOWN) {
                        if (midQ->midState == MID_RETRY_NEEDED) {
                                rc = -EAGAIN;
-                                cFYI(1, ("marking request for retry"));
+                                cFYI(1, "marking request for retry");
                        } else {
                                rc = -EIO;
                        }
@@ -520,8 +519,8 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
        receive_len = midQ->resp_buf->smb_buf_length;
        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-                cERROR(1, ("Frame too large received.  Length: %d  Xid: %d",
+                cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
-                        receive_len, xid));
+                        receive_len, xid);
                rc = -EIO;
                goto out;
        }
@@ -547,7 +546,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
                                                &ses->server->mac_signing_key,
                                                midQ->sequence_number+1);
                        if (rc) {
-                                cERROR(1, ("Unexpected SMB signature"));
+                                cERROR(1, "Unexpected SMB signature");
                                /* BB FIXME add code to kill session */
                        }
                }
@@ -568,7 +567,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
                                                   DeleteMidQEntry */
        } else {
                rc = -EIO;
-                cFYI(1, ("Bad MID state?"));
+                cFYI(1, "Bad MID state?");
        }
 out:
@@ -590,11 +589,11 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
        struct mid_q_entry *midQ;
        if (ses == NULL) {
-                cERROR(1, ("Null smb session"));
+                cERROR(1, "Null smb session");
                return -EIO;
        }
        if (ses->server == NULL) {
-                cERROR(1, ("Null tcp session"));
+                cERROR(1, "Null tcp session");
                return -EIO;
        }
@@ -606,8 +605,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
           use ses->maxReq */
        if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-                cERROR(1, ("Illegal length, greater than maximum frame, %d",
+                cERROR(1, "Illegal length, greater than maximum frame, %d",
-                           in_buf->smb_buf_length));
+                           in_buf->smb_buf_length);
                return -EIO;
        }
@@ -664,7 +663,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
        else if (long_op == CIFS_BLOCKING_OP)
                timeout = 0x7FFFFFFF; /* large but no so large as to wrap */
        else {
-                cERROR(1, ("unknown timeout flag %d", long_op));
+                cERROR(1, "unknown timeout flag %d", long_op);
                rc = -EIO;
                goto out;
        }
@@ -680,8 +679,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
        spin_lock(&GlobalMid_Lock);
        if (midQ->resp_buf == NULL) {
-                cERROR(1, ("No response for cmd %d mid %d",
+                cERROR(1, "No response for cmd %d mid %d",
-                          midQ->command, midQ->mid));
+                          midQ->command, midQ->mid);
                if (midQ->midState == MID_REQUEST_SUBMITTED) {
                        if (ses->server->tcpStatus == CifsExiting)
                                rc = -EHOSTDOWN;
@@ -694,7 +693,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
                if (rc != -EHOSTDOWN) {
                        if (midQ->midState == MID_RETRY_NEEDED) {
                                rc = -EAGAIN;
-                                cFYI(1, ("marking request for retry"));
+                                cFYI(1, "marking request for retry");
                        } else {
                                rc = -EIO;
                        }
@@ -711,8 +710,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
        receive_len = midQ->resp_buf->smb_buf_length;
        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-                cERROR(1, ("Frame too large received.  Length: %d  Xid: %d",
+                cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
-                        receive_len, xid));
+                        receive_len, xid);
                rc = -EIO;
                goto out;
        }
@@ -735,7 +734,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
                                                &ses->server->mac_signing_key,
                                                midQ->sequence_number+1);
                        if (rc) {
-                                cERROR(1, ("Unexpected SMB signature"));
+                                cERROR(1, "Unexpected SMB signature");
                                /* BB FIXME add code to kill session */
                        }
                }
@@ -752,7 +751,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
                        BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
        } else {
                rc = -EIO;
-                cERROR(1, ("Bad MID state?"));
+                cERROR(1, "Bad MID state?");
        }
 out:
@@ -823,13 +822,13 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
        struct cifsSesInfo *ses;
        if (tcon == NULL || tcon->ses == NULL) {
-                cERROR(1, ("Null smb session"));
+                cERROR(1, "Null smb session");
                return -EIO;
        }
        ses = tcon->ses;
        if (ses->server == NULL) {
-                cERROR(1, ("Null tcp session"));
+                cERROR(1, "Null tcp session");
                return -EIO;
        }
@@ -841,8 +840,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
           use ses->maxReq */
        if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-                cERROR(1, ("Illegal length, greater than maximum frame, %d",
+                cERROR(1, "Illegal length, greater than maximum frame, %d",
-                           in_buf->smb_buf_length));
+                           in_buf->smb_buf_length);
                return -EIO;
        }
@@ -932,8 +931,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
                spin_unlock(&GlobalMid_Lock);
                receive_len = midQ->resp_buf->smb_buf_length;
        } else {
-                cERROR(1, ("No response for cmd %d mid %d",
+                cERROR(1, "No response for cmd %d mid %d",
-                          midQ->command, midQ->mid));
+                          midQ->command, midQ->mid);
                if (midQ->midState == MID_REQUEST_SUBMITTED) {
                        if (ses->server->tcpStatus == CifsExiting)
                                rc = -EHOSTDOWN;
@@ -946,7 +945,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
                if (rc != -EHOSTDOWN) {
                        if (midQ->midState == MID_RETRY_NEEDED) {
                                rc = -EAGAIN;
-                                cFYI(1, ("marking request for retry"));
+                                cFYI(1, "marking request for retry");
                        } else {
                                rc = -EIO;
                        }
@@ -957,8 +956,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
        }
        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-                cERROR(1, ("Frame too large received.  Length: %d  Xid: %d",
+                cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
-                        receive_len, xid));
+                        receive_len, xid);
                rc = -EIO;
                goto out;
        }
@@ -967,7 +966,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
        if ((out_buf == NULL) || (midQ->midState != MID_RESPONSE_RECEIVED)) {
                rc = -EIO;
-                cERROR(1, ("Bad MID state?"));
+                cERROR(1, "Bad MID state?");
                goto out;
        }
@@ -985,7 +984,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
                                           &ses->server->mac_signing_key,
                                           midQ->sequence_number+1);
                if (rc) {
-                        cERROR(1, ("Unexpected SMB signature"));
+                        cERROR(1, "Unexpected SMB signature");
                        /* BB FIXME add code to kill session */
                }
        }
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 3e2ef0de1209..a1509207bfa6 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/slab.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -69,12 +70,12 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
                return rc;
        }
        if (ea_name == NULL) {
-                cFYI(1, ("Null xattr names not supported"));
+                cFYI(1, "Null xattr names not supported");
        } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5)
                && (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4))) {
                cFYI(1,
-                    ("illegal xattr request %s (only user namespace supported)",
+                     "illegal xattr request %s (only user namespace supported)",
-                        ea_name));
+                     ea_name);
                /* BB what if no namespace prefix? */
                /* Should we just pass them to server, except for
                system and perhaps security prefixes? */
@@ -130,19 +131,19 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
                search server for EAs or streams to
                returns as xattrs */
        if (value_size > MAX_EA_VALUE_SIZE) {
-                cFYI(1, ("size of EA value too large"));
+                cFYI(1, "size of EA value too large");
                kfree(full_path);
                FreeXid(xid);
                return -EOPNOTSUPP;
        }
        if (ea_name == NULL) {
-                cFYI(1, ("Null xattr names not supported"));
+                cFYI(1, "Null xattr names not supported");
        } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
                        goto set_ea_exit;
                if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0)
-                        cFYI(1, ("attempt to set cifs inode metadata"));
+                        cFYI(1, "attempt to set cifs inode metadata");
                ea_name += 5; /* skip past user. prefix */
                rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
@@ -168,9 +169,9 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
                                        ACL_TYPE_ACCESS, cifs_sb->local_nls,
                                        cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        cFYI(1, ("set POSIX ACL rc %d", rc));
+                        cFYI(1, "set POSIX ACL rc %d", rc);
 #else
-                        cFYI(1, ("set POSIX ACL not supported"));
+                        cFYI(1, "set POSIX ACL not supported");
 #endif
                } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
                                   strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -181,13 +182,13 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
                                        ACL_TYPE_DEFAULT, cifs_sb->local_nls,
                                        cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        cFYI(1, ("set POSIX default ACL rc %d", rc));
+                        cFYI(1, "set POSIX default ACL rc %d", rc);
 #else
-                        cFYI(1, ("set default POSIX ACL not supported"));
+                        cFYI(1, "set default POSIX ACL not supported");
 #endif
                } else {
-                        cFYI(1, ("illegal xattr request %s (only user namespace"
+                        cFYI(1, "illegal xattr request %s (only user namespace"
-                                 " supported)", ea_name));
+                                " supported)", ea_name);
                  /* BB what if no namespace prefix? */
                  /* Should we just pass them to server, except for
                  system and perhaps security prefixes? */
@@ -234,13 +235,13 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
        /* return dos attributes as pseudo xattr */
        /* return alt name if available as pseudo attr */
        if (ea_name == NULL) {
-                cFYI(1, ("Null xattr names not supported"));
+                cFYI(1, "Null xattr names not supported");
        } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
                        goto get_ea_exit;
                if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) {
-                        cFYI(1, ("attempt to query cifs inode metadata"));
+                        cFYI(1, "attempt to query cifs inode metadata");
                        /* revalidate/getattr then populate from inode */
                } /* BB add else when above is implemented */
                ea_name += 5; /* skip past user. prefix */
@@ -286,7 +287,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
                }
 #endif /* EXPERIMENTAL */
 #else
-                cFYI(1, ("query POSIX ACL not supported yet"));
+                cFYI(1, "query POSIX ACL not supported yet");
 #endif /* CONFIG_CIFS_POSIX */
        } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
                          strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -298,18 +299,18 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
                                cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
 #else
-                cFYI(1, ("query POSIX default ACL not supported yet"));
+                cFYI(1, "query POSIX default ACL not supported yet");
 #endif
        } else if (strncmp(ea_name,
                  CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
-                cFYI(1, ("Trusted xattr namespace not supported yet"));
+                cFYI(1, "Trusted xattr namespace not supported yet");
        } else if (strncmp(ea_name,
                  CIFS_XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
-                cFYI(1, ("Security xattr namespace not supported yet"));
+                cFYI(1, "Security xattr namespace not supported yet");
        } else
                cFYI(1,
-                    ("illegal xattr request %s (only user namespace supported)",
+                    "illegal xattr request %s (only user namespace supported)",
-                        ea_name));
+                     ea_name);
        /* We could add an additional check for streams ie
            if proc/fs/cifs/streamstoxattr is set then
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index d99860a33890..6b443ff43a19 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -11,8 +11,7 @@ extern int coda_fake_statfs;
 void coda_destroy_inodecache(void);
 int coda_init_inodecache(void);
-int coda_fsync(struct file *coda_file, struct dentry *coda_dentry,
+int coda_fsync(struct file *coda_file, int datasync);
-               int datasync);
 void coda_sysctl_init(void);
 void coda_sysctl_clean(void);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 4bb9d0a5decc..ccd98b0f2b0b 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/time.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/stat.h>
 #include <linux/errno.h>
diff --git a/fs/coda/file.c b/fs/coda/file.c
index ffd42815fda1..ad3cd2abeeb4 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -17,6 +17,7 @@
 #include <linux/errno.h>
 #include <linux/smp_lock.h>
 #include <linux/string.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/coda.h>
@@ -201,10 +202,10 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
        return 0;
 }
-int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
+int coda_fsync(struct file *coda_file, int datasync)
 {
        struct file *host_file;
-        struct inode *coda_inode = coda_dentry->d_inode;
+        struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
        struct coda_file_info *cfi;
        int err = 0;
@@ -216,7 +217,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
        BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
        host_file = cfi->cfi_container;
-        err = vfs_fsync(host_file, host_file->f_path.dentry, datasync);
+        err = vfs_fsync(host_file, datasync);
        if ( !err && !datasync ) {
                lock_kernel();
                err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 830f51abb971..d97f9935a028 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -18,6 +18,7 @@
 #include <linux/smp_lock.h>
 #include <linux/file.h>
 #include <linux/vfs.h>
+#include <linux/slab.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -166,6 +167,10 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
                return -EBUSY;
        }
+        error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
+        if (error)
+                goto bdi_err;
        vc->vc_sb = sb;
        sb->s_fs_info = vc;
@@ -174,6 +179,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_blocksize_bits = 12;
        sb->s_magic = CODA_SUPER_MAGIC;
        sb->s_op = &coda_super_operations;
+        sb->s_bdi = &vc->bdi;
        /* get root fid from Venus: this needs the root inode */
        error = venus_rootfid(sb, &fid);
@@ -199,6 +205,8 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 error:
+        bdi_destroy(&vc->bdi);
+ bdi_err:
        if (root)
                iput(root);
        if (vc)
@@ -209,6 +217,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 static void coda_put_super(struct super_block *sb)
 {
+        bdi_destroy(&coda_vcp(sb)->bdi);
        coda_vcp(sb)->vc_sb = NULL;
        sb->s_fs_info = NULL;
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 773f2ce9aa06..ca25d96d45c9 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -1,6 +1,6 @@
 /*
 * Pioctl operations for Coda.
- * Original version: (C) 1996 Peter Braam 
+ * Original version: (C) 1996 Peter Braam
 * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University
 *
 * Carnegie Mellon encourages users of this code to contribute improvements
@@ -23,21 +23,22 @@
 #include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
+#include <linux/smp_lock.h>
 /* pioctl ops */
 static int coda_ioctl_permission(struct inode *inode, int mask);
-static int coda_pioctl(struct inode * inode, struct file * filp, 
+static long coda_pioctl(struct file *filp, unsigned int cmd,
-                       unsigned int cmd, unsigned long user_data);
+                        unsigned long user_data);
 /* exported from this file */
-const struct inode_operations coda_ioctl_inode_operations =
+const struct inode_operations coda_ioctl_inode_operations = {
-{
        .permission     = coda_ioctl_permission,
        .setattr        = coda_setattr,
 };
 const struct file_operations coda_ioctl_operations = {
        .owner          = THIS_MODULE,
-        .ioctl          = coda_pioctl,
+        .unlocked_ioctl = coda_pioctl,
 };
 /* the coda pioctl inode ops */
@@ -46,48 +47,53 @@ static int coda_ioctl_permission(struct inode *inode, int mask)
        return (mask & MAY_EXEC) ? -EACCES : 0;
 }
-static int coda_pioctl(struct inode * inode, struct file * filp, 
+static long coda_pioctl(struct file *filp, unsigned int cmd,
-                       unsigned int cmd, unsigned long user_data)
+                        unsigned long user_data)
 {
        struct path path;
-        int error;
+        int error;
        struct PioctlData data;
-        struct inode *target_inode = NULL;
+        struct inode *inode = filp->f_dentry->d_inode;
-        struct coda_inode_info *cnp;
+        struct inode *target_inode = NULL;
+        struct coda_inode_info *cnp;
-        /* get the Pioctl data arguments from user space */
+        lock_kernel();
-        if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) {
-            return -EINVAL;
+        /* get the Pioctl data arguments from user space */
-        }
+        if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) {
-       
+                error = -EINVAL;
-        /* 
+                goto out;
-         * Look up the pathname. Note that the pathname is in 
-         * user memory, and namei takes care of this
-         */
-        if (data.follow) {
-                error = user_path(data.path, &path);
-        } else {
-                error = user_lpath(data.path, &path);
        }
-                
-        if ( error ) {
+        /*
-                return error;
+         * Look up the pathname. Note that the pathname is in
-        } else {
+         * user memory, and namei takes care of this
+         */
+        if (data.follow)
+                error = user_path(data.path, &path);
+        else
+                error = user_lpath(data.path, &path);
+        if (error)
+                goto out;
+        else
                target_inode = path.dentry->d_inode;
-        }
-        
        /* return if it is not a Coda inode */
-        if ( target_inode->i_sb != inode->i_sb ) {
+        if (target_inode->i_sb != inode->i_sb) {
                path_put(&path);
-                return  -EINVAL;
+                error = -EINVAL;
+                goto out;
        }
        /* now proceed to make the upcall */
-        cnp = ITOC(target_inode);
+        cnp = ITOC(target_inode);
        error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
        path_put(&path);
-        return error;
-}
+out:
+        unlock_kernel();
+        return error;
+}
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index be4392ca2098..66b9cf79c5ba 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -73,8 +73,7 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait)
        return mask;
 }
-static int coda_psdev_ioctl(struct inode * inode, struct file * filp, 
+static long coda_psdev_ioctl(struct file * filp, unsigned int cmd, unsigned long arg)
-                            unsigned int cmd, unsigned long arg)
 {
        unsigned int data;
@@ -344,7 +343,7 @@ static const struct file_operations coda_psdev_fops = {
        .read           = coda_psdev_read,
        .write          = coda_psdev_write,
        .poll           = coda_psdev_poll,
-        .ioctl          = coda_psdev_ioctl,
+        .unlocked_ioctl = coda_psdev_ioctl,
        .open           = coda_psdev_open,
        .release        = coda_psdev_release,
 };
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index c274d949179d..f09c5ed76f6c 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -26,6 +26,7 @@
 #include <linux/stat.h>
 #include <linux/errno.h>
 #include <linux/string.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/vfs.h>
diff --git a/fs/compat.c b/fs/compat.c
index 030602d453b7..f0b391c50552 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -49,6 +49,7 @@
 #include <linux/mm.h>
 #include <linux/eventpoll.h>
 #include <linux/fs_struct.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -567,6 +568,79 @@ out:
        return ret;
 }
+/* A write operation does a read from user space and vice versa */
+#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
+ssize_t compat_rw_copy_check_uvector(int type,
+                const struct compat_iovec __user *uvector, unsigned long nr_segs,
+                unsigned long fast_segs, struct iovec *fast_pointer,
+                struct iovec **ret_pointer)
+{
+        compat_ssize_t tot_len;
+        struct iovec *iov = *ret_pointer = fast_pointer;
+        ssize_t ret = 0;
+        int seg;
+        /*
+         * SuS says "The readv() function *may* fail if the iovcnt argument
+         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
+         * traditionally returned zero for zero segments, so...
+         */
+        if (nr_segs == 0)
+                goto out;
+        ret = -EINVAL;
+        if (nr_segs > UIO_MAXIOV || nr_segs < 0)
+                goto out;
+        if (nr_segs > fast_segs) {
+                ret = -ENOMEM;
+                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+                if (iov == NULL) {
+                        *ret_pointer = fast_pointer;
+                        goto out;
+                }
+        }
+        *ret_pointer = iov;
+        /*
+         * Single unix specification:
+         * We should -EINVAL if an element length is not >= 0 and fitting an
+         * ssize_t.  The total length is fitting an ssize_t
+         *
+         * Be careful here because iov_len is a size_t not an ssize_t
+         */
+        tot_len = 0;
+        ret = -EINVAL;
+        for (seg = 0; seg < nr_segs; seg++) {
+                compat_ssize_t tmp = tot_len;
+                compat_uptr_t buf;
+                compat_ssize_t len;
+                if (__get_user(len, &uvector->iov_len) ||
+                   __get_user(buf, &uvector->iov_base)) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                if (len < 0)    /* size_t not fitting in compat_ssize_t .. */
+                        goto out;
+                tot_len += len;
+                if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
+                        goto out;
+                if (!access_ok(vrfy_dir(type), buf, len)) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                iov->iov_base = compat_ptr(buf);
+                iov->iov_len = (compat_size_t) len;
+                uvector++;
+                iov++;
+        }
+        ret = tot_len;
+out:
+        return ret;
+}
 static inline long
 copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
 {
@@ -599,7 +673,7 @@ compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb)
        iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
        ret = copy_iocb(nr, iocb, iocb64);
        if (!ret)
-                ret = sys_io_submit(ctx_id, nr, iocb64);
+                ret = do_io_submit(ctx_id, nr, iocb64, 1);
        return ret;
 }
@@ -1076,70 +1150,21 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 {
        compat_ssize_t tot_len;
        struct iovec iovstack[UIO_FASTIOV];
-        struct iovec *iov=iovstack, *vector;
+        struct iovec *iov;
        ssize_t ret;
-        int seg;
        io_fn_t fn;
        iov_fn_t fnv;
-        /*
-         * SuS says "The readv() function *may* fail if the iovcnt argument
-         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
-         * traditionally returned zero for zero segments, so...
-         */
-        ret = 0;
-        if (nr_segs == 0)
-                goto out;
-        /*
-         * First get the "struct iovec" from user memory and
-         * verify all the pointers
-         */
        ret = -EINVAL;
-        if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0))
-                goto out;
        if (!file->f_op)
                goto out;
-        if (nr_segs > UIO_FASTIOV) {
-                ret = -ENOMEM;
-                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
-                if (!iov)
-                        goto out;
-        }
        ret = -EFAULT;
        if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
                goto out;
-        /*
+        tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
-         * Single unix specification:
+                                               UIO_FASTIOV, iovstack, &iov);
-         * We should -EINVAL if an element length is not >= 0 and fitting an
-         * ssize_t.  The total length is fitting an ssize_t
-         *
-         * Be careful here because iov_len is a size_t not an ssize_t
-         */
-        tot_len = 0;
-        vector = iov;
-        ret = -EINVAL;
-        for (seg = 0 ; seg < nr_segs; seg++) {
-                compat_ssize_t tmp = tot_len;
-                compat_ssize_t len;
-                compat_uptr_t buf;
-                if (__get_user(len, &uvector->iov_len) ||
-                    __get_user(buf, &uvector->iov_base)) {
-                        ret = -EFAULT;
-                        goto out;
-                }
-                if (len < 0)    /* size_t not fitting an compat_ssize_t .. */
-                        goto out;
-                tot_len += len;
-                if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
-                        goto out;
-                vector->iov_base = compat_ptr(buf);
-                vector->iov_len = (compat_size_t) len;
-                uvector++;
-                vector++;
-        }
        if (tot_len == 0) {
                ret = 0;
                goto out;
@@ -1530,8 +1555,6 @@ int compat_do_execve(char * filename,
        if (retval < 0)
                goto out;
-        current->stack_start = current->mm->start_stack;
        /* execve succeeded */
        current->fs->in_exec = 0;
        current->in_execve = 0;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 6d55b61bfa79..641640dc7ae5 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -23,7 +23,6 @@
 #include <linux/ioctl.h>
 #include <linux/if.h>
 #include <linux/if_bridge.h>
-#include <linux/slab.h>
 #include <linux/raid/md_u.h>
 #include <linux/kd.h>
 #include <linux/route.h>
@@ -60,6 +59,7 @@
 #include <linux/i2c.h>
 #include <linux/i2c-dev.h>
 #include <linux/atalk.h>
+#include <linux/gfp.h>
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci.h>
@@ -102,7 +102,6 @@
 #include <linux/nbd.h>
 #include <linux/random.h>
 #include <linux/filter.h>
-#include <linux/pktcdvd.h>
 #include <linux/hiddev.h>
@@ -1126,8 +1125,6 @@ COMPATIBLE_IOCTL(PPGETMODE)
 COMPATIBLE_IOCTL(PPGETPHASE)
 COMPATIBLE_IOCTL(PPGETFLAGS)
 COMPATIBLE_IOCTL(PPSETFLAGS)
-/* pktcdvd */
-COMPATIBLE_IOCTL(PACKET_CTRL_CMD)
 /* Big A */
 /* sparc only */
 /* Big Q for sound/OSS */
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 8e48b52205aa..0b502f80c691 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -645,6 +645,7 @@ static void detach_groups(struct config_group *group)
                configfs_detach_group(sd->s_element);
                child->d_inode->i_flags |= S_DEAD;
+                dont_mount(child);
                mutex_unlock(&child->d_inode->i_mutex);
@@ -840,6 +841,7 @@ static int configfs_attach_item(struct config_item *parent_item,
                        mutex_lock(&dentry->d_inode->i_mutex);
                        configfs_remove_dir(item);
                        dentry->d_inode->i_flags |= S_DEAD;
+                        dont_mount(dentry);
                        mutex_unlock(&dentry->d_inode->i_mutex);
                        d_delete(dentry);
                }
@@ -882,6 +884,7 @@ static int configfs_attach_group(struct config_item *parent_item,
                if (ret) {
                        configfs_detach_item(item);
                        dentry->d_inode->i_flags |= S_DEAD;
+                        dont_mount(dentry);
                }
                configfs_adjust_dir_dirent_depth_after_populate(sd);
                mutex_unlock(&dentry->d_inode->i_mutex);
@@ -1725,6 +1728,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
        mutex_unlock(&configfs_symlink_mutex);
        configfs_detach_group(&group->cg_item);
        dentry->d_inode->i_flags |= S_DEAD;
+        dont_mount(dentry);
        mutex_unlock(&dentry->d_inode->i_mutex);
        d_delete(dentry);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index a2f746066c5d..41645142b88b 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -34,6 +34,7 @@
 #include <linux/capability.h>
 #include <linux/sched.h>
 #include <linux/lockdep.h>
+#include <linux/slab.h>
 #include <linux/configfs.h>
 #include "configfs_internal.h"
@@ -71,16 +72,11 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
        if (!sd)
                return -EINVAL;
-        sd_iattr = sd->s_iattr;
+        error = simple_setattr(dentry, iattr);
-        error = inode_change_ok(inode, iattr);
-        if (error)
-                return error;
-        error = inode_setattr(inode, iattr);
        if (error)
                return error;
+        sd_iattr = sd->s_iattr;
        if (!sd_iattr) {
                /* setting attributes for the first time, allocate now */
                sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL);
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 8421cea7d8c7..8c8d64230c2d 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -29,6 +29,7 @@
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/configfs.h>
 #include "configfs_internal.h"
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 32a5f46b1157..0f3eb41d9201 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -27,6 +27,7 @@
 #include <linux/fs.h>
 #include <linux/module.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #include <linux/configfs.h>
 #include "configfs_internal.h"
diff --git a/fs/dcache.c b/fs/dcache.c
index f1358e5c3a59..d96047b4a633 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -536,7 +536,7 @@ restart:
 */
 static void prune_dcache(int count)
 {
-        struct super_block *sb;
+        struct super_block *sb, *n;
        int w_count;
        int unused = dentry_stat.nr_unused;
        int prune_ratio;
@@ -545,13 +545,14 @@ static void prune_dcache(int count)
        if (unused == 0 || count == 0)
                return;
        spin_lock(&dcache_lock);
-restart:
        if (count >= unused)
                prune_ratio = 1;
        else
                prune_ratio = unused / count;
        spin_lock(&sb_lock);
-        list_for_each_entry(sb, &super_blocks, s_list) {
+        list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
+                if (list_empty(&sb->s_instances))
+                        continue;
                if (sb->s_nr_dentry_unused == 0)
                        continue;
                sb->s_count++;
@@ -590,14 +591,10 @@ restart:
                }
                spin_lock(&sb_lock);
                count -= pruned;
-                /*
+                __put_super(sb);
-                 * restart only when sb is no longer on the list and
+                /* more work left to do? */
-                 * we have more work to do.
+                if (count <= 0)
-                 */
+                        break;
-                if (__put_super_and_need_restart(sb) && count > 0) {
-                        spin_unlock(&sb_lock);
-                        goto restart;
-                }
        }
        spin_unlock(&sb_lock);
        spin_unlock(&dcache_lock);
@@ -1529,6 +1526,7 @@ void d_delete(struct dentry * dentry)
        spin_lock(&dentry->d_lock);
        isdir = S_ISDIR(dentry->d_inode->i_mode);
        if (atomic_read(&dentry->d_count) == 1) {
+                dentry->d_flags &= ~DCACHE_CANT_MOUNT;
                dentry_iput(dentry);
                fsnotify_nameremove(dentry, isdir);
                return;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 4d74fc72c195..0210898458b2 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -277,8 +277,10 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"
 DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
 DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n");
 /*
- * debugfs_create_x{8,16,32} - create a debugfs file that is used to read and write an unsigned {8,16,32}-bit value
+ * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value
 *
 * These functions are exactly the same as the above functions (but use a hex
 * output for the decimal challenged). For details look at the above unsigned
@@ -357,6 +359,23 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_x32);
+/**
+ * debugfs_create_x64 - create a debugfs file that is used to read and write an unsigned 64-bit value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+struct dentry *debugfs_create_x64(const char *name, mode_t mode,
+                                 struct dentry *parent, u64 *value)
+{
+        return debugfs_create_file(name, mode, parent, value, &fops_x64);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_x64);
 static int debugfs_size_t_set(void *data, u64 val)
 {
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 049d6c36da09..30a87b3dbcac 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -27,6 +27,7 @@
 #include <linux/fsnotify.h>
 #include <linux/string.h>
 #include <linux/magic.h>
+#include <linux/slab.h>
 static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 8882ecc0f1bf..8b3ffd5b5235 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -15,6 +15,7 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/tty.h>
 #include <linux/mutex.h>
@@ -383,18 +384,15 @@ static int devpts_get_sb(struct file_system_type *fs_type,
                s->s_flags |= MS_ACTIVE;
        }
-        simple_set_mnt(mnt, s);
        memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts));
        error = mknod_ptmx(s);
        if (error)
-                goto out_dput;
+                goto out_undo_sget;
-        return 0;
+        simple_set_mnt(mnt, s);
-out_dput:
+        return 0;
-        dput(s->s_root); /* undo dget() in simple_set_mnt() */
 out_undo_sget:
        deactivate_locked_super(s);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e82adc2debb7..7600aacf531d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -82,6 +82,8 @@ struct dio {
        int reap_counter;               /* rate limit reaping */
        get_block_t *get_block;         /* block mapping function */
        dio_iodone_t *end_io;           /* IO completion function */
+        dio_submit_t *submit_io;        /* IO submition function */
+        loff_t logical_offset_in_bio;   /* current first logical block in bio */
        sector_t final_block_in_bio;    /* current final block in bio + 1 */
        sector_t next_block_for_io;     /* next block to be put under IO,
                                           in dio_blocks units */
@@ -96,6 +98,7 @@ struct dio {
        unsigned cur_page_offset;       /* Offset into it, in bytes */
        unsigned cur_page_len;          /* Nr of bytes at cur_page_offset */
        sector_t cur_page_block;        /* Where it starts */
+        loff_t cur_page_fs_offset;      /* Offset in file */
        /* BIO completion state */
        spinlock_t bio_lock;            /* protects BIO fields below */
@@ -300,6 +303,26 @@ static void dio_bio_end_io(struct bio *bio, int error)
        spin_unlock_irqrestore(&dio->bio_lock, flags);
 }
+/**
+ * dio_end_io - handle the end io action for the given bio
+ * @bio: The direct io bio thats being completed
+ * @error: Error if there was one
+ *
+ * This is meant to be called by any filesystem that uses their own dio_submit_t
+ * so that the DIO specific endio actions are dealt with after the filesystem
+ * has done it's completion work.
+ */
+void dio_end_io(struct bio *bio, int error)
+{
+        struct dio *dio = bio->bi_private;
+        if (dio->is_async)
+                dio_bio_end_aio(bio, error);
+        else
+                dio_bio_end_io(bio, error);
+}
+EXPORT_SYMBOL_GPL(dio_end_io);
 static int
 dio_bio_alloc(struct dio *dio, struct block_device *bdev,
                sector_t first_sector, int nr_vecs)
@@ -316,6 +339,7 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
                bio->bi_end_io = dio_bio_end_io;
        dio->bio = bio;
+        dio->logical_offset_in_bio = dio->cur_page_fs_offset;
        return 0;
 }
@@ -340,10 +364,15 @@ static void dio_bio_submit(struct dio *dio)
        if (dio->is_async && dio->rw == READ)
                bio_set_pages_dirty(bio);
-        submit_bio(dio->rw, bio);
+        if (dio->submit_io)
+                dio->submit_io(dio->rw, bio, dio->inode,
+                               dio->logical_offset_in_bio);
+        else
+                submit_bio(dio->rw, bio);
        dio->bio = NULL;
        dio->boundary = 0;
+        dio->logical_offset_in_bio = 0;
 }
 /*
@@ -603,10 +632,26 @@ static int dio_send_cur_page(struct dio *dio)
        int ret = 0;
        if (dio->bio) {
+                loff_t cur_offset = dio->block_in_file << dio->blkbits;
+                loff_t bio_next_offset = dio->logical_offset_in_bio +
+                        dio->bio->bi_size;
                /*
-                 * See whether this new request is contiguous with the old
+                 * See whether this new request is contiguous with the old.
+                 *
+                 * Btrfs cannot handl having logically non-contiguous requests
+                 * submitted.  For exmple if you have
+                 *
+                 * Logical:  [0-4095][HOLE][8192-12287]
+                 * Phyiscal: [0-4095]      [4096-8181]
+                 *
+                 * We cannot submit those pages together as one BIO.  So if our
+                 * current logical offset in the file does not equal what would
+                 * be the next logical offset in the bio, submit the bio we
+                 * have.
                 */
-                if (dio->final_block_in_bio != dio->cur_page_block)
+                if (dio->final_block_in_bio != dio->cur_page_block ||
+                    cur_offset != bio_next_offset)
                        dio_bio_submit(dio);
                /*
                 * Submit now if the underlying fs is about to perform a
@@ -701,6 +746,7 @@ submit_page_section(struct dio *dio, struct page *page,
        dio->cur_page_offset = offset;
        dio->cur_page_len = len;
        dio->cur_page_block = blocknr;
+        dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits;
 out:
        return ret;
 }
@@ -935,7 +981,7 @@ static ssize_t
 direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
        const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
        unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
-        struct dio *dio)
+        dio_submit_t submit_io, struct dio *dio)
 {
        unsigned long user_addr; 
        unsigned long flags;
@@ -952,6 +998,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
        dio->get_block = get_block;
        dio->end_io = end_io;
+        dio->submit_io = submit_io;
        dio->final_block_in_bio = -1;
        dio->next_block_for_io = -1;
@@ -1008,7 +1055,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
                }
        } /* end iovec loop */
-        if (ret == -ENOTBLK && (rw & WRITE)) {
+        if (ret == -ENOTBLK) {
                /*
                 * The remaining part of the request will be
                 * be handled by buffered I/O when we return
@@ -1087,30 +1134,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
        return ret;
 }
-/*
- * This is a library function for use by filesystem drivers.
- *
- * The locking rules are governed by the flags parameter:
- *  - if the flags value contains DIO_LOCKING we use a fancy locking
- *    scheme for dumb filesystems.
- *    For writes this function is called under i_mutex and returns with
- *    i_mutex held, for reads, i_mutex is not held on entry, but it is
- *    taken and dropped again before returning.
- *    For reads and writes i_alloc_sem is taken in shared mode and released
- *    on I/O completion (which may happen asynchronously after returning to
- *    the caller).
- *
- *  - if the flags value does NOT contain DIO_LOCKING we don't use any
- *    internal locking but rather rely on the filesystem to synchronize
- *    direct I/O reads/writes versus each other and truncate.
- *    For reads and writes both i_mutex and i_alloc_sem are not held on
- *    entry and are never taken.
- */
 ssize_t
-__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+__blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
        struct block_device *bdev, const struct iovec *iov, loff_t offset, 
        unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
-        int flags)
+        dio_submit_t submit_io, int flags)
 {
        int seg;
        size_t size;
@@ -1197,11 +1225,49 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
                (end > i_size_read(inode)));
        retval = direct_io_worker(rw, iocb, inode, iov, offset,
-                                nr_segs, blkbits, get_block, end_io, dio);
+                                nr_segs, blkbits, get_block, end_io,
+                                submit_io, dio);
+out:
+        return retval;
+}
+EXPORT_SYMBOL(__blockdev_direct_IO_newtrunc);
+/*
+ * This is a library function for use by filesystem drivers.
+ *
+ * The locking rules are governed by the flags parameter:
+ *  - if the flags value contains DIO_LOCKING we use a fancy locking
+ *    scheme for dumb filesystems.
+ *    For writes this function is called under i_mutex and returns with
+ *    i_mutex held, for reads, i_mutex is not held on entry, but it is
+ *    taken and dropped again before returning.
+ *    For reads and writes i_alloc_sem is taken in shared mode and released
+ *    on I/O completion (which may happen asynchronously after returning to
+ *    the caller).
+ *
+ *  - if the flags value does NOT contain DIO_LOCKING we don't use any
+ *    internal locking but rather rely on the filesystem to synchronize
+ *    direct I/O reads/writes versus each other and truncate.
+ *    For reads and writes both i_mutex and i_alloc_sem are not held on
+ *    entry and are never taken.
+ */
+ssize_t
+__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+        struct block_device *bdev, const struct iovec *iov, loff_t offset,
+        unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+        dio_submit_t submit_io, int flags)
+{
+        ssize_t retval;
+        retval = __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov,
+                        offset, nr_segs, get_block, end_io, submit_io, flags);
        /*
         * In case of error extending write may have instantiated a few
         * blocks outside i_size. Trim these off again for DIO_LOCKING.
+         * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this in
+         * their own manner. This is a further example of where the old
+         * truncate sequence is inadequate.
         *
         * NOTE: filesystems with their own locking have to handle this
         * on their own.
@@ -1209,12 +1275,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        if (flags & DIO_LOCKING) {
                if (unlikely((rw & WRITE) && retval < 0)) {
                        loff_t isize = i_size_read(inode);
+                        loff_t end = offset + iov_length(iov, nr_segs);
                        if (end > isize)
                                vmtruncate(inode, isize);
                }
        }
-out:
        return retval;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 0df243850818..b54bca03d92f 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/configfs.h>
+#include <linux/slab.h>
 #include <linux/in.h>
 #include <linux/in6.h>
 #include <net/ipv6.h>
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 29d6139c35fc..c6cf25158746 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
+#include <linux/slab.h>
 #include "dlm_internal.h"
 #include "lock.h"
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 46ffd3eeaaf7..031dbe3a15ca 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -56,6 +56,7 @@
   L: receive_xxxx_reply()     <-  R: send_xxxx_reply()
 */
 #include <linux/types.h>
+#include <linux/slab.h>
 #include "dlm_internal.h"
 #include <linux/dlm_device.h>
 #include "memory.h"
@@ -732,10 +733,7 @@ static void lkb_add_ordered(struct list_head *new, struct list_head *head,
                if (lkb->lkb_rqmode < mode)
                        break;
-        if (!lkb)
+        __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
-                list_add_tail(new, head);
-        else
-                __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
 }
 /* add/remove lkb to rsb's grant/convert/wait queue */
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 52cab160893c..c0d35c620526 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -51,6 +51,7 @@
 #include <linux/file.h>
 #include <linux/mutex.h>
 #include <linux/sctp.h>
+#include <linux/slab.h>
 #include <net/sctp/user.h>
 #include <net/ipv6.h>
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 052095cd592f..2c6ad518100d 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -9,6 +9,7 @@
 #include <net/genetlink.h>
 #include <linux/dlm.h>
 #include <linux/dlm_netlink.h>
+#include <linux/gfp.h>
 #include "dlm_internal.h"
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index b5f89aef3b29..d45c02db6943 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -11,6 +11,7 @@
 #include <linux/poll.h>
 #include <linux/dlm.h>
 #include <linux/dlm_plock.h>
+#include <linux/slab.h>
 #include "dlm_internal.h"
 #include "lockspace.h"
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index a4bfd31ac45b..b6272853130c 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -17,6 +17,7 @@
 #include <linux/spinlock.h>
 #include <linux/dlm.h>
 #include <linux/dlm_device.h>
+#include <linux/slab.h>
 #include "dlm_internal.h"
 #include "lockspace.h"
@@ -214,6 +215,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
        if (!ast_type) {
                kref_get(&lkb->lkb_ref);
                list_add_tail(&lkb->lkb_astqueue, &proc->asts);
+                lkb->lkb_ast_first = type;
                wake_up_interruptible(&proc->wait);
        }
        if (type == AST_COMP && (ast_type & AST_COMP))
@@ -222,7 +224,6 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
        eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type);
        if (eol) {
-                lkb->lkb_ast_type &= ~AST_BAST;
                lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
        }
@@ -705,7 +706,7 @@ static int device_close(struct inode *inode, struct file *file)
 }
 static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
-                               int bmode, char __user *buf, size_t count)
+                               int mode, char __user *buf, size_t count)
 {
 #ifdef CONFIG_COMPAT
        struct dlm_lock_result32 result32;
@@ -732,7 +733,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
        if (type == AST_BAST) {
                result.user_astaddr = ua->bastaddr;
                result.user_astparam = ua->bastparam;
-                result.bast_mode = bmode;
+                result.bast_mode = mode;
        } else {
                result.user_astaddr = ua->castaddr;
                result.user_astparam = ua->castparam;
@@ -800,7 +801,9 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
        struct dlm_user_proc *proc = file->private_data;
        struct dlm_lkb *lkb;
        DECLARE_WAITQUEUE(wait, current);
-        int error, type=0, bmode=0, removed = 0;
+        int error = 0, removed;
+        int ret_type, ret_mode;
+        int bastmode, castmode, do_bast, do_cast;
        if (count == sizeof(struct dlm_device_version)) {
                error = copy_version_to_user(buf, count);
@@ -819,6 +822,8 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 #endif
                return -EINVAL;
+ try_another:
        /* do we really need this? can a read happen after a close? */
        if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
                return -EINVAL;
@@ -854,13 +859,55 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
        lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
-        if (lkb->lkb_ast_type & AST_COMP) {
+        removed = 0;
-                lkb->lkb_ast_type &= ~AST_COMP;
+        ret_type = 0;
-                type = AST_COMP;
+        ret_mode = 0;
-        } else if (lkb->lkb_ast_type & AST_BAST) {
+        do_bast = lkb->lkb_ast_type & AST_BAST;
-                lkb->lkb_ast_type &= ~AST_BAST;
+        do_cast = lkb->lkb_ast_type & AST_COMP;
-                type = AST_BAST;
+        bastmode = lkb->lkb_bastmode;
-                bmode = lkb->lkb_bastmode;
+        castmode = lkb->lkb_castmode;
+        /* when both are queued figure out which to do first and
+           switch first so the other goes in the next read */
+        if (do_cast && do_bast) {
+                if (lkb->lkb_ast_first == AST_COMP) {
+                        ret_type = AST_COMP;
+                        ret_mode = castmode;
+                        lkb->lkb_ast_type &= ~AST_COMP;
+                        lkb->lkb_ast_first = AST_BAST;
+                } else {
+                        ret_type = AST_BAST;
+                        ret_mode = bastmode;
+                        lkb->lkb_ast_type &= ~AST_BAST;
+                        lkb->lkb_ast_first = AST_COMP;
+                }
+        } else {
+                ret_type = lkb->lkb_ast_first;
+                ret_mode = (ret_type == AST_COMP) ? castmode : bastmode;
+                lkb->lkb_ast_type &= ~ret_type;
+                lkb->lkb_ast_first = 0;
+        }
+        /* if we're doing a bast but the bast is unnecessary, then
+           switch to do nothing or do a cast if that was needed next */
+        if ((ret_type == AST_BAST) &&
+            dlm_modes_compat(bastmode, lkb->lkb_castmode_done)) {
+                ret_type = 0;
+                ret_mode = 0;
+                if (do_cast) {
+                        ret_type = AST_COMP;
+                        ret_mode = castmode;
+                        lkb->lkb_ast_type &= ~AST_COMP;
+                        lkb->lkb_ast_first = 0;
+                }
+        }
+        if (lkb->lkb_ast_first != lkb->lkb_ast_type) {
+                log_print("device_read %x ast_first %x ast_type %x",
+                          lkb->lkb_id, lkb->lkb_ast_first, lkb->lkb_ast_type);
        }
        if (!lkb->lkb_ast_type) {
@@ -869,15 +916,29 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
        }
        spin_unlock(&proc->asts_spin);
-        error = copy_result_to_user(lkb->lkb_ua,
+        if (ret_type) {
-                                test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+                error = copy_result_to_user(lkb->lkb_ua,
-                                type, bmode, buf, count);
+                                test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+                                ret_type, ret_mode, buf, count);
+                if (ret_type == AST_COMP)
+                        lkb->lkb_castmode_done = castmode;
+                if (ret_type == AST_BAST)
+                        lkb->lkb_bastmode_done = bastmode;
+        }
        /* removes reference for the proc->asts lists added by
           dlm_user_add_ast() and may result in the lkb being freed */
        if (removed)
                dlm_put_lkb(lkb);
+        /* the bast that was queued was eliminated (see unnecessary above),
+           leaving nothing to return */
+        if (!ret_type)
+                goto try_another;
        return error;
 }
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 31f4b0e6d72c..83c4f600786a 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -12,7 +12,7 @@
 /* A global variable is a bit ugly, but it keeps the code simple */
 int sysctl_drop_caches;
-static void drop_pagecache_sb(struct super_block *sb)
+static void drop_pagecache_sb(struct super_block *sb, void *unused)
 {
        struct inode *inode, *toput_inode = NULL;
@@ -33,26 +33,6 @@ static void drop_pagecache_sb(struct super_block *sb)
        iput(toput_inode);
 }
-static void drop_pagecache(void)
-{
-        struct super_block *sb;
-        spin_lock(&sb_lock);
-restart:
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                sb->s_count++;
-                spin_unlock(&sb_lock);
-                down_read(&sb->s_umount);
-                if (sb->s_root)
-                        drop_pagecache_sb(sb);
-                up_read(&sb->s_umount);
-                spin_lock(&sb_lock);
-                if (__put_super_and_need_restart(sb))
-                        goto restart;
-        }
-        spin_unlock(&sb_lock);
-}
 static void drop_slab(void)
 {
        int nr_objects;
@@ -68,7 +48,7 @@ int drop_caches_sysctl_handler(ctl_table *table, int write,
        proc_dointvec_minmax(table, write, buffer, length, ppos);
        if (write) {
                if (sysctl_drop_caches & 1)
-                        drop_pagecache();
+                        iterate_supers(drop_pagecache_sb, NULL);
                if (sysctl_drop_caches & 2)
                        drop_slab();
        }
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 7cb0a59f4b9d..1cc087635a5e 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -33,6 +33,7 @@
 #include <linux/crypto.h>
 #include <linux/file.h>
 #include <linux/scatterlist.h>
+#include <linux/slab.h>
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
@@ -381,8 +382,8 @@ out:
 static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num,
                                             struct ecryptfs_crypt_stat *crypt_stat)
 {
-        (*offset) = (crypt_stat->num_header_bytes_at_front
+        (*offset) = ecryptfs_lower_header_size(crypt_stat)
-                     + (crypt_stat->extent_size * extent_num));
+                    + (crypt_stat->extent_size * extent_num);
 }
 /**
@@ -834,13 +835,13 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
        set_extent_mask_and_shift(crypt_stat);
        crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES;
        if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
-                crypt_stat->num_header_bytes_at_front = 0;
+                crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
        else {
                if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)
-                        crypt_stat->num_header_bytes_at_front =
+                        crypt_stat->metadata_size =
                                ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
                else
-                        crypt_stat->num_header_bytes_at_front = PAGE_CACHE_SIZE;
+                        crypt_stat->metadata_size = PAGE_CACHE_SIZE;
        }
 }
@@ -1107,9 +1108,9 @@ static void write_ecryptfs_marker(char *page_virt, size_t *written)
        (*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
 }
-static void
+void ecryptfs_write_crypt_stat_flags(char *page_virt,
-write_ecryptfs_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat,
+                                     struct ecryptfs_crypt_stat *crypt_stat,
-                     size_t *written)
+                                     size_t *written)
 {
        u32 flags = 0;
        int i;
@@ -1237,8 +1238,7 @@ ecryptfs_write_header_metadata(char *virt,
        header_extent_size = (u32)crypt_stat->extent_size;
        num_header_extents_at_front =
-                (u16)(crypt_stat->num_header_bytes_at_front
+                (u16)(crypt_stat->metadata_size / crypt_stat->extent_size);
-                      / crypt_stat->extent_size);
        put_unaligned_be32(header_extent_size, virt);
        virt += 4;
        put_unaligned_be16(num_header_extents_at_front, virt);
@@ -1291,7 +1291,8 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max,
        offset = ECRYPTFS_FILE_SIZE_BYTES;
        write_ecryptfs_marker((page_virt + offset), &written);
        offset += written;
-        write_ecryptfs_flags((page_virt + offset), crypt_stat, &written);
+        ecryptfs_write_crypt_stat_flags((page_virt + offset), crypt_stat,
+                                        &written);
        offset += written;
        ecryptfs_write_header_metadata((page_virt + offset), crypt_stat,
                                       &written);
@@ -1381,7 +1382,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
                rc = -EINVAL;
                goto out;
        }
-        virt_len = crypt_stat->num_header_bytes_at_front;
+        virt_len = crypt_stat->metadata_size;
        order = get_order(virt_len);
        /* Released in this function */
        virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order);
@@ -1427,16 +1428,15 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
        header_extent_size = get_unaligned_be32(virt);
        virt += sizeof(__be32);
        num_header_extents_at_front = get_unaligned_be16(virt);
-        crypt_stat->num_header_bytes_at_front =
+        crypt_stat->metadata_size = (((size_t)num_header_extents_at_front
-                (((size_t)num_header_extents_at_front
+                                     * (size_t)header_extent_size));
-                  * (size_t)header_extent_size));
        (*bytes_read) = (sizeof(__be32) + sizeof(__be16));
        if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE)
-            && (crypt_stat->num_header_bytes_at_front
+            && (crypt_stat->metadata_size
                < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) {
                rc = -EINVAL;
                printk(KERN_WARNING "Invalid header size: [%zd]\n",
-                       crypt_stat->num_header_bytes_at_front);
+                       crypt_stat->metadata_size);
        }
        return rc;
 }
@@ -1451,8 +1451,7 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
 */
 static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat)
 {
-        crypt_stat->num_header_bytes_at_front =
+        crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
-                ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
 }
 /**
@@ -1606,6 +1605,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
                                                ecryptfs_dentry,
                                                ECRYPTFS_VALIDATE_HEADER_SIZE);
        if (rc) {
+                memset(page_virt, 0, PAGE_CACHE_SIZE);
                rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode);
                if (rc) {
                        printk(KERN_DEBUG "Valid eCryptfs headers not found in "
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 8f006a0d6076..906e803f7f79 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -26,6 +26,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/fs_stack.h>
+#include <linux/slab.h>
 #include "ecryptfs_kernel.h"
 /**
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 542f625312f3..0032a9f5a3a9 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -35,6 +35,7 @@
 #include <linux/scatterlist.h>
 #include <linux/hash.h>
 #include <linux/nsproxy.h>
+#include <linux/backing-dev.h>
 /* Version verification for shared data structures w/ userspace */
 #define ECRYPTFS_VERSION_MAJOR 0x00
@@ -273,7 +274,7 @@ struct ecryptfs_crypt_stat {
        u32 flags;
        unsigned int file_version;
        size_t iv_bytes;
-        size_t num_header_bytes_at_front;
+        size_t metadata_size;
        size_t extent_size; /* Data extent size; default is 4096 */
        size_t key_size;
        size_t extent_shift;
@@ -393,6 +394,7 @@ struct ecryptfs_mount_crypt_stat {
 struct ecryptfs_sb_info {
        struct super_block *wsi_sb;
        struct ecryptfs_mount_crypt_stat mount_crypt_stat;
+        struct backing_dev_info bdi;
 };
 /* file private data. */
@@ -464,6 +466,14 @@ struct ecryptfs_daemon {
 extern struct mutex ecryptfs_daemon_hash_mux;
+static inline size_t
+ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat)
+{
+        if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
+                return 0;
+        return crypt_stat->metadata_size;
+}
 static inline struct ecryptfs_file_info *
 ecryptfs_file_to_private(struct file *file)
 {
@@ -651,6 +661,9 @@ int ecryptfs_decrypt_page(struct page *page);
 int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry);
 int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry);
 int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry);
+void ecryptfs_write_crypt_stat_flags(char *page_virt,
+                                     struct ecryptfs_crypt_stat *crypt_stat,
+                                     size_t *written);
 int ecryptfs_read_and_validate_header_region(char *data,
                                             struct inode *ecryptfs_inode);
 int ecryptfs_read_and_validate_xattr_region(char *page_virt,
@@ -718,15 +731,14 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
 int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
                                      struct page *page_for_lower,
                                      size_t offset_in_page, size_t size);
-int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
+int ecryptfs_write(struct inode *inode, char *data, loff_t offset, size_t size);
-                   size_t size);
 int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
                        struct inode *ecryptfs_inode);
 int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
                                     pgoff_t page_index,
                                     size_t offset_in_page, size_t size,
                                     struct inode *ecryptfs_inode);
-struct page *ecryptfs_get_locked_page(struct file *file, loff_t index);
+struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index);
 int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon);
 int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid,
                                 struct user_namespace *user_ns);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 678172b61be2..e8fcf4e2ed7d 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -25,6 +25,7 @@
 #include <linux/file.h>
 #include <linux/poll.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/security.h>
@@ -273,11 +274,9 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
 }
 static int
-ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+ecryptfs_fsync(struct file *file, int datasync)
 {
-        return vfs_fsync(ecryptfs_file_to_lower(file),
+        return vfs_fsync(ecryptfs_file_to_lower(file), datasync);
-                         ecryptfs_dentry_to_lower(dentry),
-                         datasync);
 }
 static int ecryptfs_fasync(int fd, struct file *file, int flag)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 4a430ab4115c..31ef5252f0fe 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -31,6 +31,7 @@
 #include <linux/mount.h>
 #include <linux/crypto.h>
 #include <linux/fs_stack.h>
+#include <linux/slab.h>
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
@@ -141,19 +142,10 @@ out:
 static int grow_file(struct dentry *ecryptfs_dentry)
 {
        struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode;
-        struct file fake_file;
-        struct ecryptfs_file_info tmp_file_info;
        char zero_virt[] = { 0x00 };
        int rc = 0;
-        memset(&fake_file, 0, sizeof(fake_file));
+        rc = ecryptfs_write(ecryptfs_inode, zero_virt, 0, 1);
-        fake_file.f_path.dentry = ecryptfs_dentry;
-        memset(&tmp_file_info, 0, sizeof(tmp_file_info));
-        ecryptfs_set_file_private(&fake_file, &tmp_file_info);
-        ecryptfs_set_file_lower(
-                &fake_file,
-                ecryptfs_inode_to_private(ecryptfs_inode)->lower_file);
-        rc = ecryptfs_write(&fake_file, zero_virt, 0, 1);
        i_size_write(ecryptfs_inode, 0);
        rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
        ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat.flags |=
@@ -323,6 +315,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
        rc = ecryptfs_read_and_validate_header_region(page_virt,
                                                      ecryptfs_dentry->d_inode);
        if (rc) {
+                memset(page_virt, 0, PAGE_CACHE_SIZE);
                rc = ecryptfs_read_and_validate_xattr_region(page_virt,
                                                             ecryptfs_dentry);
                if (rc) {
@@ -335,7 +328,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                ecryptfs_dentry->d_sb)->mount_crypt_stat;
        if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) {
                if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
-                        file_size = (crypt_stat->num_header_bytes_at_front
+                        file_size = (crypt_stat->metadata_size
                                     + i_size_read(lower_dentry->d_inode));
                else
                        file_size = i_size_read(lower_dentry->d_inode);
@@ -387,9 +380,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
-                printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
+                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
-                       "lower_dentry = [%s]\n", __func__, rc,
+                                "[%d] on lower_dentry = [%s]\n", __func__, rc,
-                       ecryptfs_dentry->d_name.name);
+                                encrypted_and_encoded_name);
                goto out_d_drop;
        }
        if (lower_dentry->d_inode)
@@ -416,9 +409,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
-                printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
+                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
-                       "lower_dentry = [%s]\n", __func__, rc,
+                                "[%d] on lower_dentry = [%s]\n", __func__, rc,
-                       encrypted_and_encoded_name);
+                                encrypted_and_encoded_name);
                goto out_d_drop;
        }
 lookup_and_interpose:
@@ -455,8 +448,8 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
        rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0);
        if (rc)
                goto out_lock;
-        fsstack_copy_attr_times(dir, lower_new_dentry->d_inode);
+        fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
-        fsstack_copy_inode_size(dir, lower_new_dentry->d_inode);
+        fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
        old_dentry->d_inode->i_nlink =
                ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink;
        i_size_write(new_dentry->d_inode, file_size_save);
@@ -647,38 +640,17 @@ out_lock:
        return rc;
 }
-static int
+static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
-ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+                                   size_t *bufsiz)
 {
+        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
        char *lower_buf;
-        size_t lower_bufsiz;
+        size_t lower_bufsiz = PATH_MAX;
-        struct dentry *lower_dentry;
-        struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
-        char *plaintext_name;
-        size_t plaintext_name_size;
        mm_segment_t old_fs;
        int rc;
-        lower_dentry = ecryptfs_dentry_to_lower(dentry);
-        if (!lower_dentry->d_inode->i_op->readlink) {
-                rc = -EINVAL;
-                goto out;
-        }
-        mount_crypt_stat = &ecryptfs_superblock_to_private(
-                                                dentry->d_sb)->mount_crypt_stat;
-        /*
-         * If the lower filename is encrypted, it will result in a significantly
-         * longer name.  If needed, truncate the name after decode and decrypt.
-         */
-        if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
-                lower_bufsiz = PATH_MAX;
-        else
-                lower_bufsiz = bufsiz;
-        /* Released in this function */
        lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL);
-        if (lower_buf == NULL) {
+        if (!lower_buf) {
-                printk(KERN_ERR "%s: Out of memory whilst attempting to "
-                       "kmalloc [%zd] bytes\n", __func__, lower_bufsiz);
                rc = -ENOMEM;
                goto out;
        }
@@ -688,29 +660,31 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
                                                   (char __user *)lower_buf,
                                                   lower_bufsiz);
        set_fs(old_fs);
-        if (rc >= 0) {
+        if (rc < 0)
-                rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name,
+                goto out;
-                                                          &plaintext_name_size,
+        lower_bufsiz = rc;
-                                                          dentry, lower_buf,
+        rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry,
-                                                          rc);
+                                                  lower_buf, lower_bufsiz);
-                if (rc) {
+out:
-                        printk(KERN_ERR "%s: Error attempting to decode and "
-                               "decrypt filename; rc = [%d]\n", __func__,
-                                rc);
-                        goto out_free_lower_buf;
-                }
-                /* Check for bufsiz <= 0 done in sys_readlinkat() */
-                rc = copy_to_user(buf, plaintext_name,
-                                  min((size_t) bufsiz, plaintext_name_size));
-                if (rc)
-                        rc = -EFAULT;
-                else
-                        rc = plaintext_name_size;
-                kfree(plaintext_name);
-                fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode);
-        }
-out_free_lower_buf:
        kfree(lower_buf);
+        return rc;
+}
+static int
+ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+{
+        char *kbuf;
+        size_t kbufsiz, copied;
+        int rc;
+        rc = ecryptfs_readlink_lower(dentry, &kbuf, &kbufsiz);
+        if (rc)
+                goto out;
+        copied = min_t(size_t, bufsiz, kbufsiz);
+        rc = copy_to_user(buf, kbuf, copied) ? -EFAULT : copied;
+        kfree(kbuf);
+        fsstack_copy_attr_atime(dentry->d_inode,
+                                ecryptfs_dentry_to_lower(dentry)->d_inode);
 out:
        return rc;
 }
@@ -768,7 +742,7 @@ upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat,
 {
        loff_t lower_size;
-        lower_size = crypt_stat->num_header_bytes_at_front;
+        lower_size = ecryptfs_lower_header_size(crypt_stat);
        if (upper_size != 0) {
                loff_t num_extents;
@@ -801,8 +775,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
 {
        int rc = 0;
        struct inode *inode = dentry->d_inode;
-        struct dentry *lower_dentry;
-        struct file fake_ecryptfs_file;
        struct ecryptfs_crypt_stat *crypt_stat;
        loff_t i_size = i_size_read(inode);
        loff_t lower_size_before_truncate;
@@ -813,23 +785,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
                goto out;
        }
        crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
-        /* Set up a fake ecryptfs file, this is used to interface with
-         * the file in the underlying filesystem so that the
-         * truncation has an effect there as well. */
-        memset(&fake_ecryptfs_file, 0, sizeof(fake_ecryptfs_file));
-        fake_ecryptfs_file.f_path.dentry = dentry;
-        /* Released at out_free: label */
-        ecryptfs_set_file_private(&fake_ecryptfs_file,
-                                  kmem_cache_alloc(ecryptfs_file_info_cache,
-                                                   GFP_KERNEL));
-        if (unlikely(!ecryptfs_file_to_private(&fake_ecryptfs_file))) {
-                rc = -ENOMEM;
-                goto out;
-        }
-        lower_dentry = ecryptfs_dentry_to_lower(dentry);
-        ecryptfs_set_file_lower(
-                &fake_ecryptfs_file,
-                ecryptfs_inode_to_private(dentry->d_inode)->lower_file);
        /* Switch on growing or shrinking file */
        if (ia->ia_size > i_size) {
                char zero[] = { 0x00 };
@@ -839,7 +794,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
                 * this triggers code that will fill in 0's throughout
                 * the intermediate portion of the previous end of the
                 * file and the new and of the file */
-                rc = ecryptfs_write(&fake_ecryptfs_file, zero,
+                rc = ecryptfs_write(inode, zero,
                                    (ia->ia_size - 1), 1);
        } else { /* ia->ia_size < i_size_read(inode) */
                /* We're chopping off all the pages down to the page
@@ -850,12 +805,12 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
                                    - (ia->ia_size & ~PAGE_CACHE_MASK));
                if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
-                        rc = vmtruncate(inode, ia->ia_size);
+                        rc = simple_setsize(inode, ia->ia_size);
                        if (rc)
-                                goto out_free;
+                                goto out;
                        lower_ia->ia_size = ia->ia_size;
                        lower_ia->ia_valid |= ATTR_SIZE;
-                        goto out_free;
+                        goto out;
                }
                if (num_zeros) {
                        char *zeros_virt;
@@ -863,25 +818,25 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
                        zeros_virt = kzalloc(num_zeros, GFP_KERNEL);
                        if (!zeros_virt) {
                                rc = -ENOMEM;
-                                goto out_free;
+                                goto out;
                        }
-                        rc = ecryptfs_write(&fake_ecryptfs_file, zeros_virt,
+                        rc = ecryptfs_write(inode, zeros_virt,
                                            ia->ia_size, num_zeros);
                        kfree(zeros_virt);
                        if (rc) {
                                printk(KERN_ERR "Error attempting to zero out "
                                       "the remainder of the end page on "
                                       "reducing truncate; rc = [%d]\n", rc);
-                                goto out_free;
+                                goto out;
                        }
                }
-                vmtruncate(inode, ia->ia_size);
+                simple_setsize(inode, ia->ia_size);
                rc = ecryptfs_write_inode_size_to_metadata(inode);
                if (rc) {
                        printk(KERN_ERR "Problem with "
                               "ecryptfs_write_inode_size_to_metadata; "
                               "rc = [%d]\n", rc);
-                        goto out_free;
+                        goto out;
                }
                /* We are reducing the size of the ecryptfs file, and need to
                 * know if we need to reduce the size of the lower file. */
@@ -895,10 +850,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
                } else
                        lower_ia->ia_valid &= ~ATTR_SIZE;
        }
-out_free:
-        if (ecryptfs_file_to_private(&fake_ecryptfs_file))
-                kmem_cache_free(ecryptfs_file_info_cache,
-                                ecryptfs_file_to_private(&fake_ecryptfs_file));
 out:
        return rc;
 }
@@ -1015,6 +966,28 @@ out:
        return rc;
 }
+int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
+                          struct kstat *stat)
+{
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+        int rc = 0;
+        mount_crypt_stat = &ecryptfs_superblock_to_private(
+                                                dentry->d_sb)->mount_crypt_stat;
+        generic_fillattr(dentry->d_inode, stat);
+        if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
+                char *target;
+                size_t targetsiz;
+                rc = ecryptfs_readlink_lower(dentry, &target, &targetsiz);
+                if (!rc) {
+                        kfree(target);
+                        stat->size = targetsiz;
+                }
+        }
+        return rc;
+}
 int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
                     struct kstat *stat)
 {
@@ -1039,7 +1012,7 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
        lower_dentry = ecryptfs_dentry_to_lower(dentry);
        if (!lower_dentry->d_inode->i_op->setxattr) {
-                rc = -ENOSYS;
+                rc = -EOPNOTSUPP;
                goto out;
        }
        mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1057,7 +1030,7 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
        int rc = 0;
        if (!lower_dentry->d_inode->i_op->getxattr) {
-                rc = -ENOSYS;
+                rc = -EOPNOTSUPP;
                goto out;
        }
        mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1084,7 +1057,7 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size)
        lower_dentry = ecryptfs_dentry_to_lower(dentry);
        if (!lower_dentry->d_inode->i_op->listxattr) {
-                rc = -ENOSYS;
+                rc = -EOPNOTSUPP;
                goto out;
        }
        mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1101,7 +1074,7 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name)
        lower_dentry = ecryptfs_dentry_to_lower(dentry);
        if (!lower_dentry->d_inode->i_op->removexattr) {
-                rc = -ENOSYS;
+                rc = -EOPNOTSUPP;
                goto out;
        }
        mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1132,6 +1105,7 @@ const struct inode_operations ecryptfs_symlink_iops = {
        .put_link = ecryptfs_put_link,
        .permission = ecryptfs_permission,
        .setattr = ecryptfs_setattr,
+        .getattr = ecryptfs_getattr_link,
        .setxattr = ecryptfs_setxattr,
        .getxattr = ecryptfs_getxattr,
        .listxattr = ecryptfs_listxattr,
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index a0a7847567e9..89c5476506ef 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -32,6 +32,7 @@
 #include <linux/random.h>
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>
+#include <linux/slab.h>
 #include "ecryptfs_kernel.h"
 /**
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index e14cf7e588db..d8c3a373aafa 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -22,6 +22,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/mount.h>
 #include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index ea2f92101dfe..cbd4e18adb20 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -35,6 +35,7 @@
 #include <linux/key.h>
 #include <linux/parser.h>
 #include <linux/fs_stack.h>
+#include <linux/slab.h>
 #include "ecryptfs_kernel.h"
 /**
@@ -280,7 +281,7 @@ static void ecryptfs_init_mount_crypt_stat(
 *
 * Returns zero on success; non-zero on error
 */
-static int ecryptfs_parse_options(struct super_block *sb, char *options)
+static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
 {
        char *p;
        int rc = 0;
@@ -292,7 +293,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
        int fn_cipher_key_bytes;
        int fn_cipher_key_bytes_set = 0;
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
-                &ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
+                &sbi->mount_crypt_stat;
        substring_t args[MAX_OPT_ARGS];
        int token;
        char *sig_src;
@@ -482,60 +483,7 @@ out:
 }
 struct kmem_cache *ecryptfs_sb_info_cache;
+static struct file_system_type ecryptfs_fs_type;
-/**
- * ecryptfs_fill_super
- * @sb: The ecryptfs super block
- * @raw_data: The options passed to mount
- * @silent: Not used but required by function prototype
- *
- * Sets up what we can of the sb, rest is done in ecryptfs_read_super
- *
- * Returns zero on success; non-zero otherwise
- */
-static int
-ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent)
-{
-        int rc = 0;
-        /* Released in ecryptfs_put_super() */
-        ecryptfs_set_superblock_private(sb,
-                                        kmem_cache_zalloc(ecryptfs_sb_info_cache,
-                                                         GFP_KERNEL));
-        if (!ecryptfs_superblock_to_private(sb)) {
-                ecryptfs_printk(KERN_WARNING, "Out of memory\n");
-                rc = -ENOMEM;
-                goto out;
-        }
-        sb->s_op = &ecryptfs_sops;
-        /* Released through deactivate_super(sb) from get_sb_nodev */
-        sb->s_root = d_alloc(NULL, &(const struct qstr) {
-                             .hash = 0,.name = "/",.len = 1});
-        if (!sb->s_root) {
-                ecryptfs_printk(KERN_ERR, "d_alloc failed\n");
-                rc = -ENOMEM;
-                goto out;
-        }
-        sb->s_root->d_op = &ecryptfs_dops;
-        sb->s_root->d_sb = sb;
-        sb->s_root->d_parent = sb->s_root;
-        /* Released in d_release when dput(sb->s_root) is called */
-        /* through deactivate_super(sb) from get_sb_nodev() */
-        ecryptfs_set_dentry_private(sb->s_root,
-                                    kmem_cache_zalloc(ecryptfs_dentry_info_cache,
-                                                     GFP_KERNEL));
-        if (!ecryptfs_dentry_to_private(sb->s_root)) {
-                ecryptfs_printk(KERN_ERR,
-                                "dentry_info_cache alloc failed\n");
-                rc = -ENOMEM;
-                goto out;
-        }
-        rc = 0;
-out:
-        /* Should be able to rely on deactivate_super called from
-         * get_sb_nodev */
-        return rc;
-}
 /**
 * ecryptfs_read_super
@@ -556,6 +504,13 @@ static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
                ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
                goto out;
        }
+        if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
+                rc = -EINVAL;
+                printk(KERN_ERR "Mount on filesystem of type "
+                        "eCryptfs explicitly disallowed due to "
+                        "known incompatibilities\n");
+                goto out_free;
+        }
        ecryptfs_set_superblock_lower(sb, path.dentry->d_sb);
        sb->s_maxbytes = path.dentry->d_sb->s_maxbytes;
        sb->s_blocksize = path.dentry->d_sb->s_blocksize;
@@ -579,11 +534,8 @@ out:
 * @dev_name: The path to mount over
 * @raw_data: The options passed into the kernel
 *
- * The whole ecryptfs_get_sb process is broken into 4 functions:
+ * The whole ecryptfs_get_sb process is broken into 3 functions:
 * ecryptfs_parse_options(): handle options passed to ecryptfs, if any
- * ecryptfs_fill_super(): used by get_sb_nodev, fills out the super_block
- *                        with as much information as it can before needing
- *                        the lower filesystem.
 * ecryptfs_read_super(): this accesses the lower filesystem and uses
 *                        ecryptfs_interpose to perform most of the linking
 * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
@@ -592,30 +544,78 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
                        const char *dev_name, void *raw_data,
                        struct vfsmount *mnt)
 {
+        struct super_block *s;
+        struct ecryptfs_sb_info *sbi;
+        struct ecryptfs_dentry_info *root_info;
+        const char *err = "Getting sb failed";
        int rc;
-        struct super_block *sb;
-        rc = get_sb_nodev(fs_type, flags, raw_data, ecryptfs_fill_super, mnt);
+        sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
-        if (rc < 0) {
+        if (!sbi) {
-                printk(KERN_ERR "Getting sb failed; rc = [%d]\n", rc);
+                rc = -ENOMEM;
                goto out;
        }
-        sb = mnt->mnt_sb;
-        rc = ecryptfs_parse_options(sb, raw_data);
+        rc = ecryptfs_parse_options(sbi, raw_data);
        if (rc) {
-                printk(KERN_ERR "Error parsing options; rc = [%d]\n", rc);
+                err = "Error parsing options";
-                goto out_abort;
+                goto out;
+        }
+        s = sget(fs_type, NULL, set_anon_super, NULL);
+        if (IS_ERR(s)) {
+                rc = PTR_ERR(s);
+                goto out;
        }
-        rc = ecryptfs_read_super(sb, dev_name);
+        s->s_flags = flags;
+        rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
        if (rc) {
-                printk(KERN_ERR "Reading sb failed; rc = [%d]\n", rc);
+                deactivate_locked_super(s);
-                goto out_abort;
+                goto out;
        }
-        goto out;
-out_abort:
+        ecryptfs_set_superblock_private(s, sbi);
-        dput(sb->s_root); /* aka mnt->mnt_root, as set by get_sb_nodev() */
+        s->s_bdi = &sbi->bdi;
-        deactivate_locked_super(sb);
+        /* ->kill_sb() will take care of sbi after that point */
+        sbi = NULL;
+        s->s_op = &ecryptfs_sops;
+        rc = -ENOMEM;
+        s->s_root = d_alloc(NULL, &(const struct qstr) {
+                             .hash = 0,.name = "/",.len = 1});
+        if (!s->s_root) {
+                deactivate_locked_super(s);
+                goto out;
+        }
+        s->s_root->d_op = &ecryptfs_dops;
+        s->s_root->d_sb = s;
+        s->s_root->d_parent = s->s_root;
+        root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
+        if (!root_info) {
+                deactivate_locked_super(s);
+                goto out;
+        }
+        /* ->kill_sb() will take care of root_info */
+        ecryptfs_set_dentry_private(s->s_root, root_info);
+        s->s_flags |= MS_ACTIVE;
+        rc = ecryptfs_read_super(s, dev_name);
+        if (rc) {
+                deactivate_locked_super(s);
+                err = "Reading sb failed";
+                goto out;
+        }
+        simple_set_mnt(mnt, s);
+        return 0;
 out:
+        if (sbi) {
+                ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat);
+                kmem_cache_free(ecryptfs_sb_info_cache, sbi);
+        }
+        printk(KERN_ERR "%s; rc = [%d]\n", err, rc);
        return rc;
 }
@@ -624,11 +624,16 @@ out:
 * @sb: The ecryptfs super block
 *
 * Used to bring the superblock down and free the private data.
- * Private data is free'd in ecryptfs_put_super()
 */
 static void ecryptfs_kill_block_super(struct super_block *sb)
 {
-        generic_shutdown_super(sb);
+        struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
+        kill_anon_super(sb);
+        if (!sb_info)
+                return;
+        ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
+        bdi_destroy(&sb_info->bdi);
+        kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
 }
 static struct file_system_type ecryptfs_fs_type = {
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index f1c17e87c5fb..2d8dbce9d485 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -20,6 +20,7 @@
 * 02111-1307, USA.
 */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/user_namespace.h>
 #include <linux/nsproxy.h>
 #include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 4ec8f61ccf5a..3745f612bcd4 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -24,6 +24,7 @@
 #include <linux/random.h>
 #include <linux/miscdevice.h>
 #include <linux/poll.h>
+#include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/module.h>
 #include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index df4ce99d0597..b1d82756544b 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -32,6 +32,7 @@
 #include <linux/file.h>
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>
+#include <linux/slab.h>
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
@@ -43,17 +44,9 @@
 * Returns locked and up-to-date page (if ok), with increased
 * refcnt.
 */
-struct page *ecryptfs_get_locked_page(struct file *file, loff_t index)
+struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
 {
-        struct dentry *dentry;
+        struct page *page = read_mapping_page(inode->i_mapping, index, NULL);
-        struct inode *inode;
-        struct address_space *mapping;
-        struct page *page;
-        dentry = file->f_path.dentry;
-        inode = dentry->d_inode;
-        mapping = inode->i_mapping;
-        page = read_mapping_page(mapping, index, (void *)file);
        if (!IS_ERR(page))
                lock_page(page);
        return page;
@@ -82,6 +75,19 @@ out:
        return rc;
 }
+static void strip_xattr_flag(char *page_virt,
+                             struct ecryptfs_crypt_stat *crypt_stat)
+{
+        if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
+                size_t written;
+                crypt_stat->flags &= ~ECRYPTFS_METADATA_IN_XATTR;
+                ecryptfs_write_crypt_stat_flags(page_virt, crypt_stat,
+                                                &written);
+                crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
+        }
+}
 /**
 *   Header Extent:
 *     Octets 0-7:        Unencrypted file size (big-endian)
@@ -97,19 +103,6 @@ out:
 *                        (big-endian)
 *     Octet  26:         Begin RFC 2440 authentication token packet set
 */
-static void set_header_info(char *page_virt,
-                            struct ecryptfs_crypt_stat *crypt_stat)
-{
-        size_t written;
-        size_t save_num_header_bytes_at_front =
-                crypt_stat->num_header_bytes_at_front;
-        crypt_stat->num_header_bytes_at_front =
-                ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
-        ecryptfs_write_header_metadata(page_virt + 20, crypt_stat, &written);
-        crypt_stat->num_header_bytes_at_front =
-                save_num_header_bytes_at_front;
-}
 /**
 * ecryptfs_copy_up_encrypted_with_header
@@ -135,8 +128,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
                                           * num_extents_per_page)
                                          + extent_num_in_page);
                size_t num_header_extents_at_front =
-                        (crypt_stat->num_header_bytes_at_front
+                        (crypt_stat->metadata_size / crypt_stat->extent_size);
-                         / crypt_stat->extent_size);
                if (view_extent_num < num_header_extents_at_front) {
                        /* This is a header extent */
@@ -146,9 +138,14 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
                        memset(page_virt, 0, PAGE_CACHE_SIZE);
                        /* TODO: Support more than one header extent */
                        if (view_extent_num == 0) {
+                                size_t written;
                                rc = ecryptfs_read_xattr_region(
                                        page_virt, page->mapping->host);
-                                set_header_info(page_virt, crypt_stat);
+                                strip_xattr_flag(page_virt + 16, crypt_stat);
+                                ecryptfs_write_header_metadata(page_virt + 20,
+                                                               crypt_stat,
+                                                               &written);
                        }
                        kunmap_atomic(page_virt, KM_USER0);
                        flush_dcache_page(page);
@@ -161,7 +158,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
                        /* This is an encrypted data extent */
                        loff_t lower_offset =
                                ((view_extent_num * crypt_stat->extent_size)
-                                 - crypt_stat->num_header_bytes_at_front);
+                                 - crypt_stat->metadata_size);
                        rc = ecryptfs_read_lower_page_segment(
                                page, (lower_offset >> PAGE_CACHE_SHIFT),
@@ -193,7 +190,7 @@ out:
 static int ecryptfs_readpage(struct file *file, struct page *page)
 {
        struct ecryptfs_crypt_stat *crypt_stat =
-                &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat;
+                &ecryptfs_inode_to_private(page->mapping->host)->crypt_stat;
        int rc = 0;
        if (!crypt_stat
@@ -295,8 +292,7 @@ static int ecryptfs_write_begin(struct file *file,
        if (!PageUptodate(page)) {
                struct ecryptfs_crypt_stat *crypt_stat =
-                        &ecryptfs_inode_to_private(
+                        &ecryptfs_inode_to_private(mapping->host)->crypt_stat;
-                                file->f_path.dentry->d_inode)->crypt_stat;
                if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)
                    || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) {
@@ -482,7 +478,7 @@ static int ecryptfs_write_end(struct file *file,
        unsigned to = from + copied;
        struct inode *ecryptfs_inode = mapping->host;
        struct ecryptfs_crypt_stat *crypt_stat =
-                &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat;
+                &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
        int rc;
        if (crypt_stat->flags & ECRYPTFS_NEW_FILE) {
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 0cc4fafd6552..db184ef15d3d 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -93,7 +93,7 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
 /**
 * ecryptfs_write
- * @ecryptfs_file: The eCryptfs file into which to write
+ * @ecryptfs_inode: The eCryptfs file into which to write
 * @data: Virtual address where data to write is located
 * @offset: Offset in the eCryptfs file at which to begin writing the
 *          data from @data
@@ -109,12 +109,11 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
 *
 * Returns zero on success; non-zero otherwise
 */
-int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
+int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
                   size_t size)
 {
        struct page *ecryptfs_page;
        struct ecryptfs_crypt_stat *crypt_stat;
-        struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
        char *ecryptfs_page_virt;
        loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
        loff_t data_offset = 0;
@@ -145,7 +144,7 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
                        if (num_bytes > total_remaining_zeros)
                                num_bytes = total_remaining_zeros;
                }
-                ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file,
+                ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
                                                         ecryptfs_page_idx);
                if (IS_ERR(ecryptfs_page)) {
                        rc = PTR_ERR(ecryptfs_page);
@@ -302,10 +301,10 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
 int ecryptfs_read(char *data, loff_t offset, size_t size,
                  struct file *ecryptfs_file)
 {
+        struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
        struct page *ecryptfs_page;
        char *ecryptfs_page_virt;
-        loff_t ecryptfs_file_size =
+        loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
-                i_size_read(ecryptfs_file->f_dentry->d_inode);
        loff_t data_offset = 0;
        loff_t pos;
        int rc = 0;
@@ -327,7 +326,7 @@ int ecryptfs_read(char *data, loff_t offset, size_t size,
                if (num_bytes > total_remaining_bytes)
                        num_bytes = total_remaining_bytes;
-                ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file,
+                ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
                                                         ecryptfs_page_idx);
                if (IS_ERR(ecryptfs_page)) {
                        rc = PTR_ERR(ecryptfs_page);
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index b15a43a80ab7..0435886e4a9f 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -26,6 +26,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/key.h>
+#include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/smp_lock.h>
 #include <linux/file.h>
@@ -85,7 +86,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
                if (lower_dentry->d_inode) {
                        fput(inode_info->lower_file);
                        inode_info->lower_file = NULL;
-                        d_drop(lower_dentry);
                }
        }
        ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
@@ -109,26 +109,6 @@ void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode)
 }
 /**
- * ecryptfs_put_super
- * @sb: Pointer to the ecryptfs super block
- *
- * Final actions when unmounting a file system.
- * This will handle deallocation and release of our private data.
- */
-static void ecryptfs_put_super(struct super_block *sb)
-{
-        struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
-        lock_kernel();
-        ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
-        kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
-        ecryptfs_set_superblock_private(sb, NULL);
-        unlock_kernel();
-}
-/**
 * ecryptfs_statfs
 * @sb: The ecryptfs super block
 * @buf: The struct kstatfs to fill in with stats
@@ -202,7 +182,6 @@ const struct super_operations ecryptfs_sops = {
        .alloc_inode = ecryptfs_alloc_inode,
        .destroy_inode = ecryptfs_destroy_inode,
        .drop_inode = generic_delete_inode,
-        .put_super = ecryptfs_put_super,
        .statfs = ecryptfs_statfs,
        .remount_fs = NULL,
        .clear_inode = ecryptfs_clear_inode,
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 7758cc382ef0..6bd3f76fdf88 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -11,6 +11,7 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/anon_inodes.h>
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index bd056a5b4efc..3817149919cb 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1140,8 +1140,7 @@ retry:
                 * ep_poll_callback() when events will become available.
                 */
                init_waitqueue_entry(&wait, current);
-                wait.flags |= WQ_FLAG_EXCLUSIVE;
+                __add_wait_queue_exclusive(&ep->wq, &wait);
-                __add_wait_queue(&ep->wq, &wait);
                for (;;) {
                        /*
diff --git a/fs/exec.c b/fs/exec.c
index 49cdaa19e5b9..e19de6a80339 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -242,9 +242,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
         * use STACK_TOP because that can depend on attributes which aren't
         * configured yet.
         */
+        BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
        vma->vm_end = STACK_TOP_MAX;
        vma->vm_start = vma->vm_end - PAGE_SIZE;
-        vma->vm_flags = VM_STACK_FLAGS;
+        vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
        INIT_LIST_HEAD(&vma->anon_vma_chain);
        err = insert_vm_struct(mm, vma);
@@ -616,6 +617,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
        else if (executable_stack == EXSTACK_DISABLE_X)
                vm_flags &= ~VM_EXEC;
        vm_flags |= mm->def_flags;
+        vm_flags |= VM_STACK_INCOMPLETE_SETUP;
        ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
                        vm_flags);
@@ -630,6 +632,9 @@ int setup_arg_pages(struct linux_binprm *bprm,
                        goto out_unlock;
        }
+        /* mprotect_fixup is overkill to remove the temporary stack flags */
+        vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
        stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
        stack_size = vma->vm_end - vma->vm_start;
        /*
@@ -763,7 +768,6 @@ static int de_thread(struct task_struct *tsk)
        struct signal_struct *sig = tsk->signal;
        struct sighand_struct *oldsighand = tsk->sighand;
        spinlock_t *lock = &oldsighand->siglock;
-        int count;
        if (thread_group_empty(tsk))
                goto no_thread_group;
@@ -780,13 +784,13 @@ static int de_thread(struct task_struct *tsk)
                spin_unlock_irq(lock);
                return -EAGAIN;
        }
        sig->group_exit_task = tsk;
-        zap_other_threads(tsk);
+        sig->notify_count = zap_other_threads(tsk);
+        if (!thread_group_leader(tsk))
+                sig->notify_count--;
-        /* Account for the thread group leader hanging around: */
+        while (sig->notify_count) {
-        count = thread_group_leader(tsk) ? 1 : 2;
-        sig->notify_count = count;
-        while (atomic_read(&sig->count) > count) {
                __set_current_state(TASK_UNINTERRUPTIBLE);
                spin_unlock_irq(lock);
                schedule();
@@ -1387,8 +1391,6 @@ int do_execve(char * filename,
        if (retval < 0)
                goto out;
-        current->stack_start = current->mm->start_stack;
        /* execve succeeded */
        current->fs->in_exec = 0;
        current->in_execve = 0;
@@ -1659,12 +1661,15 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
        struct task_struct *tsk = current;
        struct mm_struct *mm = tsk->mm;
        struct completion *vfork_done;
-        int core_waiters;
+        int core_waiters = -EBUSY;
        init_completion(&core_state->startup);
        core_state->dumper.task = tsk;
        core_state->dumper.next = NULL;
-        core_waiters = zap_threads(tsk, mm, core_state, exit_code);
+        down_write(&mm->mmap_sem);
+        if (!mm->core_state)
+                core_waiters = zap_threads(tsk, mm, core_state, exit_code);
        up_write(&mm->mmap_sem);
        if (unlikely(core_waiters < 0))
@@ -1784,21 +1789,61 @@ static void wait_for_dump_helpers(struct file *file)
 }
+/*
+ * uhm_pipe_setup
+ * helper function to customize the process used
+ * to collect the core in userspace.  Specifically
+ * it sets up a pipe and installs it as fd 0 (stdin)
+ * for the process.  Returns 0 on success, or
+ * PTR_ERR on failure.
+ * Note that it also sets the core limit to 1.  This
+ * is a special value that we use to trap recursive
+ * core dumps
+ */
+static int umh_pipe_setup(struct subprocess_info *info)
+{
+        struct file *rp, *wp;
+        struct fdtable *fdt;
+        struct coredump_params *cp = (struct coredump_params *)info->data;
+        struct files_struct *cf = current->files;
+        wp = create_write_pipe(0);
+        if (IS_ERR(wp))
+                return PTR_ERR(wp);
+        rp = create_read_pipe(wp, 0);
+        if (IS_ERR(rp)) {
+                free_write_pipe(wp);
+                return PTR_ERR(rp);
+        }
+        cp->file = wp;
+        sys_close(0);
+        fd_install(0, rp);
+        spin_lock(&cf->file_lock);
+        fdt = files_fdtable(cf);
+        FD_SET(0, fdt->open_fds);
+        FD_CLR(0, fdt->close_on_exec);
+        spin_unlock(&cf->file_lock);
+        /* and disallow core files too */
+        current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
+        return 0;
+}
 void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 {
        struct core_state core_state;
        char corename[CORENAME_MAX_SIZE + 1];
        struct mm_struct *mm = current->mm;
        struct linux_binfmt * binfmt;
-        struct inode * inode;
        const struct cred *old_cred;
        struct cred *cred;
        int retval = 0;
        int flag = 0;
-        int ispipe = 0;
+        int ispipe;
-        char **helper_argv = NULL;
-        int helper_argc = 0;
-        int dump_count = 0;
        static atomic_t core_dump_count = ATOMIC_INIT(0);
        struct coredump_params cprm = {
                .signr = signr,
@@ -1817,23 +1862,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
        binfmt = mm->binfmt;
        if (!binfmt || !binfmt->core_dump)
                goto fail;
+        if (!__get_dumpable(cprm.mm_flags))
-        cred = prepare_creds();
-        if (!cred) {
-                retval = -ENOMEM;
                goto fail;
-        }
-        down_write(&mm->mmap_sem);
+        cred = prepare_creds();
-        /*
+        if (!cred)
-         * If another thread got here first, or we are not dumpable, bail out.
-         */
-        if (mm->core_state || !__get_dumpable(cprm.mm_flags)) {
-                up_write(&mm->mmap_sem);
-                put_cred(cred);
                goto fail;
-        }
        /*
         *      We cannot trust fsuid as being the "true" uid of the
         *      process nor do we know its entire history. We only know it
@@ -1846,10 +1880,8 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
        }
        retval = coredump_wait(exit_code, &core_state);
-        if (retval < 0) {
+        if (retval < 0)
-                put_cred(cred);
+                goto fail_creds;
-                goto fail;
-        }
        old_cred = override_creds(cred);
@@ -1867,19 +1899,19 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
        ispipe = format_corename(corename, signr);
        unlock_kernel();
-        if ((!ispipe) && (cprm.limit < binfmt->min_coredump))
-                goto fail_unlock;
        if (ispipe) {
-                if (cprm.limit == 0) {
+                int dump_count;
+                char **helper_argv;
+                if (cprm.limit == 1) {
                        /*
                         * Normally core limits are irrelevant to pipes, since
                         * we're not writing to the file system, but we use
-                         * cprm.limit of 0 here as a speacial value. Any
+                         * cprm.limit of 1 here as a speacial value. Any
-                         * non-zero limit gets set to RLIM_INFINITY below, but
+                         * non-1 limit gets set to RLIM_INFINITY below, but
                         * a limit of 0 skips the dump.  This is a consistent
                         * way to catch recursive crashes.  We can still crash
-                         * if the core_pattern binary sets RLIM_CORE =  !0
+                         * if the core_pattern binary sets RLIM_CORE =  !1
                         * but it runs as root, and can do lots of stupid things
                         * Note that we use task_tgid_vnr here to grab the pid
                         * of the process group leader.  That way we get the
@@ -1887,11 +1919,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
                         * core_pattern process dies.
                         */
                        printk(KERN_WARNING
-                                "Process %d(%s) has RLIMIT_CORE set to 0\n",
+                                "Process %d(%s) has RLIMIT_CORE set to 1\n",
                                task_tgid_vnr(current), current->comm);
                        printk(KERN_WARNING "Aborting core\n");
                        goto fail_unlock;
                }
+                cprm.limit = RLIM_INFINITY;
                dump_count = atomic_inc_return(&core_dump_count);
                if (core_pipe_limit && (core_pipe_limit < dump_count)) {
@@ -1901,71 +1934,74 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
                        goto fail_dropcount;
                }
-                helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
+                helper_argv = argv_split(GFP_KERNEL, corename+1, NULL);
                if (!helper_argv) {
                        printk(KERN_WARNING "%s failed to allocate memory\n",
                               __func__);
                        goto fail_dropcount;
                }
-                cprm.limit = RLIM_INFINITY;
+                retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
+                                        NULL, UMH_WAIT_EXEC, umh_pipe_setup,
-                /* SIGPIPE can happen, but it's just never processed */
+                                        NULL, &cprm);
-                if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL,
+                argv_free(helper_argv);
-                                &cprm.file)) {
+                if (retval) {
                        printk(KERN_INFO "Core dump to %s pipe failed\n",
                               corename);
-                        goto fail_dropcount;
+                        goto close_fail;
                }
-        } else
+        } else {
+                struct inode *inode;
+                if (cprm.limit < binfmt->min_coredump)
+                        goto fail_unlock;
                cprm.file = filp_open(corename,
                                 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
                                 0600);
-        if (IS_ERR(cprm.file))
+                if (IS_ERR(cprm.file))
-                goto fail_dropcount;
+                        goto fail_unlock;
-        inode = cprm.file->f_path.dentry->d_inode;
-        if (inode->i_nlink > 1)
-                goto close_fail;        /* multiple links - don't dump */
-        if (!ispipe && d_unhashed(cprm.file->f_path.dentry))
-                goto close_fail;
-        /* AK: actually i see no reason to not allow this for named pipes etc.,
-           but keep the previous behaviour for now. */
-        if (!ispipe && !S_ISREG(inode->i_mode))
-                goto close_fail;
-        /*
-         * Dont allow local users get cute and trick others to coredump
-         * into their pre-created files:
-         * Note, this is not relevant for pipes
-         */
-        if (!ispipe && (inode->i_uid != current_fsuid()))
-                goto close_fail;
-        if (!cprm.file->f_op)
-                goto close_fail;
-        if (!cprm.file->f_op->write)
-                goto close_fail;
-        if (!ispipe &&
-            do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0)
-                goto close_fail;
-        retval = binfmt->core_dump(&cprm);
+                inode = cprm.file->f_path.dentry->d_inode;
+                if (inode->i_nlink > 1)
+                        goto close_fail;
+                if (d_unhashed(cprm.file->f_path.dentry))
+                        goto close_fail;
+                /*
+                 * AK: actually i see no reason to not allow this for named
+                 * pipes etc, but keep the previous behaviour for now.
+                 */
+                if (!S_ISREG(inode->i_mode))
+                        goto close_fail;
+                /*
+                 * Dont allow local users get cute and trick others to coredump
+                 * into their pre-created files.
+                 */
+                if (inode->i_uid != current_fsuid())
+                        goto close_fail;
+                if (!cprm.file->f_op || !cprm.file->f_op->write)
+                        goto close_fail;
+                if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
+                        goto close_fail;
+        }
+        retval = binfmt->core_dump(&cprm);
        if (retval)
                current->signal->group_exit_code |= 0x80;
-close_fail:
        if (ispipe && core_pipe_limit)
                wait_for_dump_helpers(cprm.file);
-        filp_close(cprm.file, NULL);
+close_fail:
+        if (cprm.file)
+                filp_close(cprm.file, NULL);
 fail_dropcount:
-        if (dump_count)
+        if (ispipe)
                atomic_dec(&core_dump_count);
 fail_unlock:
-        if (helper_argv)
+        coredump_finish(mm);
-                argv_free(helper_argv);
        revert_creds(old_cred);
+fail_creds:
        put_cred(cred);
-        coredump_finish(mm);
 fail:
        return;
 }
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 4cfab1cc75c0..d91e9d829bc1 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -608,7 +608,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)
        de->inode_no = cpu_to_le64(parent->i_ino);
        memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
        exofs_set_de_type(de, inode);
-        kunmap_atomic(page, KM_USER0);
+        kunmap_atomic(kaddr, KM_USER0);
        err = exofs_commit_chunk(page, 0, chunk_size);
 fail:
        page_cache_release(page);
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 8442e353309f..22721b2fd890 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -35,6 +35,7 @@
 #include <linux/fs.h>
 #include <linux/time.h>
+#include <linux/backing-dev.h>
 #include "common.h"
 /* FIXME: Remove once pnfs hits mainline
@@ -84,6 +85,7 @@ struct exofs_sb_info {
        u32             s_next_generation;      /* next gen # to use          */
        atomic_t        s_curr_pending;         /* number of pending commands */
        uint8_t         s_cred[OSD_CAP_LEN];    /* credential for the fscb    */
+        struct          backing_dev_info bdi;   /* register our bdi with VFS  */
        struct pnfs_osd_data_map data_map;      /* Default raid to use
                                                 * FIXME: Needed ?
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 839b9dc1e70f..fef6899be397 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -40,12 +40,11 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
        return 0;
 }
-static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
+static int exofs_file_fsync(struct file *filp, int datasync)
-                            int datasync)
 {
        int ret;
        struct address_space *mapping = filp->f_mapping;
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = mapping->host;
        struct super_block *sb;
        ret = filemap_write_and_wait(mapping);
@@ -66,7 +65,7 @@ static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
 static int exofs_flush(struct file *file, fl_owner_t id)
 {
-        exofs_file_fsync(file, file->f_path.dentry, 1);
+        exofs_file_fsync(file, 1);
        /* TODO: Flush the OSD target */
        return 0;
 }
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index a17e4b733e35..4bb6ef822e46 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -31,6 +31,7 @@
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
+#include <linux/slab.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
 #include <scsi/scsi_device.h>
@@ -754,6 +755,21 @@ static int exofs_write_end(struct file *file, struct address_space *mapping,
        return ret;
 }
+static int exofs_releasepage(struct page *page, gfp_t gfp)
+{
+        EXOFS_DBGMSG("page 0x%lx\n", page->index);
+        WARN_ON(1);
+        return try_to_free_buffers(page);
+}
+static void exofs_invalidatepage(struct page *page, unsigned long offset)
+{
+        EXOFS_DBGMSG("page_has_buffers=>%d\n", page_has_buffers(page));
+        WARN_ON(1);
+        block_invalidatepage(page, offset);
+}
 const struct address_space_operations exofs_aops = {
        .readpage       = exofs_readpage,
        .readpages      = exofs_readpages,
@@ -761,6 +777,21 @@ const struct address_space_operations exofs_aops = {
        .writepages     = exofs_writepages,
        .write_begin    = exofs_write_begin_export,
        .write_end      = exofs_write_end,
+        .releasepage    = exofs_releasepage,
+        .set_page_dirty = __set_page_dirty_nobuffers,
+        .invalidatepage = exofs_invalidatepage,
+        /* Not implemented Yet */
+        .bmap           = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
+        .direct_IO      = NULL, /* TODO: Should be trivial to do */
+        /* With these NULL has special meaning or default is not exported */
+        .sync_page      = NULL,
+        .get_xip_mem    = NULL,
+        .migratepage    = NULL,
+        .launder_page   = NULL,
+        .is_partially_uptodate = NULL,
+        .error_remove_page = NULL,
 };
 /******************************************************************************
@@ -1122,16 +1153,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
        sbi = sb->s_fs_info;
        sb->s_dirt = 1;
-        inode->i_uid = current->cred->fsuid;
+        inode_init_owner(inode, dir, mode);
-        if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
-        } else {
-                inode->i_gid = current->cred->fsgid;
-        }
-        inode->i_mode = mode;
        inode->i_ino = sbi->s_nextid++;
        inode->i_blkbits = EXOFS_BLKSHIFT;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 5293bc411d17..4337cad7777b 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -22,6 +22,7 @@
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
+#include <linux/slab.h>
 #include <scsi/scsi_device.h>
 #include <asm/div64.h>
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 6cf5e4e84d61..03149b9a5178 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -37,6 +37,7 @@
 #include <linux/vfs.h>
 #include <linux/random.h>
 #include <linux/exportfs.h>
+#include <linux/slab.h>
 #include "exofs.h"
@@ -301,6 +302,7 @@ static void exofs_put_super(struct super_block *sb)
        _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0],
                            sbi->layout.s_pid);
+        bdi_destroy(&sbi->bdi);
        exofs_free_sbi(sbi);
        sb->s_fs_info = NULL;
 }
@@ -545,6 +547,10 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        if (!sbi)
                return -ENOMEM;
+        ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
+        if (ret)
+                goto free_bdi;
        /* use mount options to fill superblock */
        od = osduld_path_lookup(opts->dev_name);
        if (IS_ERR(od)) {
@@ -611,6 +617,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        }
        /* set up operation vectors */
+        sb->s_bdi = &sbi->bdi;
        sb->s_fs_info = sbi;
        sb->s_op = &exofs_sops;
        sb->s_export_op = &exofs_export_ops;
@@ -642,6 +649,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 free_sbi:
+        bdi_destroy(&sbi->bdi);
+free_bdi:
        EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
                  opts->dev_name, sbi->layout.s_pid, ret);
        exofs_free_sbi(sbi);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index a99e54318c3d..ca7e2a0ed98a 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -420,7 +420,7 @@ release_and_out:
        return error;
 }
-struct xattr_handler ext2_xattr_acl_access_handler = {
+const struct xattr_handler ext2_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags  = ACL_TYPE_ACCESS,
        .list   = ext2_xattr_list_acl_access,
@@ -428,7 +428,7 @@ struct xattr_handler ext2_xattr_acl_access_handler = {
        .set    = ext2_xattr_set_acl,
 };
-struct xattr_handler ext2_xattr_acl_default_handler = {
+const struct xattr_handler ext2_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags  = ACL_TYPE_DEFAULT,
        .list   = ext2_xattr_list_acl_default,
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 1d081f0cfec2..e8766a396776 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -13,6 +13,7 @@
 #include "ext2.h"
 #include <linux/quotaops.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/buffer_head.h>
 #include <linux/capability.h>
@@ -1331,6 +1332,12 @@ retry_alloc:
                free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
                /*
+                 * skip this group (and avoid loading bitmap) if there
+                 * are no free blocks
+                 */
+                if (!free_blocks)
+                        continue;
+                /*
                 * skip this group if the number of
                 * free blocks is less than half of the reservation
                 * window size.
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 0b038e47ad2f..52b34f1d2738 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -122,7 +122,6 @@ extern int ext2_write_inode (struct inode *, struct writeback_control *);
 extern void ext2_delete_inode (struct inode *);
 extern int ext2_sync_inode (struct inode *);
 extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
-extern void ext2_truncate (struct inode *);
 extern int ext2_setattr (struct dentry *, struct iattr *);
 extern void ext2_set_inode_flags(struct inode *inode);
 extern void ext2_get_inode_flags(struct ext2_inode_info *);
@@ -155,7 +154,7 @@ extern void ext2_write_super (struct super_block *);
 extern const struct file_operations ext2_dir_operations;
 /* file.c */
-extern int ext2_fsync(struct file *file, struct dentry *dentry, int datasync);
+extern int ext2_fsync(struct file *file, int datasync);
 extern const struct inode_operations ext2_file_inode_operations;
 extern const struct file_operations ext2_file_operations;
 extern const struct file_operations ext2_xip_file_operations;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5d198d0697fb..49eec9456c5b 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -40,13 +40,13 @@ static int ext2_release_file (struct inode * inode, struct file * filp)
        return 0;
 }
-int ext2_fsync(struct file *file, struct dentry *dentry, int datasync)
+int ext2_fsync(struct file *file, int datasync)
 {
        int ret;
-        struct super_block *sb = dentry->d_inode->i_sb;
+        struct super_block *sb = file->f_mapping->host->i_sb;
        struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
-        ret = simple_fsync(file, dentry, datasync);
+        ret = generic_file_fsync(file, datasync);
        if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) {
                /* We don't really know where the IO error happened... */
                ext2_error(sb, __func__,
@@ -95,7 +95,6 @@ const struct file_operations ext2_xip_file_operations = {
 #endif
 const struct inode_operations ext2_file_inode_operations = {
-        .truncate       = ext2_truncate,
 #ifdef CONFIG_EXT2_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index ad7d572ee8dc..938dbc739d00 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -106,7 +106,7 @@ void ext2_free_inode (struct inode * inode)
        struct super_block * sb = inode->i_sb;
        int is_directory;
        unsigned long ino;
-        struct buffer_head *bitmap_bh = NULL;
+        struct buffer_head *bitmap_bh;
        unsigned long block_group;
        unsigned long bit;
        struct ext2_super_block * es;
@@ -135,14 +135,13 @@ void ext2_free_inode (struct inode * inode)
            ino > le32_to_cpu(es->s_inodes_count)) {
                ext2_error (sb, "ext2_free_inode",
                            "reserved or nonexistent inode %lu", ino);
-                goto error_return;
+                return;
        }
        block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb);
        bit = (ino - 1) % EXT2_INODES_PER_GROUP(sb);
-        brelse(bitmap_bh);
        bitmap_bh = read_inode_bitmap(sb, block_group);
        if (!bitmap_bh)
-                goto error_return;
+                return;
        /* Ok, now we can actually update the inode bitmaps.. */
        if (!ext2_clear_bit_atomic(sb_bgl_lock(EXT2_SB(sb), block_group),
@@ -154,7 +153,7 @@ void ext2_free_inode (struct inode * inode)
        mark_buffer_dirty(bitmap_bh);
        if (sb->s_flags & MS_SYNCHRONOUS)
                sync_dirty_buffer(bitmap_bh);
-error_return:
        brelse(bitmap_bh);
 }
@@ -550,16 +549,12 @@ got:
        sb->s_dirt = 1;
        mark_buffer_dirty(bh2);
-        inode->i_uid = current_fsuid();
+        if (test_opt(sb, GRPID)) {
-        if (test_opt (sb, GRPID))
+                inode->i_mode = mode;
-                inode->i_gid = dir->i_gid;
+                inode->i_uid = current_fsuid();
-        else if (dir->i_mode & S_ISGID) {
                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
        } else
-                inode->i_gid = current_fsgid();
+                inode_init_owner(inode, dir, mode);
-        inode->i_mode = mode;
        inode->i_ino = ino;
        inode->i_blocks = 0;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index fc13cc119aad..19214435b752 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -22,7 +22,6 @@
 *  Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000
 */
-#include <linux/smp_lock.h>
 #include <linux/time.h>
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
@@ -55,6 +54,18 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode)
                inode->i_blocks - ea_blocks == 0);
 }
+static void ext2_truncate_blocks(struct inode *inode, loff_t offset);
+static void ext2_write_failed(struct address_space *mapping, loff_t to)
+{
+        struct inode *inode = mapping->host;
+        if (to > inode->i_size) {
+                truncate_pagecache(inode, to, inode->i_size);
+                ext2_truncate_blocks(inode, inode->i_size);
+        }
+}
 /*
 * Called at the last iput() if i_nlink is zero.
 */
@@ -72,7 +83,7 @@ void ext2_delete_inode (struct inode * inode)
        inode->i_size = 0;
        if (inode->i_blocks)
-                ext2_truncate (inode);
+                ext2_truncate_blocks(inode, 0);
        ext2_free_inode (inode);
        return;
@@ -758,8 +769,8 @@ int __ext2_write_begin(struct file *file, struct address_space *mapping,
                loff_t pos, unsigned len, unsigned flags,
                struct page **pagep, void **fsdata)
 {
-        return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        return block_write_begin_newtrunc(file, mapping, pos, len, flags,
-                                                        ext2_get_block);
+                                        pagep, fsdata, ext2_get_block);
 }
 static int
@@ -767,8 +778,25 @@ ext2_write_begin(struct file *file, struct address_space *mapping,
                loff_t pos, unsigned len, unsigned flags,
                struct page **pagep, void **fsdata)
 {
+        int ret;
        *pagep = NULL;
-        return __ext2_write_begin(file, mapping, pos, len, flags, pagep,fsdata);
+        ret = __ext2_write_begin(file, mapping, pos, len, flags, pagep, fsdata);
+        if (ret < 0)
+                ext2_write_failed(mapping, pos + len);
+        return ret;
+}
+static int ext2_write_end(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned copied,
+                        struct page *page, void *fsdata)
+{
+        int ret;
+        ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+        if (ret < len)
+                ext2_write_failed(mapping, pos + len);
+        return ret;
 }
 static int
@@ -776,13 +804,18 @@ ext2_nobh_write_begin(struct file *file, struct address_space *mapping,
                loff_t pos, unsigned len, unsigned flags,
                struct page **pagep, void **fsdata)
 {
+        int ret;
        /*
         * Dir-in-pagecache still uses ext2_write_begin. Would have to rework
         * directory handling code to pass around offsets rather than struct
         * pages in order to make this work easily.
         */
-        return nobh_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags, pagep,
-                                                        ext2_get_block);
+                                                fsdata, ext2_get_block);
+        if (ret < 0)
+                ext2_write_failed(mapping, pos + len);
+        return ret;
 }
 static int ext2_nobh_writepage(struct page *page,
@@ -801,10 +834,15 @@ ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
                        loff_t offset, unsigned long nr_segs)
 {
        struct file *file = iocb->ki_filp;
-        struct inode *inode = file->f_mapping->host;
+        struct address_space *mapping = file->f_mapping;
+        struct inode *inode = mapping->host;
-        return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+        ssize_t ret;
-                                offset, nr_segs, ext2_get_block, NULL);
+        ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev,
+                                iov, offset, nr_segs, ext2_get_block, NULL);
+        if (ret < 0 && (rw & WRITE))
+                ext2_write_failed(mapping, offset + iov_length(iov, nr_segs));
+        return ret;
 }
 static int
@@ -819,7 +857,7 @@ const struct address_space_operations ext2_aops = {
        .writepage              = ext2_writepage,
        .sync_page              = block_sync_page,
        .write_begin            = ext2_write_begin,
-        .write_end              = generic_write_end,
+        .write_end              = ext2_write_end,
        .bmap                   = ext2_bmap,
        .direct_IO              = ext2_direct_IO,
        .writepages             = ext2_writepages,
@@ -1028,7 +1066,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de
                ext2_free_data(inode, p, q);
 }
-void ext2_truncate(struct inode *inode)
+static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
 {
        __le32 *i_data = EXT2_I(inode)->i_data;
        struct ext2_inode_info *ei = EXT2_I(inode);
@@ -1040,27 +1078,8 @@ void ext2_truncate(struct inode *inode)
        int n;
        long iblock;
        unsigned blocksize;
-        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-            S_ISLNK(inode->i_mode)))
-                return;
-        if (ext2_inode_is_fast_symlink(inode))
-                return;
-        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-                return;
        blocksize = inode->i_sb->s_blocksize;
-        iblock = (inode->i_size + blocksize-1)
+        iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
-                                        >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
-        if (mapping_is_xip(inode->i_mapping))
-                xip_truncate_page(inode->i_mapping, inode->i_size);
-        else if (test_opt(inode->i_sb, NOBH))
-                nobh_truncate_page(inode->i_mapping,
-                                inode->i_size, ext2_get_block);
-        else
-                block_truncate_page(inode->i_mapping,
-                                inode->i_size, ext2_get_block);
        n = ext2_block_to_path(inode, iblock, offsets, NULL);
        if (n == 0)
@@ -1128,6 +1147,62 @@ do_indirects:
        ext2_discard_reservation(inode);
        mutex_unlock(&ei->truncate_mutex);
+}
+static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
+{
+        /*
+         * XXX: it seems like a bug here that we don't allow
+         * IS_APPEND inode to have blocks-past-i_size trimmed off.
+         * review and fix this.
+         *
+         * Also would be nice to be able to handle IO errors and such,
+         * but that's probably too much to ask.
+         */
+        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+            S_ISLNK(inode->i_mode)))
+                return;
+        if (ext2_inode_is_fast_symlink(inode))
+                return;
+        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+                return;
+        __ext2_truncate_blocks(inode, offset);
+}
+int ext2_setsize(struct inode *inode, loff_t newsize)
+{
+        loff_t oldsize;
+        int error;
+        error = inode_newsize_ok(inode, newsize);
+        if (error)
+                return error;
+        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+            S_ISLNK(inode->i_mode)))
+                return -EINVAL;
+        if (ext2_inode_is_fast_symlink(inode))
+                return -EINVAL;
+        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+                return -EPERM;
+        if (mapping_is_xip(inode->i_mapping))
+                error = xip_truncate_page(inode->i_mapping, newsize);
+        else if (test_opt(inode->i_sb, NOBH))
+                error = nobh_truncate_page(inode->i_mapping,
+                                newsize, ext2_get_block);
+        else
+                error = block_truncate_page(inode->i_mapping,
+                                newsize, ext2_get_block);
+        if (error)
+                return error;
+        oldsize = inode->i_size;
+        i_size_write(inode, newsize);
+        truncate_pagecache(inode, oldsize, newsize);
+        __ext2_truncate_blocks(inode, newsize);
        inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
        if (inode_needs_sync(inode)) {
                sync_mapping_buffers(inode->i_mapping);
@@ -1135,6 +1210,8 @@ do_indirects:
        } else {
                mark_inode_dirty(inode);
        }
+        return 0;
 }
 static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino,
@@ -1406,11 +1483,11 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
                               /* If this is the first large file
                                * created, add a flag to the superblock.
                                */
-                                lock_kernel();
+                                spin_lock(&EXT2_SB(sb)->s_lock);
                                ext2_update_dynamic_rev(sb);
                                EXT2_SET_RO_COMPAT_FEATURE(sb,
                                        EXT2_FEATURE_RO_COMPAT_LARGE_FILE);
-                                unlock_kernel();
+                                spin_unlock(&EXT2_SB(sb)->s_lock);
                                ext2_write_super(sb);
                        }
                }
@@ -1467,7 +1544,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
        if (error)
                return error;
-        if (iattr->ia_valid & ATTR_SIZE)
+        if (is_quota_modification(inode, iattr))
                dquot_initialize(inode);
        if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
            (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
@@ -1475,8 +1552,15 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
                if (error)
                        return error;
        }
-        error = inode_setattr(inode, iattr);
+        if (iattr->ia_valid & ATTR_SIZE) {
-        if (!error && (iattr->ia_valid & ATTR_MODE))
+                error = ext2_setsize(inode, iattr->ia_size);
+                if (error)
+                        return error;
+        }
+        generic_setattr(inode, iattr);
+        if (iattr->ia_valid & ATTR_MODE)
                error = ext2_acl_chmod(inode);
+        mark_inode_dirty(inode);
        return error;
 }
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 42e4a303b675..7ff43f4a59cd 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -26,7 +26,6 @@
 #include <linux/random.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
-#include <linux/smp_lock.h>
 #include <linux/vfs.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
@@ -39,7 +38,7 @@
 #include "xip.h"
 static void ext2_sync_super(struct super_block *sb,
-                            struct ext2_super_block *es);
+                            struct ext2_super_block *es, int wait);
 static int ext2_remount (struct super_block * sb, int * flags, char * data);
 static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
 static int ext2_sync_fs(struct super_block *sb, int wait);
@@ -52,9 +51,11 @@ void ext2_error (struct super_block * sb, const char * function,
        struct ext2_super_block *es = sbi->s_es;
        if (!(sb->s_flags & MS_RDONLY)) {
+                spin_lock(&sbi->s_lock);
                sbi->s_mount_state |= EXT2_ERROR_FS;
                es->s_state |= cpu_to_le16(EXT2_ERROR_FS);
-                ext2_sync_super(sb, es);
+                spin_unlock(&sbi->s_lock);
+                ext2_sync_super(sb, es, 1);
        }
        va_start(args, fmt);
@@ -84,6 +85,9 @@ void ext2_msg(struct super_block *sb, const char *prefix,
        va_end(args);
 }
+/*
+ * This must be called with sbi->s_lock held.
+ */
 void ext2_update_dynamic_rev(struct super_block *sb)
 {
        struct ext2_super_block *es = EXT2_SB(sb)->s_es;
@@ -115,7 +119,7 @@ static void ext2_put_super (struct super_block * sb)
        int i;
        struct ext2_sb_info *sbi = EXT2_SB(sb);
-        lock_kernel();
+        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
        if (sb->s_dirt)
                ext2_write_super(sb);
@@ -124,8 +128,10 @@ static void ext2_put_super (struct super_block * sb)
        if (!(sb->s_flags & MS_RDONLY)) {
                struct ext2_super_block *es = sbi->s_es;
+                spin_lock(&sbi->s_lock);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
-                ext2_sync_super(sb, es);
+                spin_unlock(&sbi->s_lock);
+                ext2_sync_super(sb, es, 1);
        }
        db_count = sbi->s_gdb_count;
        for (i = 0; i < db_count; i++)
@@ -140,8 +146,6 @@ static void ext2_put_super (struct super_block * sb)
        sb->s_fs_info = NULL;
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
-        unlock_kernel();
 }
 static struct kmem_cache * ext2_inode_cachep;
@@ -209,6 +213,7 @@ static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
        struct ext2_super_block *es = sbi->s_es;
        unsigned long def_mount_opts;
+        spin_lock(&sbi->s_lock);
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
        if (sbi->s_sb_block != 1)
@@ -281,6 +286,7 @@ static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (!test_opt(sb, RESERVATION))
                seq_puts(seq, ",noreservation");
+        spin_unlock(&sbi->s_lock);
        return 0;
 }
@@ -606,7 +612,6 @@ static int ext2_setup_super (struct super_block * sb,
        if (!le16_to_cpu(es->s_max_mnt_count))
                es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT);
        le16_add_cpu(&es->s_mnt_count, 1);
-        ext2_write_super(sb);
        if (test_opt (sb, DEBUG))
                ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, "
                        "bpg=%lu, ipg=%lu, mo=%04lx]",
@@ -767,6 +772,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_fs_info = sbi;
        sbi->s_sb_block = sb_block;
+        spin_lock_init(&sbi->s_lock);
        /*
         * See what the current blocksize for the device is, and
         * use that as the blocksize.  Otherwise (or if the blocksize
@@ -1058,6 +1065,12 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_op = &ext2_sops;
        sb->s_export_op = &ext2_export_ops;
        sb->s_xattr = ext2_xattr_handlers;
+#ifdef CONFIG_QUOTA
+        sb->dq_op = &dquot_operations;
+        sb->s_qcop = &dquot_quotactl_ops;
+#endif
        root = ext2_iget(sb, EXT2_ROOT_INO);
        if (IS_ERR(root)) {
                ret = PTR_ERR(root);
@@ -1079,7 +1092,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL))
                ext2_msg(sb, KERN_WARNING,
                        "warning: mounting ext3 filesystem as ext2");
-        ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY);
+        if (ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY))
+                sb->s_flags |= MS_RDONLY;
+        ext2_write_super(sb);
        return 0;
 cantfind_ext2:
@@ -1120,30 +1135,26 @@ static void ext2_clear_super_error(struct super_block *sb)
                 * be remapped.  Nothing we can do but to retry the
                 * write and hope for the best.
                 */
-                printk(KERN_ERR "EXT2-fs: %s previous I/O error to "
+                ext2_msg(sb, KERN_ERR,
-                       "superblock detected", sb->s_id);
+                       "previous I/O error to superblock detected\n");
                clear_buffer_write_io_error(sbh);
                set_buffer_uptodate(sbh);
        }
 }
-static void ext2_commit_super (struct super_block * sb,
+static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es,
-                               struct ext2_super_block * es)
+                            int wait)
-{
-        ext2_clear_super_error(sb);
-        es->s_wtime = cpu_to_le32(get_seconds());
-        mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
-        sb->s_dirt = 0;
-}
-static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
 {
        ext2_clear_super_error(sb);
+        spin_lock(&EXT2_SB(sb)->s_lock);
        es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
        es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb));
        es->s_wtime = cpu_to_le32(get_seconds());
+        /* unlock before we do IO */
+        spin_unlock(&EXT2_SB(sb)->s_lock);
        mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
-        sync_dirty_buffer(EXT2_SB(sb)->s_sbh);
+        if (wait)
+                sync_dirty_buffer(EXT2_SB(sb)->s_sbh);
        sb->s_dirt = 0;
 }
@@ -1157,43 +1168,18 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
 * may have been checked while mounted and e2fsck may have
 * set s_state to EXT2_VALID_FS after some corrections.
 */
 static int ext2_sync_fs(struct super_block *sb, int wait)
 {
+        struct ext2_sb_info *sbi = EXT2_SB(sb);
        struct ext2_super_block *es = EXT2_SB(sb)->s_es;
-        struct buffer_head *sbh = EXT2_SB(sb)->s_sbh;
-        lock_kernel();
-        if (buffer_write_io_error(sbh)) {
-                /*
-                 * Oh, dear.  A previous attempt to write the
-                 * superblock failed.  This could happen because the
-                 * USB device was yanked out.  Or it could happen to
-                 * be a transient write error and maybe the block will
-                 * be remapped.  Nothing we can do but to retry the
-                 * write and hope for the best.
-                 */
-                ext2_msg(sb, KERN_ERR,
-                       "previous I/O error to superblock detected\n");
-                clear_buffer_write_io_error(sbh);
-                set_buffer_uptodate(sbh);
-        }
+        spin_lock(&sbi->s_lock);
        if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
                ext2_debug("setting valid to 0\n");
                es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
-                es->s_free_blocks_count =
-                        cpu_to_le32(ext2_count_free_blocks(sb));
-                es->s_free_inodes_count =
-                        cpu_to_le32(ext2_count_free_inodes(sb));
-                es->s_mtime = cpu_to_le32(get_seconds());
-                ext2_sync_super(sb, es);
-        } else {
-                ext2_commit_super(sb, es);
        }
-        sb->s_dirt = 0;
+        spin_unlock(&sbi->s_lock);
-        unlock_kernel();
+        ext2_sync_super(sb, es, wait);
        return 0;
 }
@@ -1215,7 +1201,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
        unsigned long old_sb_flags;
        int err;
-        lock_kernel();
+        spin_lock(&sbi->s_lock);
        /* Store the old options */
        old_sb_flags = sb->s_flags;
@@ -1254,21 +1240,31 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
                sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
        }
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
-                unlock_kernel();
+                spin_unlock(&sbi->s_lock);
                return 0;
        }
        if (*flags & MS_RDONLY) {
                if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
                    !(sbi->s_mount_state & EXT2_VALID_FS)) {
-                        unlock_kernel();
+                        spin_unlock(&sbi->s_lock);
                        return 0;
                }
                /*
                 * OK, we are remounting a valid rw partition rdonly, so set
                 * the rdonly flag and then mark the partition as valid again.
                 */
                es->s_state = cpu_to_le16(sbi->s_mount_state);
                es->s_mtime = cpu_to_le32(get_seconds());
+                spin_unlock(&sbi->s_lock);
+                err = dquot_suspend(sb, -1);
+                if (err < 0) {
+                        spin_lock(&sbi->s_lock);
+                        goto restore_opts;
+                }
+                ext2_sync_super(sb, es, 1);
        } else {
                __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb,
                                               ~EXT2_FEATURE_RO_COMPAT_SUPP);
@@ -1288,16 +1284,20 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
                sbi->s_mount_state = le16_to_cpu(es->s_state);
                if (!ext2_setup_super (sb, es, 0))
                        sb->s_flags &= ~MS_RDONLY;
+                spin_unlock(&sbi->s_lock);
+                ext2_write_super(sb);
+                dquot_resume(sb, -1);
        }
-        ext2_sync_super(sb, es);
-        unlock_kernel();
        return 0;
 restore_opts:
        sbi->s_mount_opt = old_opts.s_mount_opt;
        sbi->s_resuid = old_opts.s_resuid;
        sbi->s_resgid = old_opts.s_resgid;
        sb->s_flags = old_sb_flags;
-        unlock_kernel();
+        spin_unlock(&sbi->s_lock);
        return err;
 }
@@ -1308,6 +1308,8 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
        struct ext2_super_block *es = sbi->s_es;
        u64 fsid;
+        spin_lock(&sbi->s_lock);
        if (test_opt (sb, MINIX_DF))
                sbi->s_overhead_last = 0;
        else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
@@ -1362,6 +1364,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
               le64_to_cpup((void *)es->s_uuid + sizeof(u64));
        buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
        buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+        spin_unlock(&sbi->s_lock);
        return 0;
 }
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index 4e2426e22bbe..565cf817bbf1 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -32,6 +32,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
+        .setattr        = ext2_setattr,
 #ifdef CONFIG_EXT2_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
@@ -43,6 +44,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
 const struct inode_operations ext2_fast_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = ext2_follow_link,
+        .setattr        = ext2_setattr,
 #ifdef CONFIG_EXT2_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index e44dc92609be..7c3915780b19 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -101,7 +101,7 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *,
 static struct mb_cache *ext2_xattr_cache;
-static struct xattr_handler *ext2_xattr_handler_map[] = {
+static const struct xattr_handler *ext2_xattr_handler_map[] = {
        [EXT2_XATTR_INDEX_USER]              = &ext2_xattr_user_handler,
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
        [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext2_xattr_acl_access_handler,
@@ -113,7 +113,7 @@ static struct xattr_handler *ext2_xattr_handler_map[] = {
 #endif
 };
-struct xattr_handler *ext2_xattr_handlers[] = {
+const struct xattr_handler *ext2_xattr_handlers[] = {
        &ext2_xattr_user_handler,
        &ext2_xattr_trusted_handler,
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
@@ -126,10 +126,10 @@ struct xattr_handler *ext2_xattr_handlers[] = {
        NULL
 };
-static inline struct xattr_handler *
+static inline const struct xattr_handler *
 ext2_xattr_handler(int name_index)
 {
-        struct xattr_handler *handler = NULL;
+        const struct xattr_handler *handler = NULL;
        if (name_index > 0 && name_index < ARRAY_SIZE(ext2_xattr_handler_map))
                handler = ext2_xattr_handler_map[name_index];
@@ -298,7 +298,7 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_list",
        /* list the attribute names */
        for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
             entry = EXT2_XATTR_NEXT(entry)) {
-                struct xattr_handler *handler =
+                const struct xattr_handler *handler =
                        ext2_xattr_handler(entry->e_name_index);
                if (handler) {
@@ -345,7 +345,9 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
        if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR))
                return;
+        spin_lock(&EXT2_SB(sb)->s_lock);
        EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR);
+        spin_unlock(&EXT2_SB(sb)->s_lock);
        sb->s_dirt = 1;
        mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
 }
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index bf8175b2ced9..a1a1c2184616 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -55,11 +55,11 @@ struct ext2_xattr_entry {
 # ifdef CONFIG_EXT2_FS_XATTR
-extern struct xattr_handler ext2_xattr_user_handler;
+extern const struct xattr_handler ext2_xattr_user_handler;
-extern struct xattr_handler ext2_xattr_trusted_handler;
+extern const struct xattr_handler ext2_xattr_trusted_handler;
-extern struct xattr_handler ext2_xattr_acl_access_handler;
+extern const struct xattr_handler ext2_xattr_acl_access_handler;
-extern struct xattr_handler ext2_xattr_acl_default_handler;
+extern const struct xattr_handler ext2_xattr_acl_default_handler;
-extern struct xattr_handler ext2_xattr_security_handler;
+extern const struct xattr_handler ext2_xattr_security_handler;
 extern ssize_t ext2_listxattr(struct dentry *, char *, size_t);
@@ -72,7 +72,7 @@ extern void ext2_xattr_put_super(struct super_block *);
 extern int init_ext2_xattr(void);
 extern void exit_ext2_xattr(void);
-extern struct xattr_handler *ext2_xattr_handlers[];
+extern const struct xattr_handler *ext2_xattr_handlers[];
 # else  /* CONFIG_EXT2_FS_XATTR */
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index c8155845ac05..3004e15d5da5 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -4,6 +4,7 @@
 */
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/ext2_fs.h>
@@ -66,7 +67,7 @@ ext2_init_security(struct inode *inode, struct inode *dir)
        return err;
 }
-struct xattr_handler ext2_xattr_security_handler = {
+const struct xattr_handler ext2_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .list   = ext2_xattr_security_list,
        .get    = ext2_xattr_security_get,
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 2a26d71f4771..667e46a8d62d 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -50,7 +50,7 @@ ext2_xattr_trusted_set(struct dentry *dentry, const char *name,
                              value, size, flags);
 }
-struct xattr_handler ext2_xattr_trusted_handler = {
+const struct xattr_handler ext2_xattr_trusted_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .list   = ext2_xattr_trusted_list,
        .get    = ext2_xattr_trusted_get,
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 3f6caf3684b4..099d20f47163 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -54,7 +54,7 @@ ext2_xattr_user_set(struct dentry *dentry, const char *name,
                              name, value, size, flags);
 }
-struct xattr_handler ext2_xattr_user_handler = {
+const struct xattr_handler ext2_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
        .list   = ext2_xattr_user_list,
        .get    = ext2_xattr_user_get,
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 82ba34158661..01552abbca3c 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -456,7 +456,7 @@ release_and_out:
        return error;
 }
-struct xattr_handler ext3_xattr_acl_access_handler = {
+const struct xattr_handler ext3_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags  = ACL_TYPE_ACCESS,
        .list   = ext3_xattr_list_acl_access,
@@ -464,7 +464,7 @@ struct xattr_handler ext3_xattr_acl_access_handler = {
        .set    = ext3_xattr_set_acl,
 };
-struct xattr_handler ext3_xattr_acl_default_handler = {
+const struct xattr_handler ext3_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags  = ACL_TYPE_DEFAULT,
        .list   = ext3_xattr_list_acl_default,
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 161da2d3f890..4a32511f4ded 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -14,6 +14,7 @@
 #include <linux/time.h>
 #include <linux/capability.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/jbd.h>
 #include <linux/ext3_fs.h>
 #include <linux/ext3_jbd.h>
@@ -1583,6 +1584,12 @@ retry_alloc:
                        goto io_error;
                free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
                /*
+                 * skip this group (and avoid loading bitmap) if there
+                 * are no free blocks
+                 */
+                if (!free_blocks)
+                        continue;
+                /*
                 * skip this group if the number of
                 * free blocks is less than half of the reservation
                 * window size.
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 373fa90c796a..e2e72c367cf6 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -297,7 +297,7 @@ static void free_rb_tree_fname(struct rb_root *root)
                        kfree (old);
                }
                if (!parent)
-                        root->rb_node = NULL;
+                        *root = RB_ROOT;
                else if (parent->rb_left == n)
                        parent->rb_left = NULL;
                else if (parent->rb_right == n)
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 8209f266e9ad..d7e9f74dc3a6 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -43,12 +43,12 @@
 * inode to disk.
 */
-int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
+int ext3_sync_file(struct file *file, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        struct ext3_inode_info *ei = EXT3_I(inode);
        journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
-        int ret = 0;
+        int ret, needs_barrier = 0;
        tid_t commit_tid;
        if (inode->i_sb->s_flags & MS_RDONLY)
@@ -70,28 +70,27 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
         *  (they were dirtied by commit).  But that's OK - the blocks are
         *  safe in-journal, which is all fsync() needs to ensure.
         */
-        if (ext3_should_journal_data(inode)) {
+        if (ext3_should_journal_data(inode))
-                ret = ext3_force_commit(inode->i_sb);
+                return ext3_force_commit(inode->i_sb);
-                goto out;
-        }
        if (datasync)
                commit_tid = atomic_read(&ei->i_datasync_tid);
        else
                commit_tid = atomic_read(&ei->i_sync_tid);
-        if (log_start_commit(journal, commit_tid)) {
+        if (test_opt(inode->i_sb, BARRIER) &&
-                log_wait_commit(journal, commit_tid);
+            !journal_trans_will_send_data_barrier(journal, commit_tid))
-                goto out;
+                needs_barrier = 1;
-        }
+        log_start_commit(journal, commit_tid);
+        ret = log_wait_commit(journal, commit_tid);
        /*
         * In case we didn't commit a transaction, we have to flush
         * disk caches manually so that data really is on persistent
         * storage
         */
-        if (test_opt(inode->i_sb, BARRIER))
+        if (needs_barrier)
-                blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
-out:
+                                BLKDEV_IFL_WAIT);
        return ret;
 }
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index ef9008b885b5..498021eb88fb 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -538,16 +538,13 @@ got:
        if (S_ISDIR(mode))
                percpu_counter_inc(&sbi->s_dirs_counter);
-        inode->i_uid = current_fsuid();
-        if (test_opt (sb, GRPID))
+        if (test_opt(sb, GRPID)) {
-                inode->i_gid = dir->i_gid;
+                inode->i_mode = mode;
-        else if (dir->i_mode & S_ISGID) {
+                inode->i_uid = current_fsuid();
                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
        } else
-                inode->i_gid = current_fsgid();
+                inode_init_owner(inode, dir, mode);
-        inode->i_mode = mode;
        inode->i_ino = ino;
        /* This is the optimal IO size (for stat), not the fs block size */
@@ -582,7 +579,9 @@ got:
        inode->i_generation = sbi->s_next_generation++;
        spin_unlock(&sbi->s_next_gen_lock);
-        ei->i_state = EXT3_STATE_NEW;
+        ei->i_state_flags = 0;
+        ext3_set_inode_state(inode, EXT3_STATE_NEW);
        ei->i_extra_isize =
                (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ?
                sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 7f920b7263a4..735f0190ec2a 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2811,7 +2811,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
        inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
        inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
-        ei->i_state = 0;
+        ei->i_state_flags = 0;
        ei->i_dir_start_lookup = 0;
        ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
        /* We now have enough fields to check if the inode was active or not.
@@ -3151,7 +3151,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
        if (error)
                return error;
-        if (ia_valid & ATTR_SIZE)
+        if (is_quota_modification(inode, attr))
                dquot_initialize(inode);
        if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
                (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 1bee604cc6cd..6c953bb255e7 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -410,6 +410,8 @@ static void ext3_put_super (struct super_block * sb)
        struct ext3_super_block *es = sbi->s_es;
        int i, err;
+        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
        lock_kernel();
        ext3_xattr_put_super(sb);
@@ -653,8 +655,12 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_printf(seq, ",commit=%u",
                           (unsigned) (sbi->s_commit_interval / HZ));
        }
-        if (test_opt(sb, BARRIER))
-                seq_puts(seq, ",barrier=1");
+        /*
+         * Always display barrier state so it's clear what the status is.
+         */
+        seq_puts(seq, ",barrier=");
+        seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
        if (test_opt(sb, NOBH))
                seq_puts(seq, ",nobh");
@@ -744,7 +750,7 @@ static int ext3_release_dquot(struct dquot *dquot);
 static int ext3_mark_dquot_dirty(struct dquot *dquot);
 static int ext3_write_info(struct super_block *sb, int type);
 static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-                                char *path, int remount);
+                                char *path);
 static int ext3_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off);
@@ -763,12 +769,12 @@ static const struct dquot_operations ext3_quota_operations = {
 static const struct quotactl_ops ext3_qctl_operations = {
        .quota_on       = ext3_quota_on,
-        .quota_off      = vfs_quota_off,
+        .quota_off      = dquot_quota_off,
-        .quota_sync     = vfs_quota_sync,
+        .quota_sync     = dquot_quota_sync,
-        .get_info       = vfs_get_dqinfo,
+        .get_info       = dquot_get_dqinfo,
-        .set_info       = vfs_set_dqinfo,
+        .set_info       = dquot_set_dqinfo,
-        .get_dqblk      = vfs_get_dqblk,
+        .get_dqblk      = dquot_get_dqblk,
-        .set_dqblk      = vfs_set_dqblk
+        .set_dqblk      = dquot_set_dqblk
 };
 #endif
@@ -810,8 +816,8 @@ enum {
        Opt_data_err_abort, Opt_data_err_ignore,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
-        Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
+        Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
-        Opt_usrquota, Opt_grpquota
+        Opt_resize, Opt_usrquota, Opt_grpquota
 };
 static const match_table_t tokens = {
@@ -865,6 +871,8 @@ static const match_table_t tokens = {
        {Opt_quota, "quota"},
        {Opt_usrquota, "usrquota"},
        {Opt_barrier, "barrier=%u"},
+        {Opt_barrier, "barrier"},
+        {Opt_nobarrier, "nobarrier"},
        {Opt_resize, "resize"},
        {Opt_err, NULL},
 };
@@ -967,7 +975,11 @@ static int parse_options (char *options, struct super_block *sb,
                int token;
                if (!*p)
                        continue;
+                /*
+                 * Initialize args struct so we know whether arg was
+                 * found; some options take optional arguments.
+                 */
+                args[0].to = args[0].from = 0;
                token = match_token(p, tokens, args);
                switch (token) {
                case Opt_bsd_df:
@@ -1215,9 +1227,15 @@ set_qf_format:
                case Opt_abort:
                        set_opt(sbi->s_mount_opt, ABORT);
                        break;
+                case Opt_nobarrier:
+                        clear_opt(sbi->s_mount_opt, BARRIER);
+                        break;
                case Opt_barrier:
-                        if (match_int(&args[0], &option))
+                        if (args[0].from) {
-                                return 0;
+                                if (match_int(&args[0], &option))
+                                        return 0;
+                        } else
+                                option = 1;     /* No argument, default to 1 */
                        if (option)
                                set_opt(sbi->s_mount_opt, BARRIER);
                        else
@@ -1511,7 +1529,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
        /* Turn quotas off */
        for (i = 0; i < MAXQUOTAS; i++) {
                if (sb_dqopt(sb)->files[i])
-                        vfs_quota_off(sb, i, 0);
+                        dquot_quota_off(sb, i);
        }
 #endif
        sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@ -1890,21 +1908,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
-        err = percpu_counter_init(&sbi->s_freeblocks_counter,
-                        ext3_count_free_blocks(sb));
-        if (!err) {
-                err = percpu_counter_init(&sbi->s_freeinodes_counter,
-                                ext3_count_free_inodes(sb));
-        }
-        if (!err) {
-                err = percpu_counter_init(&sbi->s_dirs_counter,
-                                ext3_count_dirs(sb));
-        }
-        if (err) {
-                ext3_msg(sb, KERN_ERR, "error: insufficient memory");
-                goto failed_mount3;
-        }
        /* per fileystem reservation list head & lock */
        spin_lock_init(&sbi->s_rsv_window_lock);
        sbi->s_rsv_window_root = RB_ROOT;
@@ -1945,15 +1948,29 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        if (!test_opt(sb, NOLOAD) &&
            EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
                if (ext3_load_journal(sb, es, journal_devnum))
-                        goto failed_mount3;
+                        goto failed_mount2;
        } else if (journal_inum) {
                if (ext3_create_journal(sb, es, journal_inum))
-                        goto failed_mount3;
+                        goto failed_mount2;
        } else {
                if (!silent)
                        ext3_msg(sb, KERN_ERR,
                                "error: no journal found. "
                                "mounting ext3 over ext2?");
+                goto failed_mount2;
+        }
+        err = percpu_counter_init(&sbi->s_freeblocks_counter,
+                        ext3_count_free_blocks(sb));
+        if (!err) {
+                err = percpu_counter_init(&sbi->s_freeinodes_counter,
+                                ext3_count_free_inodes(sb));
+        }
+        if (!err) {
+                err = percpu_counter_init(&sbi->s_dirs_counter,
+                                ext3_count_dirs(sb));
+        }
+        if (err) {
+                ext3_msg(sb, KERN_ERR, "error: insufficient memory");
                goto failed_mount3;
        }
@@ -1978,7 +1995,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                        ext3_msg(sb, KERN_ERR,
                                "error: journal does not support "
                                "requested data journaling mode");
-                        goto failed_mount4;
+                        goto failed_mount3;
                }
        default:
                break;
@@ -2001,19 +2018,19 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        if (IS_ERR(root)) {
                ext3_msg(sb, KERN_ERR, "error: get root inode failed");
                ret = PTR_ERR(root);
-                goto failed_mount4;
+                goto failed_mount3;
        }
        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
                iput(root);
                ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
-                goto failed_mount4;
+                goto failed_mount3;
        }
        sb->s_root = d_alloc_root(root);
        if (!sb->s_root) {
                ext3_msg(sb, KERN_ERR, "error: get root dentry failed");
                iput(root);
                ret = -ENOMEM;
-                goto failed_mount4;
+                goto failed_mount3;
        }
        ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
@@ -2039,12 +2056,11 @@ cantfind_ext3:
                       sb->s_id);
        goto failed_mount;
-failed_mount4:
-        journal_destroy(sbi->s_journal);
 failed_mount3:
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
+        journal_destroy(sbi->s_journal);
 failed_mount2:
        for (i = 0; i < db_count; i++)
                brelse(sbi->s_group_desc[i]);
@@ -2278,6 +2294,9 @@ static int ext3_load_journal(struct super_block *sb,
                        return -EINVAL;
        }
+        if (!(journal->j_flags & JFS_BARRIER))
+                printk(KERN_INFO "EXT3-fs: barriers not enabled\n");
        if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
                err = journal_update_format(journal);
                if (err)  {
@@ -2534,6 +2553,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
        ext3_fsblk_t n_blocks_count = 0;
        unsigned long old_sb_flags;
        struct ext3_mount_options old_opts;
+        int enable_quota = 0;
        int err;
 #ifdef CONFIG_QUOTA
        int i;
@@ -2580,6 +2600,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
                }
                if (*flags & MS_RDONLY) {
+                        err = dquot_suspend(sb, -1);
+                        if (err < 0)
+                                goto restore_opts;
                        /*
                         * First of all, the unconditional stuff we have to do
                         * to disable replay of the journal when we next remount
@@ -2634,6 +2658,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
                                goto restore_opts;
                        if (!ext3_setup_super (sb, es, 0))
                                sb->s_flags &= ~MS_RDONLY;
+                        enable_quota = 1;
                }
        }
 #ifdef CONFIG_QUOTA
@@ -2645,6 +2670,9 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 #endif
        unlock_super(sb);
        unlock_kernel();
+        if (enable_quota)
+                dquot_resume(sb, -1);
        return 0;
 restore_opts:
        sb->s_flags = old_sb_flags;
@@ -2834,24 +2862,21 @@ static int ext3_write_info(struct super_block *sb, int type)
 */
 static int ext3_quota_on_mount(struct super_block *sb, int type)
 {
-        return vfs_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
+        return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
-                        EXT3_SB(sb)->s_jquota_fmt, type);
+                                        EXT3_SB(sb)->s_jquota_fmt, type);
 }
 /*
 * Standard function to be called on quota_on
 */
 static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-                         char *name, int remount)
+                         char *name)
 {
        int err;
        struct path path;
        if (!test_opt(sb, QUOTA))
                return -EINVAL;
-        /* When remounting, no checks are needed and in fact, name is NULL */
-        if (remount)
-                return vfs_quota_on(sb, type, format_id, name, remount);
        err = kern_path(name, LOOKUP_FOLLOW, &path);
        if (err)
@@ -2889,7 +2914,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
                }
        }
-        err = vfs_quota_on_path(sb, type, format_id, &path);
+        err = dquot_quota_on_path(sb, type, format_id, &path);
        path_put(&path);
        return err;
 }
diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c
index ff7b4ccd8983..7c4898207776 100644
--- a/fs/ext3/symlink.c
+++ b/fs/ext3/symlink.c
@@ -34,6 +34,7 @@ const struct inode_operations ext3_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
+        .setattr        = ext3_setattr,
 #ifdef CONFIG_EXT3_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext3_symlink_inode_operations = {
 const struct inode_operations ext3_fast_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = ext3_follow_link,
+        .setattr        = ext3_setattr,
 #ifdef CONFIG_EXT3_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 534a94c3a933..71fb8d65e54c 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -104,7 +104,7 @@ static int ext3_xattr_list(struct dentry *dentry, char *buffer,
 static struct mb_cache *ext3_xattr_cache;
-static struct xattr_handler *ext3_xattr_handler_map[] = {
+static const struct xattr_handler *ext3_xattr_handler_map[] = {
        [EXT3_XATTR_INDEX_USER]              = &ext3_xattr_user_handler,
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
        [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext3_xattr_acl_access_handler,
@@ -116,7 +116,7 @@ static struct xattr_handler *ext3_xattr_handler_map[] = {
 #endif
 };
-struct xattr_handler *ext3_xattr_handlers[] = {
+const struct xattr_handler *ext3_xattr_handlers[] = {
        &ext3_xattr_user_handler,
        &ext3_xattr_trusted_handler,
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
@@ -129,10 +129,10 @@ struct xattr_handler *ext3_xattr_handlers[] = {
        NULL
 };
-static inline struct xattr_handler *
+static inline const struct xattr_handler *
 ext3_xattr_handler(int name_index)
 {
-        struct xattr_handler *handler = NULL;
+        const struct xattr_handler *handler = NULL;
        if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map))
                handler = ext3_xattr_handler_map[name_index];
@@ -338,7 +338,7 @@ ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry,
        size_t rest = buffer_size;
        for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
-                struct xattr_handler *handler =
+                const struct xattr_handler *handler =
                        ext3_xattr_handler(entry->e_name_index);
                if (handler) {
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 148a4dfc82ab..377fe7201169 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -58,11 +58,11 @@ struct ext3_xattr_entry {
 # ifdef CONFIG_EXT3_FS_XATTR
-extern struct xattr_handler ext3_xattr_user_handler;
+extern const struct xattr_handler ext3_xattr_user_handler;
-extern struct xattr_handler ext3_xattr_trusted_handler;
+extern const struct xattr_handler ext3_xattr_trusted_handler;
-extern struct xattr_handler ext3_xattr_acl_access_handler;
+extern const struct xattr_handler ext3_xattr_acl_access_handler;
-extern struct xattr_handler ext3_xattr_acl_default_handler;
+extern const struct xattr_handler ext3_xattr_acl_default_handler;
-extern struct xattr_handler ext3_xattr_security_handler;
+extern const struct xattr_handler ext3_xattr_security_handler;
 extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
@@ -76,7 +76,7 @@ extern void ext3_xattr_put_super(struct super_block *);
 extern int init_ext3_xattr(void);
 extern void exit_ext3_xattr(void);
-extern struct xattr_handler *ext3_xattr_handlers[];
+extern const struct xattr_handler *ext3_xattr_handlers[];
 # else  /* CONFIG_EXT3_FS_XATTR */
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 474348788dd9..03a99bfc59f9 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -4,6 +4,7 @@
 */
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/ext3_jbd.h>
@@ -68,7 +69,7 @@ ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
        return err;
 }
-struct xattr_handler ext3_xattr_security_handler = {
+const struct xattr_handler ext3_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .list   = ext3_xattr_security_list,
        .get    = ext3_xattr_security_get,
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index e5562845ed96..dc8edda9ffe0 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -51,7 +51,7 @@ ext3_xattr_trusted_set(struct dentry *dentry, const char *name,
                              value, size, flags);
 }
-struct xattr_handler ext3_xattr_trusted_handler = {
+const struct xattr_handler ext3_xattr_trusted_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .list   = ext3_xattr_trusted_list,
        .get    = ext3_xattr_trusted_get,
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index 3bcfe9ee0a68..7a321974d584 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -54,7 +54,7 @@ ext3_xattr_user_set(struct dentry *dentry, const char *name,
                              name, value, size, flags);
 }
-struct xattr_handler ext3_xattr_user_handler = {
+const struct xattr_handler ext3_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
        .list   = ext3_xattr_user_list,
        .get    = ext3_xattr_user_get,
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 8a2a29d35a6f..feaf498feaa6 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -454,7 +454,7 @@ release_and_out:
        return error;
 }
-struct xattr_handler ext4_xattr_acl_access_handler = {
+const struct xattr_handler ext4_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags  = ACL_TYPE_ACCESS,
        .list   = ext4_xattr_list_acl_access,
@@ -462,7 +462,7 @@ struct xattr_handler ext4_xattr_acl_access_handler = {
        .set    = ext4_xattr_set_acl,
 };
-struct xattr_handler ext4_xattr_acl_default_handler = {
+const struct xattr_handler ext4_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags  = ACL_TYPE_DEFAULT,
        .list   = ext4_xattr_list_acl_default,
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d2f37a5516c7..95b7594c76f9 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -591,14 +591,15 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
        ret = ext4_mb_new_blocks(handle, &ar, errp);
        if (count)
                *count = ar.len;
        /*
-         * Account for the allocated meta blocks
+         * Account for the allocated meta blocks.  We will never
+         * fail EDQUOT for metdata, but we do account for it.
         */
        if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
                spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
                EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+                dquot_alloc_block_nofail(inode, ar.len);
        }
        return ret;
 }
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 983f0e127493..5b6973fbf1bd 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -18,6 +18,7 @@
 #include <linux/pagemap.h>
 #include <linux/blkdev.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include "ext4.h"
 struct ext4_system_zone {
@@ -71,9 +72,9 @@ static int add_system_zone(struct ext4_sb_info *sbi,
                else if (start_blk >= (entry->start_blk + entry->count))
                        n = &(*n)->rb_right;
                else {
-                        if (start_blk + count > (entry->start_blk + 
+                        if (start_blk + count > (entry->start_blk +
                                                 entry->count))
-                                entry->count = (start_blk + count - 
+                                entry->count = (start_blk + count -
                                                entry->start_blk);
                        new_node = *n;
                        new_entry = rb_entry(new_node, struct ext4_system_zone,
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 86cb6d86a048..ea5e6cb7e2a5 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -83,11 +83,10 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
                error_msg = "inode out of bounds";
        if (error_msg != NULL)
-                __ext4_error(dir->i_sb, function,
+                ext4_error_inode(function, dir,
-                        "bad entry in directory #%lu: %s - block=%llu"
+                        "bad entry in directory: %s - block=%llu"
                        "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
-                        dir->i_ino, error_msg, 
+                        error_msg, (unsigned long long) bh->b_blocknr,
-                        (unsigned long long) bh->b_blocknr,     
                        (unsigned) (offset%bh->b_size), offset,
                        le32_to_cpu(de->inode),
                        rlen, de->name_len);
@@ -111,7 +110,7 @@ static int ext4_readdir(struct file *filp,
        if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
                                    EXT4_FEATURE_COMPAT_DIR_INDEX) &&
-            ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) ||
+            ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
             ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
                err = ext4_dx_readdir(filp, dirent, filldir);
                if (err != ERR_BAD_DX_DIR) {
@@ -122,20 +121,20 @@ static int ext4_readdir(struct file *filp,
                 * We don't set the inode dirty flag since it's not
                 * critical that it get flushed back to the disk.
                 */
-                EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL;
+                ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX);
        }
        stored = 0;
        offset = filp->f_pos & (sb->s_blocksize - 1);
        while (!error && !stored && filp->f_pos < inode->i_size) {
-                ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
+                struct ext4_map_blocks map;
-                struct buffer_head map_bh;
                struct buffer_head *bh = NULL;
-                map_bh.b_state = 0;
+                map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
-                err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0);
+                map.m_len = 1;
+                err = ext4_map_blocks(NULL, inode, &map, 0);
                if (err > 0) {
-                        pgoff_t index = map_bh.b_blocknr >>
+                        pgoff_t index = map.m_pblk >>
                                        (PAGE_CACHE_SHIFT - inode->i_blkbits);
                        if (!ra_has_index(&filp->f_ra, index))
                                page_cache_sync_readahead(
@@ -143,7 +142,7 @@ static int ext4_readdir(struct file *filp,
                                        &filp->f_ra, filp,
                                        index, 1);
                        filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
-                        bh = ext4_bread(NULL, inode, blk, 0, &err);
+                        bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
                }
                /*
@@ -152,9 +151,8 @@ static int ext4_readdir(struct file *filp,
                 */
                if (!bh) {
                        if (!dir_has_error) {
-                                ext4_error(sb, "directory #%lu "
+                                EXT4_ERROR_INODE(inode, "directory "
                                           "contains a hole at offset %Lu",
-                                           inode->i_ino,
                                           (unsigned long long) filp->f_pos);
                                dir_has_error = 1;
                        }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf938cf7c5f0..19a4de57128a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -29,6 +29,9 @@
 #include <linux/wait.h>
 #include <linux/blockgroup_lock.h>
 #include <linux/percpu_counter.h>
+#ifdef __KERNEL__
+#include <linux/compat.h>
+#endif
 /*
 * The fourth extended filesystem constants/structures
@@ -54,10 +57,10 @@
 #endif
 #define EXT4_ERROR_INODE(inode, fmt, a...) \
-        ext4_error_inode(__func__, (inode), (fmt), ## a);
+        ext4_error_inode(__func__, (inode), (fmt), ## a)
 #define EXT4_ERROR_FILE(file, fmt, a...)        \
-        ext4_error_file(__func__, (file), (fmt), ## a);
+        ext4_error_file(__func__, (file), (fmt), ## a)
 /* data type for block offset of block group */
 typedef int ext4_grpblk_t;
@@ -72,7 +75,7 @@ typedef __u32 ext4_lblk_t;
 typedef unsigned int ext4_group_t;
 /*
- * Flags used in mballoc's allocation_context flags field.  
+ * Flags used in mballoc's allocation_context flags field.
 *
 * Also used to show what's going on for debugging purposes when the
 * flag field is exported via the traceport interface
@@ -126,6 +129,29 @@ struct ext4_allocation_request {
 };
 /*
+ * Logical to physical block mapping, used by ext4_map_blocks()
+ *
+ * This structure is used to pass requests into ext4_map_blocks() as
+ * well as to store the information returned by ext4_map_blocks().  It
+ * takes less room on the stack than a struct buffer_head.
+ */
+#define EXT4_MAP_NEW            (1 << BH_New)
+#define EXT4_MAP_MAPPED         (1 << BH_Mapped)
+#define EXT4_MAP_UNWRITTEN      (1 << BH_Unwritten)
+#define EXT4_MAP_BOUNDARY       (1 << BH_Boundary)
+#define EXT4_MAP_UNINIT         (1 << BH_Uninit)
+#define EXT4_MAP_FLAGS          (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
+                                 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
+                                 EXT4_MAP_UNINIT)
+struct ext4_map_blocks {
+        ext4_fsblk_t m_pblk;
+        ext4_lblk_t m_lblk;
+        unsigned int m_len;
+        unsigned int m_flags;
+};
+/*
 * For delayed allocation tracking
 */
 struct mpage_da_data {
@@ -321,6 +347,83 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
                return flags & EXT4_OTHER_FLMASK;
 }
+/*
+ * Inode flags used for atomic set/get
+ */
+enum {
+        EXT4_INODE_SECRM        = 0,    /* Secure deletion */
+        EXT4_INODE_UNRM         = 1,    /* Undelete */
+        EXT4_INODE_COMPR        = 2,    /* Compress file */
+        EXT4_INODE_SYNC         = 3,    /* Synchronous updates */
+        EXT4_INODE_IMMUTABLE    = 4,    /* Immutable file */
+        EXT4_INODE_APPEND       = 5,    /* writes to file may only append */
+        EXT4_INODE_NODUMP       = 6,    /* do not dump file */
+        EXT4_INODE_NOATIME      = 7,    /* do not update atime */
+/* Reserved for compression usage... */
+        EXT4_INODE_DIRTY        = 8,
+        EXT4_INODE_COMPRBLK     = 9,    /* One or more compressed clusters */
+        EXT4_INODE_NOCOMPR      = 10,   /* Don't compress */
+        EXT4_INODE_ECOMPR       = 11,   /* Compression error */
+/* End compression flags --- maybe not all used */
+        EXT4_INODE_INDEX        = 12,   /* hash-indexed directory */
+        EXT4_INODE_IMAGIC       = 13,   /* AFS directory */
+        EXT4_INODE_JOURNAL_DATA = 14,   /* file data should be journaled */
+        EXT4_INODE_NOTAIL       = 15,   /* file tail should not be merged */
+        EXT4_INODE_DIRSYNC      = 16,   /* dirsync behaviour (directories only) */
+        EXT4_INODE_TOPDIR       = 17,   /* Top of directory hierarchies*/
+        EXT4_INODE_HUGE_FILE    = 18,   /* Set to each huge file */
+        EXT4_INODE_EXTENTS      = 19,   /* Inode uses extents */
+        EXT4_INODE_EA_INODE     = 21,   /* Inode used for large EA */
+        EXT4_INODE_EOFBLOCKS    = 22,   /* Blocks allocated beyond EOF */
+        EXT4_INODE_RESERVED     = 31,   /* reserved for ext4 lib */
+};
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
+        printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
+                EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
+/*
+ * Since it's pretty easy to mix up bit numbers and hex values, and we
+ * can't do a compile-time test for ENUM values, we use a run-time
+ * test to make sure that EXT4_XXX_FL is consistent with respect to
+ * EXT4_INODE_XXX.  If all is well the printk and BUG_ON will all drop
+ * out so it won't cost any extra space in the compiled kernel image.
+ * But it's important that these values are the same, since we are
+ * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
+ * must be consistent with the values of FS_XXX_FL defined in
+ * include/linux/fs.h and the on-disk values found in ext2, ext3, and
+ * ext4 filesystems, and of course the values defined in e2fsprogs.
+ *
+ * It's not paranoia if the Murphy's Law really *is* out to get you.  :-)
+ */
+static inline void ext4_check_flag_values(void)
+{
+        CHECK_FLAG_VALUE(SECRM);
+        CHECK_FLAG_VALUE(UNRM);
+        CHECK_FLAG_VALUE(COMPR);
+        CHECK_FLAG_VALUE(SYNC);
+        CHECK_FLAG_VALUE(IMMUTABLE);
+        CHECK_FLAG_VALUE(APPEND);
+        CHECK_FLAG_VALUE(NODUMP);
+        CHECK_FLAG_VALUE(NOATIME);
+        CHECK_FLAG_VALUE(DIRTY);
+        CHECK_FLAG_VALUE(COMPRBLK);
+        CHECK_FLAG_VALUE(NOCOMPR);
+        CHECK_FLAG_VALUE(ECOMPR);
+        CHECK_FLAG_VALUE(INDEX);
+        CHECK_FLAG_VALUE(IMAGIC);
+        CHECK_FLAG_VALUE(JOURNAL_DATA);
+        CHECK_FLAG_VALUE(NOTAIL);
+        CHECK_FLAG_VALUE(DIRSYNC);
+        CHECK_FLAG_VALUE(TOPDIR);
+        CHECK_FLAG_VALUE(HUGE_FILE);
+        CHECK_FLAG_VALUE(EXTENTS);
+        CHECK_FLAG_VALUE(EA_INODE);
+        CHECK_FLAG_VALUE(EOFBLOCKS);
+        CHECK_FLAG_VALUE(RESERVED);
+}
 /* Used to pass group descriptor data when online resize is done */
 struct ext4_new_group_input {
        __u32 group;            /* Group number for this data */
@@ -332,6 +435,18 @@ struct ext4_new_group_input {
        __u16 unused;
 };
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+struct compat_ext4_new_group_input {
+        u32 group;
+        compat_u64 block_bitmap;
+        compat_u64 inode_bitmap;
+        compat_u64 inode_table;
+        u32 blocks_count;
+        u16 reserved_blocks;
+        u16 unused;
+};
+#endif
 /* The struct ext4_new_group_input in kernel space, with free_blocks_count */
 struct ext4_new_group_data {
        __u32 group;
@@ -355,7 +470,7 @@ struct ext4_new_group_data {
 #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT       (EXT4_GET_BLOCKS_UNINIT_EXT|\
                                                 EXT4_GET_BLOCKS_CREATE)
        /* Caller is from the delayed allocation writeout path,
-           so set the magic i_delalloc_reserve_flag after taking the 
+           so set the magic i_delalloc_reserve_flag after taking the
           inode allocation semaphore for */
 #define EXT4_GET_BLOCKS_DELALLOC_RESERVE        0x0004
        /* caller is from the direct IO path, request to creation of an
@@ -398,6 +513,7 @@ struct ext4_new_group_data {
 #define EXT4_IOC_ALLOC_DA_BLKS          _IO('f', 12)
 #define EXT4_IOC_MOVE_EXT               _IOWR('f', 15, struct move_extent)
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
 * ioctl commands in 32 bit emulation
 */
@@ -408,11 +524,13 @@ struct ext4_new_group_data {
 #define EXT4_IOC32_GETRSVSZ             _IOR('f', 5, int)
 #define EXT4_IOC32_SETRSVSZ             _IOW('f', 6, int)
 #define EXT4_IOC32_GROUP_EXTEND         _IOW('f', 7, unsigned int)
+#define EXT4_IOC32_GROUP_ADD            _IOW('f', 8, struct compat_ext4_new_group_input)
 #ifdef CONFIG_JBD2_DEBUG
 #define EXT4_IOC32_WAIT_FOR_READONLY    _IOR('f', 99, int)
 #endif
 #define EXT4_IOC32_GETVERSION_OLD       FS_IOC32_GETVERSION
 #define EXT4_IOC32_SETVERSION_OLD       FS_IOC32_SETVERSION
+#endif
 /*
@@ -616,9 +734,8 @@ struct ext4_ext_cache {
 */
 struct ext4_inode_info {
        __le32  i_data[15];     /* unconverted */
-        __u32   i_flags;
-        ext4_fsblk_t    i_file_acl;
        __u32   i_dtime;
+        ext4_fsblk_t    i_file_acl;
        /*
         * i_block_group is the number of the block group which contains
@@ -629,6 +746,7 @@ struct ext4_inode_info {
         */
        ext4_group_t    i_block_group;
        unsigned long   i_state_flags;          /* Dynamic state flags */
+        unsigned long   i_flags;
        ext4_lblk_t             i_dir_start_lookup;
 #ifdef CONFIG_EXT4_FS_XATTR
@@ -1062,22 +1180,25 @@ enum {
        EXT4_STATE_DA_ALLOC_CLOSE,      /* Alloc DA blks on close */
        EXT4_STATE_EXT_MIGRATE,         /* Inode is migrating */
        EXT4_STATE_DIO_UNWRITTEN,       /* need convert on dio done*/
+        EXT4_STATE_NEWENTRY,            /* File just added to dir */
 };
-static inline int ext4_test_inode_state(struct inode *inode, int bit)
+#define EXT4_INODE_BIT_FNS(name, field)                                 \
-{
+static inline int ext4_test_inode_##name(struct inode *inode, int bit)  \
-        return test_bit(bit, &EXT4_I(inode)->i_state_flags);
+{                                                                       \
-}
+        return test_bit(bit, &EXT4_I(inode)->i_##field);                \
+}                                                                       \
-static inline void ext4_set_inode_state(struct inode *inode, int bit)
+static inline void ext4_set_inode_##name(struct inode *inode, int bit)  \
-{
+{                                                                       \
-        set_bit(bit, &EXT4_I(inode)->i_state_flags);
+        set_bit(bit, &EXT4_I(inode)->i_##field);                        \
+}                                                                       \
+static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
+{                                                                       \
+        clear_bit(bit, &EXT4_I(inode)->i_##field);                      \
 }
-static inline void ext4_clear_inode_state(struct inode *inode, int bit)
+EXT4_INODE_BIT_FNS(flag, flags)
-{
+EXT4_INODE_BIT_FNS(state, state_flags)
-        clear_bit(bit, &EXT4_I(inode)->i_state_flags);
-}
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
 * a kernel struct super_block.  This will allow us to call the feature-test
@@ -1264,7 +1385,7 @@ struct ext4_dir_entry_2 {
 #define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
                                      EXT4_FEATURE_COMPAT_DIR_INDEX) && \
-                      (EXT4_I(dir)->i_flags & EXT4_INDEX_FL))
+                    ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
 #define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
 #define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
@@ -1398,7 +1519,7 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
 extern void ext4_htree_free_dir_info(struct dir_private_info *p);
 /* fsync.c */
-extern int ext4_sync_file(struct file *, struct dentry *, int);
+extern int ext4_sync_file(struct file *, int);
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1678,6 +1799,7 @@ struct ext4_group_info {
        ext4_grpblk_t   bb_first_free;  /* first free block */
        ext4_grpblk_t   bb_free;        /* total free blocks */
        ext4_grpblk_t   bb_fragments;   /* nr of freespace fragments */
+        ext4_grpblk_t   bb_largest_free_order;/* order of largest frag in BG */
        struct          list_head bb_prealloc_list;
 #ifdef DOUBLE_CHECK
        void            *bb_bitmap;
@@ -1772,9 +1894,8 @@ extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
 extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
                                       int chunk);
-extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
+extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
-                               ext4_lblk_t iblock, unsigned int max_blocks,
+                               struct ext4_map_blocks *map, int flags);
-                               struct buffer_head *bh_result, int flags);
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
@@ -1782,6 +1903,8 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
                          loff_t len);
 extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
                          ssize_t len);
+extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
+                           struct ext4_map_blocks *map, int flags);
 extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
                           sector_t block, unsigned int max_blocks,
                           struct buffer_head *bh, int flags);
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b79ad5126468..dade0c024797 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -273,7 +273,7 @@ static inline int ext4_should_journal_data(struct inode *inode)
                return 1;
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
                return 1;
-        if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+        if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
                return 1;
        return 0;
 }
@@ -284,7 +284,7 @@ static inline int ext4_should_order_data(struct inode *inode)
                return 0;
        if (!S_ISREG(inode->i_mode))
                return 0;
-        if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+        if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
                return 0;
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
                return 1;
@@ -297,7 +297,7 @@ static inline int ext4_should_writeback_data(struct inode *inode)
                return 0;
        if (EXT4_JOURNAL(inode) == NULL)
                return 1;
-        if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+        if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
                return 0;
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
                return 1;
@@ -321,7 +321,7 @@ static inline int ext4_should_dioread_nolock(struct inode *inode)
                return 0;
        if (!S_ISREG(inode->i_mode))
                return 0;
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return 0;
        if (ext4_should_journal_data(inode))
                return 0;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 94c8ee81f5e1..377309c1af65 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
        if (err <= 0)
                return err;
        err = ext4_truncate_restart_trans(handle, inode, needed);
-        /*
+        if (err == 0)
-         * We have dropped i_data_sem so someone might have cached again
+                err = -EAGAIN;
-         * an extent we are going to truncate.
-         */
-        ext4_ext_invalidate_cache(inode);
        return err;
 }
@@ -185,10 +182,10 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
        if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
                /*
                 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
-                 * block groups per flexgroup, reserve the first block 
+                 * block groups per flexgroup, reserve the first block
-                 * group for directories and special files.  Regular 
+                 * group for directories and special files.  Regular
                 * files will start at the second block group.  This
-                 * tends to speed up directory access and improves 
+                 * tends to speed up directory access and improves
                 * fsck times.
                 */
                block_group &= ~(flex_size-1);
@@ -439,10 +436,10 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
        return 0;
 corrupted:
-        __ext4_error(inode->i_sb, function,
+        ext4_error_inode(function, inode,
-                        "bad header/extent in inode #%lu: %s - magic %x, "
+                        "bad header/extent: %s - magic %x, "
                        "entries %u, max %u(%u), depth %u(%u)",
-                        inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
+                        error_msg, le16_to_cpu(eh->eh_magic),
                        le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
                        max, le16_to_cpu(eh->eh_depth), depth);
@@ -1622,9 +1619,7 @@ int ext4_ext_try_to_merge(struct inode *inode,
                merge_done = 1;
                WARN_ON(eh->eh_entries == 0);
                if (!eh->eh_entries)
-                        ext4_error(inode->i_sb,
+                        EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
-                                   "inode#%lu, eh->eh_entries = 0!",
-                                   inode->i_ino);
        }
        return merge_done;
@@ -2039,7 +2034,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
        struct ext4_ext_cache *cex;
        int ret = EXT4_EXT_CACHE_NO;
-        /* 
+        /*
         * We borrow i_block_reservation_lock to protect i_cached_extent
         */
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2361,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
        int depth = ext_depth(inode);
        struct ext4_ext_path *path;
        handle_t *handle;
-        int i = 0, err = 0;
+        int i, err;
        ext_debug("truncate since %u\n", start);
@@ -2370,23 +2365,26 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
        if (IS_ERR(handle))
                return PTR_ERR(handle);
+again:
        ext4_ext_invalidate_cache(inode);
        /*
         * We start scanning from right side, freeing all the blocks
         * after i_size and walking into the tree depth-wise.
         */
+        depth = ext_depth(inode);
        path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
        if (path == NULL) {
                ext4_journal_stop(handle);
                return -ENOMEM;
        }
+        path[0].p_depth = depth;
        path[0].p_hdr = ext_inode_hdr(inode);
        if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
                err = -EIO;
                goto out;
        }
-        path[0].p_depth = depth;
+        i = err = 0;
        while (i >= 0 && err == 0) {
                if (i == depth) {
@@ -2480,6 +2478,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 out:
        ext4_ext_drop_refs(path);
        kfree(path);
+        if (err == -EAGAIN)
+                goto again;
        ext4_journal_stop(handle);
        return err;
@@ -2544,7 +2544,7 @@ static void bi_complete(struct bio *bio, int error)
 /* FIXME!! we need to try to merge to left or right after zero-out  */
 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 {
-        int ret = -EIO;
+        int ret;
        struct bio *bio;
        int blkbits, blocksize;
        sector_t ee_pblock;
@@ -2568,6 +2568,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
                        len = ee_len;
                bio = bio_alloc(GFP_NOIO, len);
+                if (!bio)
+                        return -ENOMEM;
                bio->bi_sector = ee_pblock;
                bio->bi_bdev   = inode->i_sb->s_bdev;
@@ -2595,22 +2598,20 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
                submit_bio(WRITE, bio);
                wait_for_completion(&event);
-                if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+                if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
-                        ret = 0;
+                        bio_put(bio);
-                else {
+                        return -EIO;
-                        ret = -EIO;
-                        break;
                }
                bio_put(bio);
                ee_len    -= done;
                ee_pblock += done  << (blkbits - 9);
        }
-        return ret;
+        return 0;
 }
 #define EXT4_EXT_ZERO_LEN 7
 /*
- * This function is called by ext4_ext_get_blocks() if someone tries to write
+ * This function is called by ext4_ext_map_blocks() if someone tries to write
 * to an uninitialized extent. It may result in splitting the uninitialized
 * extent into multiple extents (upto three - one initialized and two
 * uninitialized).
@@ -2620,39 +2621,55 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 *   c> Splits in three extents: Somone is writing in middle of the extent
 */
 static int ext4_ext_convert_to_initialized(handle_t *handle,
-                                                struct inode *inode,
+                                           struct inode *inode,
-                                                struct ext4_ext_path *path,
+                                           struct ext4_map_blocks *map,
-                                                ext4_lblk_t iblock,
+                                           struct ext4_ext_path *path)
-                                                unsigned int max_blocks)
 {
        struct ext4_extent *ex, newex, orig_ex;
        struct ext4_extent *ex1 = NULL;
        struct ext4_extent *ex2 = NULL;
        struct ext4_extent *ex3 = NULL;
        struct ext4_extent_header *eh;
-        ext4_lblk_t ee_block;
+        ext4_lblk_t ee_block, eof_block;
        unsigned int allocated, ee_len, depth;
        ext4_fsblk_t newblock;
        int err = 0;
        int ret = 0;
+        int may_zeroout;
+        ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
+                "block %llu, max_blocks %u\n", inode->i_ino,
+                (unsigned long long)map->m_lblk, map->m_len);
+        eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+                inode->i_sb->s_blocksize_bits;
+        if (eof_block < map->m_lblk + map->m_len)
+                eof_block = map->m_lblk + map->m_len;
        depth = ext_depth(inode);
        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
-        allocated = ee_len - (iblock - ee_block);
+        allocated = ee_len - (map->m_lblk - ee_block);
-        newblock = iblock - ee_block + ext_pblock(ex);
+        newblock = map->m_lblk - ee_block + ext_pblock(ex);
        ex2 = ex;
        orig_ex.ee_block = ex->ee_block;
        orig_ex.ee_len   = cpu_to_le16(ee_len);
        ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+        /*
+         * It is safe to convert extent to initialized via explicit
+         * zeroout only if extent is fully insde i_size or new_size.
+         */
+        may_zeroout = ee_block + ee_len <= eof_block;
        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
                goto out;
        /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
-        if (ee_len <= 2*EXT4_EXT_ZERO_LEN) {
+        if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
                err =  ext4_ext_zeroout(inode, &orig_ex);
                if (err)
                        goto fix_extent_len;
@@ -2665,10 +2682,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                return allocated;
        }
-        /* ex1: ee_block to iblock - 1 : uninitialized */
+        /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
-        if (iblock > ee_block) {
+        if (map->m_lblk > ee_block) {
                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(iblock - ee_block);
+                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
                ext4_ext_mark_uninitialized(ex1);
                ex2 = &newex;
        }
@@ -2677,15 +2694,15 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
         * we insert ex3, if ex1 is NULL. This is to avoid temporary
         * overlap of blocks.
         */
-        if (!ex1 && allocated > max_blocks)
+        if (!ex1 && allocated > map->m_len)
-                ex2->ee_len = cpu_to_le16(max_blocks);
+                ex2->ee_len = cpu_to_le16(map->m_len);
        /* ex3: to ee_block + ee_len : uninitialised */
-        if (allocated > max_blocks) {
+        if (allocated > map->m_len) {
                unsigned int newdepth;
                /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
-                if (allocated <= EXT4_EXT_ZERO_LEN) {
+                if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
                        /*
-                         * iblock == ee_block is handled by the zerouout
+                         * map->m_lblk == ee_block is handled by the zerouout
                         * at the beginning.
                         * Mark first half uninitialized.
                         * Mark second half initialized and zero out the
@@ -2698,7 +2715,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        ext4_ext_dirty(handle, inode, path + depth);
                        ex3 = &newex;
-                        ex3->ee_block = cpu_to_le32(iblock);
+                        ex3->ee_block = cpu_to_le32(map->m_lblk);
                        ext4_ext_store_pblock(ex3, newblock);
                        ex3->ee_len = cpu_to_le16(allocated);
                        err = ext4_ext_insert_extent(handle, inode, path,
@@ -2711,7 +2728,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                                ex->ee_len   = orig_ex.ee_len;
                                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
                                ext4_ext_dirty(handle, inode, path + depth);
-                                /* blocks available from iblock */
+                                /* blocks available from map->m_lblk */
                                return allocated;
                        } else if (err)
@@ -2733,8 +2750,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                                 */
                                depth = ext_depth(inode);
                                ext4_ext_drop_refs(path);
-                                path = ext4_ext_find_extent(inode,
+                                path = ext4_ext_find_extent(inode, map->m_lblk,
-                                                                iblock, path);
+                                                            path);
                                if (IS_ERR(path)) {
                                        err = PTR_ERR(path);
                                        return err;
@@ -2754,12 +2771,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        return allocated;
                }
                ex3 = &newex;
-                ex3->ee_block = cpu_to_le32(iblock + max_blocks);
+                ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
-                ext4_ext_store_pblock(ex3, newblock + max_blocks);
+                ext4_ext_store_pblock(ex3, newblock + map->m_len);
-                ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+                ex3->ee_len = cpu_to_le16(allocated - map->m_len);
                ext4_ext_mark_uninitialized(ex3);
                err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
-                if (err == -ENOSPC) {
+                if (err == -ENOSPC && may_zeroout) {
                        err =  ext4_ext_zeroout(inode, &orig_ex);
                        if (err)
                                goto fix_extent_len;
@@ -2769,7 +2786,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zeroed the full extent */
-                        /* blocks available from iblock */
+                        /* blocks available from map->m_lblk */
                        return allocated;
                } else if (err)
@@ -2783,11 +2800,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                 * update the extent length after successful insert of the
                 * split extent
                 */
-                orig_ex.ee_len = cpu_to_le16(ee_len -
+                ee_len -= ext4_ext_get_actual_len(ex3);
-                                                ext4_ext_get_actual_len(ex3));
+                orig_ex.ee_len = cpu_to_le16(ee_len);
+                may_zeroout = ee_block + ee_len <= eof_block;
                depth = newdepth;
                ext4_ext_drop_refs(path);
-                path = ext4_ext_find_extent(inode, iblock, path);
+                path = ext4_ext_find_extent(inode, map->m_lblk, path);
                if (IS_ERR(path)) {
                        err = PTR_ERR(path);
                        goto out;
@@ -2801,14 +2820,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                if (err)
                        goto out;
-                allocated = max_blocks;
+                allocated = map->m_len;
                /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
                 * to insert a extent in the middle zerout directly
                 * otherwise give the extent a chance to merge to left
                 */
                if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
-                                                        iblock != ee_block) {
+                        map->m_lblk != ee_block && may_zeroout) {
                        err =  ext4_ext_zeroout(inode, &orig_ex);
                        if (err)
                                goto fix_extent_len;
@@ -2818,7 +2837,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zero out the first half */
-                        /* blocks available from iblock */
+                        /* blocks available from map->m_lblk */
                        return allocated;
                }
        }
@@ -2829,12 +2848,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
         */
        if (ex1 && ex1 != ex) {
                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(iblock - ee_block);
+                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
                ext4_ext_mark_uninitialized(ex1);
                ex2 = &newex;
        }
-        /* ex2: iblock to iblock + maxblocks-1 : initialised */
+        /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
-        ex2->ee_block = cpu_to_le32(iblock);
+        ex2->ee_block = cpu_to_le32(map->m_lblk);
        ext4_ext_store_pblock(ex2, newblock);
        ex2->ee_len = cpu_to_le16(allocated);
        if (ex2 != ex)
@@ -2877,7 +2896,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
        goto out;
 insert:
        err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
-        if (err == -ENOSPC) {
+        if (err == -ENOSPC && may_zeroout) {
                err =  ext4_ext_zeroout(inode, &orig_ex);
                if (err)
                        goto fix_extent_len;
@@ -2904,7 +2923,7 @@ fix_extent_len:
 }
 /*
- * This function is called by ext4_ext_get_blocks() from
+ * This function is called by ext4_ext_map_blocks() from
 * ext4_get_blocks_dio_write() when DIO to write
 * to an uninitialized extent.
 *
@@ -2927,9 +2946,8 @@ fix_extent_len:
 */
 static int ext4_split_unwritten_extents(handle_t *handle,
                                        struct inode *inode,
+                                        struct ext4_map_blocks *map,
                                        struct ext4_ext_path *path,
-                                        ext4_lblk_t iblock,
-                                        unsigned int max_blocks,
                                        int flags)
 {
        struct ext4_extent *ex, newex, orig_ex;
@@ -2937,41 +2955,55 @@ static int ext4_split_unwritten_extents(handle_t *handle,
        struct ext4_extent *ex2 = NULL;
        struct ext4_extent *ex3 = NULL;
        struct ext4_extent_header *eh;
-        ext4_lblk_t ee_block;
+        ext4_lblk_t ee_block, eof_block;
        unsigned int allocated, ee_len, depth;
        ext4_fsblk_t newblock;
        int err = 0;
+        int may_zeroout;
+        ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
+                "block %llu, max_blocks %u\n", inode->i_ino,
+                (unsigned long long)map->m_lblk, map->m_len);
+        eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+                inode->i_sb->s_blocksize_bits;
+        if (eof_block < map->m_lblk + map->m_len)
+                eof_block = map->m_lblk + map->m_len;
-        ext_debug("ext4_split_unwritten_extents: inode %lu,"
-                  "iblock %llu, max_blocks %u\n", inode->i_ino,
-                  (unsigned long long)iblock, max_blocks);
        depth = ext_depth(inode);
        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
-        allocated = ee_len - (iblock - ee_block);
+        allocated = ee_len - (map->m_lblk - ee_block);
-        newblock = iblock - ee_block + ext_pblock(ex);
+        newblock = map->m_lblk - ee_block + ext_pblock(ex);
        ex2 = ex;
        orig_ex.ee_block = ex->ee_block;
        orig_ex.ee_len   = cpu_to_le16(ee_len);
        ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
        /*
+         * It is safe to convert extent to initialized via explicit
+         * zeroout only if extent is fully insde i_size or new_size.
+         */
+        may_zeroout = ee_block + ee_len <= eof_block;
+        /*
         * If the uninitialized extent begins at the same logical
         * block where the write begins, and the write completely
         * covers the extent, then we don't need to split it.
         */
-        if ((iblock == ee_block) && (allocated <= max_blocks))
+        if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
                return allocated;
        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
                goto out;
-        /* ex1: ee_block to iblock - 1 : uninitialized */
+        /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
-        if (iblock > ee_block) {
+        if (map->m_lblk > ee_block) {
                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(iblock - ee_block);
+                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
                ext4_ext_mark_uninitialized(ex1);
                ex2 = &newex;
        }
@@ -2980,18 +3012,18 @@ static int ext4_split_unwritten_extents(handle_t *handle,
         * we insert ex3, if ex1 is NULL. This is to avoid temporary
         * overlap of blocks.
         */
-        if (!ex1 && allocated > max_blocks)
+        if (!ex1 && allocated > map->m_len)
-                ex2->ee_len = cpu_to_le16(max_blocks);
+                ex2->ee_len = cpu_to_le16(map->m_len);
        /* ex3: to ee_block + ee_len : uninitialised */
-        if (allocated > max_blocks) {
+        if (allocated > map->m_len) {
                unsigned int newdepth;
                ex3 = &newex;
-                ex3->ee_block = cpu_to_le32(iblock + max_blocks);
+                ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
-                ext4_ext_store_pblock(ex3, newblock + max_blocks);
+                ext4_ext_store_pblock(ex3, newblock + map->m_len);
-                ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+                ex3->ee_len = cpu_to_le16(allocated - map->m_len);
                ext4_ext_mark_uninitialized(ex3);
                err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
-                if (err == -ENOSPC) {
+                if (err == -ENOSPC && may_zeroout) {
                        err =  ext4_ext_zeroout(inode, &orig_ex);
                        if (err)
                                goto fix_extent_len;
@@ -3001,7 +3033,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zeroed the full extent */
-                        /* blocks available from iblock */
+                        /* blocks available from map->m_lblk */
                        return allocated;
                } else if (err)
@@ -3015,11 +3047,13 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                 * update the extent length after successful insert of the
                 * split extent
                 */
-                orig_ex.ee_len = cpu_to_le16(ee_len -
+                ee_len -= ext4_ext_get_actual_len(ex3);
-                                                ext4_ext_get_actual_len(ex3));
+                orig_ex.ee_len = cpu_to_le16(ee_len);
+                may_zeroout = ee_block + ee_len <= eof_block;
                depth = newdepth;
                ext4_ext_drop_refs(path);
-                path = ext4_ext_find_extent(inode, iblock, path);
+                path = ext4_ext_find_extent(inode, map->m_lblk, path);
                if (IS_ERR(path)) {
                        err = PTR_ERR(path);
                        goto out;
@@ -3033,7 +3067,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                if (err)
                        goto out;
-                allocated = max_blocks;
+                allocated = map->m_len;
        }
        /*
         * If there was a change of depth as part of the
@@ -3042,15 +3076,15 @@ static int ext4_split_unwritten_extents(handle_t *handle,
         */
        if (ex1 && ex1 != ex) {
                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(iblock - ee_block);
+                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
                ext4_ext_mark_uninitialized(ex1);
                ex2 = &newex;
        }
        /*
-         * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
+         * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
-         * uninitialised still.
+         * using direct I/O, uninitialised still.
         */
-        ex2->ee_block = cpu_to_le32(iblock);
+        ex2->ee_block = cpu_to_le32(map->m_lblk);
        ext4_ext_store_pblock(ex2, newblock);
        ex2->ee_len = cpu_to_le16(allocated);
        ext4_ext_mark_uninitialized(ex2);
@@ -3062,7 +3096,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
        goto out;
 insert:
        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
-        if (err == -ENOSPC) {
+        if (err == -ENOSPC && may_zeroout) {
                err =  ext4_ext_zeroout(inode, &orig_ex);
                if (err)
                        goto fix_extent_len;
@@ -3152,10 +3186,9 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
-                        ext4_lblk_t iblock, unsigned int max_blocks,
+                        struct ext4_map_blocks *map,
                        struct ext4_ext_path *path, int flags,
-                        unsigned int allocated, struct buffer_head *bh_result,
+                        unsigned int allocated, ext4_fsblk_t newblock)
-                        ext4_fsblk_t newblock)
 {
        int ret = 0;
        int err = 0;
@@ -3163,15 +3196,14 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
                  "block %llu, max_blocks %u, flags %d, allocated %u",
-                  inode->i_ino, (unsigned long long)iblock, max_blocks,
+                  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
                  flags, allocated);
        ext4_ext_show_leaf(inode, path);
        /* get_block() before submit the IO, split the extent */
        if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-                ret = ext4_split_unwritten_extents(handle,
+                ret = ext4_split_unwritten_extents(handle, inode, map,
-                                                inode, path, iblock,
+                                                   path, flags);
-                                                max_blocks, flags);
                /*
                 * Flag the inode(non aio case) or end_io struct (aio case)
                 * that this IO needs to convertion to written when IO is
@@ -3182,7 +3214,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                else
                        ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
                if (ext4_should_dioread_nolock(inode))
-                        set_buffer_uninit(bh_result);
+                        map->m_flags |= EXT4_MAP_UNINIT;
                goto out;
        }
        /* IO end_io complete, convert the filled extent to written */
@@ -3210,14 +3242,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                 * the buffer head will be unmapped so that
                 * a read from the block returns 0s.
                 */
-                set_buffer_unwritten(bh_result);
+                map->m_flags |= EXT4_MAP_UNWRITTEN;
                goto out1;
        }
        /* buffered write, writepage time, convert*/
-        ret = ext4_ext_convert_to_initialized(handle, inode,
+        ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
-                                                path, iblock,
-                                                max_blocks);
        if (ret >= 0)
                ext4_update_inode_fsync_trans(handle, inode, 1);
 out:
@@ -3226,7 +3256,7 @@ out:
                goto out2;
        } else
                allocated = ret;
-        set_buffer_new(bh_result);
+        map->m_flags |= EXT4_MAP_NEW;
        /*
         * if we allocated more blocks than requested
         * we need to make sure we unmap the extra block
@@ -3234,11 +3264,11 @@ out:
         * unmapped later when we find the buffer_head marked
         * new.
         */
-        if (allocated > max_blocks) {
+        if (allocated > map->m_len) {
                unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
-                                        newblock + max_blocks,
+                                        newblock + map->m_len,
-                                        allocated - max_blocks);
+                                        allocated - map->m_len);
-                allocated = max_blocks;
+                allocated = map->m_len;
        }
        /*
@@ -3252,13 +3282,13 @@ out:
                ext4_da_update_reserve_space(inode, allocated, 0);
 map_out:
-        set_buffer_mapped(bh_result);
+        map->m_flags |= EXT4_MAP_MAPPED;
 out1:
-        if (allocated > max_blocks)
+        if (allocated > map->m_len)
-                allocated = max_blocks;
+                allocated = map->m_len;
        ext4_ext_show_leaf(inode, path);
-        bh_result->b_bdev = inode->i_sb->s_bdev;
+        map->m_pblk = newblock;
-        bh_result->b_blocknr = newblock;
+        map->m_len = allocated;
 out2:
        if (path) {
                ext4_ext_drop_refs(path);
@@ -3284,26 +3314,23 @@ out2:
 *
 * return < 0, error case.
 */
-int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
+int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
-                        ext4_lblk_t iblock,
+                        struct ext4_map_blocks *map, int flags)
-                        unsigned int max_blocks, struct buffer_head *bh_result,
-                        int flags)
 {
        struct ext4_ext_path *path = NULL;
        struct ext4_extent_header *eh;
        struct ext4_extent newex, *ex, *last_ex;
        ext4_fsblk_t newblock;
-        int err = 0, depth, ret, cache_type;
+        int i, err = 0, depth, ret, cache_type;
        unsigned int allocated = 0;
        struct ext4_allocation_request ar;
        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
-        __clear_bit(BH_New, &bh_result->b_state);
        ext_debug("blocks %u/%u requested for inode %lu\n",
-                        iblock, max_blocks, inode->i_ino);
+                  map->m_lblk, map->m_len, inode->i_ino);
        /* check in cache */
-        cache_type = ext4_ext_in_cache(inode, iblock, &newex);
+        cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
        if (cache_type) {
                if (cache_type == EXT4_EXT_CACHE_GAP) {
                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -3316,12 +3343,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                        /* we should allocate requested block */
                } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
                        /* block is already allocated */
-                        newblock = iblock
+                        newblock = map->m_lblk
                                   - le32_to_cpu(newex.ee_block)
                                   + ext_pblock(&newex);
                        /* number of remaining blocks in the extent */
                        allocated = ext4_ext_get_actual_len(&newex) -
-                                        (iblock - le32_to_cpu(newex.ee_block));
+                                (map->m_lblk - le32_to_cpu(newex.ee_block));
                        goto out;
                } else {
                        BUG();
@@ -3329,7 +3356,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        }
        /* find extent for this block */
-        path = ext4_ext_find_extent(inode, iblock, NULL);
+        path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
        if (IS_ERR(path)) {
                err = PTR_ERR(path);
                path = NULL;
@@ -3345,8 +3372,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         */
        if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
                EXT4_ERROR_INODE(inode, "bad extent address "
-                                 "iblock: %d, depth: %d pblock %lld",
+                                 "lblock: %lu, depth: %d pblock %lld",
-                                 iblock, depth, path[depth].p_block);
+                                 (unsigned long) map->m_lblk, depth,
+                                 path[depth].p_block);
                err = -EIO;
                goto out2;
        }
@@ -3364,12 +3392,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                 */
                ee_len = ext4_ext_get_actual_len(ex);
                /* if found extent covers block, simply return it */
-                if (in_range(iblock, ee_block, ee_len)) {
+                if (in_range(map->m_lblk, ee_block, ee_len)) {
-                        newblock = iblock - ee_block + ee_start;
+                        newblock = map->m_lblk - ee_block + ee_start;
                        /* number of remaining blocks in the extent */
-                        allocated = ee_len - (iblock - ee_block);
+                        allocated = ee_len - (map->m_lblk - ee_block);
-                        ext_debug("%u fit into %u:%d -> %llu\n", iblock,
+                        ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
-                                        ee_block, ee_len, newblock);
+                                  ee_block, ee_len, newblock);
                        /* Do not put uninitialized extent in the cache */
                        if (!ext4_ext_is_uninitialized(ex)) {
@@ -3379,8 +3407,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                goto out;
                        }
                        ret = ext4_ext_handle_uninitialized_extents(handle,
-                                        inode, iblock, max_blocks, path,
+                                        inode, map, path, flags, allocated,
-                                        flags, allocated, bh_result, newblock);
+                                        newblock);
                        return ret;
                }
        }
@@ -3394,7 +3422,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                 * put just found gap into cache to speed up
                 * subsequent requests
                 */
-                ext4_ext_put_gap_in_cache(inode, path, iblock);
+                ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
                goto out2;
        }
        /*
@@ -3402,11 +3430,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         */
        /* find neighbour allocated blocks */
-        ar.lleft = iblock;
+        ar.lleft = map->m_lblk;
        err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
        if (err)
                goto out2;
-        ar.lright = iblock;
+        ar.lright = map->m_lblk;
        err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
        if (err)
                goto out2;
@@ -3417,26 +3445,26 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
         * EXT_UNINIT_MAX_LEN.
         */
-        if (max_blocks > EXT_INIT_MAX_LEN &&
+        if (map->m_len > EXT_INIT_MAX_LEN &&
            !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
-                max_blocks = EXT_INIT_MAX_LEN;
+                map->m_len = EXT_INIT_MAX_LEN;
-        else if (max_blocks > EXT_UNINIT_MAX_LEN &&
+        else if (map->m_len > EXT_UNINIT_MAX_LEN &&
                 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
-                max_blocks = EXT_UNINIT_MAX_LEN;
+                map->m_len = EXT_UNINIT_MAX_LEN;
-        /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
+        /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
-        newex.ee_block = cpu_to_le32(iblock);
+        newex.ee_block = cpu_to_le32(map->m_lblk);
-        newex.ee_len = cpu_to_le16(max_blocks);
+        newex.ee_len = cpu_to_le16(map->m_len);
        err = ext4_ext_check_overlap(inode, &newex, path);
        if (err)
                allocated = ext4_ext_get_actual_len(&newex);
        else
-                allocated = max_blocks;
+                allocated = map->m_len;
        /* allocate new block */
        ar.inode = inode;
-        ar.goal = ext4_ext_find_goal(inode, path, iblock);
+        ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
-        ar.logical = iblock;
+        ar.logical = map->m_lblk;
        ar.len = allocated;
        if (S_ISREG(inode->i_mode))
                ar.flags = EXT4_MB_HINT_DATA;
@@ -3470,21 +3498,33 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                                     EXT4_STATE_DIO_UNWRITTEN);
                }
                if (ext4_should_dioread_nolock(inode))
-                        set_buffer_uninit(bh_result);
+                        map->m_flags |= EXT4_MAP_UNINIT;
        }
-        if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
+        if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
                if (unlikely(!eh->eh_entries)) {
                        EXT4_ERROR_INODE(inode,
-                                         "eh->eh_entries == 0 ee_block %d",
+                                         "eh->eh_entries == 0 and "
-                                         ex->ee_block);
+                                         "EOFBLOCKS_FL set");
                        err = -EIO;
                        goto out2;
                }
                last_ex = EXT_LAST_EXTENT(eh);
-                if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
+                /*
-                    + ext4_ext_get_actual_len(last_ex))
+                 * If the current leaf block was reached by looking at
-                        EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+                 * the last index block all the way down the tree, and
+                 * we are extending the inode beyond the last extent
+                 * in the current leaf block, then clear the
+                 * EOFBLOCKS_FL flag.
+                 */
+                for (i = depth-1; i >= 0; i--) {
+                        if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+                                break;
+                }
+                if ((i < 0) &&
+                    (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
+                     ext4_ext_get_actual_len(last_ex)))
+                        ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        }
        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
        if (err) {
@@ -3500,9 +3540,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        /* previous routine could use block we allocated */
        newblock = ext_pblock(&newex);
        allocated = ext4_ext_get_actual_len(&newex);
-        if (allocated > max_blocks)
+        if (allocated > map->m_len)
-                allocated = max_blocks;
+                allocated = map->m_len;
-        set_buffer_new(bh_result);
+        map->m_flags |= EXT4_MAP_NEW;
        /*
         * Update reserved blocks/metadata blocks after successful
@@ -3516,18 +3556,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         * when it is _not_ an uninitialized extent.
         */
        if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
-                ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
+                ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
                                                EXT4_EXT_CACHE_EXTENT);
                ext4_update_inode_fsync_trans(handle, inode, 1);
        } else
                ext4_update_inode_fsync_trans(handle, inode, 0);
 out:
-        if (allocated > max_blocks)
+        if (allocated > map->m_len)
-                allocated = max_blocks;
+                allocated = map->m_len;
        ext4_ext_show_leaf(inode, path);
-        set_buffer_mapped(bh_result);
+        map->m_flags |= EXT4_MAP_MAPPED;
-        bh_result->b_bdev = inode->i_sb->s_bdev;
+        map->m_pblk = newblock;
-        bh_result->b_blocknr = newblock;
+        map->m_len = allocated;
 out2:
        if (path) {
                ext4_ext_drop_refs(path);
@@ -3625,7 +3665,7 @@ static void ext4_falloc_update_inode(struct inode *inode,
                 * can proceed even if the new size is the same as i_size.
                 */
                if (new_size > i_size_read(inode))
-                        EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL;
+                        ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        }
 }
@@ -3640,55 +3680,57 @@ static void ext4_falloc_update_inode(struct inode *inode,
 long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 {
        handle_t *handle;
-        ext4_lblk_t block;
        loff_t new_size;
        unsigned int max_blocks;
        int ret = 0;
        int ret2 = 0;
        int retries = 0;
-        struct buffer_head map_bh;
+        struct ext4_map_blocks map;
        unsigned int credits, blkbits = inode->i_blkbits;
        /*
         * currently supporting (pre)allocate mode for extent-based
         * files _only_
         */
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return -EOPNOTSUPP;
        /* preallocation to directories is currently not supported */
        if (S_ISDIR(inode->i_mode))
                return -ENODEV;
-        block = offset >> blkbits;
+        map.m_lblk = offset >> blkbits;
        /*
         * We can't just convert len to max_blocks because
         * If blocksize = 4096 offset = 3072 and len = 2048
         */
        max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
-                                                        - block;
+                - map.m_lblk;
        /*
         * credits to insert 1 extent into extent tree
         */
        credits = ext4_chunk_trans_blocks(inode, max_blocks);
        mutex_lock(&inode->i_mutex);
+        ret = inode_newsize_ok(inode, (len + offset));
+        if (ret) {
+                mutex_unlock(&inode->i_mutex);
+                return ret;
+        }
 retry:
        while (ret >= 0 && ret < max_blocks) {
-                block = block + ret;
+                map.m_lblk = map.m_lblk + ret;
-                max_blocks = max_blocks - ret;
+                map.m_len = max_blocks = max_blocks - ret;
                handle = ext4_journal_start(inode, credits);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        break;
                }
-                map_bh.b_state = 0;
+                ret = ext4_map_blocks(handle, inode, &map,
-                ret = ext4_get_blocks(handle, inode, block,
-                                      max_blocks, &map_bh,
                                      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
                if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
                        WARN_ON(ret <= 0);
-                        printk(KERN_ERR "%s: ext4_ext_get_blocks "
+                        printk(KERN_ERR "%s: ext4_ext_map_blocks "
                                    "returned error inode#%lu, block=%u, "
                                    "max_blocks=%u", __func__,
                                    inode->i_ino, block, max_blocks);
@@ -3697,14 +3739,14 @@ retry:
                        ret2 = ext4_journal_stop(handle);
                        break;
                }
-                if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
+                if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
                                                blkbits) >> blkbits))
                        new_size = offset + len;
                else
-                        new_size = (block + ret) << blkbits;
+                        new_size = (map.m_lblk + ret) << blkbits;
                ext4_falloc_update_inode(inode, mode, new_size,
-                                                buffer_new(&map_bh));
+                                         (map.m_flags & EXT4_MAP_NEW));
                ext4_mark_inode_dirty(handle, inode);
                ret2 = ext4_journal_stop(handle);
                if (ret2)
@@ -3733,42 +3775,39 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
                                    ssize_t len)
 {
        handle_t *handle;
-        ext4_lblk_t block;
        unsigned int max_blocks;
        int ret = 0;
        int ret2 = 0;
-        struct buffer_head map_bh;
+        struct ext4_map_blocks map;
        unsigned int credits, blkbits = inode->i_blkbits;
-        block = offset >> blkbits;
+        map.m_lblk = offset >> blkbits;
        /*
         * We can't just convert len to max_blocks because
         * If blocksize = 4096 offset = 3072 and len = 2048
         */
-        max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
+        max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
-                                                        - block;
+                      map.m_lblk);
        /*
         * credits to insert 1 extent into extent tree
         */
        credits = ext4_chunk_trans_blocks(inode, max_blocks);
        while (ret >= 0 && ret < max_blocks) {
-                block = block + ret;
+                map.m_lblk += ret;
-                max_blocks = max_blocks - ret;
+                map.m_len = (max_blocks -= ret);
                handle = ext4_journal_start(inode, credits);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        break;
                }
-                map_bh.b_state = 0;
+                ret = ext4_map_blocks(handle, inode, &map,
-                ret = ext4_get_blocks(handle, inode, block,
-                                      max_blocks, &map_bh,
                                      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
                if (ret <= 0) {
                        WARN_ON(ret <= 0);
-                        printk(KERN_ERR "%s: ext4_ext_get_blocks "
+                        printk(KERN_ERR "%s: ext4_ext_map_blocks "
                                    "returned error inode#%lu, block=%u, "
                                    "max_blocks=%u", __func__,
-                                    inode->i_ino, block, max_blocks);
+                                    inode->i_ino, map.m_lblk, map.m_len);
                }
                ext4_mark_inode_dirty(handle, inode);
                ret2 = ext4_journal_stop(handle);
@@ -3879,6 +3918,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
                physical += offset;
                length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
                flags |= FIEMAP_EXTENT_DATA_INLINE;
+                brelse(iloc.bh);
        } else { /* external block */
                physical = EXT4_I(inode)->i_file_acl << blockbits;
                length = inode->i_sb->s_blocksize;
@@ -3897,7 +3937,7 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        int error = 0;
        /* fallback to generic here if not in extents fmt */
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return generic_block_fiemap(inode, fieinfo, start, len,
                        ext4_get_block);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d0776e410f34..5313ae4cda2d 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -66,7 +66,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
         * is smaller than s_maxbytes, which is for extent-mapped files.
         */
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
                size_t length = iov_length(iov, nr_segs);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 0d0c3239c1cd..592adf2e546e 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -35,6 +35,29 @@
 #include <trace/events/ext4.h>
 /*
+ * If we're not journaling and this is a just-created file, we have to
+ * sync our parent directory (if it was freshly created) since
+ * otherwise it will only be written by writeback, leaving a huge
+ * window during which a crash may lose the file.  This may apply for
+ * the parent directory's parent as well, and so on recursively, if
+ * they are also freshly created.
+ */
+static void ext4_sync_parent(struct inode *inode)
+{
+        struct dentry *dentry = NULL;
+        while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
+                ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
+                dentry = list_entry(inode->i_dentry.next,
+                                    struct dentry, d_alias);
+                if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
+                        break;
+                inode = dentry->d_parent->d_inode;
+                sync_mapping_buffers(inode->i_mapping);
+        }
+}
+/*
 * akpm: A new design for ext4_sync_file().
 *
 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
@@ -48,9 +71,9 @@
 * i_mutex lock is held when entering and exiting this function
 */
-int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
+int ext4_sync_file(struct file *file, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        struct ext4_inode_info *ei = EXT4_I(inode);
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
        int ret;
@@ -58,7 +81,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
        J_ASSERT(ext4_journal_current_handle() == NULL);
-        trace_ext4_sync_file(file, dentry, datasync);
+        trace_ext4_sync_file(file, datasync);
        if (inode->i_sb->s_flags & MS_RDONLY)
                return 0;
@@ -66,9 +89,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
        ret = flush_completed_IO(inode);
        if (ret < 0)
                return ret;
-        
-        if (!journal)
+        if (!journal) {
-                return simple_fsync(file, dentry, datasync);
+                ret = generic_file_fsync(file, datasync);
+                if (!ret && !list_empty(&inode->i_dentry))
+                        ext4_sync_parent(inode);
+                return ret;
+        }
        /*
         * data=writeback,ordered:
@@ -100,9 +127,11 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
                if (ext4_should_writeback_data(inode) &&
                    (journal->j_fs_dev != journal->j_dev) &&
                    (journal->j_flags & JBD2_BARRIER))
-                        blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+                        blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
-                jbd2_log_wait_commit(journal, commit_tid);
+                                        NULL, BLKDEV_IFL_WAIT);
+                ret = jbd2_log_wait_commit(journal, commit_tid);
        } else if (journal->j_flags & JBD2_BARRIER)
-                blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
+                        BLKDEV_IFL_WAIT);
        return ret;
 }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 361c0b9962a8..25c4b3173fd9 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -240,56 +240,49 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
        if (fatal)
                goto error_return;
-        /* Ok, now we can actually update the inode bitmaps.. */
+        fatal = -ESRCH;
-        cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
+        gdp = ext4_get_group_desc(sb, block_group, &bh2);
-                                        bit, bitmap_bh->b_data);
+        if (gdp) {
-        if (!cleared)
-                ext4_error(sb, "bit already cleared for inode %lu", ino);
-        else {
-                gdp = ext4_get_group_desc(sb, block_group, &bh2);
                BUFFER_TRACE(bh2, "get_write_access");
                fatal = ext4_journal_get_write_access(handle, bh2);
-                if (fatal) goto error_return;
+        }
+        ext4_lock_group(sb, block_group);
-                if (gdp) {
+        cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
-                        ext4_lock_group(sb, block_group);
+        if (fatal || !cleared) {
-                        count = ext4_free_inodes_count(sb, gdp) + 1;
+                ext4_unlock_group(sb, block_group);
-                        ext4_free_inodes_set(sb, gdp, count);
+                goto out;
-                        if (is_directory) {
+        }
-                                count = ext4_used_dirs_count(sb, gdp) - 1;
-                                ext4_used_dirs_set(sb, gdp, count);
-                                if (sbi->s_log_groups_per_flex) {
-                                        ext4_group_t f;
-                                        f = ext4_flex_group(sbi, block_group);
-                                        atomic_dec(&sbi->s_flex_groups[f].free_inodes);
-                                }
-                        }
+        count = ext4_free_inodes_count(sb, gdp) + 1;
-                        gdp->bg_checksum = ext4_group_desc_csum(sbi,
+        ext4_free_inodes_set(sb, gdp, count);
-                                                        block_group, gdp);
+        if (is_directory) {
-                        ext4_unlock_group(sb, block_group);
+                count = ext4_used_dirs_count(sb, gdp) - 1;
-                        percpu_counter_inc(&sbi->s_freeinodes_counter);
+                ext4_used_dirs_set(sb, gdp, count);
-                        if (is_directory)
+                percpu_counter_dec(&sbi->s_dirs_counter);
-                                percpu_counter_dec(&sbi->s_dirs_counter);
-                        if (sbi->s_log_groups_per_flex) {
-                                ext4_group_t f;
-                                f = ext4_flex_group(sbi, block_group);
-                                atomic_inc(&sbi->s_flex_groups[f].free_inodes);
-                        }
-                }
-                BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
-                err = ext4_handle_dirty_metadata(handle, NULL, bh2);
-                if (!fatal) fatal = err;
        }
-        BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+        gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
-        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+        ext4_unlock_group(sb, block_group);
-        if (!fatal)
-                fatal = err;
+        percpu_counter_inc(&sbi->s_freeinodes_counter);
-        sb->s_dirt = 1;
+        if (sbi->s_log_groups_per_flex) {
+                ext4_group_t f = ext4_flex_group(sbi, block_group);
+                atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+                if (is_directory)
+                        atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+        }
+        BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
+        fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
+out:
+        if (cleared) {
+                BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+                err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+                if (!fatal)
+                        fatal = err;
+                sb->s_dirt = 1;
+        } else
+                ext4_error(sb, "bit already cleared for inode %lu", ino);
 error_return:
        brelse(bitmap_bh);
        ext4_std_error(sb, fatal);
@@ -499,7 +492,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
        if (S_ISDIR(mode) &&
            ((parent == sb->s_root->d_inode) ||
-             (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
+             (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
                int best_ndir = inodes_per_group;
                int ret = -1;
@@ -773,7 +766,7 @@ static int ext4_claim_inode(struct super_block *sb,
                if (sbi->s_log_groups_per_flex) {
                        ext4_group_t f = ext4_flex_group(sbi, group);
-                        atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+                        atomic_inc(&sbi->s_flex_groups[f].used_dirs);
                }
        }
        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
@@ -979,16 +972,12 @@ got:
                atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
        }
-        inode->i_uid = current_fsuid();
+        if (test_opt(sb, GRPID)) {
-        if (test_opt(sb, GRPID))
+                inode->i_mode = mode;
-                inode->i_gid = dir->i_gid;
+                inode->i_uid = current_fsuid();
-        else if (dir->i_mode & S_ISGID) {
                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
        } else
-                inode->i_gid = current_fsgid();
+                inode_init_owner(inode, dir, mode);
-        inode->i_mode = mode;
        inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
        /* This is the optimal IO size (for stat), not the fs block size */
@@ -1045,7 +1034,7 @@ got:
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
                /* set extent flag only for directory, file and normal symlink*/
                if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
-                        EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
+                        ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
                        ext4_ext_tree_init(handle, inode);
                }
        }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 986120f30066..19df61c321fd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -39,6 +39,7 @@
 #include <linux/bio.h>
 #include <linux/workqueue.h>
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -148,7 +149,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
        int ret;
        /*
-         * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
+         * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
         * moment, get_block can be called only for blocks inside i_size since
         * page cache has been already dropped and writes are blocked by
         * i_mutex. So we can safely drop the i_data_sem here.
@@ -347,9 +348,8 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
                if (blk &&
                    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
                                                    blk, 1))) {
-                        __ext4_error(inode->i_sb, function,
+                        ext4_error_inode(function, inode,
-                                   "invalid block reference %u "
+                                         "invalid block reference %u", blk);
-                                   "in inode #%lu", blk, inode->i_ino);
                        return -EIO;
                }
        }
@@ -784,7 +784,7 @@ failed:
        /* Allocation failed, free what we already allocated */
        ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
        for (i = 1; i <= n ; i++) {
-                /* 
+                /*
                 * branch[i].bh is newly allocated, so there is no
                 * need to revoke the block, which is why we don't
                 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -874,7 +874,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 err_out:
        for (i = 1; i <= num; i++) {
-                /* 
+                /*
                 * branch[i].bh is newly allocated, so there is no
                 * need to revoke the block, which is why we don't
                 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -889,9 +889,9 @@ err_out:
 }
 /*
- * The ext4_ind_get_blocks() function handles non-extents inodes
+ * The ext4_ind_map_blocks() function handles non-extents inodes
 * (i.e., using the traditional indirect/double-indirect i_blocks
- * scheme) for ext4_get_blocks().
+ * scheme) for ext4_map_blocks().
 *
 * Allocation strategy is simple: if we have to allocate something, we will
 * have to go the whole way to leaf. So let's do it before attaching anything
@@ -916,9 +916,8 @@ err_out:
 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
 * blocks.
 */
-static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
+static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
-                               ext4_lblk_t iblock, unsigned int maxblocks,
+                               struct ext4_map_blocks *map,
-                               struct buffer_head *bh_result,
                               int flags)
 {
        int err = -EIO;
@@ -932,9 +931,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
        int count = 0;
        ext4_fsblk_t first_block = 0;
-        J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
+        J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
        J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
-        depth = ext4_block_to_path(inode, iblock, offsets,
+        depth = ext4_block_to_path(inode, map->m_lblk, offsets,
                                   &blocks_to_boundary);
        if (depth == 0)
@@ -945,10 +944,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
        /* Simplest case - block found, no allocation needed */
        if (!partial) {
                first_block = le32_to_cpu(chain[depth - 1].key);
-                clear_buffer_new(bh_result);
                count++;
                /*map more blocks*/
-                while (count < maxblocks && count <= blocks_to_boundary) {
+                while (count < map->m_len && count <= blocks_to_boundary) {
                        ext4_fsblk_t blk;
                        blk = le32_to_cpu(*(chain[depth-1].p + count));
@@ -968,7 +966,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
        /*
         * Okay, we need to do block allocation.
        */
-        goal = ext4_find_goal(inode, iblock, partial);
+        goal = ext4_find_goal(inode, map->m_lblk, partial);
        /* the number of blocks need to allocate for [d,t]indirect blocks */
        indirect_blks = (chain + depth) - partial - 1;
@@ -978,11 +976,11 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
         * direct blocks to allocate for this branch.
         */
        count = ext4_blks_to_allocate(partial, indirect_blks,
-                                        maxblocks, blocks_to_boundary);
+                                      map->m_len, blocks_to_boundary);
        /*
         * Block out ext4_truncate while we alter the tree
         */
-        err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
+        err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
                                &count, goal,
                                offsets + (partial - chain), partial);
@@ -994,18 +992,20 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
         * may need to return -EAGAIN upwards in the worst case.  --sct
         */
        if (!err)
-                err = ext4_splice_branch(handle, inode, iblock,
+                err = ext4_splice_branch(handle, inode, map->m_lblk,
                                         partial, indirect_blks, count);
        if (err)
                goto cleanup;
-        set_buffer_new(bh_result);
+        map->m_flags |= EXT4_MAP_NEW;
        ext4_update_inode_fsync_trans(handle, inode, 1);
 got_it:
-        map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
+        map->m_flags |= EXT4_MAP_MAPPED;
+        map->m_pblk = le32_to_cpu(chain[depth-1].key);
+        map->m_len = count;
        if (count > blocks_to_boundary)
-                set_buffer_boundary(bh_result);
+                map->m_flags |= EXT4_MAP_BOUNDARY;
        err = count;
        /* Clean up and exit */
        partial = chain + depth - 1;    /* the whole chain */
@@ -1015,7 +1015,6 @@ cleanup:
                brelse(partial->bh);
                partial--;
        }
-        BUFFER_TRACE(bh_result, "returned");
 out:
        return err;
 }
@@ -1035,7 +1034,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
                                              sector_t lblock)
 {
        struct ext4_inode_info *ei = EXT4_I(inode);
-        int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1;
+        sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
        int blk_bits;
        if (lblock < EXT4_NDIR_BLOCKS)
@@ -1050,7 +1049,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
        }
        ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
        ei->i_da_metadata_calc_len = 1;
-        blk_bits = roundup_pow_of_two(lblock + 1);
+        blk_bits = order_base_2(lblock);
        return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
 }
@@ -1060,7 +1059,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
 */
 static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
 {
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return ext4_ext_calc_metadata_amount(inode, lblock);
        return ext4_indirect_calc_metadata_amount(inode, lblock);
@@ -1075,7 +1074,6 @@ void ext4_da_update_reserve_space(struct inode *inode,
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
-        int mdb_free = 0, allocated_meta_blocks = 0;
        spin_lock(&ei->i_block_reservation_lock);
        trace_ext4_da_update_reserve_space(inode, used);
@@ -1090,11 +1088,10 @@ void ext4_da_update_reserve_space(struct inode *inode,
        /* Update per-inode reservations */
        ei->i_reserved_data_blocks -= used;
-        used += ei->i_allocated_meta_blocks;
        ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
-        allocated_meta_blocks = ei->i_allocated_meta_blocks;
+        percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+                           used + ei->i_allocated_meta_blocks);
        ei->i_allocated_meta_blocks = 0;
-        percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
        if (ei->i_reserved_data_blocks == 0) {
                /*
@@ -1102,30 +1099,23 @@ void ext4_da_update_reserve_space(struct inode *inode,
                 * only when we have written all of the delayed
                 * allocation blocks.
                 */
-                mdb_free = ei->i_reserved_meta_blocks;
+                percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+                                   ei->i_reserved_meta_blocks);
                ei->i_reserved_meta_blocks = 0;
                ei->i_da_metadata_calc_len = 0;
-                percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
        }
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-        /* Update quota subsystem */
+        /* Update quota subsystem for data blocks */
-        if (quota_claim) {
+        if (quota_claim)
                dquot_claim_block(inode, used);
-                if (mdb_free)
+        else {
-                        dquot_release_reservation_block(inode, mdb_free);
-        } else {
                /*
                 * We did fallocate with an offset that is already delayed
                 * allocated. So on delayed allocated writeback we should
-                 * not update the quota for allocated blocks. But then
+                 * not re-claim the quota for fallocated blocks.
-                 * converting an fallocate region to initialized region would
-                 * have caused a metadata allocation. So claim quota for
-                 * that
                 */
-                if (allocated_meta_blocks)
+                dquot_release_reservation_block(inode, used);
-                        dquot_claim_block(inode, allocated_meta_blocks);
-                dquot_release_reservation_block(inode, mdb_free + used);
        }
        /*
@@ -1138,15 +1128,15 @@ void ext4_da_update_reserve_space(struct inode *inode,
                ext4_discard_preallocations(inode);
 }
-static int check_block_validity(struct inode *inode, const char *msg,
+static int check_block_validity(struct inode *inode, const char *func,
-                                sector_t logical, sector_t phys, int len)
+                                struct ext4_map_blocks *map)
 {
-        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
+        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
-                __ext4_error(inode->i_sb, msg,
+                                   map->m_len)) {
-                           "inode #%lu logical block %llu mapped to %llu "
+                ext4_error_inode(func, inode,
-                           "(size %d)", inode->i_ino,
+                           "lblock %lu mapped to illegal pblock %llu "
-                           (unsigned long long) logical,
+                           "(length %d)", (unsigned long) map->m_lblk,
-                           (unsigned long long) phys, len);
+                                 map->m_pblk, map->m_len);
                return -EIO;
        }
        return 0;
@@ -1211,15 +1201,15 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 }
 /*
- * The ext4_get_blocks() function tries to look up the requested blocks,
+ * The ext4_map_blocks() function tries to look up the requested blocks,
 * and returns if the blocks are already mapped.
 *
 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
 * and store the allocated blocks in the result buffer head and mark it
 * mapped.
 *
- * If file type is extents based, it will call ext4_ext_get_blocks(),
+ * If file type is extents based, it will call ext4_ext_map_blocks(),
- * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping
+ * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
 * based files
 *
 * On success, it returns the number of blocks being mapped or allocate.
@@ -1232,35 +1222,29 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 *
 * It returns the error in case of allocation failure.
 */
-int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
+int ext4_map_blocks(handle_t *handle, struct inode *inode,
-                    unsigned int max_blocks, struct buffer_head *bh,
+                    struct ext4_map_blocks *map, int flags)
-                    int flags)
 {
        int retval;
-        clear_buffer_mapped(bh);
+        map->m_flags = 0;
-        clear_buffer_unwritten(bh);
+        ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
+                  "logical block %lu\n", inode->i_ino, flags, map->m_len,
-        ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
+                  (unsigned long) map->m_lblk);
-                  "logical block %lu\n", inode->i_ino, flags, max_blocks,
-                  (unsigned long)block);
        /*
         * Try to see if we can get the block without requesting a new
         * file system block.
         */
        down_read((&EXT4_I(inode)->i_data_sem));
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
-                retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
+                retval = ext4_ext_map_blocks(handle, inode, map, 0);
-                                bh, 0);
        } else {
-                retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
+                retval = ext4_ind_map_blocks(handle, inode, map, 0);
-                                             bh, 0);
        }
        up_read((&EXT4_I(inode)->i_data_sem));
-        if (retval > 0 && buffer_mapped(bh)) {
+        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-                int ret = check_block_validity(inode, "file system corruption",
+                int ret = check_block_validity(inode, __func__, map);
-                                               block, bh->b_blocknr, retval);
                if (ret != 0)
                        return ret;
        }
@@ -1276,7 +1260,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
         * ext4_ext_get_block() returns th create = 0
         * with buffer head unmapped.
         */
-        if (retval > 0 && buffer_mapped(bh))
+        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
                return retval;
        /*
@@ -1289,7 +1273,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
         * of BH_Unwritten and BH_Mapped flags being simultaneously
         * set on the buffer_head.
         */
-        clear_buffer_unwritten(bh);
+        map->m_flags &= ~EXT4_MAP_UNWRITTEN;
        /*
         * New blocks allocate and/or writing to uninitialized extent
@@ -1311,14 +1295,12 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
         * We need to check for EXT4 here because migrate
         * could have changed the inode type in between
         */
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
-                retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
+                retval = ext4_ext_map_blocks(handle, inode, map, flags);
-                                              bh, flags);
        } else {
-                retval = ext4_ind_get_blocks(handle, inode, block,
+                retval = ext4_ind_map_blocks(handle, inode, map, flags);
-                                             max_blocks, bh, flags);
-                if (retval > 0 && buffer_new(bh)) {
+                if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
                        /*
                         * We allocated new blocks which will result in
                         * i_data's format changing.  Force the migrate
@@ -1341,10 +1323,10 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
                EXT4_I(inode)->i_delalloc_reserved_flag = 0;
        up_write((&EXT4_I(inode)->i_data_sem));
-        if (retval > 0 && buffer_mapped(bh)) {
+        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-                int ret = check_block_validity(inode, "file system "
+                int ret = check_block_validity(inode,
-                                               "corruption after allocation",
+                                               "ext4_map_blocks_after_alloc",
-                                               block, bh->b_blocknr, retval);
+                                               map);
                if (ret != 0)
                        return ret;
        }
@@ -1354,109 +1336,109 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
-int ext4_get_block(struct inode *inode, sector_t iblock,
+static int _ext4_get_block(struct inode *inode, sector_t iblock,
-                   struct buffer_head *bh_result, int create)
+                           struct buffer_head *bh, int flags)
 {
        handle_t *handle = ext4_journal_current_handle();
+        struct ext4_map_blocks map;
        int ret = 0, started = 0;
-        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
        int dio_credits;
-        if (create && !handle) {
+        map.m_lblk = iblock;
+        map.m_len = bh->b_size >> inode->i_blkbits;
+        if (flags && !handle) {
                /* Direct IO write... */
-                if (max_blocks > DIO_MAX_BLOCKS)
+                if (map.m_len > DIO_MAX_BLOCKS)
-                        max_blocks = DIO_MAX_BLOCKS;
+                        map.m_len = DIO_MAX_BLOCKS;
-                dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+                dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
                handle = ext4_journal_start(inode, dio_credits);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
-                        goto out;
+                        return ret;
                }
                started = 1;
        }
-        ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
+        ret = ext4_map_blocks(handle, inode, &map, flags);
-                              create ? EXT4_GET_BLOCKS_CREATE : 0);
        if (ret > 0) {
-                bh_result->b_size = (ret << inode->i_blkbits);
+                map_bh(bh, inode->i_sb, map.m_pblk);
+                bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
                ret = 0;
        }
        if (started)
                ext4_journal_stop(handle);
-out:
        return ret;
 }
+int ext4_get_block(struct inode *inode, sector_t iblock,
+                   struct buffer_head *bh, int create)
+{
+        return _ext4_get_block(inode, iblock, bh,
+                               create ? EXT4_GET_BLOCKS_CREATE : 0);
+}
 /*
 * `handle' can be NULL if create is zero
 */
 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
                                ext4_lblk_t block, int create, int *errp)
 {
-        struct buffer_head dummy;
+        struct ext4_map_blocks map;
+        struct buffer_head *bh;
        int fatal = 0, err;
-        int flags = 0;
        J_ASSERT(handle != NULL || create == 0);
-        dummy.b_state = 0;
+        map.m_lblk = block;
-        dummy.b_blocknr = -1000;
+        map.m_len = 1;
-        buffer_trace_init(&dummy.b_history);
+        err = ext4_map_blocks(handle, inode, &map,
-        if (create)
+                              create ? EXT4_GET_BLOCKS_CREATE : 0);
-                flags |= EXT4_GET_BLOCKS_CREATE;
-        err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags);
+        if (err < 0)
-        /*
+                *errp = err;
-         * ext4_get_blocks() returns number of blocks mapped. 0 in
+        if (err <= 0)
-         * case of a HOLE.
+                return NULL;
-         */
+        *errp = 0;
-        if (err > 0) {
-                if (err > 1)
+        bh = sb_getblk(inode->i_sb, map.m_pblk);
-                        WARN_ON(1);
+        if (!bh) {
-                err = 0;
+                *errp = -EIO;
+                return NULL;
        }
-        *errp = err;
+        if (map.m_flags & EXT4_MAP_NEW) {
-        if (!err && buffer_mapped(&dummy)) {
+                J_ASSERT(create != 0);
-                struct buffer_head *bh;
+                J_ASSERT(handle != NULL);
-                bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
-                if (!bh) {
-                        *errp = -EIO;
-                        goto err;
-                }
-                if (buffer_new(&dummy)) {
-                        J_ASSERT(create != 0);
-                        J_ASSERT(handle != NULL);
-                        /*
+                /*
-                         * Now that we do not always journal data, we should
+                 * Now that we do not always journal data, we should
-                         * keep in mind whether this should always journal the
+                 * keep in mind whether this should always journal the
-                         * new buffer as metadata.  For now, regular file
+                 * new buffer as metadata.  For now, regular file
-                         * writes use ext4_get_block instead, so it's not a
+                 * writes use ext4_get_block instead, so it's not a
-                         * problem.
+                 * problem.
-                         */
+                 */
-                        lock_buffer(bh);
+                lock_buffer(bh);
-                        BUFFER_TRACE(bh, "call get_create_access");
+                BUFFER_TRACE(bh, "call get_create_access");
-                        fatal = ext4_journal_get_create_access(handle, bh);
+                fatal = ext4_journal_get_create_access(handle, bh);
-                        if (!fatal && !buffer_uptodate(bh)) {
+                if (!fatal && !buffer_uptodate(bh)) {
-                                memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+                        memset(bh->b_data, 0, inode->i_sb->s_blocksize);
-                                set_buffer_uptodate(bh);
+                        set_buffer_uptodate(bh);
-                        }
-                        unlock_buffer(bh);
-                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        err = ext4_handle_dirty_metadata(handle, inode, bh);
-                        if (!fatal)
-                                fatal = err;
-                } else {
-                        BUFFER_TRACE(bh, "not a new buffer");
-                }
-                if (fatal) {
-                        *errp = fatal;
-                        brelse(bh);
-                        bh = NULL;
                }
-                return bh;
+                unlock_buffer(bh);
+                BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+                err = ext4_handle_dirty_metadata(handle, inode, bh);
+                if (!fatal)
+                        fatal = err;
+        } else {
+                BUFFER_TRACE(bh, "not a new buffer");
        }
-err:
+        if (fatal) {
-        return NULL;
+                *errp = fatal;
+                brelse(bh);
+                bh = NULL;
+        }
+        return bh;
 }
 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
@@ -1859,7 +1841,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
        int retries = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
-        unsigned long md_needed, md_reserved;
+        unsigned long md_needed;
        int ret;
        /*
@@ -1869,22 +1851,24 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
         */
 repeat:
        spin_lock(&ei->i_block_reservation_lock);
-        md_reserved = ei->i_reserved_meta_blocks;
        md_needed = ext4_calc_metadata_amount(inode, lblock);
        trace_ext4_da_reserve_space(inode, md_needed);
        spin_unlock(&ei->i_block_reservation_lock);
        /*
-         * Make quota reservation here to prevent quota overflow
+         * We will charge metadata quota at writeout time; this saves
-         * later. Real quota accounting is done at pages writeout
+         * us from metadata over-estimation, though we may go over by
-         * time.
+         * a small amount in the end.  Here we just reserve for data.
         */
-        ret = dquot_reserve_block(inode, md_needed + 1);
+        ret = dquot_reserve_block(inode, 1);
        if (ret)
                return ret;
+        /*
+         * We do still charge estimated metadata to the sb though;
+         * we cannot afford to run out of free blocks.
+         */
        if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
-                dquot_release_reservation_block(inode, md_needed + 1);
+                dquot_release_reservation_block(inode, 1);
                if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
                        yield();
                        goto repeat;
@@ -1909,6 +1893,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+        trace_ext4_da_release_space(inode, to_free);
        if (unlikely(to_free > ei->i_reserved_data_blocks)) {
                /*
                 * if there aren't enough reserved blocks, then the
@@ -1931,12 +1916,13 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
                 * only when we have written all of the delayed
                 * allocation blocks.
                 */
-                to_free += ei->i_reserved_meta_blocks;
+                percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+                                   ei->i_reserved_meta_blocks);
                ei->i_reserved_meta_blocks = 0;
                ei->i_da_metadata_calc_len = 0;
        }
-        /* update fs dirty blocks counter */
+        /* update fs dirty data blocks counter */
        percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2041,28 +2027,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 /*
 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
 *
- * @mpd->inode - inode to walk through
- * @exbh->b_blocknr - first block on a disk
- * @exbh->b_size - amount of space in bytes
- * @logical - first logical block to start assignment with
- *
 * the function goes through all passed space and put actual disk
 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
 */
-static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
-                                 struct buffer_head *exbh)
+                                 struct ext4_map_blocks *map)
 {
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
-        int blocks = exbh->b_size >> inode->i_blkbits;
+        int blocks = map->m_len;
-        sector_t pblock = exbh->b_blocknr, cur_logical;
+        sector_t pblock = map->m_pblk, cur_logical;
        struct buffer_head *head, *bh;
        pgoff_t index, end;
        struct pagevec pvec;
        int nr_pages, i;
-        index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
        cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
        pagevec_init(&pvec, 0);
@@ -2089,17 +2070,16 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
                        /* skip blocks out of the range */
                        do {
-                                if (cur_logical >= logical)
+                                if (cur_logical >= map->m_lblk)
                                        break;
                                cur_logical++;
                        } while ((bh = bh->b_this_page) != head);
                        do {
-                                if (cur_logical >= logical + blocks)
+                                if (cur_logical >= map->m_lblk + blocks)
                                        break;
-                                if (buffer_delay(bh) ||
+                                if (buffer_delay(bh) || buffer_unwritten(bh)) {
-                                                buffer_unwritten(bh)) {
                                        BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
@@ -2118,7 +2098,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
                                } else if (buffer_mapped(bh))
                                        BUG_ON(bh->b_blocknr != pblock);
-                                if (buffer_uninit(exbh))
+                                if (map->m_flags & EXT4_MAP_UNINIT)
                                        set_buffer_uninit(bh);
                                cur_logical++;
                                pblock++;
@@ -2129,21 +2109,6 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
 }
-/*
- * __unmap_underlying_blocks - just a helper function to unmap
- * set of blocks described by @bh
- */
-static inline void __unmap_underlying_blocks(struct inode *inode,
-                                             struct buffer_head *bh)
-{
-        struct block_device *bdev = inode->i_sb->s_bdev;
-        int blocks, i;
-        blocks = bh->b_size >> inode->i_blkbits;
-        for (i = 0; i < blocks; i++)
-                unmap_underlying_metadata(bdev, bh->b_blocknr + i);
-}
 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
                                        sector_t logical, long blk_cnt)
 {
@@ -2205,7 +2170,7 @@ static void ext4_print_free_blocks(struct inode *inode)
 static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
        int err, blks, get_blocks_flags;
-        struct buffer_head new;
+        struct ext4_map_blocks map;
        sector_t next = mpd->b_blocknr;
        unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
        loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
@@ -2246,15 +2211,15 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
         * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
         * variables are updated after the blocks have been allocated.
         */
-        new.b_state = 0;
+        map.m_lblk = next;
+        map.m_len = max_blocks;
        get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
        if (ext4_should_dioread_nolock(mpd->inode))
                get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
        if (mpd->b_state & (1 << BH_Delay))
                get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
-        blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
+        blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
-                               &new, get_blocks_flags);
        if (blks < 0) {
                err = blks;
                /*
@@ -2281,7 +2246,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                ext4_msg(mpd->inode->i_sb, KERN_CRIT,
                         "delayed block allocation failed for inode %lu at "
                         "logical offset %llu with max blocks %zd with "
-                         "error %d\n", mpd->inode->i_ino,
+                         "error %d", mpd->inode->i_ino,
                         (unsigned long long) next,
                         mpd->b_size >> mpd->inode->i_blkbits, err);
                printk(KERN_CRIT "This should not happen!!  "
@@ -2296,10 +2261,13 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
        }
        BUG_ON(blks == 0);
-        new.b_size = (blks << mpd->inode->i_blkbits);
+        if (map.m_flags & EXT4_MAP_NEW) {
+                struct block_device *bdev = mpd->inode->i_sb->s_bdev;
+                int i;
-        if (buffer_new(&new))
+                for (i = 0; i < map.m_len; i++)
-                __unmap_underlying_blocks(mpd->inode, &new);
+                        unmap_underlying_metadata(bdev, map.m_pblk + i);
+        }
        /*
         * If blocks are delayed marked, we need to
@@ -2307,7 +2275,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
         */
        if ((mpd->b_state & (1 << BH_Delay)) ||
            (mpd->b_state & (1 << BH_Unwritten)))
-                mpage_put_bnr_to_bhs(mpd, next, &new);
+                mpage_put_bnr_to_bhs(mpd, &map);
        if (ext4_should_order_data(mpd->inode)) {
                err = ext4_jbd2_file_inode(handle, mpd->inode);
@@ -2348,8 +2316,17 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
        sector_t next;
        int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
+        /*
+         * XXX Don't go larger than mballoc is willing to allocate
+         * This is a stopgap solution.  We eventually need to fold
+         * mpage_da_submit_io() into this function and then call
+         * ext4_get_blocks() multiple times in a loop
+         */
+        if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
+                goto flush_it;
        /* check if thereserved journal credits might overflow */
-        if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
+        if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
                if (nrblocks >= EXT4_MAX_TRANS_DATA) {
                        /*
                         * With non-extent format we are limited by the journal
@@ -2422,17 +2399,6 @@ static int __mpage_da_writepage(struct page *page,
        struct buffer_head *bh, *head;
        sector_t logical;
-        if (mpd->io_done) {
-                /*
-                 * Rest of the page in the page_vec
-                 * redirty then and skip then. We will
-                 * try to write them again after
-                 * starting a new transaction
-                 */
-                redirty_page_for_writepage(wbc, page);
-                unlock_page(page);
-                return MPAGE_DA_EXTENT_TAIL;
-        }
        /*
         * Can we merge this page to current extent?
         */
@@ -2527,8 +2493,9 @@ static int __mpage_da_writepage(struct page *page,
 * initialized properly.
 */
 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
-                                  struct buffer_head *bh_result, int create)
+                                  struct buffer_head *bh, int create)
 {
+        struct ext4_map_blocks map;
        int ret = 0;
        sector_t invalid_block = ~((sector_t) 0xffff);
@@ -2536,16 +2503,22 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                invalid_block = ~0;
        BUG_ON(create == 0);
-        BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+        BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
+        map.m_lblk = iblock;
+        map.m_len = 1;
        /*
         * first, we need to know whether the block is allocated already
         * preallocated blocks are unmapped but should treated
         * the same as allocated blocks.
         */
-        ret = ext4_get_blocks(NULL, inode, iblock, 1,  bh_result, 0);
+        ret = ext4_map_blocks(NULL, inode, &map, 0);
-        if ((ret == 0) && !buffer_delay(bh_result)) {
+        if (ret < 0)
-                /* the block isn't (pre)allocated yet, let's reserve space */
+                return ret;
+        if (ret == 0) {
+                if (buffer_delay(bh))
+                        return 0; /* Not sure this could or should happen */
                /*
                 * XXX: __block_prepare_write() unmaps passed block,
                 * is it OK?
@@ -2555,26 +2528,26 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                        /* not enough space to reserve */
                        return ret;
-                map_bh(bh_result, inode->i_sb, invalid_block);
+                map_bh(bh, inode->i_sb, invalid_block);
-                set_buffer_new(bh_result);
+                set_buffer_new(bh);
-                set_buffer_delay(bh_result);
+                set_buffer_delay(bh);
-        } else if (ret > 0) {
+                return 0;
-                bh_result->b_size = (ret << inode->i_blkbits);
-                if (buffer_unwritten(bh_result)) {
-                        /* A delayed write to unwritten bh should
-                         * be marked new and mapped.  Mapped ensures
-                         * that we don't do get_block multiple times
-                         * when we write to the same offset and new
-                         * ensures that we do proper zero out for
-                         * partial write.
-                         */
-                        set_buffer_new(bh_result);
-                        set_buffer_mapped(bh_result);
-                }
-                ret = 0;
        }
-        return ret;
+        map_bh(bh, inode->i_sb, map.m_pblk);
+        bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+        if (buffer_unwritten(bh)) {
+                /* A delayed write to unwritten bh should be marked
+                 * new and mapped.  Mapped ensures that we don't do
+                 * get_block multiple times when we write to the same
+                 * offset and new ensures that we do proper zero out
+                 * for partial write.
+                 */
+                set_buffer_new(bh);
+                set_buffer_mapped(bh);
+        }
+        return 0;
 }
 /*
@@ -2596,21 +2569,8 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
                                   struct buffer_head *bh_result, int create)
 {
-        int ret = 0;
-        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
        BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+        return _ext4_get_block(inode, iblock, bh_result, 0);
-        /*
-         * we don't want to do block allocation in writepage
-         * so call get_block_wrap with create = 0
-         */
-        ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
-        if (ret > 0) {
-                bh_result->b_size = (ret << inode->i_blkbits);
-                ret = 0;
-        }
-        return ret;
 }
 static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -2820,13 +2780,131 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
         * number of contiguous block. So we will limit
         * number of contiguous block to a sane value
         */
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
            (max_blocks > EXT4_MAX_TRANS_DATA))
                max_blocks = EXT4_MAX_TRANS_DATA;
        return ext4_chunk_trans_blocks(inode, max_blocks);
 }
+/*
+ * write_cache_pages_da - walk the list of dirty pages of the given
+ * address space and call the callback function (which usually writes
+ * the pages).
+ *
+ * This is a forked version of write_cache_pages().  Differences:
+ *      Range cyclic is ignored.
+ *      no_nrwrite_index_update is always presumed true
+ */
+static int write_cache_pages_da(struct address_space *mapping,
+                                struct writeback_control *wbc,
+                                struct mpage_da_data *mpd)
+{
+        int ret = 0;
+        int done = 0;
+        struct pagevec pvec;
+        int nr_pages;
+        pgoff_t index;
+        pgoff_t end;            /* Inclusive */
+        long nr_to_write = wbc->nr_to_write;
+        pagevec_init(&pvec, 0);
+        index = wbc->range_start >> PAGE_CACHE_SHIFT;
+        end = wbc->range_end >> PAGE_CACHE_SHIFT;
+        while (!done && (index <= end)) {
+                int i;
+                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                              PAGECACHE_TAG_DIRTY,
+                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+                if (nr_pages == 0)
+                        break;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        /*
+                         * At this point, the page may be truncated or
+                         * invalidated (changing page->mapping to NULL), or
+                         * even swizzled back from swapper_space to tmpfs file
+                         * mapping. However, page->index will not change
+                         * because we have a reference on the page.
+                         */
+                        if (page->index > end) {
+                                done = 1;
+                                break;
+                        }
+                        lock_page(page);
+                        /*
+                         * Page truncated or invalidated. We can freely skip it
+                         * then, even for data integrity operations: the page
+                         * has disappeared concurrently, so there could be no
+                         * real expectation of this data interity operation
+                         * even if there is now a new, dirty page at the same
+                         * pagecache address.
+                         */
+                        if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+                                unlock_page(page);
+                                continue;
+                        }
+                        if (!PageDirty(page)) {
+                                /* someone wrote it for us */
+                                goto continue_unlock;
+                        }
+                        if (PageWriteback(page)) {
+                                if (wbc->sync_mode != WB_SYNC_NONE)
+                                        wait_on_page_writeback(page);
+                                else
+                                        goto continue_unlock;
+                        }
+                        BUG_ON(PageWriteback(page));
+                        if (!clear_page_dirty_for_io(page))
+                                goto continue_unlock;
+                        ret = __mpage_da_writepage(page, wbc, mpd);
+                        if (unlikely(ret)) {
+                                if (ret == AOP_WRITEPAGE_ACTIVATE) {
+                                        unlock_page(page);
+                                        ret = 0;
+                                } else {
+                                        done = 1;
+                                        break;
+                                }
+                        }
+                        if (nr_to_write > 0) {
+                                nr_to_write--;
+                                if (nr_to_write == 0 &&
+                                    wbc->sync_mode == WB_SYNC_NONE) {
+                                        /*
+                                         * We stop writing back only if we are
+                                         * not doing integrity sync. In case of
+                                         * integrity sync we have to keep going
+                                         * because someone may be concurrently
+                                         * dirtying pages, and we might have
+                                         * synced a lot of newly appeared dirty
+                                         * pages, but have not synced all of the
+                                         * old dirty pages.
+                                         */
+                                        done = 1;
+                                        break;
+                                }
+                        }
+                }
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+        return ret;
+}
 static int ext4_da_writepages(struct address_space *mapping,
                              struct writeback_control *wbc)
 {
@@ -2835,7 +2913,6 @@ static int ext4_da_writepages(struct address_space *mapping,
        handle_t *handle = NULL;
        struct mpage_da_data mpd;
        struct inode *inode = mapping->host;
-        int no_nrwrite_index_update;
        int pages_written = 0;
        long pages_skipped;
        unsigned int max_pages;
@@ -2915,12 +2992,6 @@ static int ext4_da_writepages(struct address_space *mapping,
        mpd.wbc = wbc;
        mpd.inode = mapping->host;
-        /*
-         * we don't want write_cache_pages to update
-         * nr_to_write and writeback_index
-         */
-        no_nrwrite_index_update = wbc->no_nrwrite_index_update;
-        wbc->no_nrwrite_index_update = 1;
        pages_skipped = wbc->pages_skipped;
 retry:
@@ -2940,7 +3011,7 @@ retry:
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
-                               "%ld pages, ino %lu; err %d\n", __func__,
+                               "%ld pages, ino %lu; err %d", __func__,
                                wbc->nr_to_write, inode->i_ino, ret);
                        goto out_writepages;
                }
@@ -2962,8 +3033,7 @@ retry:
                mpd.io_done = 0;
                mpd.pages_written = 0;
                mpd.retval = 0;
-                ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
+                ret = write_cache_pages_da(mapping, wbc, &mpd);
-                                        &mpd);
                /*
                 * If we have a contiguous extent of pages and we
                 * haven't done the I/O yet, map the blocks and submit
@@ -3015,7 +3085,7 @@ retry:
        if (pages_skipped != wbc->pages_skipped)
                ext4_msg(inode->i_sb, KERN_CRIT,
                         "This should not happen leaving %s "
-                         "with nr_to_write = %ld ret = %d\n",
+                         "with nr_to_write = %ld ret = %d",
                         __func__, wbc->nr_to_write, ret);
        /* Update index */
@@ -3029,8 +3099,6 @@ retry:
                mapping->writeback_index = index;
 out_writepages:
-        if (!no_nrwrite_index_update)
-                wbc->no_nrwrite_index_update = 0;
        wbc->nr_to_write -= nr_to_writebump;
        wbc->range_start = range_start;
        trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
@@ -3075,7 +3143,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                               loff_t pos, unsigned len, unsigned flags,
                               struct page **pagep, void **fsdata)
 {
-        int ret, retries = 0, quota_retries = 0;
+        int ret, retries = 0;
        struct page *page;
        pgoff_t index;
        unsigned from, to;
@@ -3134,22 +3202,6 @@ retry:
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
-        if ((ret == -EDQUOT) &&
-            EXT4_I(inode)->i_reserved_meta_blocks &&
-            (quota_retries++ < 3)) {
-                /*
-                 * Since we often over-estimate the number of meta
-                 * data blocks required, we may sometimes get a
-                 * spurios out of quota error even though there would
-                 * be enough space once we write the data blocks and
-                 * find out how many meta data blocks were _really_
-                 * required.  So try forcing the inode write to see if
-                 * that helps.
-                 */
-                write_inode_now(inode, (quota_retries == 3));
-                goto retry;
-        }
 out:
        return ret;
 }
@@ -3545,46 +3597,18 @@ out:
        return ret;
 }
+/*
+ * ext4_get_block used when preparing for a DIO write or buffer write.
+ * We allocate an uinitialized extent if blocks haven't been allocated.
+ * The extent will be converted to initialized after the IO is complete.
+ */
 static int ext4_get_block_write(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh_result, int create)
 {
-        handle_t *handle = ext4_journal_current_handle();
-        int ret = 0;
-        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-        int dio_credits;
-        int started = 0;
        ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
                   inode->i_ino, create);
-        /*
+        return _ext4_get_block(inode, iblock, bh_result,
-         * ext4_get_block in prepare for a DIO write or buffer write.
+                               EXT4_GET_BLOCKS_IO_CREATE_EXT);
-         * We allocate an uinitialized extent if blocks haven't been allocated.
-         * The extent will be converted to initialized after IO complete.
-         */
-        create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
-        if (!handle) {
-                if (max_blocks > DIO_MAX_BLOCKS)
-                        max_blocks = DIO_MAX_BLOCKS;
-                dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
-                handle = ext4_journal_start(inode, dio_credits);
-                if (IS_ERR(handle)) {
-                        ret = PTR_ERR(handle);
-                        goto out;
-                }
-                started = 1;
-        }
-        ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
-                              create);
-        if (ret > 0) {
-                bh_result->b_size = (ret << inode->i_blkbits);
-                ret = 0;
-        }
-        if (started)
-                ext4_journal_stop(handle);
-out:
-        return ret;
 }
 static void dump_completed_IO(struct inode * inode)
@@ -3972,7 +3996,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
        return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -4301,10 +4325,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
                                   count)) {
-                ext4_error(inode->i_sb, "inode #%lu: "
+                EXT4_ERROR_INODE(inode, "attempt to clear invalid "
-                           "attempt to clear blocks %llu len %lu, invalid",
+                                 "blocks %llu len %lu",
-                           inode->i_ino, (unsigned long long) block_to_free,
+                                 (unsigned long long) block_to_free, count);
-                           count);
                return 1;
        }
@@ -4409,11 +4432,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
                if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
                        ext4_handle_dirty_metadata(handle, inode, this_bh);
                else
-                        ext4_error(inode->i_sb,
+                        EXT4_ERROR_INODE(inode,
-                                   "circular indirect block detected, "
+                                         "circular indirect block detected at "
-                                   "inode=%lu, block=%llu",
+                                         "block %llu",
-                                   inode->i_ino,
+                                (unsigned long long) this_bh->b_blocknr);
-                                   (unsigned long long) this_bh->b_blocknr);
        }
 }
@@ -4451,11 +4473,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
                                                   nr, 1)) {
-                                ext4_error(inode->i_sb,
+                                EXT4_ERROR_INODE(inode,
-                                           "indirect mapped block in inode "
+                                                 "invalid indirect mapped "
-                                           "#%lu invalid (level %d, blk #%lu)",
+                                                 "block %lu (level %d)",
-                                           inode->i_ino, depth,
+                                                 (unsigned long) nr, depth);
-                                           (unsigned long) nr);
                                break;
                        }
@@ -4467,9 +4488,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                         * (should be rare).
                         */
                        if (!bh) {
-                                ext4_error(inode->i_sb,
+                                EXT4_ERROR_INODE(inode,
-                                           "Read failure, inode=%lu, block=%llu",
+                                                 "Read failure block=%llu",
-                                           inode->i_ino, nr);
+                                                 (unsigned long long) nr);
                                continue;
                        }
@@ -4611,12 +4632,12 @@ void ext4_truncate(struct inode *inode)
        if (!ext4_can_truncate(inode))
                return;
-        EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+        ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
                ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                ext4_ext_truncate(inode);
                return;
        }
@@ -4784,8 +4805,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
        bh = sb_getblk(sb, block);
        if (!bh) {
-                ext4_error(sb, "unable to read inode block - "
+                EXT4_ERROR_INODE(inode, "unable to read inode block - "
-                           "inode=%lu, block=%llu", inode->i_ino, block);
+                                 "block %llu", block);
                return -EIO;
        }
        if (!buffer_uptodate(bh)) {
@@ -4883,8 +4904,8 @@ make_io:
                submit_bh(READ_META, bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
-                        ext4_error(sb, "unable to read inode block - inode=%lu,"
+                        EXT4_ERROR_INODE(inode, "unable to read inode "
-                                   " block=%llu", inode->i_ino, block);
+                                         "block %llu", block);
                        brelse(bh);
                        return -EIO;
                }
@@ -5095,8 +5116,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        ret = 0;
        if (ei->i_file_acl &&
            !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
-                ext4_error(sb, "bad extended attribute block %llu inode #%lu",
+                EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
-                           ei->i_file_acl, inode->i_ino);
+                                 ei->i_file_acl);
                ret = -EIO;
                goto bad_inode;
        } else if (ei->i_flags & EXT4_EXTENTS_FL) {
@@ -5141,8 +5162,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                           new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
        } else {
                ret = -EIO;
-                ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu",
+                EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
-                           inode->i_mode, inode->i_ino);
                goto bad_inode;
        }
        brelse(iloc.bh);
@@ -5374,17 +5394,18 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
        } else {
                struct ext4_iloc iloc;
-                err = ext4_get_inode_loc(inode, &iloc);
+                err = __ext4_get_inode_loc(inode, &iloc, 0);
                if (err)
                        return err;
                if (wbc->sync_mode == WB_SYNC_ALL)
                        sync_dirty_buffer(iloc.bh);
                if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
-                        ext4_error(inode->i_sb, "IO error syncing inode, "
+                        EXT4_ERROR_INODE(inode,
-                                   "inode=%lu, block=%llu", inode->i_ino,
+                                "IO error syncing inode (block=%llu)",
-                                   (unsigned long long)iloc.bh->b_blocknr);
+                                (unsigned long long) iloc.bh->b_blocknr);
                        err = -EIO;
                }
+                brelse(iloc.bh);
        }
        return err;
 }
@@ -5423,7 +5444,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
        if (error)
                return error;
-        if (ia_valid & ATTR_SIZE)
+        if (is_quota_modification(inode, attr))
                dquot_initialize(inode);
        if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
                (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
@@ -5453,7 +5474,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
        }
        if (attr->ia_valid & ATTR_SIZE) {
-                if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+                if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
                        if (attr->ia_size > sbi->s_bitmap_maxbytes) {
@@ -5466,7 +5487,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
        if (S_ISREG(inode->i_mode) &&
            attr->ia_valid & ATTR_SIZE &&
            (attr->ia_size < inode->i_size ||
-             (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
+             (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
                handle_t *handle;
                handle = ext4_journal_start(inode, 3);
@@ -5498,7 +5519,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                        }
                }
                /* ext4_truncate will clear the flag */
-                if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
+                if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
                        ext4_truncate(inode);
        }
@@ -5574,7 +5595,7 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
        return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
 }
@@ -5909,9 +5930,9 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
         */
        if (val)
-                EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
+                ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
        else
-                EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
+                ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
        ext4_set_aops(inode);
        jbd2_journal_unlock_updates(journal);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 016d0249294f..bf5ae883b1bd 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -258,7 +258,7 @@ setversion_out:
                if (me.moved_len > 0)
                        file_remove_suid(donor_filp);
-                if (copy_to_user((struct move_extent __user *)arg, 
+                if (copy_to_user((struct move_extent __user *)arg,
                                 &me, sizeof(me)))
                        err = -EFAULT;
 mext_out:
@@ -373,7 +373,30 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case EXT4_IOC32_SETRSVSZ:
                cmd = EXT4_IOC_SETRSVSZ;
                break;
-        case EXT4_IOC_GROUP_ADD:
+        case EXT4_IOC32_GROUP_ADD: {
+                struct compat_ext4_new_group_input __user *uinput;
+                struct ext4_new_group_input input;
+                mm_segment_t old_fs;
+                int err;
+                uinput = compat_ptr(arg);
+                err = get_user(input.group, &uinput->group);
+                err |= get_user(input.block_bitmap, &uinput->block_bitmap);
+                err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
+                err |= get_user(input.inode_table, &uinput->inode_table);
+                err |= get_user(input.blocks_count, &uinput->blocks_count);
+                err |= get_user(input.reserved_blocks,
+                                &uinput->reserved_blocks);
+                if (err)
+                        return -EFAULT;
+                old_fs = get_fs();
+                set_fs(KERNEL_DS);
+                err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
+                                 (unsigned long) &input);
+                set_fs(old_fs);
+                return err;
+        }
+        case EXT4_IOC_MOVE_EXT:
                break;
        default:
                return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 54df209d2eed..12b3bc026a68 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -23,6 +23,7 @@
 #include "mballoc.h"
 #include <linux/debugfs.h>
+#include <linux/slab.h>
 #include <trace/events/ext4.h>
 /*
@@ -657,6 +658,27 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
        }
 }
+/*
+ * Cache the order of the largest free extent we have available in this block
+ * group.
+ */
+static void
+mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
+{
+        int i;
+        int bits;
+        grp->bb_largest_free_order = -1; /* uninit */
+        bits = sb->s_blocksize_bits + 1;
+        for (i = bits; i >= 0; i--) {
+                if (grp->bb_counters[i] > 0) {
+                        grp->bb_largest_free_order = i;
+                        break;
+                }
+        }
+}
 static noinline_for_stack
 void ext4_mb_generate_buddy(struct super_block *sb,
                                void *buddy, void *bitmap, ext4_group_t group)
@@ -699,6 +721,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
                 */
                grp->bb_free = free;
        }
+        mb_set_largest_free_order(sb, grp);
        clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
@@ -724,6 +747,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize)  blocks.
 * So it can have information regarding groups_per_page which
 * is blocks_per_page/2
+ *
+ * Locking note:  This routine takes the block group lock of all groups
+ * for this page; do not hold this lock when calling this routine!
 */
 static int ext4_mb_init_cache(struct page *page, char *incore)
@@ -864,6 +890,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                        BUG_ON(incore == NULL);
                        mb_debug(1, "put buddy for group %u in page %lu/%x\n",
                                group, page->index, i * blocksize);
+                        trace_ext4_mb_buddy_bitmap_load(sb, group);
                        grinfo = ext4_get_group_info(sb, group);
                        grinfo->bb_fragments = 0;
                        memset(grinfo->bb_counters, 0,
@@ -881,6 +908,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                        BUG_ON(incore != NULL);
                        mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
                                group, page->index, i * blocksize);
+                        trace_ext4_mb_bitmap_load(sb, group);
                        /* see comments in ext4_mb_put_pa() */
                        ext4_lock_group(sb, group);
@@ -909,6 +937,11 @@ out:
        return err;
 }
+/*
+ * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
 static noinline_for_stack
 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 {
@@ -1003,6 +1036,11 @@ err:
        return ret;
 }
+/*
+ * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
 static noinline_for_stack int
 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
                                        struct ext4_buddy *e4b)
@@ -1149,7 +1187,7 @@ err:
        return ret;
 }
-static void ext4_mb_release_desc(struct ext4_buddy *e4b)
+static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
 {
        if (e4b->bd_bitmap_page)
                page_cache_release(e4b->bd_bitmap_page);
@@ -1298,6 +1336,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                        buddy = buddy2;
                } while (1);
        }
+        mb_set_largest_free_order(sb, e4b->bd_info);
        mb_check_buddy(e4b);
 }
@@ -1426,6 +1465,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
                e4b->bd_info->bb_counters[ord]++;
                e4b->bd_info->bb_counters[ord]++;
        }
+        mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
        mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
        mb_check_buddy(e4b);
@@ -1616,7 +1656,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
        }
        ext4_unlock_group(ac->ac_sb, group);
-        ext4_mb_release_desc(e4b);
+        ext4_mb_unload_buddy(e4b);
        return 0;
 }
@@ -1671,7 +1711,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
                ext4_mb_use_best_found(ac, e4b);
        }
        ext4_unlock_group(ac->ac_sb, group);
-        ext4_mb_release_desc(e4b);
+        ext4_mb_unload_buddy(e4b);
        return 0;
 }
@@ -1820,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
        }
 }
+/* This is now called BEFORE we load the buddy bitmap. */
 static int ext4_mb_good_group(struct ext4_allocation_context *ac,
                                ext4_group_t group, int cr)
 {
        unsigned free, fragments;
-        unsigned i, bits;
        int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
        BUG_ON(cr < 0 || cr >= 4);
-        BUG_ON(EXT4_MB_GRP_NEED_INIT(grp));
+        /* We only do this if the grp has never been initialized */
+        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+                int ret = ext4_mb_init_group(ac->ac_sb, group);
+                if (ret)
+                        return 0;
+        }
        free = grp->bb_free;
        fragments = grp->bb_fragments;
@@ -1842,17 +1888,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
        case 0:
                BUG_ON(ac->ac_2order == 0);
+                if (grp->bb_largest_free_order < ac->ac_2order)
+                        return 0;
                /* Avoid using the first bg of a flexgroup for data files */
                if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
                    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
                    ((group % flex_size) == 0))
                        return 0;
-                bits = ac->ac_sb->s_blocksize_bits + 1;
+                return 1;
-                for (i = ac->ac_2order; i <= bits; i++)
-                        if (grp->bb_counters[i] > 0)
-                                return 1;
-                break;
        case 1:
                if ((free / fragments) >= ac->ac_g_ex.fe_len)
                        return 1;
@@ -1963,7 +2008,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
        sbi = EXT4_SB(sb);
        ngroups = ext4_get_groups_count(sb);
        /* non-extent files are limited to low blocks/groups */
-        if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
+        if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
                ngroups = sbi->s_blockfile_groups;
        BUG_ON(ac->ac_status == AC_STATUS_FOUND);
@@ -2023,15 +2068,11 @@ repeat:
                group = ac->ac_g_ex.fe_group;
                for (i = 0; i < ngroups; group++, i++) {
-                        struct ext4_group_info *grp;
-                        struct ext4_group_desc *desc;
                        if (group == ngroups)
                                group = 0;
-                        /* quick check to skip empty groups */
+                        /* This now checks without needing the buddy page */
-                        grp = ext4_get_group_info(sb, group);
+                        if (!ext4_mb_good_group(ac, group, cr))
-                        if (grp->bb_free == 0)
                                continue;
                        err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -2039,15 +2080,18 @@ repeat:
                                goto out;
                        ext4_lock_group(sb, group);
+                        /*
+                         * We need to check again after locking the
+                         * block group
+                         */
                        if (!ext4_mb_good_group(ac, group, cr)) {
-                                /* someone did allocation from this group */
                                ext4_unlock_group(sb, group);
-                                ext4_mb_release_desc(&e4b);
+                                ext4_mb_unload_buddy(&e4b);
                                continue;
                        }
                        ac->ac_groups_scanned++;
-                        desc = ext4_get_group_desc(sb, group, NULL);
                        if (cr == 0)
                                ext4_mb_simple_scan_group(ac, &e4b);
                        else if (cr == 1 &&
@@ -2057,7 +2101,7 @@ repeat:
                                ext4_mb_complex_scan_group(ac, &e4b);
                        ext4_unlock_group(sb, group);
-                        ext4_mb_release_desc(&e4b);
+                        ext4_mb_unload_buddy(&e4b);
                        if (ac->ac_status != AC_STATUS_CONTINUE)
                                break;
@@ -2147,7 +2191,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
        ext4_lock_group(sb, group);
        memcpy(&sg, ext4_get_group_info(sb, group), i);
        ext4_unlock_group(sb, group);
-        ext4_mb_release_desc(&e4b);
+        ext4_mb_unload_buddy(&e4b);
        seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
                        sg.info.bb_fragments, sg.info.bb_first_free);
@@ -2254,6 +2298,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
        init_rwsem(&meta_group_info[i]->alloc_sem);
        meta_group_info[i]->bb_free_root = RB_ROOT;
+        meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
 #ifdef DOUBLE_CHECK
        {
@@ -2534,6 +2579,23 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
                         entry->count, entry->group, entry);
+                if (test_opt(sb, DISCARD)) {
+                        int ret;
+                        ext4_fsblk_t discard_block;
+                        discard_block = entry->start_blk +
+                                ext4_group_first_block_no(sb, entry->group);
+                        trace_ext4_discard_blocks(sb,
+                                        (unsigned long long)discard_block,
+                                        entry->count);
+                        ret = sb_issue_discard(sb, discard_block, entry->count);
+                        if (ret == EOPNOTSUPP) {
+                                ext4_warning(sb,
+                                        "discard not supported, disabling");
+                                clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
+                        }
+                }
                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
                /* we expect to find existing buddy because it's pinned */
                BUG_ON(err != 0);
@@ -2555,18 +2617,8 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                        page_cache_release(e4b.bd_bitmap_page);
                }
                ext4_unlock_group(sb, entry->group);
-                if (test_opt(sb, DISCARD)) {
-                        ext4_fsblk_t discard_block;
-                        discard_block = entry->start_blk +
-                                ext4_group_first_block_no(sb, entry->group);
-                        trace_ext4_discard_blocks(sb,
-                                        (unsigned long long)discard_block,
-                                        entry->count);
-                        sb_issue_discard(sb, discard_block, entry->count);
-                }
                kmem_cache_free(ext4_free_ext_cachep, entry);
-                ext4_mb_release_desc(&e4b);
+                ext4_mb_unload_buddy(&e4b);
        }
        mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
@@ -2639,7 +2691,7 @@ int __init init_ext4_mballoc(void)
 void exit_ext4_mballoc(void)
 {
-        /* 
+        /*
         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
         * before destroying the slab cache.
         */
@@ -2979,7 +3031,7 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
        if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
                atomic_inc(&sbi->s_bal_reqs);
                atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
-                if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len)
+                if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
                        atomic_inc(&sbi->s_bal_success);
                atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
                if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
@@ -3121,7 +3173,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
                        continue;
                /* non-extent files can't have physical blocks past 2^32 */
-                if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
+                if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
                        pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
                        continue;
@@ -3278,7 +3330,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
        spin_unlock(&pa->pa_lock);
        grp_blk = pa->pa_pstart;
-        /* 
+        /*
         * If doing group-based preallocation, pa_pstart may be in the
         * next group when pa is used up
         */
@@ -3695,7 +3747,7 @@ out:
        ext4_unlock_group(sb, group);
        if (ac)
                kmem_cache_free(ext4_ac_cachep, ac);
-        ext4_mb_release_desc(&e4b);
+        ext4_mb_unload_buddy(&e4b);
        put_bh(bitmap_bh);
        return free;
 }
@@ -3799,7 +3851,7 @@ repeat:
                if (bitmap_bh == NULL) {
                        ext4_error(sb, "Error reading block bitmap for %u",
                                        group);
-                        ext4_mb_release_desc(&e4b);
+                        ext4_mb_unload_buddy(&e4b);
                        continue;
                }
@@ -3808,7 +3860,7 @@ repeat:
                ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
                ext4_unlock_group(sb, group);
-                ext4_mb_release_desc(&e4b);
+                ext4_mb_unload_buddy(&e4b);
                put_bh(bitmap_bh);
                list_del(&pa->u.pa_tmp_list);
@@ -4072,7 +4124,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
                ext4_mb_release_group_pa(&e4b, pa, ac);
                ext4_unlock_group(sb, group);
-                ext4_mb_release_desc(&e4b);
+                ext4_mb_unload_buddy(&e4b);
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
        }
@@ -4482,12 +4534,12 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
                        if (!bh)
                                tbh = sb_find_get_block(inode->i_sb,
                                                        block + i);
-                        ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 
+                        ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
                                    inode, tbh, block + i);
                }
        }
-        /* 
+        /*
         * We need to make sure we don't reuse the freed block until
         * after the transaction is committed, which we can do by
         * treating the block as metadata, below.  We make an
@@ -4608,7 +4660,7 @@ do_more:
                atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
        }
-        ext4_mb_release_desc(&e4b);
+        ext4_mb_unload_buddy(&e4b);
        freed += count;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 8b87bd0eac95..6f3a27ec30bf 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -13,6 +13,7 @@
 */
 #include <linux/module.h>
+#include <linux/slab.h>
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
@@ -474,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode)
         */
        if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
                                       EXT4_FEATURE_INCOMPAT_EXTENTS) ||
-            (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+            (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return -EINVAL;
        if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index aa5fe28d180f..3a6c92ac131c 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -15,6 +15,7 @@
 #include <linux/fs.h>
 #include <linux/quotaops.h>
+#include <linux/slab.h>
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
 #include "ext4.h"
@@ -481,6 +482,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
        int depth = ext_depth(orig_inode);
        int ret;
+        start_ext.ee_block = end_ext.ee_block = 0;
        o_start = o_end = oext = orig_path[depth].p_ext;
        oext_alen = ext4_ext_get_actual_len(oext);
        start_ext.ee_len = end_ext.ee_len = 0;
@@ -528,7 +530,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
         * new_ext       |-------|
         */
        if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
-                ext4_error(orig_inode->i_sb,
+                EXT4_ERROR_INODE(orig_inode,
                        "new_ext_end(%u) should be less than or equal to "
                        "oext->ee_block(%u) + oext_alen(%d) - 1",
                        new_ext_end, le32_to_cpu(oext->ee_block),
@@ -691,12 +693,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
        while (1) {
                /* The extent for donor must be found. */
                if (!dext) {
-                        ext4_error(donor_inode->i_sb,
+                        EXT4_ERROR_INODE(donor_inode,
                                   "The extent for donor must be found");
                        *err = -EIO;
                        goto out;
                } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
-                        ext4_error(donor_inode->i_sb,
+                        EXT4_ERROR_INODE(donor_inode,
                                "Donor offset(%u) and the first block of donor "
                                "extent(%u) should be equal",
                                donor_off,
@@ -975,11 +977,11 @@ mext_check_arguments(struct inode *orig_inode,
        }
        /* Ext4 move extent supports only extent based file */
-        if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
+        if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
                ext4_debug("ext4 move extent: orig file is not extents "
                        "based file [ino:orig %lu]\n", orig_inode->i_ino);
                return -EOPNOTSUPP;
-        } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) {
+        } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
                ext4_debug("ext4 move extent: donor file is not extents "
                        "based file [ino:donor %lu]\n", donor_inode->i_ino);
                return -EOPNOTSUPP;
@@ -1353,7 +1355,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                        if (ret1 < 0)
                                break;
                        if (*moved_len > len) {
-                                ext4_error(orig_inode->i_sb,
+                                EXT4_ERROR_INODE(orig_inode,
                                        "We replaced blocks too much! "
                                        "sum of replaced: %llu requested: %llu",
                                        *moved_len, len);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 0c070fabd108..a43e6617b351 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -187,7 +187,7 @@ unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
                return blocksize;
        return (len & 65532) | ((len & 3) << 16);
 }
-  
 __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
 {
        if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
@@ -197,7 +197,7 @@ __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
        if (len == blocksize) {
                if (blocksize == 65536)
                        return cpu_to_le16(EXT4_MAX_REC_LEN);
-                else 
+                else
                        return cpu_to_le16(0);
        }
        return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
@@ -349,7 +349,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
                brelse(bh);
        }
        if (bcount)
-                printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", 
+                printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
                       levels ? "" : "   ", names, space/bcount,
                       (space/bcount)*100/blocksize);
        return (struct stats) { names, space, bcount};
@@ -653,10 +653,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
        int ret, err;
        __u32 hashval;
-        dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", 
+        dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
                       start_hash, start_minor_hash));
        dir = dir_file->f_path.dentry->d_inode;
-        if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
+        if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
                hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
                if (hinfo.hash_version <= DX_HASH_TEA)
                        hinfo.hash_version +=
@@ -801,7 +801,7 @@ static void ext4_update_dx_flag(struct inode *inode)
 {
        if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
                                     EXT4_FEATURE_COMPAT_DIR_INDEX))
-                EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL;
+                ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
 }
 /*
@@ -943,8 +943,8 @@ restart:
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
                        /* read error, skip block & hope for the best */
-                        ext4_error(sb, "reading directory #%lu offset %lu",
+                        EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
-                                   dir->i_ino, (unsigned long)block);
+                                         (unsigned long) block);
                        brelse(bh);
                        goto next;
                }
@@ -1066,15 +1066,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
                __u32 ino = le32_to_cpu(de->inode);
                brelse(bh);
                if (!ext4_valid_inum(dir->i_sb, ino)) {
-                        ext4_error(dir->i_sb, "bad inode number: %u", ino);
+                        EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
                        return ERR_PTR(-EIO);
                }
                inode = ext4_iget(dir->i_sb, ino);
                if (unlikely(IS_ERR(inode))) {
                        if (PTR_ERR(inode) == -ESTALE) {
-                                ext4_error(dir->i_sb,
+                                EXT4_ERROR_INODE(dir,
-                                                "deleted inode referenced: %u",
+                                                 "deleted inode referenced: %u",
-                                                ino);
+                                                 ino);
                                return ERR_PTR(-EIO);
                        } else {
                                return ERR_CAST(inode);
@@ -1104,8 +1104,8 @@ struct dentry *ext4_get_parent(struct dentry *child)
        brelse(bh);
        if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
-                ext4_error(child->d_inode->i_sb,
+                EXT4_ERROR_INODE(child->d_inode,
-                           "bad inode number: %u", ino);
+                                 "bad parent inode number: %u", ino);
                return ERR_PTR(-EIO);
        }
@@ -1141,7 +1141,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
        unsigned rec_len = 0;
        while (count--) {
-                struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) 
+                struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
                                                (from + (map->offs<<2));
                rec_len = EXT4_DIR_REC_LEN(de->name_len);
                memcpy (to, de, rec_len);
@@ -1404,9 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        de = (struct ext4_dir_entry_2 *)((char *)fde +
                ext4_rec_len_from_disk(fde->rec_len, blocksize));
        if ((char *) de >= (((char *) root) + blocksize)) {
-                ext4_error(dir->i_sb,
+                EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
-                           "invalid rec_len for '..' in inode %lu",
-                           dir->i_ino);
                brelse(bh);
                return -EIO;
        }
@@ -1418,7 +1416,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
                brelse(bh);
                return retval;
        }
-        EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
+        ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
        data1 = bh2->b_data;
        memcpy (data1, de, len);
@@ -1491,7 +1489,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                retval = ext4_dx_add_entry(handle, dentry, inode);
                if (!retval || (retval != ERR_BAD_DX_DIR))
                        return retval;
-                EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL;
+                ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
                dx_fallback++;
                ext4_mark_inode_dirty(handle, dir);
        }
@@ -1519,6 +1517,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
        de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
        retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
        brelse(bh);
+        if (retval == 0)
+                ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
        return retval;
 }
@@ -1915,9 +1915,8 @@ static int empty_dir(struct inode *inode)
        if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
            !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
                if (err)
-                        ext4_error(inode->i_sb,
+                        EXT4_ERROR_INODE(inode,
-                                   "error %d reading directory #%lu offset 0",
+                                "error %d reading directory lblock 0", err);
-                                   err, inode->i_ino);
                else
                        ext4_warning(inode->i_sb,
                                     "bad directory (dir #%lu) - no data block",
@@ -1941,17 +1940,17 @@ static int empty_dir(struct inode *inode)
        de = ext4_next_entry(de1, sb->s_blocksize);
        while (offset < inode->i_size) {
                if (!bh ||
-                        (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+                    (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+                        unsigned int lblock;
                        err = 0;
                        brelse(bh);
-                        bh = ext4_bread(NULL, inode,
+                        lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
-                                offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
+                        bh = ext4_bread(NULL, inode, lblock, 0, &err);
                        if (!bh) {
                                if (err)
-                                        ext4_error(sb,
+                                        EXT4_ERROR_INODE(inode,
-                                                   "error %d reading directory"
+                                                "error %d reading directory "
-                                                   " #%lu offset %u",
+                                                "lblock %u", err, lblock);
-                                                   err, inode->i_ino, offset);
                                offset += sb->s_blocksize;
                                continue;
                        }
@@ -2297,7 +2296,7 @@ retry:
                }
        } else {
                /* clear the extent format for fast symlink */
-                EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
+                ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
                inode->i_op = &ext4_fast_symlink_inode_operations;
                memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
                inode->i_size = l-1;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 5692c48754a0..6df797eb9aeb 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -911,7 +911,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        percpu_counter_add(&sbi->s_freeinodes_counter,
                           EXT4_INODES_PER_GROUP(sb));
-        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+            sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group;
                flex_group = ext4_flex_group(sbi, input->group);
                atomic_add(input->free_blocks_count,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ba191dae8730..4e8983a9811b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -68,7 +68,21 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int ext4_unfreeze(struct super_block *sb);
 static void ext4_write_super(struct super_block *sb);
 static int ext4_freeze(struct super_block *sb);
+static int ext4_get_sb(struct file_system_type *fs_type, int flags,
+                       const char *dev_name, void *data, struct vfsmount *mnt);
+#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext3_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "ext3",
+        .get_sb         = ext4_get_sb,
+        .kill_sb        = kill_block_super,
+        .fs_flags       = FS_REQUIRES_DEV,
+};
+#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
+#else
+#define IS_EXT3_SB(sb) (0)
+#endif
 ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
                               struct ext4_group_desc *bg)
@@ -227,6 +241,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
        if (sb->s_flags & MS_RDONLY)
                return ERR_PTR(-EROFS);
+        vfs_check_frozen(sb, SB_FREEZE_WRITE);
        /* Special case here: if the journal has aborted behind our
         * backs (eg. EIO in the commit thread), then we still need to
         * take the FS itself readonly cleanly. */
@@ -631,6 +646,8 @@ static void ext4_put_super(struct super_block *sb)
        struct ext4_super_block *es = sbi->s_es;
        int i, err;
+        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
        flush_workqueue(sbi->dio_unwritten_wq);
        destroy_workqueue(sbi->dio_unwritten_wq);
@@ -927,6 +944,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
        if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
                seq_puts(seq, ",journal_async_commit");
+        else if (test_opt(sb, JOURNAL_CHECKSUM))
+                seq_puts(seq, ",journal_checksum");
        if (test_opt(sb, NOBH))
                seq_puts(seq, ",nobh");
        if (test_opt(sb, I_VERSION))
@@ -1045,7 +1064,7 @@ static int ext4_release_dquot(struct dquot *dquot);
 static int ext4_mark_dquot_dirty(struct dquot *dquot);
 static int ext4_write_info(struct super_block *sb, int type);
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-                                char *path, int remount);
+                                char *path);
 static int ext4_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off);
@@ -1067,12 +1086,12 @@ static const struct dquot_operations ext4_quota_operations = {
 static const struct quotactl_ops ext4_qctl_operations = {
        .quota_on       = ext4_quota_on,
-        .quota_off      = vfs_quota_off,
+        .quota_off      = dquot_quota_off,
-        .quota_sync     = vfs_quota_sync,
+        .quota_sync     = dquot_quota_sync,
-        .get_info       = vfs_get_dqinfo,
+        .get_info       = dquot_get_dqinfo,
-        .set_info       = vfs_set_dqinfo,
+        .set_info       = dquot_set_dqinfo,
-        .get_dqblk      = vfs_get_dqblk,
+        .get_dqblk      = dquot_get_dqblk,
-        .set_dqblk      = vfs_set_dqblk
+        .set_dqblk      = dquot_set_dqblk
 };
 #endif
@@ -2037,7 +2056,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
        /* Turn quotas off */
        for (i = 0; i < MAXQUOTAS; i++) {
                if (sb_dqopt(sb)->files[i])
-                        vfs_quota_off(sb, i, 0);
+                        dquot_quota_off(sb, i);
        }
 #endif
        sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@ -2199,7 +2218,7 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
 struct ext4_attr {
        struct attribute attr;
        ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
-        ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 
+        ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
                         const char *, size_t);
        int offset;
 };
@@ -2416,6 +2435,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                __releases(kernel_lock)
                                __acquires(kernel_lock)
 {
+        char *orig_data = kstrdup(data, GFP_KERNEL);
        struct buffer_head *bh;
        struct ext4_super_block *es = NULL;
        struct ext4_sb_info *sbi;
@@ -2539,7 +2559,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         * enable delayed allocation by default
         * Use -o nodelalloc to turn it off
         */
-        set_opt(sbi->s_mount_opt, DELALLOC);
+        if (!IS_EXT3_SB(sb))
+                set_opt(sbi->s_mount_opt, DELALLOC);
        if (!parse_options((char *) data, sb, &journal_devnum,
                           &journal_ioprio, NULL, 0))
@@ -2778,24 +2799,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
-        err = percpu_counter_init(&sbi->s_freeblocks_counter,
-                        ext4_count_free_blocks(sb));
-        if (!err) {
-                err = percpu_counter_init(&sbi->s_freeinodes_counter,
-                                ext4_count_free_inodes(sb));
-        }
-        if (!err) {
-                err = percpu_counter_init(&sbi->s_dirs_counter,
-                                ext4_count_dirs(sb));
-        }
-        if (!err) {
-                err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
-        }
-        if (err) {
-                ext4_msg(sb, KERN_ERR, "insufficient memory");
-                goto failed_mount3;
-        }
        sbi->s_stripe = ext4_get_stripe_size(sbi);
        sbi->s_max_writeback_mb_bump = 128;
@@ -2895,6 +2898,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 no_journal:
+        err = percpu_counter_init(&sbi->s_freeblocks_counter,
+                                  ext4_count_free_blocks(sb));
+        if (!err)
+                err = percpu_counter_init(&sbi->s_freeinodes_counter,
+                                          ext4_count_free_inodes(sb));
+        if (!err)
+                err = percpu_counter_init(&sbi->s_dirs_counter,
+                                          ext4_count_dirs(sb));
+        if (!err)
+                err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+        if (err) {
+                ext4_msg(sb, KERN_ERR, "insufficient memory");
+                goto failed_mount_wq;
+        }
        if (test_opt(sb, NOBH)) {
                if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
                        ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
@@ -2986,7 +3003,7 @@ no_journal:
        err = ext4_setup_system_zone(sb);
        if (err) {
                ext4_msg(sb, KERN_ERR, "failed to initialize system "
-                         "zone (%d)\n", err);
+                         "zone (%d)", err);
                goto failed_mount4;
        }
@@ -3025,9 +3042,11 @@ no_journal:
        } else
                descr = "out journal";
-        ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr);
+        ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
+                "Opts: %s", descr, orig_data);
        lock_kernel();
+        kfree(orig_data);
        return 0;
 cantfind_ext4:
@@ -3044,6 +3063,10 @@ failed_mount_wq:
                jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
        }
+        percpu_counter_destroy(&sbi->s_freeblocks_counter);
+        percpu_counter_destroy(&sbi->s_freeinodes_counter);
+        percpu_counter_destroy(&sbi->s_dirs_counter);
+        percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount3:
        if (sbi->s_flex_groups) {
                if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3051,10 +3074,6 @@ failed_mount3:
                else
                        kfree(sbi->s_flex_groups);
        }
-        percpu_counter_destroy(&sbi->s_freeblocks_counter);
-        percpu_counter_destroy(&sbi->s_freeinodes_counter);
-        percpu_counter_destroy(&sbi->s_dirs_counter);
-        percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount2:
        for (i = 0; i < db_count; i++)
                brelse(sbi->s_group_desc[i]);
@@ -3074,6 +3093,7 @@ out_fail:
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        lock_kernel();
+        kfree(orig_data);
        return ret;
 }
@@ -3365,7 +3385,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
        if (!(sb->s_flags & MS_RDONLY))
                es->s_wtime = cpu_to_le32(get_seconds());
        es->s_kbytes_written =
-                cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 
+                cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
                            ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
                              EXT4_SB(sb)->s_sectors_written_start) >> 1));
        ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
@@ -3470,8 +3490,10 @@ int ext4_force_commit(struct super_block *sb)
                return 0;
        journal = EXT4_SB(sb)->s_journal;
-        if (journal)
+        if (journal) {
+                vfs_check_frozen(sb, SB_FREEZE_WRITE);
                ret = ext4_journal_force_commit(journal);
+        }
        return ret;
 }
@@ -3520,18 +3542,16 @@ static int ext4_freeze(struct super_block *sb)
         * the journal.
         */
        error = jbd2_journal_flush(journal);
-        if (error < 0) {
+        if (error < 0)
-        out:
+                goto out;
-                jbd2_journal_unlock_updates(journal);
-                return error;
-        }
        /* Journal blocked and flushed, clear needs_recovery flag. */
        EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
        error = ext4_commit_super(sb, 1);
-        if (error)
+out:
-                goto out;
+        /* we rely on s_frozen to stop further updates */
-        return 0;
+        jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+        return error;
 }
 /*
@@ -3548,7 +3568,6 @@ static int ext4_unfreeze(struct super_block *sb)
        EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
        ext4_commit_super(sb, 1);
        unlock_super(sb);
-        jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
        return 0;
 }
@@ -3559,12 +3578,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        ext4_fsblk_t n_blocks_count = 0;
        unsigned long old_sb_flags;
        struct ext4_mount_options old_opts;
+        int enable_quota = 0;
        ext4_group_t g;
        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
        int err;
 #ifdef CONFIG_QUOTA
        int i;
 #endif
+        char *orig_data = kstrdup(data, GFP_KERNEL);
        lock_kernel();
@@ -3615,6 +3636,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                }
                if (*flags & MS_RDONLY) {
+                        err = dquot_suspend(sb, -1);
+                        if (err < 0)
+                                goto restore_opts;
                        /*
                         * First of all, the unconditional stuff we have to do
                         * to disable replay of the journal when we next remount
@@ -3683,6 +3708,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                                goto restore_opts;
                        if (!ext4_setup_super(sb, es, 0))
                                sb->s_flags &= ~MS_RDONLY;
+                        enable_quota = 1;
                }
        }
        ext4_setup_system_zone(sb);
@@ -3698,6 +3724,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 #endif
        unlock_super(sb);
        unlock_kernel();
+        if (enable_quota)
+                dquot_resume(sb, -1);
+        ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
+        kfree(orig_data);
        return 0;
 restore_opts:
@@ -3719,6 +3750,7 @@ restore_opts:
 #endif
        unlock_super(sb);
        unlock_kernel();
+        kfree(orig_data);
        return err;
 }
@@ -3891,24 +3923,21 @@ static int ext4_write_info(struct super_block *sb, int type)
 */
 static int ext4_quota_on_mount(struct super_block *sb, int type)
 {
-        return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
+        return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
-                                  EXT4_SB(sb)->s_jquota_fmt, type);
+                                        EXT4_SB(sb)->s_jquota_fmt, type);
 }
 /*
 * Standard function to be called on quota_on
 */
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-                         char *name, int remount)
+                         char *name)
 {
        int err;
        struct path path;
        if (!test_opt(sb, QUOTA))
                return -EINVAL;
-        /* When remounting, no checks are needed and in fact, name is NULL */
-        if (remount)
-                return vfs_quota_on(sb, type, format_id, name, remount);
        err = kern_path(name, LOOKUP_FOLLOW, &path);
        if (err)
@@ -3947,7 +3976,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
                }
        }
-        err = vfs_quota_on_path(sb, type, format_id, &path);
+        err = dquot_quota_on_path(sb, type, format_id, &path);
        path_put(&path);
        return err;
 }
@@ -4068,7 +4097,7 @@ static int ext4_get_sb(struct file_system_type *fs_type, int flags,
        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
-#if !defined(CONTIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext2_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext2",
@@ -4095,15 +4124,7 @@ static inline void register_as_ext2(void) { }
 static inline void unregister_as_ext2(void) { }
 #endif
-#if !defined(CONTIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
-static struct file_system_type ext3_fs_type = {
-        .owner          = THIS_MODULE,
-        .name           = "ext3",
-        .get_sb         = ext4_get_sb,
-        .kill_sb        = kill_block_super,
-        .fs_flags       = FS_REQUIRES_DEV,
-};
 static inline void register_as_ext3(void)
 {
        int err = register_filesystem(&ext3_fs_type);
@@ -4134,6 +4155,7 @@ static int __init init_ext4_fs(void)
 {
        int err;
+        ext4_check_flag_values();
        err = init_ext4_system_zone();
        if (err)
                return err;
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 00740cb32be3..ed9354aff279 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -34,6 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
+        .setattr        = ext4_setattr,
 #ifdef CONFIG_EXT4_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
 const struct inode_operations ext4_fast_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = ext4_follow_link,
+        .setattr        = ext4_setattr,
 #ifdef CONFIG_EXT4_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index b4c5aa8489d8..04338009793a 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -97,7 +97,7 @@ static int ext4_xattr_list(struct dentry *dentry, char *buffer,
 static struct mb_cache *ext4_xattr_cache;
-static struct xattr_handler *ext4_xattr_handler_map[] = {
+static const struct xattr_handler *ext4_xattr_handler_map[] = {
        [EXT4_XATTR_INDEX_USER]              = &ext4_xattr_user_handler,
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
        [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext4_xattr_acl_access_handler,
@@ -109,7 +109,7 @@ static struct xattr_handler *ext4_xattr_handler_map[] = {
 #endif
 };
-struct xattr_handler *ext4_xattr_handlers[] = {
+const struct xattr_handler *ext4_xattr_handlers[] = {
        &ext4_xattr_user_handler,
        &ext4_xattr_trusted_handler,
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -122,10 +122,10 @@ struct xattr_handler *ext4_xattr_handlers[] = {
        NULL
 };
-static inline struct xattr_handler *
+static inline const struct xattr_handler *
 ext4_xattr_handler(int name_index)
 {
-        struct xattr_handler *handler = NULL;
+        const struct xattr_handler *handler = NULL;
        if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
                handler = ext4_xattr_handler_map[name_index];
@@ -228,9 +228,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
        if (ext4_xattr_check_block(bh)) {
 bad_block:
-                ext4_error(inode->i_sb,
+                EXT4_ERROR_INODE(inode, "bad block %llu",
-                           "inode %lu: bad block %llu", inode->i_ino,
+                                 EXT4_I(inode)->i_file_acl);
-                           EXT4_I(inode)->i_file_acl);
                error = -EIO;
                goto cleanup;
        }
@@ -332,7 +331,7 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
        size_t rest = buffer_size;
        for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
-                struct xattr_handler *handler =
+                const struct xattr_handler *handler =
                        ext4_xattr_handler(entry->e_name_index);
                if (handler) {
@@ -372,9 +371,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
        ea_bdebug(bh, "b_count=%d, refcount=%d",
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
        if (ext4_xattr_check_block(bh)) {
-                ext4_error(inode->i_sb,
+                EXT4_ERROR_INODE(inode, "bad block %llu",
-                           "inode %lu: bad block %llu", inode->i_ino,
+                                 EXT4_I(inode)->i_file_acl);
-                           EXT4_I(inode)->i_file_acl);
                error = -EIO;
                goto cleanup;
        }
@@ -666,8 +664,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
                        atomic_read(&(bs->bh->b_count)),
                        le32_to_cpu(BHDR(bs->bh)->h_refcount));
                if (ext4_xattr_check_block(bs->bh)) {
-                        ext4_error(sb, "inode %lu: bad block %llu",
+                        EXT4_ERROR_INODE(inode, "bad block %llu",
-                                   inode->i_ino, EXT4_I(inode)->i_file_acl);
+                                         EXT4_I(inode)->i_file_acl);
                        error = -EIO;
                        goto cleanup;
                }
@@ -820,7 +818,7 @@ inserted:
                                                EXT4_I(inode)->i_block_group);
                        /* non-extent files can't have physical blocks past 2^32 */
-                        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+                        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                                goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
                        block = ext4_new_meta_blocks(handle, inode,
@@ -828,7 +826,7 @@ inserted:
                        if (error)
                                goto cleanup;
-                        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+                        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                                BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
                        ea_idebug(inode, "creating block %d", block);
@@ -880,8 +878,8 @@ cleanup_dquot:
        goto cleanup;
 bad_block:
-        ext4_error(inode->i_sb, "inode %lu: bad block %llu",
+        EXT4_ERROR_INODE(inode, "bad block %llu",
-                   inode->i_ino, EXT4_I(inode)->i_file_acl);
+                         EXT4_I(inode)->i_file_acl);
        goto cleanup;
 #undef header
@@ -1194,8 +1192,8 @@ retry:
                if (!bh)
                        goto cleanup;
                if (ext4_xattr_check_block(bh)) {
-                        ext4_error(inode->i_sb, "inode %lu: bad block %llu",
+                        EXT4_ERROR_INODE(inode, "bad block %llu",
-                                   inode->i_ino, EXT4_I(inode)->i_file_acl);
+                                         EXT4_I(inode)->i_file_acl);
                        error = -EIO;
                        goto cleanup;
                }
@@ -1372,14 +1370,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
                goto cleanup;
        bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
        if (!bh) {
-                ext4_error(inode->i_sb, "inode %lu: block %llu read error",
+                EXT4_ERROR_INODE(inode, "block %llu read error",
-                           inode->i_ino, EXT4_I(inode)->i_file_acl);
+                                 EXT4_I(inode)->i_file_acl);
                goto cleanup;
        }
        if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
            BHDR(bh)->h_blocks != cpu_to_le32(1)) {
-                ext4_error(inode->i_sb, "inode %lu: bad block %llu",
+                EXT4_ERROR_INODE(inode, "bad block %llu",
-                           inode->i_ino, EXT4_I(inode)->i_file_acl);
+                                 EXT4_I(inode)->i_file_acl);
                goto cleanup;
        }
        ext4_xattr_release_block(handle, inode, bh);
@@ -1504,9 +1502,8 @@ again:
                }
                bh = sb_bread(inode->i_sb, ce->e_block);
                if (!bh) {
-                        ext4_error(inode->i_sb,
+                        EXT4_ERROR_INODE(inode, "block %lu read error",
-                                "inode %lu: block %lu read error",
+                                         (unsigned long) ce->e_block);
-                                inode->i_ino, (unsigned long) ce->e_block);
                } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
                                EXT4_XATTR_REFCOUNT_MAX) {
                        ea_idebug(inode, "block %lu refcount %d>=%d",
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 8ede88b18c29..518e96e43905 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -65,11 +65,11 @@ struct ext4_xattr_entry {
 # ifdef CONFIG_EXT4_FS_XATTR
-extern struct xattr_handler ext4_xattr_user_handler;
+extern const struct xattr_handler ext4_xattr_user_handler;
-extern struct xattr_handler ext4_xattr_trusted_handler;
+extern const struct xattr_handler ext4_xattr_trusted_handler;
-extern struct xattr_handler ext4_xattr_acl_access_handler;
+extern const struct xattr_handler ext4_xattr_acl_access_handler;
-extern struct xattr_handler ext4_xattr_acl_default_handler;
+extern const struct xattr_handler ext4_xattr_acl_default_handler;
-extern struct xattr_handler ext4_xattr_security_handler;
+extern const struct xattr_handler ext4_xattr_security_handler;
 extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
@@ -86,7 +86,7 @@ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 extern int init_ext4_xattr(void);
 extern void exit_ext4_xattr(void);
-extern struct xattr_handler *ext4_xattr_handlers[];
+extern const struct xattr_handler *ext4_xattr_handlers[];
 # else  /* CONFIG_EXT4_FS_XATTR */
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 983c253999a7..9b21268e121c 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -7,6 +7,7 @@
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/security.h>
+#include <linux/slab.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
 #include "xattr.h"
@@ -68,7 +69,7 @@ ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
        return err;
 }
-struct xattr_handler ext4_xattr_security_handler = {
+const struct xattr_handler ext4_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .list   = ext4_xattr_security_list,
        .get    = ext4_xattr_security_get,
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 15b50edc6587..37e6ebca2cc3 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -51,7 +51,7 @@ ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
                              name, value, size, flags);
 }
-struct xattr_handler ext4_xattr_trusted_handler = {
+const struct xattr_handler ext4_xattr_trusted_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .list   = ext4_xattr_trusted_list,
        .get    = ext4_xattr_trusted_get,
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index c4ce05746ce1..98c375352d0e 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -54,7 +54,7 @@ ext4_xattr_user_set(struct dentry *dentry, const char *name,
                              name, value, size, flags);
 }
-struct xattr_handler ext4_xattr_user_handler = {
+const struct xattr_handler ext4_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
        .list   = ext4_xattr_user_list,
        .get    = ext4_xattr_user_get,
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 923990e4f16e..ae8200f84e39 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -9,6 +9,7 @@
 */
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include "fat.h"
@@ -241,9 +242,10 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
        while (*fclus < cluster) {
                /* prevent the infinite loop of cluster chain */
                if (*fclus > limit) {
-                        fat_fs_error(sb, "%s: detected the cluster chain loop"
+                        fat_fs_error_ratelimit(sb,
-                                     " (i_pos %lld)", __func__,
+                                        "%s: detected the cluster chain loop"
-                                     MSDOS_I(inode)->i_pos);
+                                        " (i_pos %lld)", __func__,
+                                        MSDOS_I(inode)->i_pos);
                        nr = -EIO;
                        goto out;
                }
@@ -252,9 +254,9 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
                if (nr < 0)
                        goto out;
                else if (nr == FAT_ENT_FREE) {
-                        fat_fs_error(sb, "%s: invalid cluster chain"
+                        fat_fs_error_ratelimit(sb, "%s: invalid cluster chain"
-                                     " (i_pos %lld)", __func__,
+                                               " (i_pos %lld)", __func__,
-                                     MSDOS_I(inode)->i_pos);
+                                               MSDOS_I(inode)->i_pos);
                        nr = -EIO;
                        goto out;
                } else if (nr == FAT_ENT_EOF) {
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 530b4ca01510..ee42b9e0b16a 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -19,6 +19,7 @@
 #include <linux/buffer_head.h>
 #include <linux/compat.h>
 #include <asm/uaccess.h>
+#include <linux/kernel.h>
 #include "fat.h"
 /*
@@ -140,28 +141,22 @@ static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
 {
        const wchar_t *ip;
        wchar_t ec;
-        unsigned char *op, nc;
+        unsigned char *op;
        int charlen;
-        int k;
        ip = uni;
        op = ascii;
        while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) {
                ec = *ip++;
-                if ( (charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) {
+                if ((charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) {
                        op += charlen;
                        len -= charlen;
                } else {
                        if (uni_xlate == 1) {
-                                *op = ':';
+                                *op++ = ':';
-                                for (k = 4; k > 0; k--) {
+                                op = pack_hex_byte(op, ec >> 8);
-                                        nc = ec & 0xF;
+                                op = pack_hex_byte(op, ec);
-                                        op[k] = nc > 9  ? nc + ('a' - 10)
-                                                        : nc + '0';
-                                        ec >>= 4;
-                                }
-                                op += 5;
                                len -= 5;
                        } else {
                                *op++ = '?';
@@ -758,9 +753,10 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
        return ret;
 }
-static int fat_dir_ioctl(struct inode *inode, struct file *filp,
+static long fat_dir_ioctl(struct file *filp, unsigned int cmd,
-                         unsigned int cmd, unsigned long arg)
+                          unsigned long arg)
 {
+        struct inode *inode = filp->f_path.dentry->d_inode;
        struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg;
        int short_only, both;
@@ -774,7 +770,7 @@ static int fat_dir_ioctl(struct inode *inode, struct file *filp,
                both = 1;
                break;
        default:
-                return fat_generic_ioctl(inode, filp, cmd, arg);
+                return fat_generic_ioctl(filp, cmd, arg);
        }
        if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2])))
@@ -814,7 +810,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
                both = 1;
                break;
        default:
-                return -ENOIOCTLCMD;
+                return fat_generic_ioctl(filp, cmd, (unsigned long)arg);
        }
        if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2])))
@@ -836,7 +832,7 @@ const struct file_operations fat_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = fat_readdir,
-        .ioctl          = fat_dir_ioctl,
+        .unlocked_ioctl = fat_dir_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = fat_compat_dir_ioctl,
 #endif
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index e6efdfa0f6db..27ac25725954 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -6,6 +6,7 @@
 #include <linux/nls.h>
 #include <linux/fs.h>
 #include <linux/mutex.h>
+#include <linux/ratelimit.h>
 #include <linux/msdos_fs.h>
 /*
@@ -82,6 +83,8 @@ struct msdos_sb_info {
        struct fatent_operations *fatent_ops;
        struct inode *fat_inode;
+        struct ratelimit_state ratelimit;
        spinlock_t inode_hash_lock;
        struct hlist_head inode_hashtable[FAT_HASH_SIZE];
 };
@@ -298,16 +301,16 @@ extern int fat_free_clusters(struct inode *inode, int cluster);
 extern int fat_count_free_clusters(struct super_block *sb);
 /* fat/file.c */
-extern int fat_generic_ioctl(struct inode *inode, struct file *filp,
+extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
-                             unsigned int cmd, unsigned long arg);
+                              unsigned long arg);
 extern const struct file_operations fat_file_operations;
 extern const struct inode_operations fat_file_inode_operations;
 extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
-extern void fat_truncate(struct inode *inode);
+extern int fat_setsize(struct inode *inode, loff_t offset);
+extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
 extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
                       struct kstat *stat);
-extern int fat_file_fsync(struct file *file, struct dentry *dentry,
+extern int fat_file_fsync(struct file *file, int datasync);
-                          int datasync);
 /* fat/inode.c */
 extern void fat_attach(struct inode *inode, loff_t i_pos);
@@ -322,8 +325,13 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
 extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
                            struct inode *i2);
 /* fat/misc.c */
-extern void fat_fs_error(struct super_block *s, const char *fmt, ...)
+extern void
-        __attribute__ ((format (printf, 2, 3))) __cold;
+__fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
+        __attribute__ ((format (printf, 3, 4))) __cold;
+#define fat_fs_error(s, fmt, args...)           \
+        __fat_fs_error(s, 1, fmt , ## args)
+#define fat_fs_error_ratelimit(s, fmt, args...) \
+        __fat_fs_error(s, __ratelimit(&MSDOS_SB(s)->ratelimit), fmt , ## args)
 extern int fat_clusters_flush(struct super_block *sb);
 extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
 extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
diff --git a/fs/fat/file.c b/fs/fat/file.c
index e8c159de236b..990dfae022e5 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -8,6 +8,7 @@
 #include <linux/capability.h>
 #include <linux/module.h>
+#include <linux/compat.h>
 #include <linux/mount.h>
 #include <linux/time.h>
 #include <linux/buffer_head.h>
@@ -114,9 +115,9 @@ out:
        return err;
 }
-int fat_generic_ioctl(struct inode *inode, struct file *filp,
+long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-                      unsigned int cmd, unsigned long arg)
 {
+        struct inode *inode = filp->f_path.dentry->d_inode;
        u32 __user *user_attr = (u32 __user *)arg;
        switch (cmd) {
@@ -129,6 +130,15 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
        }
 }
+#ifdef CONFIG_COMPAT
+static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd,
+                                      unsigned long arg)
+{
+        return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
 static int fat_file_release(struct inode *inode, struct file *filp)
 {
        if ((filp->f_mode & FMODE_WRITE) &&
@@ -139,12 +149,12 @@ static int fat_file_release(struct inode *inode, struct file *filp)
        return 0;
 }
-int fat_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int fat_file_fsync(struct file *filp, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = filp->f_mapping->host;
        int res, err;
-        res = simple_fsync(filp, dentry, datasync);
+        res = generic_file_fsync(filp, datasync);
        err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
        return res ? res : err;
@@ -159,7 +169,10 @@ const struct file_operations fat_file_operations = {
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
        .release        = fat_file_release,
-        .ioctl          = fat_generic_ioctl,
+        .unlocked_ioctl = fat_generic_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = fat_generic_compat_ioctl,
+#endif
        .fsync          = fat_file_fsync,
        .splice_read    = generic_file_splice_read,
 };
@@ -270,7 +283,7 @@ static int fat_free(struct inode *inode, int skip)
        return fat_free_clusters(inode, free_start);
 }
-void fat_truncate(struct inode *inode)
+void fat_truncate_blocks(struct inode *inode, loff_t offset)
 {
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
        const unsigned int cluster_size = sbi->cluster_size;
@@ -280,10 +293,10 @@ void fat_truncate(struct inode *inode)
         * This protects against truncating a file bigger than it was then
         * trying to write into the hole.
         */
-        if (MSDOS_I(inode)->mmu_private > inode->i_size)
+        if (MSDOS_I(inode)->mmu_private > offset)
-                MSDOS_I(inode)->mmu_private = inode->i_size;
+                MSDOS_I(inode)->mmu_private = offset;
-        nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits;
+        nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits;
        fat_free(inode, nr_clusters);
        fat_flush_inodes(inode->i_sb, inode, NULL);
@@ -351,6 +364,18 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
        return 0;
 }
+int fat_setsize(struct inode *inode, loff_t offset)
+{
+        int error;
+        error = simple_setsize(inode, offset);
+        if (error)
+                return error;
+        fat_truncate_blocks(inode, offset);
+        return error;
+}
 #define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
 /* valid file mode bits */
 #define FAT_VALID_MODE  (S_IFREG | S_IFDIR | S_IRWXUGO)
@@ -365,7 +390,8 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
        /*
         * Expand the file. Since inode_setattr() updates ->i_size
         * before calling the ->truncate(), but FAT needs to fill the
-         * hole before it.
+         * hole before it. XXX: this is no longer true with new truncate
+         * sequence.
         */
        if (attr->ia_valid & ATTR_SIZE) {
                if (attr->ia_size > inode->i_size) {
@@ -414,15 +440,20 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
                        attr->ia_valid &= ~ATTR_MODE;
        }
-        if (attr->ia_valid)
+        if (attr->ia_valid & ATTR_SIZE) {
-                error = inode_setattr(inode, attr);
+                error = fat_setsize(inode, attr->ia_size);
+                if (error)
+                        goto out;
+        }
+        generic_setattr(inode, attr);
+        mark_inode_dirty(inode);
 out:
        return error;
 }
 EXPORT_SYMBOL_GPL(fat_setattr);
 const struct inode_operations fat_file_inode_operations = {
-        .truncate       = fat_truncate,
        .setattr        = fat_setattr,
        .getattr        = fat_getattr,
 };
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 0ce143bd7d56..7bf45aee56d7 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -142,14 +142,29 @@ static int fat_readpages(struct file *file, struct address_space *mapping,
        return mpage_readpages(mapping, pages, nr_pages, fat_get_block);
 }
+static void fat_write_failed(struct address_space *mapping, loff_t to)
+{
+        struct inode *inode = mapping->host;
+        if (to > inode->i_size) {
+                truncate_pagecache(inode, to, inode->i_size);
+                fat_truncate_blocks(inode, inode->i_size);
+        }
+}
 static int fat_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
 {
+        int err;
        *pagep = NULL;
-        return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        err = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
-                                fat_get_block,
+                                pagep, fsdata, fat_get_block,
                                &MSDOS_I(mapping->host)->mmu_private);
+        if (err < 0)
+                fat_write_failed(mapping, pos + len);
+        return err;
 }
 static int fat_write_end(struct file *file, struct address_space *mapping,
@@ -159,6 +174,8 @@ static int fat_write_end(struct file *file, struct address_space *mapping,
        struct inode *inode = mapping->host;
        int err;
        err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
+        if (err < len)
+                fat_write_failed(mapping, pos + len);
        if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
                inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
                MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
@@ -172,7 +189,9 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
                             loff_t offset, unsigned long nr_segs)
 {
        struct file *file = iocb->ki_filp;
-        struct inode *inode = file->f_mapping->host;
+        struct address_space *mapping = file->f_mapping;
+        struct inode *inode = mapping->host;
+        ssize_t ret;
        if (rw == WRITE) {
                /*
@@ -193,8 +212,12 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
         * FAT need to use the DIO_LOCKING for avoiding the race
         * condition of fat_get_block() and ->truncate().
         */
-        return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+        ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev,
-                                  offset, nr_segs, fat_get_block, NULL);
+                                iov, offset, nr_segs, fat_get_block, NULL);
+        if (ret < 0 && (rw & WRITE))
+                fat_write_failed(mapping, offset + iov_length(iov, nr_segs));
+        return ret;
 }
 static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
@@ -429,7 +452,7 @@ static void fat_delete_inode(struct inode *inode)
 {
        truncate_inode_pages(&inode->i_data, 0);
        inode->i_size = 0;
-        fat_truncate(inode);
+        fat_truncate_blocks(inode, 0);
        clear_inode(inode);
 }
@@ -1250,6 +1273,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        sb->s_op = &fat_sops;
        sb->s_export_op = &fat_export_ops;
        sbi->dir_ops = fs_dir_inode_ops;
+        ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
+                             DEFAULT_RATELIMIT_BURST);
        error = parse_options(data, isvfat, silent, &debug, &sbi->options);
        if (error)
@@ -1497,10 +1522,8 @@ out_fail:
                iput(fat_inode);
        if (root_inode)
                iput(root_inode);
-        if (sbi->nls_io)
+        unload_nls(sbi->nls_io);
-                unload_nls(sbi->nls_io);
+        unload_nls(sbi->nls_disk);
-        if (sbi->nls_disk)
-                unload_nls(sbi->nls_disk);
        if (sbi->options.iocharset != fat_default_iocharset)
                kfree(sbi->options.iocharset);
        sb->s_fs_info = NULL;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index d3da05f26465..1fa23f6ffba5 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -20,27 +20,29 @@
 * In case the file system is remounted read-only, it can be made writable
 * again by remounting it.
 */
-void fat_fs_error(struct super_block *s, const char *fmt, ...)
+void __fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
 {
        struct fat_mount_options *opts = &MSDOS_SB(s)->options;
        va_list args;
-        printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
+        if (report) {
+                printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
-        printk(KERN_ERR "    ");
+                printk(KERN_ERR "    ");
-        va_start(args, fmt);
+                va_start(args, fmt);
-        vprintk(fmt, args);
+                vprintk(fmt, args);
-        va_end(args);
+                va_end(args);
-        printk("\n");
+                printk("\n");
+        }
        if (opts->errors == FAT_ERRORS_PANIC)
-                panic("    FAT fs panic from previous error\n");
+                panic("FAT: fs panic from previous error\n");
        else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) {
                s->s_flags |= MS_RDONLY;
-                printk(KERN_ERR "    File system has been set read-only\n");
+                printk(KERN_ERR "FAT: Filesystem has been set read-only\n");
        }
 }
-EXPORT_SYMBOL_GPL(fat_fs_error);
+EXPORT_SYMBOL_GPL(__fat_fs_error);
 /* Flushes the number of free clusters on FAT32 */
 /* XXX: Need to write one per FSINFO block.  Currently only writes 1 */
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index c1ef50154868..6fcc7e71fbaa 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -309,7 +309,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
 {
        struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options;
        wchar_t *ip, *ext_start, *end, *name_start;
-        unsigned char base[9], ext[4], buf[8], *p;
+        unsigned char base[9], ext[4], buf[5], *p;
        unsigned char charbuf[NLS_MAX_CHARSET_SIZE];
        int chl, chi;
        int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen;
@@ -467,7 +467,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
                        return 0;
        }
-        i = jiffies & 0xffff;
+        i = jiffies;
        sz = (jiffies >> 16) & 0x7;
        if (baselen > 2) {
                baselen = numtail2_baselen;
@@ -476,7 +476,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
        name_res[baselen + 4] = '~';
        name_res[baselen + 5] = '1' + sz;
        while (1) {
-                sprintf(buf, "%04X", i);
+                snprintf(buf, sizeof(buf), "%04X", i & 0xffff);
                memcpy(&name_res[baselen], buf, 4);
                if (vfat_find_form(dir, name_res) < 0)
                        break;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 452d02f9075e..f74d270ba155 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -14,6 +14,7 @@
 #include <linux/dnotify.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/pipe_fs_i.h>
 #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/signal.h>
@@ -412,6 +413,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
        case F_NOTIFY:
                err = fcntl_dirnotify(fd, filp, arg);
                break;
+        case F_SETPIPE_SZ:
+        case F_GETPIPE_SZ:
+                err = pipe_fcntl(filp, cmd, arg);
+                break;
        default:
                break;
        }
@@ -614,9 +619,15 @@ int send_sigurg(struct fown_struct *fown)
        return ret;
 }
-static DEFINE_RWLOCK(fasync_lock);
+static DEFINE_SPINLOCK(fasync_lock);
 static struct kmem_cache *fasync_cache __read_mostly;
+static void fasync_free_rcu(struct rcu_head *head)
+{
+        kmem_cache_free(fasync_cache,
+                        container_of(head, struct fasync_struct, fa_rcu));
+}
 /*
 * Remove a fasync entry. If successfully removed, return
 * positive and clear the FASYNC flag. If no entry exists,
@@ -625,8 +636,6 @@ static struct kmem_cache *fasync_cache __read_mostly;
 * NOTE! It is very important that the FASYNC flag always
 * match the state "is the filp on a fasync list".
 *
- * We always take the 'filp->f_lock', in since fasync_lock
- * needs to be irq-safe.
 */
 static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
 {
@@ -634,17 +643,22 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
        int result = 0;
        spin_lock(&filp->f_lock);
-        write_lock_irq(&fasync_lock);
+        spin_lock(&fasync_lock);
        for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
                if (fa->fa_file != filp)
                        continue;
+                spin_lock_irq(&fa->fa_lock);
+                fa->fa_file = NULL;
+                spin_unlock_irq(&fa->fa_lock);
                *fp = fa->fa_next;
-                kmem_cache_free(fasync_cache, fa);
+                call_rcu(&fa->fa_rcu, fasync_free_rcu);
                filp->f_flags &= ~FASYNC;
                result = 1;
                break;
        }
-        write_unlock_irq(&fasync_lock);
+        spin_unlock(&fasync_lock);
        spin_unlock(&filp->f_lock);
        return result;
 }
@@ -666,25 +680,30 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
                return -ENOMEM;
        spin_lock(&filp->f_lock);
-        write_lock_irq(&fasync_lock);
+        spin_lock(&fasync_lock);
        for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
                if (fa->fa_file != filp)
                        continue;
+                spin_lock_irq(&fa->fa_lock);
                fa->fa_fd = fd;
+                spin_unlock_irq(&fa->fa_lock);
                kmem_cache_free(fasync_cache, new);
                goto out;
        }
+        spin_lock_init(&new->fa_lock);
        new->magic = FASYNC_MAGIC;
        new->fa_file = filp;
        new->fa_fd = fd;
        new->fa_next = *fapp;
-        *fapp = new;
+        rcu_assign_pointer(*fapp, new);
        result = 1;
        filp->f_flags |= FASYNC;
 out:
-        write_unlock_irq(&fasync_lock);
+        spin_unlock(&fasync_lock);
        spin_unlock(&filp->f_lock);
        return result;
 }
@@ -704,37 +723,41 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap
 EXPORT_SYMBOL(fasync_helper);
-void __kill_fasync(struct fasync_struct *fa, int sig, int band)
+/*
+ * rcu_read_lock() is held
+ */
+static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
 {
        while (fa) {
-                struct fown_struct * fown;
+                struct fown_struct *fown;
                if (fa->magic != FASYNC_MAGIC) {
                        printk(KERN_ERR "kill_fasync: bad magic number in "
                               "fasync_struct!\n");
                        return;
                }
-                fown = &fa->fa_file->f_owner;
+                spin_lock(&fa->fa_lock);
-                /* Don't send SIGURG to processes which have not set a
+                if (fa->fa_file) {
-                   queued signum: SIGURG has its own default signalling
+                        fown = &fa->fa_file->f_owner;
-                   mechanism. */
+                        /* Don't send SIGURG to processes which have not set a
-                if (!(sig == SIGURG && fown->signum == 0))
+                           queued signum: SIGURG has its own default signalling
-                        send_sigio(fown, fa->fa_fd, band);
+                           mechanism. */
-                fa = fa->fa_next;
+                        if (!(sig == SIGURG && fown->signum == 0))
+                                send_sigio(fown, fa->fa_fd, band);
+                }
+                spin_unlock(&fa->fa_lock);
+                fa = rcu_dereference(fa->fa_next);
        }
 }
-EXPORT_SYMBOL(__kill_fasync);
 void kill_fasync(struct fasync_struct **fp, int sig, int band)
 {
        /* First a quick test without locking: usually
         * the list is empty.
         */
        if (*fp) {
-                read_lock(&fasync_lock);
+                rcu_read_lock();
-                /* reread *fp after obtaining the lock */
+                kill_fasync_rcu(rcu_dereference(*fp), sig, band);
-                __kill_fasync(*fp, sig, band);
+                rcu_read_unlock();
-                read_unlock(&fasync_lock);
        }
 }
 EXPORT_SYMBOL(kill_fasync);
diff --git a/fs/fifo.c b/fs/fifo.c
index f8f97b8b6d44..5d6606ffc2d2 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -10,7 +10,6 @@
 */
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/pipe_fs_i.h>
diff --git a/fs/file_table.c b/fs/file_table.c
index 32d12b78bac8..5c7d10ead4ad 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -194,14 +194,6 @@ struct file *alloc_file(struct path *path, fmode_t mode,
 }
 EXPORT_SYMBOL(alloc_file);
-void fput(struct file *file)
-{
-        if (atomic_long_dec_and_test(&file->f_count))
-                __fput(file);
-}
-EXPORT_SYMBOL(fput);
 /**
 * drop_file_write_access - give up ability to write to a file
 * @file: the file to which we will stop writing
@@ -227,10 +219,9 @@ void drop_file_write_access(struct file *file)
 }
 EXPORT_SYMBOL_GPL(drop_file_write_access);
-/* __fput is called from task context when aio completion releases the last
+/* the real guts of fput() - releasing the last reference to file
- * last use of a struct file *.  Do not use otherwise.
 */
-void __fput(struct file *file)
+static void __fput(struct file *file)
 {
        struct dentry *dentry = file->f_path.dentry;
        struct vfsmount *mnt = file->f_path.mnt;
@@ -268,6 +259,14 @@ void __fput(struct file *file)
        mntput(mnt);
 }
+void fput(struct file *file)
+{
+        if (atomic_long_dec_and_test(&file->f_count))
+                __fput(file);
+}
+EXPORT_SYMBOL(fput);
 struct file *fget(unsigned int fd)
 {
        struct file *file;
diff --git a/fs/filesystems.c b/fs/filesystems.c
index a24c58e181db..68ba492d8eef 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -10,10 +10,10 @@
 #include <linux/fs.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <linux/slab.h>
 #include <linux/kmod.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 /*
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index aee049cb9f84..0ec7bb2c95c6 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -57,6 +57,8 @@ const struct inode_operations vxfs_dir_inode_ops = {
 };
 const struct file_operations vxfs_dir_operations = {
+        .llseek =               generic_file_llseek,
+        .read =                 generic_read_dir,
        .readdir =              vxfs_readdir,
 };
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index ed8f0b0dd880..1429f3ae1e86 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -33,7 +33,6 @@
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include "vxfs_extern.h"
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 76fc4d594acb..ea8592b90696 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -16,6 +16,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
@@ -41,9 +42,10 @@ struct wb_writeback_args {
        long nr_pages;
        struct super_block *sb;
        enum writeback_sync_modes sync_mode;
-        int for_kupdate:1;
+        unsigned int for_kupdate:1;
-        int range_cyclic:1;
+        unsigned int range_cyclic:1;
-        int for_background:1;
+        unsigned int for_background:1;
+        unsigned int sb_pinned:1;
 };
 /*
@@ -191,7 +193,8 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
 }
 static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
-                                 struct wb_writeback_args *args)
+                                 struct wb_writeback_args *args,
+                                 int wait)
 {
        struct bdi_work *work;
@@ -203,6 +206,8 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
        if (work) {
                bdi_work_init(work, args);
                bdi_queue_work(bdi, work);
+                if (wait)
+                        bdi_wait_on_work_clear(work);
        } else {
                struct bdi_writeback *wb = &bdi->wb;
@@ -229,6 +234,11 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
                .sync_mode      = WB_SYNC_ALL,
                .nr_pages       = LONG_MAX,
                .range_cyclic   = 0,
+                /*
+                 * Setting sb_pinned is not necessary for WB_SYNC_ALL, but
+                 * lets make it explicitly clear.
+                 */
+                .sb_pinned      = 1,
        };
        struct bdi_work work;
@@ -244,21 +254,23 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
 * @bdi: the backing device to write from
 * @sb: write inodes from this super_block
 * @nr_pages: the number of pages to write
+ * @sb_locked: caller already holds sb umount sem.
 *
 * Description:
 *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
 *   started when this function returns, we make no guarentees on
- *   completion. Caller need not hold sb s_umount semaphore.
+ *   completion. Caller specifies whether sb umount sem is held already or not.
 *
 */
 void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
-                         long nr_pages)
+                         long nr_pages, int sb_locked)
 {
        struct wb_writeback_args args = {
                .sb             = sb,
                .sync_mode      = WB_SYNC_NONE,
                .nr_pages       = nr_pages,
                .range_cyclic   = 1,
+                .sb_pinned      = sb_locked,
        };
        /*
@@ -270,7 +282,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
                args.for_background = 1;
        }
-        bdi_alloc_queue_work(bdi, &args);
+        bdi_alloc_queue_work(bdi, &args, sb_locked);
 }
 /*
@@ -397,11 +409,11 @@ static void inode_wait_for_writeback(struct inode *inode)
        wait_queue_head_t *wqh;
        wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
-        do {
+         while (inode->i_state & I_SYNC) {
                spin_unlock(&inode_lock);
                __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
                spin_lock(&inode_lock);
-        } while (inode->i_state & I_SYNC);
+        }
 }
 /*
@@ -451,11 +463,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
        BUG_ON(inode->i_state & I_SYNC);
-        /* Set I_SYNC, reset I_DIRTY */
+        /* Set I_SYNC, reset I_DIRTY_PAGES */
-        dirty = inode->i_state & I_DIRTY;
        inode->i_state |= I_SYNC;
-        inode->i_state &= ~I_DIRTY;
+        inode->i_state &= ~I_DIRTY_PAGES;
        spin_unlock(&inode_lock);
        ret = do_writepages(mapping, wbc);
@@ -471,6 +481,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
                        ret = err;
        }
+        /*
+         * Some filesystems may redirty the inode during the writeback
+         * due to delalloc, clear dirty metadata flags right before
+         * write_inode()
+         */
+        spin_lock(&inode_lock);
+        dirty = inode->i_state & I_DIRTY;
+        inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+        spin_unlock(&inode_lock);
        /* Don't write the inode if only I_DIRTY_PAGES was set */
        if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
                int err = write_inode(inode, wbc);
@@ -553,108 +572,85 @@ select_queue:
        return ret;
 }
-static void unpin_sb_for_writeback(struct super_block **psb)
+static void unpin_sb_for_writeback(struct super_block *sb)
 {
-        struct super_block *sb = *psb;
+        up_read(&sb->s_umount);
+        put_super(sb);
-        if (sb) {
-                up_read(&sb->s_umount);
-                put_super(sb);
-                *psb = NULL;
-        }
 }
+enum sb_pin_state {
+        SB_PINNED,
+        SB_NOT_PINNED,
+        SB_PIN_FAILED
+};
 /*
 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
 * before calling writeback. So make sure that we do pin it, so it doesn't
 * go away while we are writing inodes from it.
- *
- * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
- * 1 if we failed.
 */
-static int pin_sb_for_writeback(struct writeback_control *wbc,
+static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
-                                struct inode *inode, struct super_block **psb)
+                                              struct super_block *sb)
 {
-        struct super_block *sb = inode->i_sb;
-        /*
-         * If this sb is already pinned, nothing more to do. If not and
-         * *psb is non-NULL, unpin the old one first
-         */
-        if (sb == *psb)
-                return 0;
-        else if (*psb)
-                unpin_sb_for_writeback(psb);
        /*
         * Caller must already hold the ref for this
         */
-        if (wbc->sync_mode == WB_SYNC_ALL) {
+        if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) {
                WARN_ON(!rwsem_is_locked(&sb->s_umount));
-                return 0;
+                return SB_NOT_PINNED;
        }
        spin_lock(&sb_lock);
        sb->s_count++;
        if (down_read_trylock(&sb->s_umount)) {
                if (sb->s_root) {
                        spin_unlock(&sb_lock);
-                        goto pinned;
+                        return SB_PINNED;
                }
                /*
                 * umounted, drop rwsem again and fall through to failure
                 */
                up_read(&sb->s_umount);
        }
        sb->s_count--;
        spin_unlock(&sb_lock);
-        return 1;
+        return SB_PIN_FAILED;
-pinned:
-        *psb = sb;
-        return 0;
 }
-static void writeback_inodes_wb(struct bdi_writeback *wb,
+/*
-                                struct writeback_control *wbc)
+ * Write a portion of b_io inodes which belong to @sb.
+ * If @wbc->sb != NULL, then find and write all such
+ * inodes. Otherwise write only ones which go sequentially
+ * in reverse order.
+ * Return 1, if the caller writeback routine should be
+ * interrupted. Otherwise return 0.
+ */
+static int writeback_sb_inodes(struct super_block *sb,
+                               struct bdi_writeback *wb,
+                               struct writeback_control *wbc)
 {
-        struct super_block *sb = wbc->sb, *pin_sb = NULL;
-        const unsigned long start = jiffies;    /* livelock avoidance */
-        spin_lock(&inode_lock);
-        if (!wbc->for_kupdate || list_empty(&wb->b_io))
-                queue_io(wb, wbc->older_than_this);
        while (!list_empty(&wb->b_io)) {
-                struct inode *inode = list_entry(wb->b_io.prev,
-                                                struct inode, i_list);
                long pages_skipped;
+                struct inode *inode = list_entry(wb->b_io.prev,
-                /*
+                                                 struct inode, i_list);
-                 * super block given and doesn't match, skip this inode
+                if (wbc->sb && sb != inode->i_sb) {
-                 */
+                        /* super block given and doesn't
-                if (sb && sb != inode->i_sb) {
+                           match, skip this inode */
                        redirty_tail(inode);
                        continue;
                }
+                if (sb != inode->i_sb)
+                        /* finish with this superblock */
+                        return 0;
                if (inode->i_state & (I_NEW | I_WILL_FREE)) {
                        requeue_io(inode);
                        continue;
                }
                /*
                 * Was this inode dirtied after sync_sb_inodes was called?
                 * This keeps sync from extra jobs and livelock.
                 */
-                if (inode_dirtied_after(inode, start))
+                if (inode_dirtied_after(inode, wbc->wb_start))
-                        break;
+                        return 1;
-                if (pin_sb_for_writeback(wbc, inode, &pin_sb)) {
-                        requeue_io(inode);
-                        continue;
-                }
                BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
                __iget(inode);
@@ -673,14 +669,50 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
                spin_lock(&inode_lock);
                if (wbc->nr_to_write <= 0) {
                        wbc->more_io = 1;
-                        break;
+                        return 1;
                }
                if (!list_empty(&wb->b_more_io))
                        wbc->more_io = 1;
        }
+        /* b_io is empty */
+        return 1;
+}
+static void writeback_inodes_wb(struct bdi_writeback *wb,
+                                struct writeback_control *wbc)
+{
+        int ret = 0;
+        wbc->wb_start = jiffies; /* livelock avoidance */
+        spin_lock(&inode_lock);
+        if (!wbc->for_kupdate || list_empty(&wb->b_io))
+                queue_io(wb, wbc->older_than_this);
-        unpin_sb_for_writeback(&pin_sb);
+        while (!list_empty(&wb->b_io)) {
+                struct inode *inode = list_entry(wb->b_io.prev,
+                                                 struct inode, i_list);
+                struct super_block *sb = inode->i_sb;
+                enum sb_pin_state state;
+                if (wbc->sb && sb != wbc->sb) {
+                        /* super block given and doesn't
+                           match, skip this inode */
+                        redirty_tail(inode);
+                        continue;
+                }
+                state = pin_sb_for_writeback(wbc, sb);
+                if (state == SB_PIN_FAILED) {
+                        requeue_io(inode);
+                        continue;
+                }
+                ret = writeback_sb_inodes(sb, wb, wbc);
+                if (state == SB_PINNED)
+                        unpin_sb_for_writeback(sb);
+                if (ret)
+                        break;
+        }
        spin_unlock(&inode_lock);
        /* Leave any unwritten inodes on b_io */
 }
@@ -737,6 +769,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                .for_kupdate            = args->for_kupdate,
                .for_background         = args->for_background,
                .range_cyclic           = args->range_cyclic,
+                .sb_pinned              = args->sb_pinned,
        };
        unsigned long oldest_jif;
        long wrote = 0;
@@ -838,6 +871,12 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
        unsigned long expired;
        long nr_pages;
+        /*
+         * When set to zero, disable periodic writeback
+         */
+        if (!dirty_writeback_interval)
+                return 0;
        expired = wb->last_old_flush +
                        msecs_to_jiffies(dirty_writeback_interval * 10);
        if (time_before(jiffies, expired))
@@ -873,6 +912,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
        while ((work = get_next_work_item(bdi, wb)) != NULL) {
                struct wb_writeback_args args = work->args;
+                int post_clear;
                /*
                 * Override sync mode, in case we must wait for completion
@@ -880,11 +920,13 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
                if (force_wait)
                        work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
+                post_clear = WB_SYNC_ALL || args.sb_pinned;
                /*
                 * If this isn't a data integrity operation, just notify
                 * that we have seen this work and we are now starting it.
                 */
-                if (args.sync_mode == WB_SYNC_NONE)
+                if (!post_clear)
                        wb_clear_pending(wb, work);
                wrote += wb_writeback(wb, &args);
@@ -893,7 +935,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
                 * This is a data integrity writeback, so only do the
                 * notification when we have completed the work.
                 */
-                if (args.sync_mode == WB_SYNC_ALL)
+                if (post_clear)
                        wb_clear_pending(wb, work);
        }
@@ -933,8 +975,17 @@ int bdi_writeback_task(struct bdi_writeback *wb)
                                break;
                }
-                wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
+                if (dirty_writeback_interval) {
-                schedule_timeout_interruptible(wait_jiffies);
+                        wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
+                        schedule_timeout_interruptible(wait_jiffies);
+                } else {
+                        set_current_state(TASK_INTERRUPTIBLE);
+                        if (list_empty_careful(&wb->bdi->work_list) &&
+                            !kthread_should_stop())
+                                schedule();
+                        __set_current_state(TASK_RUNNING);
+                }
                try_to_freeze();
        }
@@ -960,7 +1011,7 @@ static void bdi_writeback_all(struct super_block *sb, long nr_pages)
                if (!bdi_has_dirty_io(bdi))
                        continue;
-                bdi_alloc_queue_work(bdi, &args);
+                bdi_alloc_queue_work(bdi, &args, 0);
        }
        rcu_read_unlock();
@@ -1169,6 +1220,18 @@ static void wait_sb_inodes(struct super_block *sb)
        iput(old_inode);
 }
+static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
+{
+        unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+        unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+        long nr_to_write;
+        nr_to_write = nr_dirty + nr_unstable +
+                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+        bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked);
+}
 /**
 * writeback_inodes_sb  -       writeback dirty inodes from given super_block
 * @sb: the superblock
@@ -1180,18 +1243,23 @@ static void wait_sb_inodes(struct super_block *sb)
 */
 void writeback_inodes_sb(struct super_block *sb)
 {
-        unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+        __writeback_inodes_sb(sb, 0);
-        unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-        long nr_to_write;
-        nr_to_write = nr_dirty + nr_unstable +
-                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
-        bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
 /**
+ * writeback_inodes_sb_locked   - writeback dirty inodes from given super_block
+ * @sb: the superblock
+ *
+ * Like writeback_inodes_sb(), except the caller already holds the
+ * sb umount sem.
+ */
+void writeback_inodes_sb_locked(struct super_block *sb)
+{
+        __writeback_inodes_sb(sb, 1);
+}
+/**
 * writeback_inodes_sb_if_idle  -       start writeback if none underway
 * @sb: the superblock
 *
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 3221a0c7944e..4a8eb31c5338 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -12,6 +12,7 @@
 #define FSCACHE_DEBUG_LEVEL COOKIE
 #include <linux/module.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include <linux/key.h>
 #include <keys/user-type.h>
 #include "internal.h"
@@ -102,7 +103,7 @@ static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
        /* banners (can't represent line 0 by pos 0 as that would involve
         * returning a NULL pointer) */
        if (pos == 0)
-                return (struct fscache_object *) ++(*_pos);
+                return (struct fscache_object *)(long)++(*_pos);
        if (pos < 3)
                return (struct fscache_object *)pos;
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index e513ac599c8e..0b589a9b4ffc 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -53,7 +53,7 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
 static void fscache_object_slow_work_put_ref(struct slow_work *);
 static int  fscache_object_slow_work_get_ref(struct slow_work *);
 static void fscache_object_slow_work_execute(struct slow_work *);
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
 #endif
 static void fscache_initialise_object(struct fscache_object *);
@@ -69,7 +69,7 @@ const struct slow_work_ops fscache_object_slow_work_ops = {
        .get_ref        = fscache_object_slow_work_get_ref,
        .put_ref        = fscache_object_slow_work_put_ref,
        .execute        = fscache_object_slow_work_execute,
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
        .desc           = fscache_object_slow_work_desc,
 #endif
 };
@@ -364,7 +364,7 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
 /*
 * describe an object for slow-work debugging
 */
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 static void fscache_object_slow_work_desc(struct slow_work *work,
                                          struct seq_file *m)
 {
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 313e79a14266..f17cecafae44 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -14,6 +14,7 @@
 #define FSCACHE_DEBUG_LEVEL OPERATION
 #include <linux/module.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include "internal.h"
 atomic_t fscache_op_debug_id;
@@ -500,7 +501,7 @@ static void fscache_op_execute(struct slow_work *work)
 /*
 * describe an operation for slow-work debugging
 */
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
 {
        struct fscache_operation *op =
@@ -517,7 +518,7 @@ const struct slow_work_ops fscache_op_slow_work_ops = {
        .get_ref        = fscache_op_get_ref,
        .put_ref        = fscache_op_put_ref,
        .execute        = fscache_op_execute,
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
        .desc           = fscache_op_desc,
 #endif
 };
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index c598ea4c4e7d..47aefd376e54 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -14,6 +14,7 @@
 #include <linux/fscache-cache.h>
 #include <linux/buffer_head.h>
 #include <linux/pagevec.h>
+#include <linux/slab.h>
 #include "internal.h"
 /*
@@ -881,6 +882,7 @@ submit_failed:
        goto nobufs;
 nobufs_unlock_obj:
+        spin_unlock(&cookie->stores_lock);
        spin_unlock(&object->lock);
 nobufs:
        spin_unlock(&cookie->lock);
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 46435f3aae68..4765190d537f 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -165,8 +165,8 @@ static int fscache_stats_show(struct seq_file *m, void *v)
                   atomic_read(&fscache_n_object_lookups),
                   atomic_read(&fscache_n_object_lookups_negative),
                   atomic_read(&fscache_n_object_lookups_positive),
-                   atomic_read(&fscache_n_object_lookups_timed_out),
+                   atomic_read(&fscache_n_object_created),
-                   atomic_read(&fscache_n_object_created));
+                   atomic_read(&fscache_n_object_lookups_timed_out));
        seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
                   atomic_read(&fscache_n_updates),
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index de792dcf3274..e1f8171278bd 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -44,6 +44,7 @@
 #include <linux/magic.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/stat.h>
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index eb7e9423691f..9424796d6634 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -16,8 +16,12 @@
 #include <linux/pagemap.h>
 #include <linux/file.h>
 #include <linux/slab.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/swap.h>
+#include <linux/splice.h>
 MODULE_ALIAS_MISCDEV(FUSE_MINOR);
+MODULE_ALIAS("devname:fuse");
 static struct kmem_cache *fuse_req_cachep;
@@ -498,6 +502,9 @@ struct fuse_copy_state {
        int write;
        struct fuse_req *req;
        const struct iovec *iov;
+        struct pipe_buffer *pipebufs;
+        struct pipe_buffer *currbuf;
+        struct pipe_inode_info *pipe;
        unsigned long nr_segs;
        unsigned long seglen;
        unsigned long addr;
@@ -505,16 +512,16 @@ struct fuse_copy_state {
        void *mapaddr;
        void *buf;
        unsigned len;
+        unsigned move_pages:1;
 };
 static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
-                           int write, struct fuse_req *req,
+                           int write,
                           const struct iovec *iov, unsigned long nr_segs)
 {
        memset(cs, 0, sizeof(*cs));
        cs->fc = fc;
        cs->write = write;
-        cs->req = req;
        cs->iov = iov;
        cs->nr_segs = nr_segs;
 }
@@ -522,7 +529,18 @@ static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
 /* Unmap and put previous page of userspace buffer */
 static void fuse_copy_finish(struct fuse_copy_state *cs)
 {
-        if (cs->mapaddr) {
+        if (cs->currbuf) {
+                struct pipe_buffer *buf = cs->currbuf;
+                if (!cs->write) {
+                        buf->ops->unmap(cs->pipe, buf, cs->mapaddr);
+                } else {
+                        kunmap_atomic(cs->mapaddr, KM_USER0);
+                        buf->len = PAGE_SIZE - cs->len;
+                }
+                cs->currbuf = NULL;
+                cs->mapaddr = NULL;
+        } else if (cs->mapaddr) {
                kunmap_atomic(cs->mapaddr, KM_USER0);
                if (cs->write) {
                        flush_dcache_page(cs->pg);
@@ -544,26 +562,61 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
        unlock_request(cs->fc, cs->req);
        fuse_copy_finish(cs);
-        if (!cs->seglen) {
+        if (cs->pipebufs) {
-                BUG_ON(!cs->nr_segs);
+                struct pipe_buffer *buf = cs->pipebufs;
-                cs->seglen = cs->iov[0].iov_len;
-                cs->addr = (unsigned long) cs->iov[0].iov_base;
+                if (!cs->write) {
-                cs->iov++;
+                        err = buf->ops->confirm(cs->pipe, buf);
-                cs->nr_segs--;
+                        if (err)
+                                return err;
+                        BUG_ON(!cs->nr_segs);
+                        cs->currbuf = buf;
+                        cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
+                        cs->len = buf->len;
+                        cs->buf = cs->mapaddr + buf->offset;
+                        cs->pipebufs++;
+                        cs->nr_segs--;
+                } else {
+                        struct page *page;
+                        if (cs->nr_segs == cs->pipe->buffers)
+                                return -EIO;
+                        page = alloc_page(GFP_HIGHUSER);
+                        if (!page)
+                                return -ENOMEM;
+                        buf->page = page;
+                        buf->offset = 0;
+                        buf->len = 0;
+                        cs->currbuf = buf;
+                        cs->mapaddr = kmap_atomic(page, KM_USER0);
+                        cs->buf = cs->mapaddr;
+                        cs->len = PAGE_SIZE;
+                        cs->pipebufs++;
+                        cs->nr_segs++;
+                }
+        } else {
+                if (!cs->seglen) {
+                        BUG_ON(!cs->nr_segs);
+                        cs->seglen = cs->iov[0].iov_len;
+                        cs->addr = (unsigned long) cs->iov[0].iov_base;
+                        cs->iov++;
+                        cs->nr_segs--;
+                }
+                err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg);
+                if (err < 0)
+                        return err;
+                BUG_ON(err != 1);
+                offset = cs->addr % PAGE_SIZE;
+                cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
+                cs->buf = cs->mapaddr + offset;
+                cs->len = min(PAGE_SIZE - offset, cs->seglen);
+                cs->seglen -= cs->len;
+                cs->addr += cs->len;
        }
-        down_read(&current->mm->mmap_sem);
-        err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
-                             &cs->pg, NULL);
-        up_read(&current->mm->mmap_sem);
-        if (err < 0)
-                return err;
-        BUG_ON(err != 1);
-        offset = cs->addr % PAGE_SIZE;
-        cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
-        cs->buf = cs->mapaddr + offset;
-        cs->len = min(PAGE_SIZE - offset, cs->seglen);
-        cs->seglen -= cs->len;
-        cs->addr += cs->len;
        return lock_request(cs->fc, cs->req);
 }
@@ -585,23 +638,178 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
        return ncpy;
 }
+static int fuse_check_page(struct page *page)
+{
+        if (page_mapcount(page) ||
+            page->mapping != NULL ||
+            page_count(page) != 1 ||
+            (page->flags & PAGE_FLAGS_CHECK_AT_PREP &
+             ~(1 << PG_locked |
+               1 << PG_referenced |
+               1 << PG_uptodate |
+               1 << PG_lru |
+               1 << PG_active |
+               1 << PG_reclaim))) {
+                printk(KERN_WARNING "fuse: trying to steal weird page\n");
+                printk(KERN_WARNING "  page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping);
+                return 1;
+        }
+        return 0;
+}
+static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
+{
+        int err;
+        struct page *oldpage = *pagep;
+        struct page *newpage;
+        struct pipe_buffer *buf = cs->pipebufs;
+        struct address_space *mapping;
+        pgoff_t index;
+        unlock_request(cs->fc, cs->req);
+        fuse_copy_finish(cs);
+        err = buf->ops->confirm(cs->pipe, buf);
+        if (err)
+                return err;
+        BUG_ON(!cs->nr_segs);
+        cs->currbuf = buf;
+        cs->len = buf->len;
+        cs->pipebufs++;
+        cs->nr_segs--;
+        if (cs->len != PAGE_SIZE)
+                goto out_fallback;
+        if (buf->ops->steal(cs->pipe, buf) != 0)
+                goto out_fallback;
+        newpage = buf->page;
+        if (WARN_ON(!PageUptodate(newpage)))
+                return -EIO;
+        ClearPageMappedToDisk(newpage);
+        if (fuse_check_page(newpage) != 0)
+                goto out_fallback_unlock;
+        mapping = oldpage->mapping;
+        index = oldpage->index;
+        /*
+         * This is a new and locked page, it shouldn't be mapped or
+         * have any special flags on it
+         */
+        if (WARN_ON(page_mapped(oldpage)))
+                goto out_fallback_unlock;
+        if (WARN_ON(page_has_private(oldpage)))
+                goto out_fallback_unlock;
+        if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage)))
+                goto out_fallback_unlock;
+        if (WARN_ON(PageMlocked(oldpage)))
+                goto out_fallback_unlock;
+        remove_from_page_cache(oldpage);
+        page_cache_release(oldpage);
+        err = add_to_page_cache_locked(newpage, mapping, index, GFP_KERNEL);
+        if (err) {
+                printk(KERN_WARNING "fuse_try_move_page: failed to add page");
+                goto out_fallback_unlock;
+        }
+        page_cache_get(newpage);
+        if (!(buf->flags & PIPE_BUF_FLAG_LRU))
+                lru_cache_add_file(newpage);
+        err = 0;
+        spin_lock(&cs->fc->lock);
+        if (cs->req->aborted)
+                err = -ENOENT;
+        else
+                *pagep = newpage;
+        spin_unlock(&cs->fc->lock);
+        if (err) {
+                unlock_page(newpage);
+                page_cache_release(newpage);
+                return err;
+        }
+        unlock_page(oldpage);
+        page_cache_release(oldpage);
+        cs->len = 0;
+        return 0;
+out_fallback_unlock:
+        unlock_page(newpage);
+out_fallback:
+        cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
+        cs->buf = cs->mapaddr + buf->offset;
+        err = lock_request(cs->fc, cs->req);
+        if (err)
+                return err;
+        return 1;
+}
+static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
+                         unsigned offset, unsigned count)
+{
+        struct pipe_buffer *buf;
+        if (cs->nr_segs == cs->pipe->buffers)
+                return -EIO;
+        unlock_request(cs->fc, cs->req);
+        fuse_copy_finish(cs);
+        buf = cs->pipebufs;
+        page_cache_get(page);
+        buf->page = page;
+        buf->offset = offset;
+        buf->len = count;
+        cs->pipebufs++;
+        cs->nr_segs++;
+        cs->len = 0;
+        return 0;
+}
 /*
 * Copy a page in the request to/from the userspace buffer.  Must be
 * done atomically
 */
-static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
+static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
                          unsigned offset, unsigned count, int zeroing)
 {
+        int err;
+        struct page *page = *pagep;
        if (page && zeroing && count < PAGE_SIZE) {
                void *mapaddr = kmap_atomic(page, KM_USER1);
                memset(mapaddr, 0, PAGE_SIZE);
                kunmap_atomic(mapaddr, KM_USER1);
        }
        while (count) {
-                if (!cs->len) {
+                if (cs->write && cs->pipebufs && page) {
-                        int err = fuse_copy_fill(cs);
+                        return fuse_ref_page(cs, page, offset, count);
-                        if (err)
+                } else if (!cs->len) {
-                                return err;
+                        if (cs->move_pages && page &&
+                            offset == 0 && count == PAGE_SIZE) {
+                                err = fuse_try_move_page(cs, pagep);
+                                if (err <= 0)
+                                        return err;
+                        } else {
+                                err = fuse_copy_fill(cs);
+                                if (err)
+                                        return err;
+                        }
                }
                if (page) {
                        void *mapaddr = kmap_atomic(page, KM_USER1);
@@ -626,8 +834,10 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
        unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
        for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
-                struct page *page = req->pages[i];
+                int err;
-                int err = fuse_copy_page(cs, page, offset, count, zeroing);
+                err = fuse_copy_page(cs, &req->pages[i], offset, count,
+                                     zeroing);
                if (err)
                        return err;
@@ -704,11 +914,10 @@ __acquires(&fc->lock)
 *
 * Called with fc->lock held, releases it
 */
-static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
+static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs,
-                               const struct iovec *iov, unsigned long nr_segs)
+                               size_t nbytes, struct fuse_req *req)
 __releases(&fc->lock)
 {
-        struct fuse_copy_state cs;
        struct fuse_in_header ih;
        struct fuse_interrupt_in arg;
        unsigned reqsize = sizeof(ih) + sizeof(arg);
@@ -724,14 +933,13 @@ __releases(&fc->lock)
        arg.unique = req->in.h.unique;
        spin_unlock(&fc->lock);
-        if (iov_length(iov, nr_segs) < reqsize)
+        if (nbytes < reqsize)
                return -EINVAL;
-        fuse_copy_init(&cs, fc, 1, NULL, iov, nr_segs);
+        err = fuse_copy_one(cs, &ih, sizeof(ih));
-        err = fuse_copy_one(&cs, &ih, sizeof(ih));
        if (!err)
-                err = fuse_copy_one(&cs, &arg, sizeof(arg));
+                err = fuse_copy_one(cs, &arg, sizeof(arg));
-        fuse_copy_finish(&cs);
+        fuse_copy_finish(cs);
        return err ? err : reqsize;
 }
@@ -745,18 +953,13 @@ __releases(&fc->lock)
 * request_end().  Otherwise add it to the processing list, and set
 * the 'sent' flag.
 */
-static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
-                              unsigned long nr_segs, loff_t pos)
+                                struct fuse_copy_state *cs, size_t nbytes)
 {
        int err;
        struct fuse_req *req;
        struct fuse_in *in;
-        struct fuse_copy_state cs;
        unsigned reqsize;
-        struct file *file = iocb->ki_filp;
-        struct fuse_conn *fc = fuse_get_conn(file);
-        if (!fc)
-                return -EPERM;
 restart:
        spin_lock(&fc->lock);
@@ -776,7 +979,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
        if (!list_empty(&fc->interrupts)) {
                req = list_entry(fc->interrupts.next, struct fuse_req,
                                 intr_entry);
-                return fuse_read_interrupt(fc, req, iov, nr_segs);
+                return fuse_read_interrupt(fc, cs, nbytes, req);
        }
        req = list_entry(fc->pending.next, struct fuse_req, list);
@@ -786,7 +989,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
        in = &req->in;
        reqsize = in->h.len;
        /* If request is too large, reply with an error and restart the read */
-        if (iov_length(iov, nr_segs) < reqsize) {
+        if (nbytes < reqsize) {
                req->out.h.error = -EIO;
                /* SETXATTR is special, since it may contain too large data */
                if (in->h.opcode == FUSE_SETXATTR)
@@ -795,12 +998,12 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
                goto restart;
        }
        spin_unlock(&fc->lock);
-        fuse_copy_init(&cs, fc, 1, req, iov, nr_segs);
+        cs->req = req;
-        err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
+        err = fuse_copy_one(cs, &in->h, sizeof(in->h));
        if (!err)
-                err = fuse_copy_args(&cs, in->numargs, in->argpages,
+                err = fuse_copy_args(cs, in->numargs, in->argpages,
                                     (struct fuse_arg *) in->args, 0);
-        fuse_copy_finish(&cs);
+        fuse_copy_finish(cs);
        spin_lock(&fc->lock);
        req->locked = 0;
        if (req->aborted) {
@@ -828,6 +1031,110 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
        return err;
 }
+static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
+                              unsigned long nr_segs, loff_t pos)
+{
+        struct fuse_copy_state cs;
+        struct file *file = iocb->ki_filp;
+        struct fuse_conn *fc = fuse_get_conn(file);
+        if (!fc)
+                return -EPERM;
+        fuse_copy_init(&cs, fc, 1, iov, nr_segs);
+        return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs));
+}
+static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe,
+                                   struct pipe_buffer *buf)
+{
+        return 1;
+}
+static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = {
+        .can_merge = 0,
+        .map = generic_pipe_buf_map,
+        .unmap = generic_pipe_buf_unmap,
+        .confirm = generic_pipe_buf_confirm,
+        .release = generic_pipe_buf_release,
+        .steal = fuse_dev_pipe_buf_steal,
+        .get = generic_pipe_buf_get,
+};
+static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
+                                    struct pipe_inode_info *pipe,
+                                    size_t len, unsigned int flags)
+{
+        int ret;
+        int page_nr = 0;
+        int do_wakeup = 0;
+        struct pipe_buffer *bufs;
+        struct fuse_copy_state cs;
+        struct fuse_conn *fc = fuse_get_conn(in);
+        if (!fc)
+                return -EPERM;
+        bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
+        if (!bufs)
+                return -ENOMEM;
+        fuse_copy_init(&cs, fc, 1, NULL, 0);
+        cs.pipebufs = bufs;
+        cs.pipe = pipe;
+        ret = fuse_dev_do_read(fc, in, &cs, len);
+        if (ret < 0)
+                goto out;
+        ret = 0;
+        pipe_lock(pipe);
+        if (!pipe->readers) {
+                send_sig(SIGPIPE, current, 0);
+                if (!ret)
+                        ret = -EPIPE;
+                goto out_unlock;
+        }
+        if (pipe->nrbufs + cs.nr_segs > pipe->buffers) {
+                ret = -EIO;
+                goto out_unlock;
+        }
+        while (page_nr < cs.nr_segs) {
+                int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
+                struct pipe_buffer *buf = pipe->bufs + newbuf;
+                buf->page = bufs[page_nr].page;
+                buf->offset = bufs[page_nr].offset;
+                buf->len = bufs[page_nr].len;
+                buf->ops = &fuse_dev_pipe_buf_ops;
+                pipe->nrbufs++;
+                page_nr++;
+                ret += buf->len;
+                if (pipe->inode)
+                        do_wakeup = 1;
+        }
+out_unlock:
+        pipe_unlock(pipe);
+        if (do_wakeup) {
+                smp_mb();
+                if (waitqueue_active(&pipe->wait))
+                        wake_up_interruptible(&pipe->wait);
+                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+        }
+out:
+        for (; page_nr < cs.nr_segs; page_nr++)
+                page_cache_release(bufs[page_nr].page);
+        kfree(bufs);
+        return ret;
+}
 static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
                            struct fuse_copy_state *cs)
 {
@@ -987,23 +1294,17 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
 * it from the list and copy the rest of the buffer to the request.
 * The request is finished by calling request_end()
 */
-static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
-                               unsigned long nr_segs, loff_t pos)
+                                 struct fuse_copy_state *cs, size_t nbytes)
 {
        int err;
-        size_t nbytes = iov_length(iov, nr_segs);
        struct fuse_req *req;
        struct fuse_out_header oh;
-        struct fuse_copy_state cs;
-        struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
-        if (!fc)
-                return -EPERM;
-        fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs);
        if (nbytes < sizeof(struct fuse_out_header))
                return -EINVAL;
-        err = fuse_copy_one(&cs, &oh, sizeof(oh));
+        err = fuse_copy_one(cs, &oh, sizeof(oh));
        if (err)
                goto err_finish;
@@ -1016,7 +1317,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
         * and error contains notification code.
         */
        if (!oh.unique) {
-                err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs);
+                err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
                return err ? err : nbytes;
        }
@@ -1035,7 +1336,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
        if (req->aborted) {
                spin_unlock(&fc->lock);
-                fuse_copy_finish(&cs);
+                fuse_copy_finish(cs);
                spin_lock(&fc->lock);
                request_end(fc, req);
                return -ENOENT;
@@ -1052,7 +1353,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
                        queue_interrupt(fc, req);
                spin_unlock(&fc->lock);
-                fuse_copy_finish(&cs);
+                fuse_copy_finish(cs);
                return nbytes;
        }
@@ -1060,11 +1361,13 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
        list_move(&req->list, &fc->io);
        req->out.h = oh;
        req->locked = 1;
-        cs.req = req;
+        cs->req = req;
+        if (!req->out.page_replace)
+                cs->move_pages = 0;
        spin_unlock(&fc->lock);
-        err = copy_out_args(&cs, &req->out, nbytes);
+        err = copy_out_args(cs, &req->out, nbytes);
-        fuse_copy_finish(&cs);
+        fuse_copy_finish(cs);
        spin_lock(&fc->lock);
        req->locked = 0;
@@ -1080,10 +1383,101 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
 err_unlock:
        spin_unlock(&fc->lock);
 err_finish:
-        fuse_copy_finish(&cs);
+        fuse_copy_finish(cs);
        return err;
 }
+static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
+                              unsigned long nr_segs, loff_t pos)
+{
+        struct fuse_copy_state cs;
+        struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
+        if (!fc)
+                return -EPERM;
+        fuse_copy_init(&cs, fc, 0, iov, nr_segs);
+        return fuse_dev_do_write(fc, &cs, iov_length(iov, nr_segs));
+}
+static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
+                                     struct file *out, loff_t *ppos,
+                                     size_t len, unsigned int flags)
+{
+        unsigned nbuf;
+        unsigned idx;
+        struct pipe_buffer *bufs;
+        struct fuse_copy_state cs;
+        struct fuse_conn *fc;
+        size_t rem;
+        ssize_t ret;
+        fc = fuse_get_conn(out);
+        if (!fc)
+                return -EPERM;
+        bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
+        if (!bufs)
+                return -ENOMEM;
+        pipe_lock(pipe);
+        nbuf = 0;
+        rem = 0;
+        for (idx = 0; idx < pipe->nrbufs && rem < len; idx++)
+                rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
+        ret = -EINVAL;
+        if (rem < len) {
+                pipe_unlock(pipe);
+                goto out;
+        }
+        rem = len;
+        while (rem) {
+                struct pipe_buffer *ibuf;
+                struct pipe_buffer *obuf;
+                BUG_ON(nbuf >= pipe->buffers);
+                BUG_ON(!pipe->nrbufs);
+                ibuf = &pipe->bufs[pipe->curbuf];
+                obuf = &bufs[nbuf];
+                if (rem >= ibuf->len) {
+                        *obuf = *ibuf;
+                        ibuf->ops = NULL;
+                        pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
+                        pipe->nrbufs--;
+                } else {
+                        ibuf->ops->get(pipe, ibuf);
+                        *obuf = *ibuf;
+                        obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+                        obuf->len = rem;
+                        ibuf->offset += obuf->len;
+                        ibuf->len -= obuf->len;
+                }
+                nbuf++;
+                rem -= obuf->len;
+        }
+        pipe_unlock(pipe);
+        fuse_copy_init(&cs, fc, 0, NULL, nbuf);
+        cs.pipebufs = bufs;
+        cs.pipe = pipe;
+        if (flags & SPLICE_F_MOVE)
+                cs.move_pages = 1;
+        ret = fuse_dev_do_write(fc, &cs, len);
+        for (idx = 0; idx < nbuf; idx++) {
+                struct pipe_buffer *buf = &bufs[idx];
+                buf->ops->release(pipe, buf);
+        }
+out:
+        kfree(bufs);
+        return ret;
+}
 static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 {
        unsigned mask = POLLOUT | POLLWRNORM;
@@ -1225,8 +1619,10 @@ const struct file_operations fuse_dev_operations = {
        .llseek         = no_llseek,
        .read           = do_sync_read,
        .aio_read       = fuse_dev_read,
+        .splice_read    = fuse_dev_splice_read,
        .write          = do_sync_write,
        .aio_write      = fuse_dev_write,
+        .splice_write   = fuse_dev_splice_write,
        .poll           = fuse_dev_poll,
        .release        = fuse_dev_release,
        .fasync         = fuse_dev_fasync,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 4787ae6c5c1c..3cdc5f78a406 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1156,10 +1156,9 @@ static int fuse_dir_release(struct inode *inode, struct file *file)
        return 0;
 }
-static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync)
+static int fuse_dir_fsync(struct file *file, int datasync)
 {
-        /* nfsd can call this with no file */
+        return fuse_fsync_common(file, datasync, 1);
-        return file ? fuse_fsync_common(file, de, datasync, 1) : 0;
 }
 static bool update_mtime(unsigned ivalid)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a9f5e137f1d3..ada0adeb3bb5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -351,10 +351,9 @@ static void fuse_sync_writes(struct inode *inode)
        fuse_release_nowrite(inode);
 }
-int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
+int fuse_fsync_common(struct file *file, int datasync, int isdir)
-                      int isdir)
 {
-        struct inode *inode = de->d_inode;
+        struct inode *inode = file->f_mapping->host;
        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_file *ff = file->private_data;
        struct fuse_req *req;
@@ -403,9 +402,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
        return err;
 }
-static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
+static int fuse_fsync(struct file *file, int datasync)
 {
-        return fuse_fsync_common(file, de, datasync, 0);
+        return fuse_fsync_common(file, datasync, 0);
 }
 void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
@@ -517,17 +516,26 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
        int i;
        size_t count = req->misc.read.in.size;
        size_t num_read = req->out.args[0].size;
-        struct inode *inode = req->pages[0]->mapping->host;
+        struct address_space *mapping = NULL;
-        /*
+        for (i = 0; mapping == NULL && i < req->num_pages; i++)
-         * Short read means EOF.  If file size is larger, truncate it
+                mapping = req->pages[i]->mapping;
-         */
-        if (!req->out.h.error && num_read < count) {
-                loff_t pos = page_offset(req->pages[0]) + num_read;
-                fuse_read_update_size(inode, pos, req->misc.read.attr_ver);
-        }
-        fuse_invalidate_attr(inode); /* atime changed */
+        if (mapping) {
+                struct inode *inode = mapping->host;
+                /*
+                 * Short read means EOF. If file size is larger, truncate it
+                 */
+                if (!req->out.h.error && num_read < count) {
+                        loff_t pos;
+                        pos = page_offset(req->pages[0]) + num_read;
+                        fuse_read_update_size(inode, pos,
+                                              req->misc.read.attr_ver);
+                }
+                fuse_invalidate_attr(inode); /* atime changed */
+        }
        for (i = 0; i < req->num_pages; i++) {
                struct page *page = req->pages[i];
@@ -536,6 +544,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
                else
                        SetPageError(page);
                unlock_page(page);
+                page_cache_release(page);
        }
        if (req->ff)
                fuse_file_put(req->ff);
@@ -550,6 +559,7 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file)
        req->out.argpages = 1;
        req->out.page_zeroing = 1;
+        req->out.page_replace = 1;
        fuse_read_fill(req, file, pos, count, FUSE_READ);
        req->misc.read.attr_ver = fuse_get_attr_version(fc);
        if (fc->async_read) {
@@ -589,6 +599,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
                        return PTR_ERR(req);
                }
        }
+        page_cache_get(page);
        req->pages[req->num_pages] = page;
        req->num_pages++;
        return 0;
@@ -994,10 +1005,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
        nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
        npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
        npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
-        down_read(&current->mm->mmap_sem);
+        npages = get_user_pages_fast(user_addr, npages, !write, req->pages);
-        npages = get_user_pages(current, current->mm, user_addr, npages, !write,
-                                0, req->pages, NULL);
-        up_read(&current->mm->mmap_sem);
        if (npages < 0)
                return npages;
@@ -1580,9 +1588,9 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
        while (iov_iter_count(&ii)) {
                struct page *page = pages[page_idx++];
                size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
-                void *kaddr, *map;
+                void *kaddr;
-                kaddr = map = kmap(page);
+                kaddr = kmap(page);
                while (todo) {
                        char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 01cc462ff45d..8f309f04064e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -177,6 +177,9 @@ struct fuse_out {
        /** Zero partially or not copied pages */
        unsigned page_zeroing:1;
+        /** Pages may be replaced with new ones */
+        unsigned page_replace:1;
        /** Number or arguments */
        unsigned numargs;
@@ -568,8 +571,7 @@ void fuse_release_common(struct file *file, int opcode);
 /**
 * Send FSYNC or FSYNCDIR request
 */
-int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
+int fuse_fsync_common(struct file *file, int datasync, int isdir);
-                      int isdir);
 /**
 * Notify poll wakeup
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 55458031e501..99800e564157 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -7,6 +7,7 @@
 */
 #include <linux/sched.h>
+#include <linux/gfp.h>
 #include <linux/fs.h>
 #include <linux/generic_acl.h>
 #include <linux/posix_acl.h>
@@ -200,7 +201,7 @@ generic_check_acl(struct inode *inode, int mask)
        return -EAGAIN;
 }
-struct xattr_handler generic_acl_access_handler = {
+const struct xattr_handler generic_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags  = ACL_TYPE_ACCESS,
        .list   = generic_acl_list,
@@ -208,7 +209,7 @@ struct xattr_handler generic_acl_access_handler = {
        .set    = generic_acl_set,
 };
-struct xattr_handler generic_acl_default_handler = {
+const struct xattr_handler generic_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags  = ACL_TYPE_DEFAULT,
        .list   = generic_acl_list,
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 87ee309d4c24..48171f4c943d 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -236,10 +236,14 @@ static int gfs2_xattr_system_get(struct dentry *dentry, const char *name,
                                 void *buffer, size_t size, int xtype)
 {
        struct inode *inode = dentry->d_inode;
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct posix_acl *acl;
        int type;
        int error;
+        if (!sdp->sd_args.ar_posix_acl)
+                return -EOPNOTSUPP;
        type = gfs2_acl_type(name);
        if (type < 0)
                return type;
@@ -335,7 +339,7 @@ out:
        return error;
 }
-struct xattr_handler gfs2_xattr_system_handler = {
+const struct xattr_handler gfs2_xattr_system_handler = {
        .prefix = XATTR_SYSTEM_PREFIX,
        .flags  = GFS2_EATYPE_SYS,
        .get    = gfs2_xattr_system_get,
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 9306a2e6620c..b522b0cb39ea 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -19,6 +19,6 @@
 extern int gfs2_check_acl(struct inode *inode, int mask);
 extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
 extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
-extern struct xattr_handler gfs2_xattr_system_handler;
+extern const struct xattr_handler gfs2_xattr_system_handler;
 #endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 0c1d0b82dcf1..9f8b52500d63 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -418,6 +418,7 @@ static int gfs2_jdata_writepages(struct address_space *mapping,
 static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 {
        struct buffer_head *dibh;
+        u64 dsize = i_size_read(&ip->i_inode);
        void *kaddr;
        int error;
@@ -437,9 +438,10 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
                return error;
        kaddr = kmap_atomic(page, KM_USER0);
-        memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
+        if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
-               ip->i_disksize);
+                dsize = (dibh->b_size - sizeof(struct gfs2_dinode));
-        memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize);
+        memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
+        memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
        kunmap_atomic(kaddr, KM_USER0);
        flush_dcache_page(page);
        brelse(dibh);
@@ -698,8 +700,14 @@ out:
                return 0;
        page_cache_release(page);
+        /*
+         * XXX(hch): the call below should probably be replaced with
+         * a call to the gfs2-specific truncate blocks helper to actually
+         * release disk blocks..
+         */
        if (pos + len > ip->i_inode.i_size)
-                vmtruncate(&ip->i_inode, ip->i_inode.i_size);
+                simple_setsize(&ip->i_inode, ip->i_inode.i_size);
 out_endtrans:
        gfs2_trans_end(sdp);
 out_trans_fail:
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 583e823307ae..4a48c0f4b402 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -7,7 +7,6 @@
 * of the GNU General Public License version 2.
 */
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
@@ -72,11 +71,13 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
        if (!PageUptodate(page)) {
                void *kaddr = kmap(page);
+                u64 dsize = i_size_read(inode);
+ 
+                if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
+                        dsize = dibh->b_size - sizeof(struct gfs2_dinode);
-                memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
+                memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
-                       ip->i_disksize);
+                memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
-                memset(kaddr + ip->i_disksize, 0,
-                       PAGE_CACHE_SIZE - ip->i_disksize);
                kunmap(page);
                SetPageUptodate(page);
@@ -1039,13 +1040,14 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
                goto out;
        if (gfs2_is_stuffed(ip)) {
-                ip->i_disksize = size;
+                u64 dsize = size + sizeof(struct gfs2_inode);
                ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
                gfs2_dinode_out(ip, dibh->b_data);
-                gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
+                if (dsize > dibh->b_size)
+                        dsize = dibh->b_size;
+                gfs2_buffer_clear_tail(dibh, dsize);
                error = 1;
        } else {
                if (size & (u64)(sdp->sd_sb.sb_bsize - 1))
                        error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 91beddadd388..bb7907bde3d8 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -7,7 +7,6 @@
 * of the GNU General Public License version 2.
 */
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 25fddc100f18..8295c5b5d4a9 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1475,7 +1475,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
                inode = gfs2_inode_lookup(dir->i_sb, 
                                be16_to_cpu(dent->de_type),
                                be64_to_cpu(dent->de_inum.no_addr),
-                                be64_to_cpu(dent->de_inum.no_formal_ino), 0);
+                                be64_to_cpu(dent->de_inum.no_formal_ino));
                brelse(bh);
                return inode;
        }
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index d15876e9aa26..dfe237a3f8ad 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -7,7 +7,6 @@
 * of the GNU General Public License version 2.
 */
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
@@ -169,7 +168,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
        if (error)
                goto fail;
-        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0);
+        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0);
        if (IS_ERR(inode)) {
                error = PTR_ERR(inode);
                goto fail;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index e6dd2aec6f82..ed9a94f0ef15 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -218,6 +218,11 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
        if (error)
                goto out_drop_write;
+        error = -EACCES;
+        if (!is_owner_or_cap(inode))
+                goto out;
+        error = 0;
        flags = ip->i_diskflags;
        new_flags = (flags & ~mask) | (reqflags & mask);
        if ((new_flags ^ flags) == 0)
@@ -275,8 +280,10 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
 {
        struct inode *inode = filp->f_path.dentry->d_inode;
        u32 fsflags, gfsflags;
        if (get_user(fsflags, ptr))
                return -EFAULT;
        gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
        if (!S_ISDIR(inode->i_mode)) {
                if (gfsflags & GFS2_DIF_INHERIT_JDATA)
@@ -547,9 +554,9 @@ static int gfs2_close(struct inode *inode, struct file *file)
 * Returns: errno
 */
-static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
+static int gfs2_fsync(struct file *file, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
        int ret = 0;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 454d4b4eb36b..ddcdbf493536 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -855,6 +855,9 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *
        gh->gh_flags = flags;
        gh->gh_iflags = 0;
        gh->gh_ip = (unsigned long)__builtin_return_address(0);
+        if (gh->gh_owner_pid)
+                put_pid(gh->gh_owner_pid);
+        gh->gh_owner_pid = get_pid(task_pid(current));
 }
 /**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 38e3749d476c..49f97d3bb690 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -7,7 +7,6 @@
 * of the GNU General Public License version 2.
 */
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 3aac46f6853e..b5d7363b22da 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -439,9 +439,6 @@ struct gfs2_args {
 struct gfs2_tune {
        spinlock_t gt_spin;
-        unsigned int gt_incore_log_blocks;
-        unsigned int gt_log_flush_secs;
        unsigned int gt_logd_secs;
        unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
@@ -462,6 +459,7 @@ enum {
        SDF_SHUTDOWN            = 2,
        SDF_NOBARRIERS          = 3,
        SDF_NORECOVERY          = 4,
+        SDF_DEMOTE              = 5,
 };
 #define GFS2_FSNAME_LEN         256
@@ -618,6 +616,7 @@ struct gfs2_sbd {
        unsigned int sd_log_commited_databuf;
        int sd_log_commited_revoke;
+        atomic_t sd_log_pinned;
        unsigned int sd_log_num_buf;
        unsigned int sd_log_num_revoke;
        unsigned int sd_log_num_rg;
@@ -629,15 +628,17 @@ struct gfs2_sbd {
        struct list_head sd_log_le_databuf;
        struct list_head sd_log_le_ordered;
+        atomic_t sd_log_thresh1;
+        atomic_t sd_log_thresh2;
        atomic_t sd_log_blks_free;
-        struct mutex sd_log_reserve_mutex;
+        wait_queue_head_t sd_log_waitq;
+        wait_queue_head_t sd_logd_waitq;
        u64 sd_log_sequence;
        unsigned int sd_log_head;
        unsigned int sd_log_tail;
        int sd_log_idle;
-        unsigned long sd_log_flush_time;
        struct rw_semaphore sd_log_flush_lock;
        atomic_t sd_log_in_flight;
        wait_queue_head_t sd_log_flush_wait;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index b1bf2694fb2b..b5612cbb62a5 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -158,7 +158,6 @@ void gfs2_set_iop(struct inode *inode)
 * @sb: The super block
 * @no_addr: The inode number
 * @type: The type of the inode
- * @skip_freeing: set this not return an inode if it is currently being freed.
 *
 * Returns: A VFS inode, or an error
 */
@@ -166,17 +165,14 @@ void gfs2_set_iop(struct inode *inode)
 struct inode *gfs2_inode_lookup(struct super_block *sb,
                                unsigned int type,
                                u64 no_addr,
-                                u64 no_formal_ino, int skip_freeing)
+                                u64 no_formal_ino)
 {
        struct inode *inode;
        struct gfs2_inode *ip;
        struct gfs2_glock *io_gl;
        int error;
-        if (skip_freeing)
+        inode = gfs2_iget(sb, no_addr);
-                inode = gfs2_iget_skip(sb, no_addr);
-        else
-                inode = gfs2_iget(sb, no_addr);
        ip = GFS2_I(inode);
        if (!inode)
@@ -234,11 +230,102 @@ fail_glock:
 fail_iopen:
        gfs2_glock_put(io_gl);
 fail_put:
+        if (inode->i_state & I_NEW)
+                ip->i_gl->gl_object = NULL;
+        gfs2_glock_put(ip->i_gl);
+fail:
+        if (inode->i_state & I_NEW)
+                iget_failed(inode);
+        else
+                iput(inode);
+        return ERR_PTR(error);
+}
+/**
+ * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation
+ *                               and try to reclaim it by doing iput.
+ *
+ * This function assumes no rgrp locks are currently held.
+ *
+ * @sb: The super block
+ * no_addr: The inode number
+ *
+ */
+void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
+{
+        struct gfs2_sbd *sdp;
+        struct gfs2_inode *ip;
+        struct gfs2_glock *io_gl;
+        int error;
+        struct gfs2_holder gh;
+        struct inode *inode;
+        inode = gfs2_iget_skip(sb, no_addr);
+        if (!inode)
+                return;
+        /* If it's not a new inode, someone's using it, so leave it alone. */
+        if (!(inode->i_state & I_NEW)) {
+                iput(inode);
+                return;
+        }
+        ip = GFS2_I(inode);
+        sdp = GFS2_SB(inode);
+        ip->i_no_formal_ino = -1;
+        error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
+        if (unlikely(error))
+                goto fail;
+        ip->i_gl->gl_object = ip;
+        error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
+        if (unlikely(error))
+                goto fail_put;
+        set_bit(GIF_INVALID, &ip->i_flags);
+        error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
+                                   &ip->i_iopen_gh);
+        if (unlikely(error))
+                goto fail_iopen;
+        ip->i_iopen_gh.gh_gl->gl_object = ip;
+        gfs2_glock_put(io_gl);
+        inode->i_mode = DT2IF(DT_UNKNOWN);
+        /*
+         * We must read the inode in order to work out its type in
+         * this case. Note that this doesn't happen often as we normally
+         * know the type beforehand. This code path only occurs during
+         * unlinked inode recovery (where it is safe to do this glock,
+         * which is not true in the general case).
+         */
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
+                                   &gh);
+        if (unlikely(error))
+                goto fail_glock;
+        /* Inode is now uptodate */
+        gfs2_glock_dq_uninit(&gh);
+        gfs2_set_iop(inode);
+        /* The iput will cause it to be deleted. */
+        iput(inode);
+        return;
+fail_glock:
+        gfs2_glock_dq(&ip->i_iopen_gh);
+fail_iopen:
+        gfs2_glock_put(io_gl);
+fail_put:
        ip->i_gl->gl_object = NULL;
        gfs2_glock_put(ip->i_gl);
 fail:
        iget_failed(inode);
-        return ERR_PTR(error);
+        return;
 }
 static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
@@ -862,7 +949,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
                goto fail_gunlock2;
        inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
-                                  inum.no_formal_ino, 0);
+                                  inum.no_formal_ino);
        if (IS_ERR(inode))
                goto fail_gunlock2;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c341aaf67adb..300ada3f21de 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -83,8 +83,8 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
 extern void gfs2_set_iop(struct inode *inode);
 extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
-                                       u64 no_addr, u64 no_formal_ino,
+                                       u64 no_addr, u64 no_formal_ino);
-                                       int skip_freeing);
+extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr);
 extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
 extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 569b46240f61..0e0470ed34c2 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -9,6 +9,7 @@
 #include <linux/fs.h>
 #include <linux/dlm.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/gfs2_ondisk.h>
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index e5bf4b59d46e..6a857e24f947 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -168,12 +168,11 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl
        return list_empty(&ai->ai_ail1_list);
 }
-static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
+static void gfs2_ail1_start(struct gfs2_sbd *sdp)
 {
        struct list_head *head;
        u64 sync_gen;
-        struct list_head *first;
+        struct gfs2_ail *ai;
-        struct gfs2_ail *first_ai, *ai, *tmp;
        int done = 0;
        gfs2_log_lock(sdp);
@@ -184,21 +183,9 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
        }
        sync_gen = sdp->sd_ail_sync_gen++;
-        first = head->prev;
-        first_ai = list_entry(first, struct gfs2_ail, ai_list);
-        first_ai->ai_sync_gen = sync_gen;
-        gfs2_ail1_start_one(sdp, first_ai); /* This may drop log lock */
-        if (flags & DIO_ALL)
-                first = NULL;
        while(!done) {
-                if (first && (head->prev != first ||
-                              gfs2_ail1_empty_one(sdp, first_ai, 0)))
-                        break;
                done = 1;
-                list_for_each_entry_safe_reverse(ai, tmp, head, ai_list) {
+                list_for_each_entry_reverse(ai, head, ai_list) {
                        if (ai->ai_sync_gen >= sync_gen)
                                continue;
                        ai->ai_sync_gen = sync_gen;
@@ -290,58 +277,57 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
 * flush time, so we ensure that we have just enough free blocks at all
 * times to avoid running out during a log flush.
 *
+ * We no longer flush the log here, instead we wake up logd to do that
+ * for us. To avoid the thundering herd and to ensure that we deal fairly
+ * with queued waiters, we use an exclusive wait. This means that when we
+ * get woken with enough journal space to get our reservation, we need to
+ * wake the next waiter on the list.
+ *
 * Returns: errno
 */
 int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
 {
-        unsigned int try = 0;
        unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize);
+        unsigned wanted = blks + reserved_blks;
+        DEFINE_WAIT(wait);
+        int did_wait = 0;
+        unsigned int free_blocks;
        if (gfs2_assert_warn(sdp, blks) ||
            gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
                return -EINVAL;
+retry:
-        mutex_lock(&sdp->sd_log_reserve_mutex);
+        free_blocks = atomic_read(&sdp->sd_log_blks_free);
-        gfs2_log_lock(sdp);
+        if (unlikely(free_blocks <= wanted)) {
-        while(atomic_read(&sdp->sd_log_blks_free) <= (blks + reserved_blks)) {
+                do {
-                gfs2_log_unlock(sdp);
+                        prepare_to_wait_exclusive(&sdp->sd_log_waitq, &wait,
-                gfs2_ail1_empty(sdp, 0);
+                                        TASK_UNINTERRUPTIBLE);
-                gfs2_log_flush(sdp, NULL);
+                        wake_up(&sdp->sd_logd_waitq);
+                        did_wait = 1;
-                if (try++)
+                        if (atomic_read(&sdp->sd_log_blks_free) <= wanted)
-                        gfs2_ail1_start(sdp, 0);
+                                io_schedule();
-                gfs2_log_lock(sdp);
+                        free_blocks = atomic_read(&sdp->sd_log_blks_free);
+                } while(free_blocks <= wanted);
+                finish_wait(&sdp->sd_log_waitq, &wait);
        }
-        atomic_sub(blks, &sdp->sd_log_blks_free);
+        if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks,
+                                free_blocks - blks) != free_blocks)
+                goto retry;
        trace_gfs2_log_blocks(sdp, -blks);
-        gfs2_log_unlock(sdp);
-        mutex_unlock(&sdp->sd_log_reserve_mutex);
+        /*
+         * If we waited, then so might others, wake them up _after_ we get
+         * our share of the log.
+         */
+        if (unlikely(did_wait))
+                wake_up(&sdp->sd_log_waitq);
        down_read(&sdp->sd_log_flush_lock);
        return 0;
 }
-/**
- * gfs2_log_release - Release a given number of log blocks
- * @sdp: The GFS2 superblock
- * @blks: The number of blocks
- *
- */
-void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
-{
-        gfs2_log_lock(sdp);
-        atomic_add(blks, &sdp->sd_log_blks_free);
-        trace_gfs2_log_blocks(sdp, blks);
-        gfs2_assert_withdraw(sdp,
-                             atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
-        gfs2_log_unlock(sdp);
-        up_read(&sdp->sd_log_flush_lock);
-}
 static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
 {
        struct gfs2_journal_extent *je;
@@ -559,11 +545,10 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
        ail2_empty(sdp, new_tail);
-        gfs2_log_lock(sdp);
        atomic_add(dist, &sdp->sd_log_blks_free);
        trace_gfs2_log_blocks(sdp, dist);
-        gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
+        gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
-        gfs2_log_unlock(sdp);
+                             sdp->sd_jdesc->jd_blocks);
        sdp->sd_log_tail = new_tail;
 }
@@ -615,6 +600,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
        if (buffer_eopnotsupp(bh)) {
                clear_buffer_eopnotsupp(bh);
                set_buffer_uptodate(bh);
+                fs_info(sdp, "barrier sync failed - disabling barriers\n");
                set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
                lock_buffer(bh);
 skip_barrier:
@@ -710,7 +696,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
 *
 */
-void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
+void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 {
        struct gfs2_ail *ai;
@@ -822,6 +808,13 @@ static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 * @sdp: the filesystem
 * @tr: the transaction
 *
+ * We wake up gfs2_logd if the number of pinned blocks exceed thresh1
+ * or the total number of used blocks (pinned blocks plus AIL blocks)
+ * is greater than thresh2.
+ *
+ * At mount time thresh1 is 1/3rd of journal size, thresh2 is 2/3rd of
+ * journal size.
+ *
 * Returns: errno
 */
@@ -832,10 +825,10 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
        up_read(&sdp->sd_log_flush_lock);
-        gfs2_log_lock(sdp);
+        if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) ||
-        if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks))
+            ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) >
-                wake_up_process(sdp->sd_logd_process);
+            atomic_read(&sdp->sd_log_thresh2)))
-        gfs2_log_unlock(sdp);
+                wake_up(&sdp->sd_logd_waitq);
 }
 /**
@@ -882,13 +875,23 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
 {
        gfs2_log_flush(sdp, NULL);
        for (;;) {
-                gfs2_ail1_start(sdp, DIO_ALL);
+                gfs2_ail1_start(sdp);
                if (gfs2_ail1_empty(sdp, DIO_ALL))
                        break;
                msleep(10);
        }
 }
+static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp)
+{
+        return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1));
+}
+static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp)
+{
+        unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free);
+        return used_blocks >= atomic_read(&sdp->sd_log_thresh2);
+}
 /**
 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
@@ -901,28 +904,43 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
 int gfs2_logd(void *data)
 {
        struct gfs2_sbd *sdp = data;
-        unsigned long t;
+        unsigned long t = 1;
-        int need_flush;
+        DEFINE_WAIT(wait);
+        unsigned preflush;
        while (!kthread_should_stop()) {
-                /* Advance the log tail */
-                t = sdp->sd_log_flush_time +
+                preflush = atomic_read(&sdp->sd_log_pinned);
-                    gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
+                if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
+                        gfs2_ail1_empty(sdp, DIO_ALL);
+                        gfs2_log_flush(sdp, NULL);
+                        gfs2_ail1_empty(sdp, DIO_ALL);
+                }
-                gfs2_ail1_empty(sdp, DIO_ALL);
+                if (gfs2_ail_flush_reqd(sdp)) {
-                gfs2_log_lock(sdp);
+                        gfs2_ail1_start(sdp);
-                need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks);
+                        io_schedule();
-                gfs2_log_unlock(sdp);
+                        gfs2_ail1_empty(sdp, 0);
-                if (need_flush || time_after_eq(jiffies, t)) {
                        gfs2_log_flush(sdp, NULL);
-                        sdp->sd_log_flush_time = jiffies;
+                        gfs2_ail1_empty(sdp, DIO_ALL);
                }
+                wake_up(&sdp->sd_log_waitq);
                t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
                if (freezing(current))
                        refrigerator();
-                schedule_timeout_interruptible(t);
+                do {
+                        prepare_to_wait(&sdp->sd_logd_waitq, &wait,
+                                        TASK_UNINTERRUPTIBLE);
+                        if (!gfs2_ail_flush_reqd(sdp) &&
+                            !gfs2_jrnl_flush_reqd(sdp) &&
+                            !kthread_should_stop())
+                                t = schedule_timeout(t);
+                } while(t && !gfs2_ail_flush_reqd(sdp) &&
+                        !gfs2_jrnl_flush_reqd(sdp) &&
+                        !kthread_should_stop());
+                finish_wait(&sdp->sd_logd_waitq, &wait);
        }
        return 0;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 7c64510ccfd2..0d007f920234 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -47,29 +47,21 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
        sdp->sd_log_head = sdp->sd_log_tail = value;
 }
-unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
+extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
                            unsigned int ssize);
-int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
+extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
-void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
+extern void gfs2_log_incr_head(struct gfs2_sbd *sdp);
-void gfs2_log_incr_head(struct gfs2_sbd *sdp);
-struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
+extern struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
-struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
+extern struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
                                      struct buffer_head *real);
-void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
+extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
-static inline void gfs2_log_flush(struct gfs2_sbd *sbd, struct gfs2_glock *gl)
+extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
-{
+extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
-        if (!gl || test_bit(GLF_LFLUSH, &gl->gl_flags))
+extern int gfs2_logd(void *data);
-                __gfs2_log_flush(sbd, gl);
-}
-void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
-void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
-void gfs2_log_shutdown(struct gfs2_sbd *sdp);
-void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
-int gfs2_logd(void *data);
 #endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index adc260fbea90..bf33f822058d 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -54,6 +54,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
        if (bd->bd_ail)
                list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
        get_bh(bh);
+        atomic_inc(&sdp->sd_log_pinned);
        trace_gfs2_pin(bd, 1);
 }
@@ -94,6 +95,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
        trace_gfs2_pin(bd, 0);
        gfs2_log_unlock(sdp);
        unlock_buffer(bh);
+        atomic_dec(&sdp->sd_log_pinned);
 }
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index a88fadc704bb..fb2a5f93b7c3 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -94,7 +94,7 @@ static int __init init_gfs2_fs(void)
        if (!gfs2_glock_cachep)
                goto fail;
-        gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock (aspace)",
+        gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock(aspace)",
                                        sizeof(struct gfs2_glock) +
                                        sizeof(struct address_space),
                                        0, 0, gfs2_init_gl_aspace_once);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0bb12c80937a..18176d0b75d7 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -34,7 +34,6 @@
 static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
 {
-        int err;
        struct buffer_head *bh, *head;
        int nr_underway = 0;
        int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ?
@@ -86,11 +85,10 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
        } while (bh != head);
        unlock_page(page);
-        err = 0;
        if (nr_underway == 0)
                end_page_writeback(page);
-        return err;
+        return 0;
 }
 const struct address_space_operations gfs2_meta_aops = {
@@ -313,6 +311,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
        struct gfs2_bufdata *bd = bh->b_private;
        if (test_clear_buffer_pinned(bh)) {
+                atomic_dec(&sdp->sd_log_pinned);
                list_del_init(&bd->bd_le.le_list);
                if (meta) {
                        gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c1309ed1c496..3593b3a7290e 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -57,8 +57,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 {
        spin_lock_init(&gt->gt_spin);
-        gt->gt_incore_log_blocks = 1024;
-        gt->gt_logd_secs = 1;
        gt->gt_quota_simul_sync = 64;
        gt->gt_quota_warn_period = 10;
        gt->gt_quota_scale_num = 1;
@@ -101,14 +99,15 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        spin_lock_init(&sdp->sd_trunc_lock);
        spin_lock_init(&sdp->sd_log_lock);
+        atomic_set(&sdp->sd_log_pinned, 0);
        INIT_LIST_HEAD(&sdp->sd_log_le_buf);
        INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
        INIT_LIST_HEAD(&sdp->sd_log_le_rg);
        INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
        INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
-        mutex_init(&sdp->sd_log_reserve_mutex);
+        init_waitqueue_head(&sdp->sd_log_waitq);
+        init_waitqueue_head(&sdp->sd_logd_waitq);
        INIT_LIST_HEAD(&sdp->sd_ail1_list);
        INIT_LIST_HEAD(&sdp->sd_ail2_list);
@@ -487,7 +486,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
        struct dentry *dentry;
        struct inode *inode;
-        inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
+        inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0);
        if (IS_ERR(inode)) {
                fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
                return PTR_ERR(inode);
@@ -733,6 +732,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
        if (sdp->sd_args.ar_spectator) {
                sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
                atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
+                atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
+                atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
        } else {
                if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
                        fs_err(sdp, "can't mount journal #%u\n",
@@ -770,6 +771,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
                        goto fail_jinode_gh;
                }
                atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
+                atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
+                atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
                /* Map the extents for this journal's blocks */
                map_journal_extents(sdp);
@@ -951,8 +954,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
        if (undo)
                goto fail_quotad;
-        sdp->sd_log_flush_time = jiffies;
        p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
        error = IS_ERR(p);
        if (error) {
@@ -1160,7 +1161,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
                               GFS2_BASIC_BLOCK_SHIFT;
        sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
-        sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit;
+        sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit;
        sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum;
        if (sdp->sd_args.ar_statfs_quantum) {
                sdp->sd_tune.gt_statfs_slow = 0;
@@ -1323,7 +1324,7 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
        memset(&args, 0, sizeof(args));
        args.ar_quota = GFS2_QUOTA_DEFAULT;
        args.ar_data = GFS2_DATA_DEFAULT;
-        args.ar_commit = 60;
+        args.ar_commit = 30;
        args.ar_statfs_quantum = 30;
        args.ar_quota_quantum = 60;
        args.ar_errors = GFS2_ERRORS_DEFAULT;
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 4e64352d49de..98cdd05f3316 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1071,6 +1071,9 @@ int gfs2_permission(struct inode *inode, int mask)
        return error;
 }
+/*
+ * XXX: should be changed to have proper ordering by opencoding simple_setsize
+ */
 static int setattr_size(struct inode *inode, struct iattr *attr)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
@@ -1081,7 +1084,7 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
                error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
                if (error)
                        return error;
-                error = vmtruncate(inode, attr->ia_size);
+                error = simple_setsize(inode, attr->ia_size);
                gfs2_trans_end(sdp);
                if (error) 
                        return error;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 6dbcbad6ab17..49667d68769e 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -637,15 +637,40 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        unsigned blocksize, iblock, pos;
        struct buffer_head *bh, *dibh;
        struct page *page;
-        void *kaddr;
+        void *kaddr, *ptr;
-        struct gfs2_quota *qp;
+        struct gfs2_quota q, *qp;
-        s64 value;
+        int err, nbytes;
-        int err = -EIO;
        u64 size;
        if (gfs2_is_stuffed(ip))
                gfs2_unstuff_dinode(ip, NULL);
-        
+        memset(&q, 0, sizeof(struct gfs2_quota));
+        err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q));
+        if (err < 0)
+                return err;
+        err = -EIO;
+        qp = &q;
+        qp->qu_value = be64_to_cpu(qp->qu_value);
+        qp->qu_value += change;
+        qp->qu_value = cpu_to_be64(qp->qu_value);
+        qd->qd_qb.qb_value = qp->qu_value;
+        if (fdq) {
+                if (fdq->d_fieldmask & FS_DQ_BSOFT) {
+                        qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
+                        qd->qd_qb.qb_warn = qp->qu_warn;
+                }
+                if (fdq->d_fieldmask & FS_DQ_BHARD) {
+                        qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
+                        qd->qd_qb.qb_limit = qp->qu_limit;
+                }
+        }
+        /* Write the quota into the quota file on disk */
+        ptr = qp;
+        nbytes = sizeof(struct gfs2_quota);
+get_a_page:
        page = grab_cache_page(mapping, index);
        if (!page)
                return -ENOMEM;
@@ -667,7 +692,12 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        if (!buffer_mapped(bh)) {
                gfs2_block_map(inode, iblock, bh, 1);
                if (!buffer_mapped(bh))
-                        goto unlock;
+                        goto unlock_out;
+                /* If it's a newly allocated disk block for quota, zero it */
+                if (buffer_new(bh)) {
+                        memset(bh->b_data, 0, bh->b_size);
+                        set_buffer_uptodate(bh);
+                }
        }
        if (PageUptodate(page))
@@ -677,32 +707,34 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
                ll_rw_block(READ_META, 1, &bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh))
-                        goto unlock;
+                        goto unlock_out;
        }
        gfs2_trans_add_bh(ip->i_gl, bh, 0);
        kaddr = kmap_atomic(page, KM_USER0);
-        qp = kaddr + offset;
+        if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE)
-        value = (s64)be64_to_cpu(qp->qu_value) + change;
+                nbytes = PAGE_CACHE_SIZE - offset;
-        qp->qu_value = cpu_to_be64(value);
+        memcpy(kaddr + offset, ptr, nbytes);
-        qd->qd_qb.qb_value = qp->qu_value;
-        if (fdq) {
-                if (fdq->d_fieldmask & FS_DQ_BSOFT) {
-                        qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
-                        qd->qd_qb.qb_warn = qp->qu_warn;
-                }
-                if (fdq->d_fieldmask & FS_DQ_BHARD) {
-                        qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
-                        qd->qd_qb.qb_limit = qp->qu_limit;
-                }
-        }
        flush_dcache_page(page);
        kunmap_atomic(kaddr, KM_USER0);
+        unlock_page(page);
+        page_cache_release(page);
+        /* If quota straddles page boundary, we need to update the rest of the
+         * quota at the beginning of the next page */
+        if (offset != 0) { /* first page, offset is closer to PAGE_CACHE_SIZE */
+                ptr = ptr + nbytes;
+                nbytes = sizeof(struct gfs2_quota) - nbytes;
+                offset = 0;
+                index++;
+                goto get_a_page;
+        }
+        /* Update the disk inode timestamp and size (if extended) */
        err = gfs2_meta_inode_buffer(ip, &dibh);
        if (err)
-                goto unlock;
+                goto out;
        size = loc + sizeof(struct gfs2_quota);
        if (size > inode->i_size) {
@@ -715,7 +747,9 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        brelse(dibh);
        mark_inode_dirty(inode);
-unlock:
+out:
+        return err;
+unlock_out:
        unlock_page(page);
        page_cache_release(page);
        return err;
@@ -779,8 +813,10 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
         * rgrp since it won't be allocated during the transaction
         */
        al->al_requested = 1;
-        /* +1 in the end for block requested above for unstuffing */
+        /* +3 in the end for unstuffing block, inode size update block
-        blocks = num_qd * data_blocks + RES_DINODE + num_qd + 1;
+         * and another block in case quota straddles page boundary and 
+         * two blocks need to be updated instead of 1 */
+        blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3;
        if (nalloc)
                al->al_requested += nalloc * (data_blocks + ind_blocks);                
@@ -1418,10 +1454,18 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
        memset(fqs, 0, sizeof(struct fs_quota_stat));
        fqs->qs_version = FS_QSTAT_VERSION;
-        if (sdp->sd_args.ar_quota == GFS2_QUOTA_ON)
-                fqs->qs_flags = (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
+        switch (sdp->sd_args.ar_quota) {
-        else if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT)
+        case GFS2_QUOTA_ON:
-                fqs->qs_flags = (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
+                fqs->qs_flags |= (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
+                /*FALLTHRU*/
+        case GFS2_QUOTA_ACCOUNT:
+                fqs->qs_flags |= (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
+                break;
+        case GFS2_QUOTA_OFF:
+                break;
+        }
        if (sdp->sd_quota_inode) {
                fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr;
                fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks;
@@ -1432,8 +1476,8 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
        return 0;
 }
-static int gfs2_xquota_get(struct super_block *sb, int type, qid_t id,
+static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
-                           struct fs_disk_quota *fdq)
+                          struct fs_disk_quota *fdq)
 {
        struct gfs2_sbd *sdp = sb->s_fs_info;
        struct gfs2_quota_lvb *qlvb;
@@ -1477,8 +1521,8 @@ out:
 /* GFS2 only supports a subset of the XFS fields */
 #define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
-static int gfs2_xquota_set(struct super_block *sb, int type, qid_t id,
+static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
-                           struct fs_disk_quota *fdq)
+                          struct fs_disk_quota *fdq)
 {
        struct gfs2_sbd *sdp = sb->s_fs_info;
        struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
@@ -1585,7 +1629,7 @@ out_put:
 const struct quotactl_ops gfs2_quotactl_ops = {
        .quota_sync     = gfs2_quota_sync,
        .get_xstate     = gfs2_quota_get_xstate,
-        .get_xquota     = gfs2_xquota_get,
+        .get_dqblk      = gfs2_get_dqblk,
-        .set_xquota     = gfs2_xquota_set,
+        .set_dqblk      = gfs2_set_dqblk,
 };
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 503b842f3ba2..171a744f8e45 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -854,7 +854,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
                                if ((start + nr_sects) != blk) {
                                        rv = blkdev_issue_discard(bdev, start,
                                                            nr_sects, GFP_NOFS,
-                                                            DISCARD_FL_BARRIER);
+                                                            BLKDEV_IFL_WAIT |
+                                                            BLKDEV_IFL_BARRIER);
                                        if (rv)
                                                goto fail;
                                        nr_sects = 0;
@@ -869,7 +870,7 @@ start_new_extent:
        }
        if (nr_sects) {
                rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
-                                         DISCARD_FL_BARRIER);
+                                         BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
                if (rv)
                        goto fail;
        }
@@ -948,13 +949,13 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
 * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
 * @rgd: The rgrp
 *
- * Returns: The inode, if one has been found
+ * Returns: 0 if no error
+ *          The inode, if one has been found, in inode.
 */
-static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
+static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
-                                     u64 skip)
+                           u64 skip)
 {
-        struct inode *inode;
        u32 goal = 0, block;
        u64 no_addr;
        struct gfs2_sbd *sdp = rgd->rd_sbd;
@@ -979,14 +980,11 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
                if (no_addr == skip)
                        continue;
                *last_unlinked = no_addr;
-                inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN,
+                return no_addr;
-                                          no_addr, -1, 1);
-                if (!IS_ERR(inode))
-                        return inode;
        }
        rgd->rd_flags &= ~GFS2_RDF_CHECK;
-        return NULL;
+        return 0;
 }
 /**
@@ -1067,11 +1065,12 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
 * Try to acquire rgrp in way which avoids contending with others.
 *
 * Returns: errno
+ *          unlinked: the block address of an unlinked block to be reclaimed
 */
-static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
+static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
+                          u64 *last_unlinked)
 {
-        struct inode *inode = NULL;
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrpd *rgd, *begin = NULL;
        struct gfs2_alloc *al = ip->i_alloc;
@@ -1080,6 +1079,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
        int loops = 0;
        int error, rg_locked;
+        *unlinked = 0;
        rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
        while (rgd) {
@@ -1096,19 +1096,24 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                case 0:
                        if (try_rgrp_fit(rgd, al))
                                goto out;
-                        if (rgd->rd_flags & GFS2_RDF_CHECK)
+                        /* If the rg came in already locked, there's no
-                                inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
+                           way we can recover from a failed try_rgrp_unlink
+                           because that would require an iput which can only
+                           happen after the rgrp is unlocked. */
+                        if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
+                                *unlinked = try_rgrp_unlink(rgd, last_unlinked,
+                                                           ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
-                        if (inode)
+                        if (*unlinked)
-                                return inode;
+                                return -EAGAIN;
                        /* fall through */
                case GLR_TRYFAILED:
                        rgd = recent_rgrp_next(rgd);
                        break;
                default:
-                        return ERR_PTR(error);
+                        return error;
                }
        }
@@ -1130,12 +1135,13 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                case 0:
                        if (try_rgrp_fit(rgd, al))
                                goto out;
-                        if (rgd->rd_flags & GFS2_RDF_CHECK)
+                        if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
-                                inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
+                                *unlinked = try_rgrp_unlink(rgd, last_unlinked,
+                                                            ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
-                        if (inode)
+                        if (*unlinked)
-                                return inode;
+                                return -EAGAIN;
                        break;
                case GLR_TRYFAILED:
@@ -1143,7 +1149,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                        break;
                default:
-                        return ERR_PTR(error);
+                        return error;
                }
                rgd = gfs2_rgrpd_get_next(rgd);
@@ -1152,7 +1158,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                if (rgd == begin) {
                        if (++loops >= 3)
-                                return ERR_PTR(-ENOSPC);
+                                return -ENOSPC;
                        if (!skipped)
                                loops++;
                        flags = 0;
@@ -1172,7 +1178,7 @@ out:
                forward_rgrp_set(sdp, rgd);
        }
-        return NULL;
+        return 0;
 }
 /**
@@ -1186,9 +1192,8 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_alloc *al = ip->i_alloc;
-        struct inode *inode;
        int error = 0;
-        u64 last_unlinked = NO_BLOCK;
+        u64 last_unlinked = NO_BLOCK, unlinked;
        if (gfs2_assert_warn(sdp, al->al_requested))
                return -EINVAL;
@@ -1204,17 +1209,27 @@ try_again:
        if (error)
                return error;
-        inode = get_local_rgrp(ip, &last_unlinked);
+        /* Find an rgrp suitable for allocation.  If it encounters any unlinked
-        if (inode) {
+           dinodes along the way, error will equal -EAGAIN and unlinked will
+           contains it block address. We then need to look up that inode and
+           try to free it, and try the allocation again. */
+        error = get_local_rgrp(ip, &unlinked, &last_unlinked);
+        if (error) {
                if (ip != GFS2_I(sdp->sd_rindex))
                        gfs2_glock_dq_uninit(&al->al_ri_gh);
-                if (IS_ERR(inode))
+                if (error != -EAGAIN)
-                        return PTR_ERR(inode);
+                        return error;
-                iput(inode);
+                gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
+                /* regardless of whether or not gfs2_process_unlinked_inode
+                   was successful, we don't want to repeat it again. */
+                last_unlinked = unlinked;
                gfs2_log_flush(sdp, NULL);
+                error = 0;
                goto try_again;
        }
+        /* no error, so we have the rgrp set in the inode's allocation. */
        al->al_file = file;
        al->al_line = line;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index b4106ddaaa98..f07119d89557 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -10,6 +10,8 @@
 #ifndef __RGRP_DOT_H__
 #define __RGRP_DOT_H__
+#include <linux/slab.h>
 struct gfs2_rgrpd;
 struct gfs2_sbd;
 struct gfs2_holder;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 50aac606b990..4d1aad38f1b1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1113,7 +1113,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
        int error;
        spin_lock(&gt->gt_spin);
-        args.ar_commit = gt->gt_log_flush_secs;
+        args.ar_commit = gt->gt_logd_secs;
        args.ar_quota_quantum = gt->gt_quota_quantum;
        if (gt->gt_statfs_slow)
                args.ar_statfs_quantum = 0;
@@ -1160,7 +1160,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
        else
                clear_bit(SDF_NOBARRIERS, &sdp->sd_flags);
        spin_lock(&gt->gt_spin);
-        gt->gt_log_flush_secs = args.ar_commit;
+        gt->gt_logd_secs = args.ar_commit;
        gt->gt_quota_quantum = args.ar_quota_quantum;
        if (args.ar_statfs_quantum) {
                gt->gt_statfs_slow = 0;
@@ -1305,8 +1305,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        }
        if (args->ar_discard)
                seq_printf(s, ",discard");
-        val = sdp->sd_tune.gt_log_flush_secs;
+        val = sdp->sd_tune.gt_logd_secs;
-        if (val != 60)
+        if (val != 30)
                seq_printf(s, ",commit=%d", val);
        val = sdp->sd_tune.gt_statfs_quantum;
        if (val != 30)
@@ -1334,7 +1334,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        }
        if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
                seq_printf(s, ",nobarrier");
+        if (test_bit(SDF_DEMOTE, &sdp->sd_flags))
+                seq_printf(s, ",demote_interface_used");
        return 0;
 }
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 3df60f2d84e3..a0464680af0b 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -54,7 +54,7 @@ extern struct file_system_type gfs2meta_fs_type;
 extern const struct export_operations gfs2_export_ops;
 extern const struct super_operations gfs2_super_ops;
 extern const struct dentry_operations gfs2_dops;
-extern struct xattr_handler *gfs2_xattr_handlers[];
+extern const struct xattr_handler *gfs2_xattr_handlers[];
 #endif /* __SUPER_DOT_H__ */
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 419042f7f0b6..37f5393e68e6 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -8,7 +8,6 @@
 */
 #include <linux/sched.h>
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
@@ -233,6 +232,8 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
        glops = gfs2_glops_list[gltype];
        if (glops == NULL)
                return -EINVAL;
+        if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
+                fs_info(sdp, "demote interface used\n");
        rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl);
        if (rv)
                return rv;
@@ -469,8 +470,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
 }                                                                             \
 TUNE_ATTR_2(name, name##_store)
-TUNE_ATTR(incore_log_blocks, 0);
-TUNE_ATTR(log_flush_secs, 0);
 TUNE_ATTR(quota_warn_period, 0);
 TUNE_ATTR(quota_quantum, 0);
 TUNE_ATTR(max_readahead, 0);
@@ -482,8 +481,6 @@ TUNE_ATTR(statfs_quantum, 1);
 TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
 static struct attribute *tune_attrs[] = {
-        &tune_attr_incore_log_blocks.attr,
-        &tune_attr_log_flush_secs.attr,
        &tune_attr_quota_warn_period.attr,
        &tune_attr_quota_quantum.attr,
        &tune_attr_max_readahead.attr,
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 4ef0e9fa3549..9ec73a854111 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -23,6 +23,7 @@
 #include "meta_io.h"
 #include "trans.h"
 #include "util.h"
+#include "trace_gfs2.h"
 int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
                     unsigned int revokes)
@@ -75,6 +76,23 @@ fail_holder_uninit:
        return error;
 }
+/**
+ * gfs2_log_release - Release a given number of log blocks
+ * @sdp: The GFS2 superblock
+ * @blks: The number of blocks
+ *
+ */
+static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
+{
+        atomic_add(blks, &sdp->sd_log_blks_free);
+        trace_gfs2_log_blocks(sdp, blks);
+        gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
+                                  sdp->sd_jdesc->jd_blocks);
+        up_read(&sdp->sd_log_flush_lock);
+}
 void gfs2_trans_end(struct gfs2_sbd *sdp)
 {
        struct gfs2_trans *tr = current->journal_info;
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 226f2bfbf16a..53511291fe36 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -7,7 +7,6 @@
 * of the GNU General Public License version 2.
 */
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index c2ebdf2c01d4..82f93da00d1b 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1535,21 +1535,21 @@ out_alloc:
        return error;
 }
-static struct xattr_handler gfs2_xattr_user_handler = {
+static const struct xattr_handler gfs2_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
        .flags  = GFS2_EATYPE_USR,
        .get    = gfs2_xattr_get,
        .set    = gfs2_xattr_set,
 };
-static struct xattr_handler gfs2_xattr_security_handler = {
+static const struct xattr_handler gfs2_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .flags  = GFS2_EATYPE_SECURITY,
        .get    = gfs2_xattr_get,
        .set    = gfs2_xattr_set,
 };
-struct xattr_handler *gfs2_xattr_handlers[] = {
+const struct xattr_handler *gfs2_xattr_handlers[] = {
        &gfs2_xattr_user_handler,
        &gfs2_xattr_security_handler,
        &gfs2_xattr_system_handler,
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 0d200068d0af..cdb41a1f6a64 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -9,6 +9,7 @@
 */
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 #include "btree.h"
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 052f214ea6f0..38a0a9917d7f 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -9,6 +9,7 @@
 */
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/log2.h>
 #include "btree.h"
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 8bbe03c3f6d5..86428f5ac991 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -11,6 +11,7 @@
 #include <linux/cdrom.h>
 #include <linux/genhd.h>
 #include <linux/nls.h>
+#include <linux/slab.h>
 #include "hfs_fs.h"
 #include "btree.h"
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 5ed7252b7b23..0a81eb7111f3 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -19,6 +19,7 @@
 #include <linux/nls.h>
 #include <linux/parser.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/vfs.h>
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 5f4023678251..764fd1bdca88 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -494,7 +494,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
 const struct file_operations hfsplus_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = hfsplus_readdir,
-        .ioctl          = hfsplus_ioctl,
+        .unlocked_ioctl = hfsplus_ioctl,
        .llseek         = generic_file_llseek,
        .release        = hfsplus_dir_release,
 };
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 5c10d803d9df..6505c30ad965 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -337,8 +337,7 @@ struct inode *hfsplus_new_inode(struct super_block *, int);
 void hfsplus_delete_inode(struct inode *);
 /* ioctl.c */
-int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
+long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
-                  unsigned long arg);
 int hfsplus_setxattr(struct dentry *dentry, const char *name,
                     const void *value, size_t size, int flags);
 ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 1bcf597c0562..9bbb82924a22 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -285,7 +285,7 @@ static const struct file_operations hfsplus_file_operations = {
        .fsync          = file_fsync,
        .open           = hfsplus_file_open,
        .release        = hfsplus_file_release,
-        .ioctl          = hfsplus_ioctl,
+        .unlocked_ioctl = hfsplus_ioctl,
 };
 struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index f457d2ca51ab..ac405f099026 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,14 +17,16 @@
 #include <linux/mount.h>
 #include <linux/sched.h>
 #include <linux/xattr.h>
+#include <linux/smp_lock.h>
 #include <asm/uaccess.h>
 #include "hfsplus_fs.h"
-int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
+long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-                  unsigned long arg)
 {
+        struct inode *inode = filp->f_path.dentry->d_inode;
        unsigned int flags;
+        lock_kernel();
        switch (cmd) {
        case HFSPLUS_IOC_EXT2_GETFLAGS:
                flags = 0;
@@ -38,8 +40,10 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
        case HFSPLUS_IOC_EXT2_SETFLAGS: {
                int err = 0;
                err = mnt_want_write(filp->f_path.mnt);
-                if (err)
+                if (err) {
+                        unlock_kernel();
                        return err;
+                }
                if (!is_owner_or_cap(inode)) {
                        err = -EACCES;
@@ -85,9 +89,11 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
                mark_inode_dirty(inode);
 setflags_out:
                mnt_drop_write(filp->f_path.mnt);
+                unlock_kernel();
                return err;
        }
        default:
+                unlock_kernel();
                return -ENOTTY;
        }
 }
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 3fcbb0e1f6fc..572628b4b07d 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -15,6 +15,7 @@
 #include <linux/nls.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include "hfsplus_fs.h"
 enum {
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 032604e5ef2c..87ac1891a185 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -11,6 +11,7 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/statfs.h>
+#include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include "hostfs.h"
@@ -410,9 +411,9 @@ int hostfs_file_open(struct inode *ino, struct file *file)
        return 0;
 }
-int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int hostfs_fsync(struct file *file, int datasync)
 {
-        return fsync_file(HOSTFS_I(dentry->d_inode)->fd, datasync);
+        return fsync_file(HOSTFS_I(file->f_mapping->host)->fd, datasync);
 }
 static const struct file_operations hostfs_file_fops = {
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index b6fca543544c..eac5f96323e3 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -6,6 +6,7 @@
 *  general buffer i/o
 */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include "hpfs_fn.h"
 void hpfs_lock_creation(struct super_block *s)
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 26e3964a4b8c..2338130cceba 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -7,6 +7,7 @@
 */
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 #include "hpfs_fn.h"
 static int hpfs_dir_release(struct inode *inode, struct file *filp)
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 3efabff00367..a9ae9bfa752f 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -19,9 +19,9 @@ static int hpfs_file_release(struct inode *inode, struct file *file)
        return 0;
 }
-int hpfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
+int hpfs_file_fsync(struct file *file, int datasync)
 {
-        /*return file_fsync(file, dentry);*/
+        /*return file_fsync(file, datasync);*/
        return 0; /* Don't fsync :-) */
 }
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 97bf738cd5d6..75f9d4324851 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -268,7 +268,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, const char *,
 /* file.c */
-int hpfs_file_fsync(struct file *, struct dentry *, int);
+int hpfs_file_fsync(struct file *, int);
 extern const struct file_operations hpfs_file_ops;
 extern const struct inode_operations hpfs_file_iops;
 extern const struct address_space_operations hpfs_aops;
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index ff90affb94e1..1042a9bc97f3 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -7,6 +7,7 @@
 */
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 #include "hpfs_fn.h"
 void hpfs_init_inode(struct inode *i)
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index cadc4ce48656..aa53842c599c 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/smp_lock.h>
 #include <linux/bitmap.h>
+#include <linux/slab.h>
 /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 2e4dfa8593da..826c3f9d29ac 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -587,7 +587,7 @@ static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
        return err;
 }
-static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+static int hppfs_fsync(struct file *file, int datasync)
 {
        return 0;
 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a0bbd3d1b41a..a4e9a7ec3691 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -688,7 +688,7 @@ static void init_once(void *foo)
 const struct file_operations hugetlbfs_file_operations = {
        .read                   = hugetlbfs_read,
        .mmap                   = hugetlbfs_file_mmap,
-        .fsync                  = simple_sync_file,
+        .fsync                  = noop_fsync,
        .get_unmapped_area      = hugetlb_get_unmapped_area,
 };
diff --git a/fs/inode.c b/fs/inode.c
index 407bf392e20a..2bee20ae3d65 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -286,11 +286,9 @@ static void init_once(void *foo)
 */
 void __iget(struct inode *inode)
 {
-        if (atomic_read(&inode->i_count)) {
+        if (atomic_inc_return(&inode->i_count) != 1)
-                atomic_inc(&inode->i_count);
                return;
-        }
-        atomic_inc(&inode->i_count);
        if (!(inode->i_state & (I_DIRTY|I_SYNC)))
                list_move(&inode->i_list, &inode_in_use);
        inodes_stat.nr_unused--;
@@ -1205,8 +1203,6 @@ void generic_delete_inode(struct inode *inode)
        inodes_stat.nr_inodes--;
        spin_unlock(&inode_lock);
-        security_inode_delete(inode);
        if (op->delete_inode) {
                void (*delete)(struct inode *) = op->delete_inode;
                /* Filesystems implementing their own
@@ -1610,3 +1606,23 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
                                  inode->i_ino);
 }
 EXPORT_SYMBOL(init_special_inode);
+/**
+ * Init uid,gid,mode for new inode according to posix standards
+ * @inode: New inode
+ * @dir: Directory inode
+ * @mode: mode of the new inode
+ */
+void inode_init_owner(struct inode *inode, const struct inode *dir,
+                        mode_t mode)
+{
+        inode->i_uid = current_fsuid();
+        if (dir && dir->i_mode & S_ISGID) {
+                inode->i_gid = dir->i_gid;
+                if (S_ISDIR(mode))
+                        mode |= S_ISGID;
+        } else
+                inode->i_gid = current_fsgid();
+        inode->i_mode = mode;
+}
+EXPORT_SYMBOL(inode_init_owner);
diff --git a/fs/internal.h b/fs/internal.h
index 8a03a5447bdf..6b706bc60a66 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -87,6 +87,8 @@ extern struct file *get_empty_filp(void);
 * super.c
 */
 extern int do_remount_sb(struct super_block *, int, void *, int);
+extern void __put_super(struct super_block *sb);
+extern void put_super(struct super_block *sb);
 /*
 * open.c
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 6c751106c2e5..2d140a713861 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -228,14 +228,23 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 #ifdef CONFIG_BLOCK
-#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits)
+static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
-#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits);
+{
+        return (offset >> inode->i_blkbits);
+}
+static inline loff_t blk_to_logical(struct inode *inode, sector_t blk)
+{
+        return (blk << inode->i_blkbits);
+}
 /**
 * __generic_block_fiemap - FIEMAP for block based inodes (no locking)
- * @inode - the inode to map
+ * @inode: the inode to map
- * @arg - the pointer to userspace where we copy everything to
+ * @fieinfo: the fiemap info struct that will be passed back to userspace
- * @get_block - the fs's get_block function
+ * @start: where to start mapping in the inode
+ * @len: how much space to map
+ * @get_block: the fs's get_block function
 *
 * This does FIEMAP for block based inodes.  Basically it will just loop
 * through get_block until we hit the number of extents we want to map, or we
@@ -250,58 +259,63 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 */
 int __generic_block_fiemap(struct inode *inode,
-                           struct fiemap_extent_info *fieinfo, u64 start,
+                           struct fiemap_extent_info *fieinfo, loff_t start,
-                           u64 len, get_block_t *get_block)
+                           loff_t len, get_block_t *get_block)
 {
-        struct buffer_head tmp;
+        struct buffer_head map_bh;
-        unsigned long long start_blk;
+        sector_t start_blk, last_blk;
-        long long length = 0, map_len = 0;
+        loff_t isize = i_size_read(inode);
        u64 logical = 0, phys = 0, size = 0;
        u32 flags = FIEMAP_EXTENT_MERGED;
-        int ret = 0, past_eof = 0, whole_file = 0;
+        bool past_eof = false, whole_file = false;
+        int ret = 0;
-        if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)))
+        ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+        if (ret)
                return ret;
-        start_blk = logical_to_blk(inode, start);
+        /*
+         * Either the i_mutex or other appropriate locking needs to be held
-        length = (long long)min_t(u64, len, i_size_read(inode));
+         * since we expect isize to not change at all through the duration of
-        if (length < len)
+         * this call.
-                whole_file = 1;
+         */
+        if (len >= isize) {
+                whole_file = true;
+                len = isize;
+        }
-        map_len = length;
+        start_blk = logical_to_blk(inode, start);
+        last_blk = logical_to_blk(inode, start + len - 1);
        do {
                /*
                 * we set b_size to the total size we want so it will map as
                 * many contiguous blocks as possible at once
                 */
-                memset(&tmp, 0, sizeof(struct buffer_head));
+                memset(&map_bh, 0, sizeof(struct buffer_head));
-                tmp.b_size = map_len;
+                map_bh.b_size = len;
-                ret = get_block(inode, start_blk, &tmp, 0);
+                ret = get_block(inode, start_blk, &map_bh, 0);
                if (ret)
                        break;
                /* HOLE */
-                if (!buffer_mapped(&tmp)) {
+                if (!buffer_mapped(&map_bh)) {
-                        length -= blk_to_logical(inode, 1);
                        start_blk++;
                        /*
-                         * we want to handle the case where there is an
+                         * We want to handle the case where there is an
                         * allocated block at the front of the file, and then
                         * nothing but holes up to the end of the file properly,
                         * to make sure that extent at the front gets properly
                         * marked with FIEMAP_EXTENT_LAST
                         */
                        if (!past_eof &&
-                            blk_to_logical(inode, start_blk) >=
+                            blk_to_logical(inode, start_blk) >= isize)
-                            blk_to_logical(inode, 0)+i_size_read(inode))
                                past_eof = 1;
                        /*
-                         * first hole after going past the EOF, this is our
+                         * First hole after going past the EOF, this is our
                         * last extent
                         */
                        if (past_eof && size) {
@@ -309,15 +323,18 @@ int __generic_block_fiemap(struct inode *inode,
                                ret = fiemap_fill_next_extent(fieinfo, logical,
                                                              phys, size,
                                                              flags);
-                                break;
+                        } else if (size) {
+                                ret = fiemap_fill_next_extent(fieinfo, logical,
+                                                              phys, size, flags);
+                                size = 0;
                        }
                        /* if we have holes up to/past EOF then we're done */
-                        if (length <= 0 || past_eof)
+                        if (start_blk > last_blk || past_eof || ret)
                                break;
                } else {
                        /*
-                         * we have gone over the length of what we wanted to
+                         * We have gone over the length of what we wanted to
                         * map, and it wasn't the entire file, so add the extent
                         * we got last time and exit.
                         *
@@ -331,7 +348,7 @@ int __generic_block_fiemap(struct inode *inode,
                         * are good to go, just add the extent to the fieinfo
                         * and break
                         */
-                        if (length <= 0 && !whole_file) {
+                        if (start_blk > last_blk && !whole_file) {
                                ret = fiemap_fill_next_extent(fieinfo, logical,
                                                              phys, size,
                                                              flags);
@@ -351,11 +368,10 @@ int __generic_block_fiemap(struct inode *inode,
                        }
                        logical = blk_to_logical(inode, start_blk);
-                        phys = blk_to_logical(inode, tmp.b_blocknr);
+                        phys = blk_to_logical(inode, map_bh.b_blocknr);
-                        size = tmp.b_size;
+                        size = map_bh.b_size;
                        flags = FIEMAP_EXTENT_MERGED;
-                        length -= tmp.b_size;
                        start_blk += logical_to_blk(inode, size);
                        /*
@@ -363,15 +379,13 @@ int __generic_block_fiemap(struct inode *inode,
                         * soon as we find a hole that the last extent we found
                         * is marked with FIEMAP_EXTENT_LAST
                         */
-                        if (!past_eof &&
+                        if (!past_eof && logical + size >= isize)
-                            logical+size >=
+                                past_eof = true;
-                            blk_to_logical(inode, 0)+i_size_read(inode))
-                                past_eof = 1;
                }
                cond_resched();
        } while (1);
-        /* if ret is 1 then we just hit the end of the extent array */
+        /* If ret is 1 then we just hit the end of the extent array */
        if (ret == 1)
                ret = 0;
@@ -511,15 +525,8 @@ static int ioctl_fsfreeze(struct file *filp)
        if (sb->s_op->freeze_fs == NULL)
                return -EOPNOTSUPP;
-        /* If a blockdevice-backed filesystem isn't specified, return. */
-        if (sb->s_bdev == NULL)
-                return -EINVAL;
        /* Freeze */
-        sb = freeze_bdev(sb->s_bdev);
+        return freeze_super(sb);
-        if (IS_ERR(sb))
-                return PTR_ERR(sb);
-        return 0;
 }
 static int ioctl_fsthaw(struct file *filp)
@@ -529,12 +536,8 @@ static int ioctl_fsthaw(struct file *filp)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
-        if (sb->s_bdev == NULL)
-                return -EINVAL;
        /* Thaw */
-        return thaw_bdev(sb->s_bdev, sb);
+        return thaw_super(sb);
 }
 /*
diff --git a/fs/ioprio.c b/fs/ioprio.c
index c7c0b28d7d21..748cfb92dcc6 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -19,6 +19,7 @@
 * See also Documentation/block/ioprio.txt
 *
 */
+#include <linux/gfp.h>
 #include <linux/kernel.h>
 #include <linux/ioprio.h>
 #include <linux/blkdev.h>
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 8ba5441063be..e0aca9a0ac68 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -11,6 +11,7 @@
 *  isofs directory handling functions
 */
 #include <linux/smp_lock.h>
+#include <linux/gfp.h>
 #include "isofs.h"
 int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode)
@@ -271,6 +272,7 @@ static int isofs_readdir(struct file *filp,
 const struct file_operations isofs_dir_operations =
 {
+        .llseek = generic_file_llseek,
        .read = generic_read_dir,
        .readdir = isofs_readdir,
 };
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index eaa831311c9c..ab438beb867c 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -7,6 +7,7 @@
 */
 #include <linux/smp_lock.h>
+#include <linux/gfp.h>
 #include "isofs.h"
 /*
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 2c90e3ef625f..28a9ddaa0c49 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -17,7 +17,6 @@
 #include <linux/fs.h>
 #include <linux/jbd.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/bio.h>
@@ -787,6 +786,12 @@ wait_for_iobuf:
        jbd_debug(3, "JBD: commit phase 6\n");
+        /* All metadata is written, now write commit record and do cleanup */
+        spin_lock(&journal->j_state_lock);
+        J_ASSERT(commit_transaction->t_state == T_COMMIT);
+        commit_transaction->t_state = T_COMMIT_RECORD;
+        spin_unlock(&journal->j_state_lock);
        if (journal_write_commit_record(journal, commit_transaction))
                err = -EIO;
@@ -924,7 +929,7 @@ restart_loop:
        jbd_debug(3, "JBD: commit phase 8\n");
-        J_ASSERT(commit_transaction->t_state == T_COMMIT);
+        J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
        commit_transaction->t_state = T_FINISHED;
        J_ASSERT(commit_transaction == journal->j_committing_transaction);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index bd224eec9b07..93d1e47647bd 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -565,6 +565,38 @@ int log_wait_commit(journal_t *journal, tid_t tid)
 }
 /*
+ * Return 1 if a given transaction has not yet sent barrier request
+ * connected with a transaction commit. If 0 is returned, transaction
+ * may or may not have sent the barrier. Used to avoid sending barrier
+ * twice in common cases.
+ */
+int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
+{
+        int ret = 0;
+        transaction_t *commit_trans;
+        if (!(journal->j_flags & JFS_BARRIER))
+                return 0;
+        spin_lock(&journal->j_state_lock);
+        /* Transaction already committed? */
+        if (tid_geq(journal->j_commit_sequence, tid))
+                goto out;
+        /*
+         * Transaction is being committed and we already proceeded to
+         * writing commit record?
+         */
+        commit_trans = journal->j_committing_transaction;
+        if (commit_trans && commit_trans->t_tid == tid &&
+            commit_trans->t_state >= T_COMMIT_RECORD)
+                goto out;
+        ret = 1;
+out:
+        spin_unlock(&journal->j_state_lock);
+        return ret;
+}
+EXPORT_SYMBOL(journal_trans_will_send_data_barrier);
+/*
 * Log buffer allocation routines:
 */
@@ -1157,6 +1189,7 @@ int journal_destroy(journal_t *journal)
 {
        int err = 0;
+        
        /* Wait for the commit thread to wake up and die. */
        journal_kill_thread(journal);
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index cb1a49ae605e..54c9bc9e1b17 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -20,7 +20,6 @@
 #include <linux/fs.h>
 #include <linux/jbd.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
 #endif
 /*
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 30beb11ef928..076d1cc44f95 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -530,7 +530,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
         */
        if ((journal->j_fs_dev != journal->j_dev) &&
            (journal->j_flags & JBD2_BARRIER))
-                blkdev_issue_flush(journal->j_fs_dev, NULL);
+                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
+                        BLKDEV_IFL_WAIT);
        if (!(journal->j_flags & JBD2_ABORT))
                jbd2_journal_update_superblock(journal, 1);
        return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 671da7fb7ffd..75716d3d2be0 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -717,7 +717,8 @@ start_journal_io:
        if (commit_transaction->t_flushed_data_blocks &&
            (journal->j_fs_dev != journal->j_dev) &&
            (journal->j_flags & JBD2_BARRIER))
-                blkdev_issue_flush(journal->j_fs_dev, NULL);
+                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
+                        BLKDEV_IFL_WAIT);
        /* Done it all: now write the commit record asynchronously. */
        if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -727,7 +728,8 @@ start_journal_io:
                if (err)
                        __jbd2_journal_abort_hard(journal);
                if (journal->j_flags & JBD2_BARRIER)
-                        blkdev_issue_flush(journal->j_dev, NULL);
+                        blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
+                                BLKDEV_IFL_WAIT);
        }
        err = journal_finish_inode_data_buffers(journal, commit_transaction);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c03d4dce4d76..bc2ff5932769 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1889,7 +1889,7 @@ static struct kmem_cache *get_slab(size_t size)
        BUG_ON(i >= JBD2_MAX_SLABS);
        if (unlikely(i < 0))
                i = 0;
-        BUG_ON(jbd2_slab[i] == 0);
+        BUG_ON(jbd2_slab[i] == NULL);
        return jbd2_slab[i];
 }
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 73063285b13f..049281b7cb89 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -20,7 +20,6 @@
 #include <linux/fs.h>
 #include <linux/jbd2.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
 #include <linux/crc32.h>
 #endif
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index bfc70f57900f..e214d68620ac 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1311,7 +1311,6 @@ int jbd2_journal_stop(handle_t *handle)
        if (handle->h_sync)
                transaction->t_synchronous_commit = 1;
        current->journal_info = NULL;
-        spin_lock(&journal->j_state_lock);
        spin_lock(&transaction->t_handle_lock);
        transaction->t_outstanding_credits -= handle->h_buffer_credits;
        transaction->t_updates--;
@@ -1340,8 +1339,7 @@ int jbd2_journal_stop(handle_t *handle)
                jbd_debug(2, "transaction too old, requesting commit for "
                                        "handle %p\n", handle);
                /* This is non-blocking */
-                __jbd2_log_start_commit(journal, transaction->t_tid);
+                jbd2_log_start_commit(journal, transaction->t_tid);
-                spin_unlock(&journal->j_state_lock);
                /*
                 * Special case: JBD2_SYNC synchronous updates require us
@@ -1351,7 +1349,6 @@ int jbd2_journal_stop(handle_t *handle)
                        err = jbd2_log_wait_commit(journal, tid);
        } else {
                spin_unlock(&transaction->t_handle_lock);
-                spin_unlock(&journal->j_state_lock);
        }
        lock_map_release(&handle->h_lockdep_map);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 7cdc3196476a..a33aab6b5e68 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -419,7 +419,7 @@ static int jffs2_acl_setxattr(struct dentry *dentry, const char *name,
        return rc;
 }
-struct xattr_handler jffs2_acl_access_xattr_handler = {
+const struct xattr_handler jffs2_acl_access_xattr_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags  = ACL_TYPE_DEFAULT,
        .list   = jffs2_acl_access_listxattr,
@@ -427,7 +427,7 @@ struct xattr_handler jffs2_acl_access_xattr_handler = {
        .set    = jffs2_acl_setxattr,
 };
-struct xattr_handler jffs2_acl_default_xattr_handler = {
+const struct xattr_handler jffs2_acl_default_xattr_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags  = ACL_TYPE_DEFAULT,
        .list   = jffs2_acl_default_listxattr,
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index f0ba63e3c36b..5e42de8d9541 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -31,8 +31,8 @@ extern int jffs2_acl_chmod(struct inode *);
 extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
 extern int jffs2_init_acl_post(struct inode *);
-extern struct xattr_handler jffs2_acl_access_xattr_handler;
+extern const struct xattr_handler jffs2_acl_access_xattr_handler;
-extern struct xattr_handler jffs2_acl_default_xattr_handler;
+extern const struct xattr_handler jffs2_acl_default_xattr_handler;
 #else
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 3ff50da94789..55f1dde2fa8b 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -23,10 +23,9 @@ static int jffs2_garbage_collect_thread(void *);
 void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c)
 {
-        spin_lock(&c->erase_completion_lock);
+        assert_spin_locked(&c->erase_completion_lock);
        if (c->gc_task && jffs2_thread_should_wake(c))
                send_sig(SIGHUP, c->gc_task, 1);
-        spin_unlock(&c->erase_completion_lock);
 }
 /* This must only ever be called when no GC thread is currently running */
diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index 90cb60d09787..cd02acafde8a 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -11,7 +11,6 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/init.h>
 #include <linux/lzo.h>
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index cfd301a5edfc..b46661a42758 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -14,7 +14,6 @@
 #endif
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/zlib.h>
 #include <linux/zutil.h>
 #include "nodelist.h"
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index 5544d31c066b..ec3538413926 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -15,6 +15,7 @@
 #include <linux/crc32.h>
 #include <linux/jffs2.h>
 #include <linux/mtd/mtd.h>
+#include <linux/slab.h>
 #include "nodelist.h"
 #include "debug.h"
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index b47679be118a..6286ad9b00f7 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -103,9 +103,10 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
        jffs2_erase_failed(c, jeb, bad_offset);
 }
-void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
+int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
 {
        struct jffs2_eraseblock *jeb;
+        int work_done = 0;
        mutex_lock(&c->erase_free_sem);
@@ -121,6 +122,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
                        mutex_unlock(&c->erase_free_sem);
                        jffs2_mark_erased_block(c, jeb);
+                        work_done++;
                        if (!--count) {
                                D1(printk(KERN_DEBUG "Count reached. jffs2_erase_pending_blocks leaving\n"));
                                goto done;
@@ -157,6 +159,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
        mutex_unlock(&c->erase_free_sem);
 done:
        D1(printk(KERN_DEBUG "jffs2_erase_pending_blocks completed\n"));
+        return work_done;
 }
 static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
@@ -165,10 +168,11 @@ static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblo
        mutex_lock(&c->erase_free_sem);
        spin_lock(&c->erase_completion_lock);
        list_move_tail(&jeb->list, &c->erase_complete_list);
+        /* Wake the GC thread to mark them clean */
+        jffs2_garbage_collect_trigger(c);
        spin_unlock(&c->erase_completion_lock);
        mutex_unlock(&c->erase_free_sem);
-        /* Ensure that kupdated calls us again to mark them clean */
+        wake_up(&c->erase_wait);
-        jffs2_erase_pending_trigger(c);
 }
 static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset)
@@ -487,9 +491,9 @@ filebad:
 refile:
        /* Stick it back on the list from whence it came and come back later */
-        jffs2_erase_pending_trigger(c);
        mutex_lock(&c->erase_free_sem);
        spin_lock(&c->erase_completion_lock);
+        jffs2_garbage_collect_trigger(c);
        list_move(&jeb->list, &c->erase_complete_list);
        spin_unlock(&c->erase_completion_lock);
        mutex_unlock(&c->erase_free_sem);
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index b7b74e299142..813497024437 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -10,7 +10,6 @@
 */
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/time.h>
 #include <linux/pagemap.h>
@@ -27,9 +26,9 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
                        struct page **pagep, void **fsdata);
 static int jffs2_readpage (struct file *filp, struct page *pg);
-int jffs2_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int jffs2_fsync(struct file *filp, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = filp->f_mapping->host;
        struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
        /* Trigger GC to flush any pending writes for this inode */
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 3451a81b2142..8bc2c80ab159 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -169,13 +169,13 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
        mutex_unlock(&f->sem);
        jffs2_complete_reservation(c);
-        /* We have to do the vmtruncate() without f->sem held, since
+        /* We have to do the simple_setsize() without f->sem held, since
           some pages may be locked and waiting for it in readpage().
           We are protected from a simultaneous write() extending i_size
           back past iattr->ia_size, because do_truncate() holds the
           generic inode semaphore. */
        if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) {
-                vmtruncate(inode, iattr->ia_size);      
+                simple_setsize(inode, iattr->ia_size);
                inode->i_blocks = (inode->i_size + 511) >> 9;
        }       
@@ -313,8 +313,8 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
        case S_IFBLK:
        case S_IFCHR:
                /* Read the device numbers from the media */
-                if (f->metadata->size != sizeof(jdev.old) &&
+                if (f->metadata->size != sizeof(jdev.old_id) &&
-                    f->metadata->size != sizeof(jdev.new)) {
+                    f->metadata->size != sizeof(jdev.new_id)) {
                        printk(KERN_NOTICE "Device node has strange size %d\n", f->metadata->size);
                        goto error_io;
                }
@@ -325,10 +325,10 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
                        printk(KERN_NOTICE "Read device numbers for inode %lu failed\n", (unsigned long)inode->i_ino);
                        goto error;
                }
-                if (f->metadata->size == sizeof(jdev.old))
+                if (f->metadata->size == sizeof(jdev.old_id))
-                        rdev = old_decode_dev(je16_to_cpu(jdev.old));
+                        rdev = old_decode_dev(je16_to_cpu(jdev.old_id));
                else
-                        rdev = new_decode_dev(je32_to_cpu(jdev.new));
+                        rdev = new_decode_dev(je32_to_cpu(jdev.new_id));
        case S_IFSOCK:
        case S_IFIFO:
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 3b6f2fa12cff..f5e96bd656e8 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -214,6 +214,19 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
                return ret;
        }
+        /* If there are any blocks which need erasing, erase them now */
+        if (!list_empty(&c->erase_complete_list) ||
+            !list_empty(&c->erase_pending_list)) {
+                spin_unlock(&c->erase_completion_lock);
+                D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n"));
+                if (jffs2_erase_pending_blocks(c, 1)) {
+                        mutex_unlock(&c->alloc_sem);
+                        return 0;
+                }
+                D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n"));
+                spin_lock(&c->erase_completion_lock);
+        }
        /* First, work out which block we're garbage-collecting */
        jeb = c->gcblock;
@@ -222,7 +235,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
        if (!jeb) {
                /* Couldn't find a free block. But maybe we can just erase one and make 'progress'? */
-                if (!list_empty(&c->erase_pending_list)) {
+                if (c->nr_erasing_blocks) {
                        spin_unlock(&c->erase_completion_lock);
                        mutex_unlock(&c->alloc_sem);
                        return -EAGAIN;
@@ -435,7 +448,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
                list_add_tail(&c->gcblock->list, &c->erase_pending_list);
                c->gcblock = NULL;
                c->nr_erasing_blocks++;
-                jffs2_erase_pending_trigger(c);
+                jffs2_garbage_collect_trigger(c);
        }
        spin_unlock(&c->erase_completion_lock);
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 87c6f555e1a0..af02bd138469 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -15,7 +15,6 @@
 #include <linux/mtd/mtd.h>
 #include <linux/rbtree.h>
 #include <linux/crc32.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include "nodelist.h"
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 507ed6ec1847..a881a42f19e3 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -312,11 +312,11 @@ static inline int jffs2_blocks_use_vmalloc(struct jffs2_sb_info *c)
 static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev)
 {
        if (old_valid_dev(rdev)) {
-                jdev->old = cpu_to_je16(old_encode_dev(rdev));
+                jdev->old_id = cpu_to_je16(old_encode_dev(rdev));
-                return sizeof(jdev->old);
+                return sizeof(jdev->old_id);
        } else {
-                jdev->new = cpu_to_je32(new_encode_dev(rdev));
+                jdev->new_id = cpu_to_je32(new_encode_dev(rdev));
-                return sizeof(jdev->new);
+                return sizeof(jdev->new_id);
        }
 }
@@ -464,7 +464,7 @@ int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 int jffs2_do_mount_fs(struct jffs2_sb_info *c);
 /* erase.c */
-void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count);
+int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count);
 void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 21a052915aa9..694aa5b03505 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -10,7 +10,6 @@
 */
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/mtd/mtd.h>
 #include <linux/compiler.h>
 #include <linux/sched.h> /* For cond_resched() */
@@ -117,9 +116,21 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
                        ret = jffs2_garbage_collect_pass(c);
-                        if (ret == -EAGAIN)
+                        if (ret == -EAGAIN) {
-                                jffs2_erase_pending_blocks(c, 1);
+                                spin_lock(&c->erase_completion_lock);
-                        else if (ret)
+                                if (c->nr_erasing_blocks &&
+                                    list_empty(&c->erase_pending_list) &&
+                                    list_empty(&c->erase_complete_list)) {
+                                        DECLARE_WAITQUEUE(wait, current);
+                                        set_current_state(TASK_UNINTERRUPTIBLE);
+                                        add_wait_queue(&c->erase_wait, &wait);
+                                        D1(printk(KERN_DEBUG "%s waiting for erase to complete\n", __func__));
+                                        spin_unlock(&c->erase_completion_lock);
+                                        schedule();
+                                } else
+                                        spin_unlock(&c->erase_completion_lock);
+                        } else if (ret)
                                return ret;
                        cond_resched();
@@ -218,7 +229,7 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
                        ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list);
                        list_move_tail(&ejeb->list, &c->erase_pending_list);
                        c->nr_erasing_blocks++;
-                        jffs2_erase_pending_trigger(c);
+                        jffs2_garbage_collect_trigger(c);
                        D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n",
                                  ejeb->offset));
                }
@@ -470,7 +481,9 @@ struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c,
 void jffs2_complete_reservation(struct jffs2_sb_info *c)
 {
        D1(printk(KERN_DEBUG "jffs2_complete_reservation()\n"));
+        spin_lock(&c->erase_completion_lock);
        jffs2_garbage_collect_trigger(c);
+        spin_unlock(&c->erase_completion_lock);
        mutex_unlock(&c->alloc_sem);
 }
@@ -612,7 +625,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
                                D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n"));
                                list_add_tail(&jeb->list, &c->erase_pending_list);
                                c->nr_erasing_blocks++;
-                                jffs2_erase_pending_trigger(c);
+                                jffs2_garbage_collect_trigger(c);
                        } else {
                                /* Sometimes, however, we leave it elsewhere so it doesn't get
                                   immediately reused, and we spread the load a bit. */
@@ -733,6 +746,10 @@ int jffs2_thread_should_wake(struct jffs2_sb_info *c)
        int nr_very_dirty = 0;
        struct jffs2_eraseblock *jeb;
+        if (!list_empty(&c->erase_complete_list) ||
+            !list_empty(&c->erase_pending_list))
+                return 1;
        if (c->unchecked_size) {
                D1(printk(KERN_DEBUG "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n",
                          c->unchecked_size, c->checked_ino));
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index a7f03b7ebcb3..4791aacf3084 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -140,8 +140,7 @@ void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c);
 #endif /* WRITEBUFFER */
-/* erase.c */
+static inline void jffs2_dirty_trigger(struct jffs2_sb_info *c)
-static inline void jffs2_erase_pending_trigger(struct jffs2_sb_info *c)
 {
        OFNI_BS_2SFFJ(c)->s_dirt = 1;
 }
@@ -159,7 +158,7 @@ extern const struct inode_operations jffs2_dir_inode_operations;
 extern const struct file_operations jffs2_file_operations;
 extern const struct inode_operations jffs2_file_inode_operations;
 extern const struct address_space_operations jffs2_file_address_operations;
-int jffs2_fsync(struct file *, struct dentry *, int);
+int jffs2_fsync(struct file *, int);
 int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg);
 /* ioctl.c */
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 696686cc206e..46f870d1cc36 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -260,7 +260,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
                        ret = -EIO;
                        goto out;
                }
-                jffs2_erase_pending_trigger(c);
+                spin_lock(&c->erase_completion_lock);
+                jffs2_garbage_collect_trigger(c);
+                spin_unlock(&c->erase_completion_lock);
        }
        ret = 0;
 out:
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index eaccee058583..239f51216a68 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -77,7 +77,7 @@ static size_t jffs2_security_listxattr(struct dentry *dentry, char *list,
        return retlen;
 }
-struct xattr_handler jffs2_security_xattr_handler = {
+const struct xattr_handler jffs2_security_xattr_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .list = jffs2_security_listxattr,
        .set = jffs2_security_setxattr,
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 9a80e8e595d0..511e2d609d12 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -63,8 +63,6 @@ static void jffs2_write_super(struct super_block *sb)
        if (!(sb->s_flags & MS_RDONLY)) {
                D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
-                jffs2_garbage_collect_trigger(c);
-                jffs2_erase_pending_blocks(c, 0);
                jffs2_flush_wbuf_gc(c, 0);
        }
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 4ec11e8bda8c..b955626071c2 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -10,7 +10,6 @@
 */
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include "nodelist.h"
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 5ef7bac265e5..07ee1546b2fa 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -84,7 +84,7 @@ static void jffs2_wbuf_dirties_inode(struct jffs2_sb_info *c, uint32_t ino)
        struct jffs2_inodirty *new;
        /* Mark the superblock dirty so that kupdated will flush... */
-        jffs2_erase_pending_trigger(c);
+        jffs2_dirty_trigger(c);
        if (jffs2_wbuf_pending_for_ino(c, ino))
                return;
@@ -121,7 +121,7 @@ static inline void jffs2_refile_wbuf_blocks(struct jffs2_sb_info *c)
                        D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n"));
                        list_add_tail(&jeb->list, &c->erase_pending_list);
                        c->nr_erasing_blocks++;
-                        jffs2_erase_pending_trigger(c);
+                        jffs2_garbage_collect_trigger(c);
                } else {
                        /* Sometimes, however, we leave it elsewhere so it doesn't get
                           immediately reused, and we spread the load a bit. */
@@ -152,7 +152,7 @@ static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock
                D1(printk("Refiling block at %08x to erase_pending_list\n", jeb->offset));
                list_add(&jeb->list, &c->erase_pending_list);
                c->nr_erasing_blocks++;
-                jffs2_erase_pending_trigger(c);
+                jffs2_garbage_collect_trigger(c);
        }
        if (!jffs2_prealloc_raw_node_refs(c, jeb, 1)) {
@@ -543,7 +543,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
                D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset));
                list_move(&jeb->list, &c->erase_pending_list);
                c->nr_erasing_blocks++;
-                jffs2_erase_pending_trigger(c);
+                jffs2_garbage_collect_trigger(c);
        }
        jffs2_dbg_acct_sanity_check_nolock(c, jeb);
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index ca29440e9435..c819eb0e982d 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/crc32.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/mtd/mtd.h>
 #include "nodelist.h"
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 9e75c62c85d6..a2d58c96f1b4 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -904,7 +904,7 @@ struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
 * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags)
 *   is an implementation of setxattr handler on jffs2.
 * -------------------------------------------------- */
-struct xattr_handler *jffs2_xattr_handlers[] = {
+const struct xattr_handler *jffs2_xattr_handlers[] = {
        &jffs2_user_xattr_handler,
 #ifdef CONFIG_JFFS2_FS_SECURITY
        &jffs2_security_xattr_handler,
@@ -917,8 +917,8 @@ struct xattr_handler *jffs2_xattr_handlers[] = {
        NULL
 };
-static struct xattr_handler *xprefix_to_handler(int xprefix) {
+static const struct xattr_handler *xprefix_to_handler(int xprefix) {
-        struct xattr_handler *ret;
+        const struct xattr_handler *ret;
        switch (xprefix) {
        case JFFS2_XPREFIX_USER:
@@ -955,7 +955,7 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
        struct jffs2_inode_cache *ic = f->inocache;
        struct jffs2_xattr_ref *ref, **pref;
        struct jffs2_xattr_datum *xd;
-        struct xattr_handler *xhandle;
+        const struct xattr_handler *xhandle;
        ssize_t len, rc;
        int retry = 0;
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 6e3b5ddfb7ab..cf4f5759b42b 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -93,9 +93,9 @@ extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname
 extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
                             const char *buffer, size_t size, int flags);
-extern struct xattr_handler *jffs2_xattr_handlers[];
+extern const struct xattr_handler *jffs2_xattr_handlers[];
-extern struct xattr_handler jffs2_user_xattr_handler;
+extern const struct xattr_handler jffs2_user_xattr_handler;
-extern struct xattr_handler jffs2_trusted_xattr_handler;
+extern const struct xattr_handler jffs2_trusted_xattr_handler;
 extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
 #define jffs2_getxattr          generic_getxattr
@@ -122,7 +122,7 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
 #ifdef CONFIG_JFFS2_FS_SECURITY
 extern int jffs2_init_security(struct inode *inode, struct inode *dir);
-extern struct xattr_handler jffs2_security_xattr_handler;
+extern const struct xattr_handler jffs2_security_xattr_handler;
 #else
 #define jffs2_init_security(inode,dir)  (0)
 #endif /* CONFIG_JFFS2_FS_SECURITY */
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index 3e5a5e356e05..1c868194c504 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -47,7 +47,7 @@ static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list,
        return retlen;
 }
-struct xattr_handler jffs2_trusted_xattr_handler = {
+const struct xattr_handler jffs2_trusted_xattr_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .list = jffs2_trusted_listxattr,
        .set = jffs2_trusted_setxattr,
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index 8544af67dffe..916b5c966039 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -47,7 +47,7 @@ static size_t jffs2_user_listxattr(struct dentry *dentry, char *list,
        return retlen;
 }
-struct xattr_handler jffs2_user_xattr_handler = {
+const struct xattr_handler jffs2_user_xattr_handler = {
        .prefix = XATTR_USER_PREFIX,
        .list = jffs2_user_listxattr,
        .set = jffs2_user_setxattr,
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 213169780b6c..1057a4998e4e 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -19,6 +19,7 @@
 */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/posix_acl_xattr.h>
 #include "jfs_incore.h"
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 14ba982b3f24..127263cc8657 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -27,9 +27,9 @@
 #include "jfs_acl.h"
 #include "jfs_debug.h"
-int jfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int jfs_fsync(struct file *file, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        int rc = 0;
        if (!(inode->i_state & I_DIRTY) ||
@@ -98,7 +98,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
        if (rc)
                return rc;
-        if (iattr->ia_valid & ATTR_SIZE)
+        if (is_quota_modification(inode, iattr))
                dquot_initialize(inode);
        if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
            (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 9dd126276c9f..ed9ba6fe04f5 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -61,7 +61,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
                        inode->i_op = &page_symlink_inode_operations;
                        inode->i_mapping->a_ops = &jfs_aops;
                } else {
-                        inode->i_op = &jfs_symlink_inode_operations;
+                        inode->i_op = &jfs_fast_symlink_inode_operations;
                        /*
                         * The inline data should be null-terminated, but
                         * don't let on-disk corruption crash the kernel
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index d9b031cf69f5..c92ea3b3ea5e 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -17,6 +17,7 @@
 */
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include "jfs_incore.h"
 #include "jfs_superblock.h"
 #include "jfs_dmap.h"
@@ -195,7 +196,7 @@ int dbMount(struct inode *ipbmap)
        bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
        bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
        bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
-        bmp->db_agheigth = le32_to_cpu(dbmp_le->dn_agheigth);
+        bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight);
        bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
        bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
        bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
@@ -287,7 +288,7 @@ int dbSync(struct inode *ipbmap)
        dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag);
        dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref);
        dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel);
-        dbmp_le->dn_agheigth = cpu_to_le32(bmp->db_agheigth);
+        dbmp_le->dn_agheight = cpu_to_le32(bmp->db_agheight);
        dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth);
        dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart);
        dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size);
@@ -1440,7 +1441,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
         * tree index of this allocation group within the control page.
         */
        agperlev =
-            (1 << (L2LPERCTL - (bmp->db_agheigth << 1))) / bmp->db_agwidth;
+            (1 << (L2LPERCTL - (bmp->db_agheight << 1))) / bmp->db_agwidth;
        ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1));
        /* dmap control page trees fan-out by 4 and a single allocation
@@ -1459,7 +1460,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
                 * the subtree to find the leftmost leaf that describes this
                 * free space.
                 */
-                for (k = bmp->db_agheigth; k > 0; k--) {
+                for (k = bmp->db_agheight; k > 0; k--) {
                        for (n = 0, m = (ti << 2) + 1; n < 4; n++) {
                                if (l2nb <= dcp->stree[m + n]) {
                                        ti = m + n;
@@ -2437,7 +2438,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
        /* check if this is a control page update for an allocation.
         * if so, update the leaf to reflect the new leaf value using
-         * dbSplit(); otherwise (deallocation), use dbJoin() to udpate
+         * dbSplit(); otherwise (deallocation), use dbJoin() to update
         * the leaf with the new value.  in addition to updating the
         * leaf, dbSplit() will also split the binary buddy system of
         * the leaves, if required, and bubble new values within the
@@ -3606,7 +3607,7 @@ void dbFinalizeBmap(struct inode *ipbmap)
        }
        /*
-         * compute db_aglevel, db_agheigth, db_width, db_agstart:
+         * compute db_aglevel, db_agheight, db_width, db_agstart:
         * an ag is covered in aglevel dmapctl summary tree,
         * at agheight level height (from leaf) with agwidth number of nodes
         * each, which starts at agstart index node of the smmary tree node
@@ -3615,9 +3616,9 @@ void dbFinalizeBmap(struct inode *ipbmap)
        bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize);
        l2nl =
            bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL);
-        bmp->db_agheigth = l2nl >> 1;
+        bmp->db_agheight = l2nl >> 1;
-        bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheigth << 1));
+        bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheight << 1));
-        for (i = 5 - bmp->db_agheigth, bmp->db_agstart = 0, n = 1; i > 0;
+        for (i = 5 - bmp->db_agheight, bmp->db_agstart = 0, n = 1; i > 0;
             i--) {
                bmp->db_agstart += n;
                n <<= 2;
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
index 1a6eb41569bc..6dcb906c55d8 100644
--- a/fs/jfs/jfs_dmap.h
+++ b/fs/jfs/jfs_dmap.h
@@ -210,7 +210,7 @@ struct dbmap_disk {
        __le32 dn_maxag;        /* 4: max active alloc group number     */
        __le32 dn_agpref;       /* 4: preferred alloc group (hint)      */
        __le32 dn_aglevel;      /* 4: dmapctl level holding the AG      */
-        __le32 dn_agheigth;     /* 4: height in dmapctl of the AG       */
+        __le32 dn_agheight;     /* 4: height in dmapctl of the AG       */
        __le32 dn_agwidth;      /* 4: width in dmapctl of the AG        */
        __le32 dn_agstart;      /* 4: start tree index at AG height     */
        __le32 dn_agl2size;     /* 4: l2 num of blks per alloc group    */
@@ -229,7 +229,7 @@ struct dbmap {
        int dn_maxag;           /* max active alloc group number        */
        int dn_agpref;          /* preferred alloc group (hint)         */
        int dn_aglevel;         /* dmapctl level holding the AG         */
-        int dn_agheigth;        /* height in dmapctl of the AG          */
+        int dn_agheight;        /* height in dmapctl of the AG          */
        int dn_agwidth;         /* width in dmapctl of the AG           */
        int dn_agstart;         /* start tree index at AG height        */
        int dn_agl2size;        /* l2 num of blks per alloc group       */
@@ -255,7 +255,7 @@ struct bmap {
 #define db_agsize       db_bmap.dn_agsize
 #define db_agl2size     db_bmap.dn_agl2size
 #define db_agwidth      db_bmap.dn_agwidth
-#define db_agheigth     db_bmap.dn_agheigth
+#define db_agheight     db_bmap.dn_agheight
 #define db_agstart      db_bmap.dn_agstart
 #define db_numag        db_bmap.dn_numag
 #define db_maxlevel     db_bmap.dn_maxlevel
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 0e4623be70ce..9197a1b0d02d 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -102,6 +102,7 @@
 #include <linux/fs.h>
 #include <linux/quotaops.h>
+#include <linux/slab.h>
 #include "jfs_incore.h"
 #include "jfs_superblock.h"
 #include "jfs_filsys.h"
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 0fc30407f039..f8332dc8eeb2 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -45,6 +45,7 @@
 #include <linux/buffer_head.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
+#include <linux/slab.h>
 #include "jfs_incore.h"
 #include "jfs_inode.h"
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 829921b67765..2686531e235a 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -98,14 +98,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
                goto fail_unlock;
        }
-        inode->i_uid = current_fsuid();
+        inode_init_owner(inode, parent, mode);
-        if (parent->i_mode & S_ISGID) {
-                inode->i_gid = parent->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
-        } else
-                inode->i_gid = current_fsgid();
        /*
         * New inodes need to save sane values on disk when
         * uid & gid mount options are used
@@ -121,7 +114,6 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
        if (rc)
                goto fail_drop;
-        inode->i_mode = mode;
        /* inherit flags from parent */
        jfs_inode->mode2 = JFS_IP(parent)->mode2 & JFS_FL_INHERIT;
@@ -134,7 +126,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
                if (S_ISLNK(mode))
                        jfs_inode->mode2 &= ~(JFS_IMMUTABLE_FL|JFS_APPEND_FL);
        }
-        jfs_inode->mode2 |= mode;
+        jfs_inode->mode2 |= inode->i_mode;
        inode->i_blocks = 0;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 79e2c79661df..11042b1f44b5 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -21,7 +21,7 @@
 struct fid;
 extern struct inode *ialloc(struct inode *, umode_t);
-extern int jfs_fsync(struct file *, struct dentry *, int);
+extern int jfs_fsync(struct file *, int);
 extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
 extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
 extern struct inode *jfs_iget(struct super_block *, unsigned long);
@@ -48,5 +48,6 @@ extern const struct file_operations jfs_dir_operations;
 extern const struct inode_operations jfs_file_inode_operations;
 extern const struct file_operations jfs_file_operations;
 extern const struct inode_operations jfs_symlink_inode_operations;
+extern const struct inode_operations jfs_fast_symlink_inode_operations;
 extern const struct dentry_operations jfs_ci_dentry_operations;
 #endif                          /* _H_JFS_INODE */
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 335c4de6552d..c51af2a14516 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -70,6 +70,7 @@
 #include <linux/delay.h>
 #include <linux/mutex.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
 #include "jfs_metapage.h"
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 07b6c5dfb4b6..48b44bd8267b 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -21,6 +21,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/bio.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/buffer_head.h>
 #include <linux/mempool.h>
diff --git a/fs/jfs/jfs_unicode.h b/fs/jfs/jfs_unicode.h
index 3fbb3a225590..8f0f02cb6ca6 100644
--- a/fs/jfs/jfs_unicode.h
+++ b/fs/jfs/jfs_unicode.h
@@ -19,6 +19,7 @@
 #ifndef _H_JFS_UNICODE
 #define _H_JFS_UNICODE
+#include <linux/slab.h>
 #include <asm/byteorder.h>
 #include "jfs_types.h"
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 4a3e9f39c21d..a9cf8e8675be 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -956,7 +956,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
         */
        if (ssize <= IDATASIZE) {
-                ip->i_op = &jfs_symlink_inode_operations;
+                ip->i_op = &jfs_fast_symlink_inode_operations;
                i_fastsymlink = JFS_IP(ip)->i_inline;
                memcpy(i_fastsymlink, name, ssize);
@@ -978,7 +978,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
        else {
                jfs_info("jfs_symlink: allocate extent ip:0x%p", ip);
-                ip->i_op = &page_symlink_inode_operations;
+                ip->i_op = &jfs_symlink_inode_operations;
                ip->i_mapping->a_ops = &jfs_aops;
                /*
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 7f24a0bb08ca..1aba0039f1c9 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -81,6 +81,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
        struct inode *iplist[1];
        struct jfs_superblock *j_sb, *j_sb2;
        uint old_agsize;
+        int agsizechanged = 0;
        struct buffer_head *bh, *bh2;
        /* If the volume hasn't grown, get out now */
@@ -333,6 +334,9 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
         */
        if ((rc = dbExtendFS(ipbmap, XAddress, nblocks)))
                goto error_out;
+        agsizechanged |= (bmp->db_agsize != old_agsize);
        /*
         * the map now has extended to cover additional nblocks:
         * dn_mapsize = oldMapsize + nblocks;
@@ -432,7 +436,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
         * will correctly identify the new ag);
         */
        /* if new AG size the same as old AG size, done! */
-        if (bmp->db_agsize != old_agsize) {
+        if (agsizechanged) {
                if ((rc = diExtendFS(ipimap, ipbmap)))
                        goto error_out;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 266699deb1c6..b38f96bef829 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -30,6 +30,7 @@
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/seq_file.h>
 #include <linux/smp_lock.h>
@@ -178,6 +179,8 @@ static void jfs_put_super(struct super_block *sb)
        jfs_info("In jfs_put_super");
+        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
        lock_kernel();
        rc = jfs_umount(sb);
@@ -395,10 +398,20 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
                JFS_SBI(sb)->flag = flag;
                ret = jfs_mount_rw(sb, 1);
+                /* mark the fs r/w for quota activity */
+                sb->s_flags &= ~MS_RDONLY;
                unlock_kernel();
+                dquot_resume(sb, -1);
                return ret;
        }
        if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
+                rc = dquot_suspend(sb, -1);
+                if (rc < 0) {
+                        unlock_kernel();
+                        return rc;
+                }
                rc = jfs_umount_rw(sb);
                JFS_SBI(sb)->flag = flag;
                unlock_kernel();
@@ -445,10 +458,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
        /* initialize the mount flag and determine the default error handler */
        flag = JFS_ERR_REMOUNT_RO;
-        if (!parse_options((char *) data, sb, &newLVSize, &flag)) {
+        if (!parse_options((char *) data, sb, &newLVSize, &flag))
-                kfree(sbi);
+                goto out_kfree;
-                return -EINVAL;
-        }
        sbi->flag = flag;
 #ifdef CONFIG_JFS_POSIX_ACL
@@ -457,7 +468,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
        if (newLVSize) {
                printk(KERN_ERR "resize option for remount only\n");
-                return -EINVAL;
+                goto out_kfree;
        }
        /*
@@ -470,6 +481,10 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
         */
        sb->s_op = &jfs_super_operations;
        sb->s_export_op = &jfs_export_operations;
+#ifdef CONFIG_QUOTA
+        sb->dq_op = &dquot_operations;
+        sb->s_qcop = &dquot_quotactl_ops;
+#endif
        /*
         * Initialize direct-mapping inode/address-space
@@ -477,7 +492,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
        inode = new_inode(sb);
        if (inode == NULL) {
                ret = -ENOMEM;
-                goto out_kfree;
+                goto out_unload;
        }
        inode->i_ino = 0;
        inode->i_nlink = 1;
@@ -549,9 +564,10 @@ out_mount_failed:
        make_bad_inode(sbi->direct_inode);
        iput(sbi->direct_inode);
        sbi->direct_inode = NULL;
-out_kfree:
+out_unload:
        if (sbi->nls_tab)
                unload_nls(sbi->nls_tab);
+out_kfree:
        kfree(sbi);
        return ret;
 }
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
index 4af1a05aad0a..205b946d8e0d 100644
--- a/fs/jfs/symlink.c
+++ b/fs/jfs/symlink.c
@@ -29,9 +29,21 @@ static void *jfs_follow_link(struct dentry *dentry, struct nameidata *nd)
        return NULL;
 }
-const struct inode_operations jfs_symlink_inode_operations = {
+const struct inode_operations jfs_fast_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = jfs_follow_link,
+        .setattr        = jfs_setattr,
+        .setxattr       = jfs_setxattr,
+        .getxattr       = jfs_getxattr,
+        .listxattr      = jfs_listxattr,
+        .removexattr    = jfs_removexattr,
+};
+const struct inode_operations jfs_symlink_inode_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = page_follow_link_light,
+        .put_link       = page_put_link,
+        .setattr        = jfs_setattr,
        .setxattr       = jfs_setxattr,
        .getxattr       = jfs_getxattr,
        .listxattr      = jfs_listxattr,
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 1f594ab21895..fa96bbb26343 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/slab.h>
 #include <linux/quotaops.h>
 #include <linux/security.h>
 #include "jfs_incore.h"
diff --git a/fs/libfs.c b/fs/libfs.c
index 9e50bcf55857..09e1016eb774 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -5,8 +5,10 @@
 #include <linux/module.h>
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/vfs.h>
+#include <linux/quotaops.h>
 #include <linux/mutex.h>
 #include <linux/exportfs.h>
 #include <linux/writeback.h>
@@ -57,11 +59,6 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na
        return NULL;
 }
-int simple_sync_file(struct file * file, struct dentry *dentry, int datasync)
-{
-        return 0;
-}
- 
 int dcache_dir_open(struct inode *inode, struct file *file)
 {
        static struct qstr cursor_name = {.len = 1, .name = "."};
@@ -189,7 +186,7 @@ const struct file_operations simple_dir_operations = {
        .llseek         = dcache_dir_lseek,
        .read           = generic_read_dir,
        .readdir        = dcache_readdir,
-        .fsync          = simple_sync_file,
+        .fsync          = noop_fsync,
 };
 const struct inode_operations simple_dir_inode_operations = {
@@ -329,6 +326,81 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
        return 0;
 }
+/**
+ * simple_setsize - handle core mm and vfs requirements for file size change
+ * @inode: inode
+ * @newsize: new file size
+ *
+ * Returns 0 on success, -error on failure.
+ *
+ * simple_setsize must be called with inode_mutex held.
+ *
+ * simple_setsize will check that the requested new size is OK (see
+ * inode_newsize_ok), and then will perform the necessary i_size update
+ * and pagecache truncation (if necessary). It will be typically be called
+ * from the filesystem's setattr function when ATTR_SIZE is passed in.
+ *
+ * The inode itself must have correct permissions and attributes to allow
+ * i_size to be changed, this function then just checks that the new size
+ * requested is valid.
+ *
+ * In the case of simple in-memory filesystems with inodes stored solely
+ * in the inode cache, and file data in the pagecache, nothing more needs
+ * to be done to satisfy a truncate request. Filesystems with on-disk
+ * blocks for example will need to free them in the case of truncate, in
+ * that case it may be easier not to use simple_setsize (but each of its
+ * components will likely be required at some point to update pagecache
+ * and inode etc).
+ */
+int simple_setsize(struct inode *inode, loff_t newsize)
+{
+        loff_t oldsize;
+        int error;
+        error = inode_newsize_ok(inode, newsize);
+        if (error)
+                return error;
+        oldsize = inode->i_size;
+        i_size_write(inode, newsize);
+        truncate_pagecache(inode, oldsize, newsize);
+        return error;
+}
+EXPORT_SYMBOL(simple_setsize);
+/**
+ * simple_setattr - setattr for simple in-memory filesystem
+ * @dentry: dentry
+ * @iattr: iattr structure
+ *
+ * Returns 0 on success, -error on failure.
+ *
+ * simple_setattr implements setattr for an in-memory filesystem which
+ * does not store its own file data or metadata (eg. uses the page cache
+ * and inode cache as its data store).
+ */
+int simple_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+        struct inode *inode = dentry->d_inode;
+        int error;
+        error = inode_change_ok(inode, iattr);
+        if (error)
+                return error;
+        if (iattr->ia_valid & ATTR_SIZE) {
+                error = simple_setsize(inode, iattr->ia_size);
+                if (error)
+                        return error;
+        }
+        generic_setattr(inode, iattr);
+        return error;
+}
+EXPORT_SYMBOL(simple_setattr);
 int simple_readpage(struct file *file, struct page *page)
 {
        clear_highpage(page);
@@ -546,6 +618,40 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
 }
 /**
+ * simple_write_to_buffer - copy data from user space to the buffer
+ * @to: the buffer to write to
+ * @available: the size of the buffer
+ * @ppos: the current position in the buffer
+ * @from: the user space buffer to read from
+ * @count: the maximum number of bytes to read
+ *
+ * The simple_write_to_buffer() function reads up to @count bytes from the user
+ * space address starting at @from into the buffer @to at offset @ppos.
+ *
+ * On success, the number of bytes written is returned and the offset @ppos is
+ * advanced by this number, or negative value is returned on error.
+ **/
+ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
+                const void __user *from, size_t count)
+{
+        loff_t pos = *ppos;
+        size_t res;
+        if (pos < 0)
+                return -EINVAL;
+        if (pos >= available || !count)
+                return 0;
+        if (count > available - pos)
+                count = available - pos;
+        res = copy_from_user(to + pos, from, count);
+        if (res == count)
+                return -EFAULT;
+        count -= res;
+        *ppos = pos + count;
+        return count;
+}
+/**
 * memory_read_from_buffer - copy data from the buffer
 * @to: the kernel space buffer to read to
 * @count: the maximum number of bytes to read
@@ -816,13 +922,22 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
 }
 EXPORT_SYMBOL_GPL(generic_fh_to_parent);
-int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
+/**
+ * generic_file_fsync - generic fsync implementation for simple filesystems
+ * @file:       file to synchronize
+ * @datasync:   only synchronize essential metadata if true
+ *
+ * This is a generic implementation of the fsync method for simple
+ * filesystems which track all non-inode metadata in the buffers list
+ * hanging off the address_space structure.
+ */
+int generic_file_fsync(struct file *file, int datasync)
 {
        struct writeback_control wbc = {
                .sync_mode = WB_SYNC_ALL,
                .nr_to_write = 0, /* metadata-only; caller takes care of data */
        };
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        int err;
        int ret;
@@ -837,7 +952,15 @@ int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
                ret = err;
        return ret;
 }
-EXPORT_SYMBOL(simple_fsync);
+EXPORT_SYMBOL(generic_file_fsync);
+/*
+ * No-op implementation of ->fsync for in-memory filesystems.
+ */
+int noop_fsync(struct file *file, int datasync)
+{
+        return 0;
+}
 EXPORT_SYMBOL(dcache_dir_close);
 EXPORT_SYMBOL(dcache_dir_lseek);
@@ -860,9 +983,10 @@ EXPORT_SYMBOL(simple_release_fs);
 EXPORT_SYMBOL(simple_rename);
 EXPORT_SYMBOL(simple_rmdir);
 EXPORT_SYMBOL(simple_statfs);
-EXPORT_SYMBOL(simple_sync_file);
+EXPORT_SYMBOL(noop_fsync);
 EXPORT_SYMBOL(simple_unlink);
 EXPORT_SYMBOL(simple_read_from_buffer);
+EXPORT_SYMBOL(simple_write_to_buffer);
 EXPORT_SYMBOL(memory_read_from_buffer);
 EXPORT_SYMBOL(simple_transaction_set);
 EXPORT_SYMBOL(simple_transaction_get);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index fc9032dc8862..64fd427c993c 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/types.h>
+#include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/nfs_fs.h>
 #include <linux/sunrpc/clnt.h>
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index c81249fef11f..7932c399fab4 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index fefa4df3f005..e3015464fbab 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -10,6 +10,7 @@
 #include <linux/utsname.h>
 #include <linux/kernel.h>
 #include <linux/ktime.h>
+#include <linux/slab.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/xprtsock.h>
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 7d150517ddf0..f1bacf1a0391 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -21,7 +21,6 @@
 #include <linux/errno.h>
 #include <linux/in.h>
 #include <linux/uio.h>
-#include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/mutex.h>
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index a7966eed3c17..031c6569a134 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -9,7 +9,6 @@
 #include <linux/types.h>
 #include <linux/time.h>
-#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index d1001790fa9a..84055d31bfc5 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -21,6 +21,7 @@
 */
 #include <linux/types.h>
+#include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 56c9519d900a..0f2ab741ae7c 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -9,7 +9,6 @@
 #include <linux/types.h>
 #include <linux/time.h>
-#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index ad478da7ca63..d0ef94cfb3da 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -10,6 +10,7 @@
 #include <linux/string.h>
 #include <linux/time.h>
 #include <linux/in.h>
+#include <linux/slab.h>
 #include <linux/mutex.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/clnt.h>
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 9718c22f186d..9bd2ce2a3040 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -9,6 +9,7 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>
+#include <linux/gfp.h>
 #define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
@@ -80,6 +81,7 @@ static void writeseg_end_io(struct bio *bio, int err)
                        prefetchw(&bvec->bv_page->flags);
                end_page_writeback(page);
+                page_cache_release(page);
        } while (bvec >= bio->bi_io_vec);
        bio_put(bio);
        if (atomic_dec_and_test(&super->s_pending_writes))
@@ -97,8 +99,10 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
        unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
        int i;
+        if (max_pages > BIO_MAX_PAGES)
+                max_pages = BIO_MAX_PAGES;
        bio = bio_alloc(GFP_NOFS, max_pages);
-        BUG_ON(!bio); /* FIXME: handle this */
+        BUG_ON(!bio);
        for (i = 0; i < nr_pages; i++) {
                if (i >= max_pages) {
@@ -191,8 +195,10 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
        unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
        int i;
+        if (max_pages > BIO_MAX_PAGES)
+                max_pages = BIO_MAX_PAGES;
        bio = bio_alloc(GFP_NOFS, max_pages);
-        BUG_ON(!bio); /* FIXME: handle this */
+        BUG_ON(!bio);
        for (i = 0; i < nr_pages; i++) {
                if (i >= max_pages) {
@@ -297,6 +303,11 @@ static void bdev_put_device(struct super_block *sb)
        close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE);
 }
+static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
+{
+        return 0;
+}
 static const struct logfs_device_ops bd_devops = {
        .find_first_sb  = bdev_find_first_sb,
        .find_last_sb   = bdev_find_last_sb,
@@ -304,6 +315,7 @@ static const struct logfs_device_ops bd_devops = {
        .readpage       = bdev_readpage,
        .writeseg       = bdev_writeseg,
        .erase          = bdev_erase,
+        .can_write_buf  = bdev_can_write_buf,
        .sync           = bdev_sync,
        .put_device     = bdev_put_device,
 };
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index cafb6ef2e05b..a85d47d13e4b 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -9,6 +9,7 @@
 #include <linux/completion.h>
 #include <linux/mount.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
@@ -126,7 +127,8 @@ static int mtd_readpage(void *_sb, struct page *page)
        err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
                        page_address(page));
-        if (err == -EUCLEAN) {
+        if (err == -EUCLEAN || err == -EBADMSG) {
+                /* -EBADMSG happens regularly on power failures */
                err = 0;
                /* FIXME: force GC this segment */
        }
@@ -233,12 +235,32 @@ static void mtd_put_device(struct super_block *sb)
        put_mtd_device(logfs_super(sb)->s_mtd);
 }
+static int mtd_can_write_buf(struct super_block *sb, u64 ofs)
+{
+        struct logfs_super *super = logfs_super(sb);
+        void *buf;
+        int err;
+        buf = kmalloc(super->s_writesize, GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        err = mtd_read(sb, ofs, super->s_writesize, buf);
+        if (err)
+                goto out;
+        if (memchr_inv(buf, 0xff, super->s_writesize))
+                err = -EIO;
+        kfree(buf);
+out:
+        return err;
+}
 static const struct logfs_device_ops mtd_devops = {
        .find_first_sb  = mtd_find_first_sb,
        .find_last_sb   = mtd_find_last_sb,
        .readpage       = mtd_readpage,
        .writeseg       = mtd_writeseg,
        .erase          = mtd_erase,
+        .can_write_buf  = mtd_can_write_buf,
        .sync           = mtd_sync,
        .put_device     = mtd_put_device,
 };
@@ -250,5 +272,7 @@ int logfs_get_sb_mtd(struct file_system_type *type, int flags,
        const struct logfs_device_ops *devops = &mtd_devops;
        mtd = get_mtd_device(NULL, mtdnr);
+        if (IS_ERR(mtd))
+                return PTR_ERR(mtd);
        return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt);
 }
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 56a8bfbb0120..72d1893ddd36 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -6,13 +6,13 @@
 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
 */
 #include "logfs.h"
+#include <linux/slab.h>
 /*
 * Atomic dir operations
 *
 * Directory operations are by default not atomic.  Dentries and Inodes are
- * created/removed/altered in seperate operations.  Therefore we need to do
+ * created/removed/altered in separate operations.  Therefore we need to do
 * a small amount of journaling.
 *
 * Create, link, mkdir, mknod and symlink all share the same function to do
@@ -303,12 +303,12 @@ static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
                                (filler_t *)logfs_readpage, NULL);
                if (IS_ERR(page))
                        return PTR_ERR(page);
-                dd = kmap_atomic(page, KM_USER0);
+                dd = kmap(page);
                BUG_ON(dd->namelen == 0);
                full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
                                pos, be64_to_cpu(dd->ino), dd->type);
-                kunmap_atomic(dd, KM_USER0);
+                kunmap(page);
                page_cache_release(page);
                if (full)
                        break;
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 370f367a933e..abe1cafbd4c2 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -161,7 +161,17 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
 static void logfs_invalidatepage(struct page *page, unsigned long offset)
 {
-        move_page_to_btree(page);
+        struct logfs_block *block = logfs_block(page);
+        if (block->reserved_bytes) {
+                struct super_block *sb = page->mapping->host->i_sb;
+                struct logfs_super *super = logfs_super(sb);
+                super->s_dirty_pages -= block->reserved_bytes;
+                block->ops->free_block(sb, block);
+                BUG_ON(bitmap_weight(block->alias_map, LOGFS_BLOCK_FACTOR));
+        } else
+                move_page_to_btree(page);
        BUG_ON(PagePrivate(page) || page->private);
 }
@@ -209,13 +219,11 @@ int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
        }
 }
-int logfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int logfs_fsync(struct file *file, int datasync)
 {
-        struct super_block *sb = dentry->d_inode->i_sb;
+        struct super_block *sb = file->f_mapping->host->i_sb;
-        struct logfs_super *super = logfs_super(sb);
-        /* FIXME: write anchor */
+        logfs_write_anchor(sb);
-        super->s_devops->sync(sb);
        return 0;
 }
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
index 92949f95a901..caa4419285dc 100644
--- a/fs/logfs/gc.c
+++ b/fs/logfs/gc.c
@@ -7,6 +7,7 @@
 */
 #include "logfs.h"
 #include <linux/sched.h>
+#include <linux/slab.h>
 /*
 * Wear leveling needs to kick in when the difference between low erase
@@ -121,7 +122,7 @@ static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
        logfs_safe_iput(inode, cookie);
 }
-static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist)
+static u32 logfs_gc_segment(struct super_block *sb, u32 segno)
 {
        struct logfs_super *super = logfs_super(sb);
        struct logfs_segment_header sh;
@@ -400,7 +401,7 @@ static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
                        segno, (u64)segno << super->s_segshift,
                        dist, no_free_segments(sb), valid,
                        super->s_free_bytes);
-        cleaned = logfs_gc_segment(sb, segno, dist);
+        cleaned = logfs_gc_segment(sb, segno);
        log_gc("GC segment #%02x complete - now %x valid\n", segno,
                        valid - cleaned);
        BUG_ON(cleaned != valid);
@@ -458,6 +459,14 @@ static void __logfs_gc_pass(struct super_block *sb, int target)
        struct logfs_block *block;
        int round, progress, last_progress = 0;
+        /*
+         * Doing too many changes to the segfile at once would result
+         * in a large number of aliases.  Write the journal before
+         * things get out of hand.
+         */
+        if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES)
+                logfs_write_anchor(sb);
        if (no_free_segments(sb) >= target &&
                        super->s_no_object_aliases < MAX_OBJ_ALIASES)
                return;
@@ -623,38 +632,31 @@ static int check_area(struct super_block *sb, int i)
 {
        struct logfs_super *super = logfs_super(sb);
        struct logfs_area *area = super->s_area[i];
-        struct logfs_object_header oh;
+        gc_level_t gc_level;
+        u32 cleaned, valid, ec;
        u32 segno = area->a_segno;
-        u32 ofs = area->a_used_bytes;
+        u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
-        __be32 crc;
-        int err;
        if (!area->a_is_open)
                return 0;
-        for (ofs = area->a_used_bytes;
+        if (super->s_devops->can_write_buf(sb, ofs) == 0)
-             ofs <= super->s_segsize - sizeof(oh);
+                return 0;
-             ofs += (u32)be16_to_cpu(oh.len) + sizeof(oh)) {
-                err = wbuf_read(sb, dev_ofs(sb, segno, ofs), sizeof(oh), &oh);
-                if (err)
-                        return err;
-                if (!memchr_inv(&oh, 0xff, sizeof(oh)))
-                        break;
-                crc = logfs_crc32(&oh, sizeof(oh) - 4, 4);
+        printk(KERN_INFO"LogFS: Possibly incomplete write at %llx\n", ofs);
-                if (crc != oh.crc) {
+        /*
-                        printk(KERN_INFO "interrupted header at %llx\n",
+         * The device cannot write back the write buffer.  Most likely the
-                                        dev_ofs(sb, segno, ofs));
+         * wbuf was already written out and the system crashed at some point
-                        return 0;
+         * before the journal commit happened.  In that case we wouldn't have
-                }
+         * to do anything.  But if the crash happened before the wbuf was
-        }
+         * written out correctly, we must GC this segment.  So assume the
-        if (ofs != area->a_used_bytes) {
+         * worst and always do the GC run.
-                printk(KERN_INFO "%x bytes unaccounted data found at %llx\n",
+         */
-                                ofs - area->a_used_bytes,
+        area->a_is_open = 0;
-                                dev_ofs(sb, segno, area->a_used_bytes));
+        valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
-                area->a_used_bytes = ofs;
+        cleaned = logfs_gc_segment(sb, segno);
-        }
+        if (cleaned != valid)
+                return -EIO;
        return 0;
 }
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 33ec1aeaeec4..f602e230e162 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -6,6 +6,7 @@
 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
 */
 #include "logfs.h"
+#include <linux/slab.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
@@ -192,6 +193,7 @@ static void logfs_init_inode(struct super_block *sb, struct inode *inode)
        inode->i_ctime  = CURRENT_TIME;
        inode->i_mtime  = CURRENT_TIME;
        inode->i_nlink  = 1;
+        li->li_refcount = 1;
        INIT_LIST_HEAD(&li->li_freeing_list);
        for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
@@ -325,7 +327,7 @@ static void logfs_set_ino_generation(struct super_block *sb,
        u64 ino;
        mutex_lock(&super->s_journal_mutex);
-        ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino);
+        ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino + 1);
        super->s_last_ino = ino;
        super->s_inos_till_wrap--;
        if (super->s_inos_till_wrap < 0) {
@@ -356,14 +358,7 @@ struct inode *logfs_new_inode(struct inode *dir, int mode)
        inode->i_mode = mode;
        logfs_set_ino_generation(sb, inode);
-        inode->i_uid = current_fsuid();
+        inode_init_owner(inode, dir, mode);
-        inode->i_gid = current_fsgid();
-        if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        inode->i_mode |= S_ISGID;
-        }
        logfs_inode_setops(inode);
        insert_inode_hash(inode);
@@ -385,8 +380,7 @@ static void logfs_init_once(void *_li)
 static int logfs_sync_fs(struct super_block *sb, int wait)
 {
-        /* FIXME: write anchor */
+        logfs_write_anchor(sb);
-        logfs_super(sb)->s_devops->sync(sb);
        return 0;
 }
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 6ad30a4c9052..4b0e0616b357 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -6,6 +6,7 @@
 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
 */
 #include "logfs.h"
+#include <linux/slab.h>
 static void logfs_calc_free(struct super_block *sb)
 {
@@ -131,10 +132,9 @@ static int read_area(struct super_block *sb, struct logfs_je_area *a)
        ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
        if (super->s_writesize > 1)
-                logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
+                return logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
        else
-                logfs_buf_recover(area, ofs, NULL, 0);
+                return logfs_buf_recover(area, ofs, NULL, 0);
-        return 0;
 }
 static void *unpack(void *from, void *to)
@@ -244,7 +244,7 @@ static int read_je(struct super_block *sb, u64 ofs)
                read_erasecount(sb, unpack(jh, scratch));
                break;
        case JE_AREA:
-                read_area(sb, unpack(jh, scratch));
+                err = read_area(sb, unpack(jh, scratch));
                break;
        case JE_OBJ_ALIAS:
                err = logfs_load_object_aliases(sb, unpack(jh, scratch),
@@ -388,7 +388,10 @@ static void journal_get_erase_count(struct logfs_area *area)
 static int journal_erase_segment(struct logfs_area *area)
 {
        struct super_block *sb = area->a_sb;
-        struct logfs_segment_header sh;
+        union {
+                struct logfs_segment_header sh;
+                unsigned char c[ALIGN(sizeof(struct logfs_segment_header), 16)];
+        } u;
        u64 ofs;
        int err;
@@ -396,20 +399,21 @@ static int journal_erase_segment(struct logfs_area *area)
        if (err)
                return err;
-        sh.pad = 0;
+        memset(&u, 0, sizeof(u));
-        sh.type = SEG_JOURNAL;
+        u.sh.pad = 0;
-        sh.level = 0;
+        u.sh.type = SEG_JOURNAL;
-        sh.segno = cpu_to_be32(area->a_segno);
+        u.sh.level = 0;
-        sh.ec = cpu_to_be32(area->a_erase_count);
+        u.sh.segno = cpu_to_be32(area->a_segno);
-        sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
+        u.sh.ec = cpu_to_be32(area->a_erase_count);
-        sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
+        u.sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
+        u.sh.crc = logfs_crc32(&u.sh, sizeof(u.sh), 4);
        /* This causes a bug in segment.c.  Not yet. */
        //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0);
        ofs = dev_ofs(sb, area->a_segno, 0);
-        area->a_used_bytes = ALIGN(sizeof(sh), 16);
+        area->a_used_bytes = sizeof(u);
-        logfs_buf_write(area, ofs, &sh, sizeof(sh));
+        logfs_buf_write(area, ofs, &u, sizeof(u));
        return 0;
 }
@@ -493,6 +497,8 @@ static void account_shadows(struct super_block *sb)
        btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow);
        btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow);
+        btree_grim_visitor32(&tree->segment_map, 0, NULL);
+        tree->no_shadowed_segments = 0;
        if (li->li_block) {
                /*
@@ -606,9 +612,9 @@ static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
        if (len == 0)
                return logfs_write_header(super, header, 0, type);
+        BUG_ON(len > sb->s_blocksize);
        compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
        if (compr_len < 0 || type == JE_ANCHOR) {
-                BUG_ON(len > sb->s_blocksize);
                memcpy(data, buf, len);
                compr_len = len;
                compr = COMPR_NONE;
@@ -660,6 +666,7 @@ static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type,
        if (ofs < 0)
                return ofs;
        logfs_buf_write(area, ofs, super->s_compressed_je, len);
+        BUG_ON(super->s_no_je >= MAX_JOURNAL_ENTRIES);
        super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs);
        return 0;
 }
@@ -800,6 +807,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
 {
        struct logfs_super *super = logfs_super(sb);
        struct logfs_area *area = super->s_journal_area;
+        struct btree_head32 *head = &super->s_reserved_segments;
        u32 segno, ec;
        int i, err;
@@ -807,6 +815,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
        /* Drop old segments */
        journal_for_each(i)
                if (super->s_journal_seg[i]) {
+                        btree_remove32(head, super->s_journal_seg[i]);
                        logfs_set_segment_unreserved(sb,
                                        super->s_journal_seg[i],
                                        super->s_journal_ec[i]);
@@ -819,8 +828,13 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
                super->s_journal_seg[i] = segno;
                super->s_journal_ec[i] = ec;
                logfs_set_segment_reserved(sb, segno);
+                err = btree_insert32(head, segno, (void *)1, GFP_KERNEL);
+                BUG_ON(err); /* mempool should prevent this */
+                err = logfs_erase_segment(sb, segno, 1);
+                BUG_ON(err); /* FIXME: remount-ro would be nicer */
        }
        /* Manually move journal_area */
+        freeseg(sb, area->a_segno);
        area->a_segno = super->s_journal_seg[0];
        area->a_is_open = 0;
        area->a_used_bytes = 0;
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 129779431373..c838c4d72111 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -144,6 +144,7 @@ struct logfs_area_ops {
 * @erase:                      erase one segment
 * @read:                       read from the device
 * @erase:                      erase part of the device
+ * @can_write_buf:              decide whether wbuf can be written to ofs
 */
 struct logfs_device_ops {
        struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs);
@@ -153,6 +154,7 @@ struct logfs_device_ops {
        void (*writeseg)(struct super_block *sb, u64 ofs, size_t len);
        int (*erase)(struct super_block *sb, loff_t ofs, size_t len,
                        int ensure_write);
+        int (*can_write_buf)(struct super_block *sb, u64 ofs);
        void (*sync)(struct super_block *sb);
        void (*put_device)(struct super_block *sb);
 };
@@ -257,10 +259,14 @@ struct logfs_shadow {
 * struct shadow_tree
 * @new:                        shadows where old_ofs==0, indexed by new_ofs
 * @old:                        shadows where old_ofs!=0, indexed by old_ofs
+ * @segment_map:                bitfield of segments containing shadows
+ * @no_shadowed_segment:        number of segments containing shadows
 */
 struct shadow_tree {
        struct btree_head64 new;
        struct btree_head64 old;
+        struct btree_head32 segment_map;
+        int no_shadowed_segments;
 };
 struct object_alias_item {
@@ -305,13 +311,14 @@ typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix,
                level_t level, int child_no, __be64 val);
 struct logfs_block_ops {
        void    (*write_block)(struct logfs_block *block);
-        gc_level_t      (*block_level)(struct logfs_block *block);
        void    (*free_block)(struct super_block *sb, struct logfs_block*block);
        int     (*write_alias)(struct super_block *sb,
                        struct logfs_block *block,
                        write_alias_t *write_one_alias);
 };
+#define MAX_JOURNAL_ENTRIES 256
 struct logfs_super {
        struct mtd_info *s_mtd;                 /* underlying device */
        struct block_device *s_bdev;            /* underlying device */
@@ -378,7 +385,7 @@ struct logfs_super {
        u32      s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */
        u64      s_last_version;
        struct logfs_area *s_journal_area;      /* open journal segment */
-        __be64  s_je_array[64];
+        __be64  s_je_array[MAX_JOURNAL_ENTRIES];
        int     s_no_je;
        int      s_sum_index;                   /* for the 12 summaries */
@@ -389,6 +396,7 @@ struct logfs_super {
        int      s_lock_count;
        mempool_t *s_block_pool;                /* struct logfs_block pool */
        mempool_t *s_shadow_pool;               /* struct logfs_shadow pool */
+        struct list_head s_writeback_list;      /* writeback pages */
        /*
         * Space accounting:
         * - s_used_bytes specifies space used to store valid data objects.
@@ -498,7 +506,7 @@ extern const struct address_space_operations logfs_reg_aops;
 int logfs_readpage(struct file *file, struct page *page);
 int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
                unsigned long arg);
-int logfs_fsync(struct file *file, struct dentry *dentry, int datasync);
+int logfs_fsync(struct file *file, int datasync);
 /* gc.c */
 u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
@@ -587,24 +595,25 @@ void move_page_to_btree(struct page *page);
 int logfs_init_mapping(struct super_block *sb);
 void logfs_sync_area(struct logfs_area *area);
 void logfs_sync_segments(struct super_block *sb);
+void freeseg(struct super_block *sb, u32 segno);
 /* area handling */
 int logfs_init_areas(struct super_block *sb);
 void logfs_cleanup_areas(struct super_block *sb);
 int logfs_open_area(struct logfs_area *area, size_t bytes);
-void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
+int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
                int use_filler);
-static inline void logfs_buf_write(struct logfs_area *area, u64 ofs,
+static inline int logfs_buf_write(struct logfs_area *area, u64 ofs,
                void *buf, size_t len)
 {
-        __logfs_buf_write(area, ofs, buf, len, 0);
+        return __logfs_buf_write(area, ofs, buf, len, 0);
 }
-static inline void logfs_buf_recover(struct logfs_area *area, u64 ofs,
+static inline int logfs_buf_recover(struct logfs_area *area, u64 ofs,
                void *buf, size_t len)
 {
-        __logfs_buf_write(area, ofs, buf, len, 1);
+        return __logfs_buf_write(area, ofs, buf, len, 1);
 }
 /* super.c */
@@ -698,7 +707,7 @@ static inline gc_level_t expand_level(u64 ino, level_t __level)
        u8 level = (__force u8)__level;
        if (ino == LOGFS_INO_MASTER) {
-                /* ifile has seperate areas */
+                /* ifile has separate areas */
                level += LOGFS_MAX_LEVELS;
        }
        return (__force gc_level_t)level;
@@ -721,4 +730,10 @@ static inline struct logfs_area *get_area(struct super_block *sb,
        return logfs_super(sb)->s_area[(__force u8)gc_level];
 }
+static inline void logfs_mempool_destroy(mempool_t *pool)
+{
+        if (pool)
+                mempool_destroy(pool);
+}
 #endif
diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h
index f674725663fe..ae960519c54a 100644
--- a/fs/logfs/logfs_abi.h
+++ b/fs/logfs/logfs_abi.h
@@ -50,9 +50,9 @@ static inline void check_##type(void)				\
 * 12   - gc recycled blocks, long-lived data
 * 13   - replacement blocks, short-lived data
 *
- * Levels 1-11 are necessary for robust gc operations and help seperate
+ * Levels 1-11 are necessary for robust gc operations and help separate
 * short-lived metadata from longer-lived file data.  In the future,
- * file data should get seperated into several segments based on simple
+ * file data should get separated into several segments based on simple
 * heuristics.  Old data recycled during gc operation is expected to be
 * long-lived.  New data is of uncertain life expectancy.  New data
 * used to replace older blocks in existing files is expected to be
@@ -117,7 +117,7 @@ static inline void check_##type(void)				\
 #define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
 /*
- * LogFS needs to seperate data into levels.  Each level is defined as the
+ * LogFS needs to separate data into levels.  Each level is defined as the
 * maximal possible distance from the master inode (inode of the inode file).
 * Data blocks reside on level 0, 1x indirect block on level 1, etc.
 * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
@@ -204,7 +204,7 @@ SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
 * @ds_crc:                     crc32 of structure starting with the next field
 * @ds_ifile_levels:            maximum number of levels for ifile
 * @ds_iblock_levels:           maximum number of levels for regular files
- * @ds_data_levels:             number of seperate levels for data
+ * @ds_data_levels:             number of separate levels for data
 * @pad0:                       reserved, must be 0
 * @ds_feature_incompat:        incompatible filesystem features
 * @ds_feature_ro_compat:       read-only compatible filesystem features
@@ -456,7 +456,7 @@ enum logfs_vim {
 * @vim:                        life expectancy of data
 *
 * "Areas" are segments currently being used for writing.  There is at least
- * one area per GC level.  Several may be used to seperate long-living from
+ * one area per GC level.  Several may be used to separate long-living from
 * short-living data.  If an area with unknown vim is encountered, it can
 * simply be closed.
 * The write buffer immediately follow this header.
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 7a23b3e7c0a7..0718d112a1a5 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -18,6 +18,7 @@
 */
 #include "logfs.h"
 #include <linux/sched.h>
+#include <linux/slab.h>
 static u64 adjust_bix(u64 bix, level_t level)
 {
@@ -429,25 +430,6 @@ static void inode_write_block(struct logfs_block *block)
        }
 }
-static gc_level_t inode_block_level(struct logfs_block *block)
-{
-        BUG_ON(block->inode->i_ino == LOGFS_INO_MASTER);
-        return GC_LEVEL(LOGFS_MAX_LEVELS);
-}
-static gc_level_t indirect_block_level(struct logfs_block *block)
-{
-        struct page *page;
-        struct inode *inode;
-        u64 bix;
-        level_t level;
-        page = block->page;
-        inode = page->mapping->host;
-        logfs_unpack_index(page->index, &bix, &level);
-        return expand_level(inode->i_ino, level);
-}
 /*
 * This silences a false, yet annoying gcc warning.  I hate it when my editor
 * jumps into bitops.h each time I recompile this file.
@@ -586,14 +568,12 @@ static void indirect_free_block(struct super_block *sb,
 static struct logfs_block_ops inode_block_ops = {
        .write_block = inode_write_block,
-        .block_level = inode_block_level,
        .free_block = inode_free_block,
        .write_alias = inode_write_alias,
 };
 struct logfs_block_ops indirect_block_ops = {
        .write_block = indirect_write_block,
-        .block_level = indirect_block_level,
        .free_block = indirect_free_block,
        .write_alias = indirect_write_alias,
 };
@@ -912,6 +892,8 @@ u64 logfs_seek_hole(struct inode *inode, u64 bix)
                return bix;
        else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED)
                bix = maxbix(li->li_height);
+        else if (bix >= maxbix(li->li_height))
+                return bix;
        else {
                bix = seek_holedata_loop(inode, bix, 0);
                if (bix < maxbix(li->li_height))
@@ -1113,17 +1095,25 @@ static int logfs_reserve_bytes(struct inode *inode, int bytes)
 int get_page_reserve(struct inode *inode, struct page *page)
 {
        struct logfs_super *super = logfs_super(inode->i_sb);
+        struct logfs_block *block = logfs_block(page);
        int ret;
-        if (logfs_block(page) && logfs_block(page)->reserved_bytes)
+        if (block && block->reserved_bytes)
                return 0;
        logfs_get_wblocks(inode->i_sb, page, WF_LOCK);
-        ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE);
+        while ((ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE)) &&
+                        !list_empty(&super->s_writeback_list)) {
+                block = list_entry(super->s_writeback_list.next,
+                                struct logfs_block, alias_list);
+                block->ops->write_block(block);
+        }
        if (!ret) {
                alloc_data_block(inode, page);
-                logfs_block(page)->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
+                block = logfs_block(page);
+                block->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
                super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE;
+                list_move_tail(&block->alias_list, &super->s_writeback_list);
        }
        logfs_put_wblocks(inode->i_sb, page, WF_LOCK);
        return ret;
@@ -1240,6 +1230,18 @@ static void free_shadow(struct inode *inode, struct logfs_shadow *shadow)
        mempool_free(shadow, super->s_shadow_pool);
 }
+static void mark_segment(struct shadow_tree *tree, u32 segno)
+{
+        int err;
+        if (!btree_lookup32(&tree->segment_map, segno)) {
+                err = btree_insert32(&tree->segment_map, segno, (void *)1,
+                                GFP_NOFS);
+                BUG_ON(err);
+                tree->no_shadowed_segments++;
+        }
+}
 /**
 * fill_shadow_tree - Propagate shadow tree changes due to a write
 * @inode:      Inode owning the page
@@ -1287,6 +1289,8 @@ static void fill_shadow_tree(struct inode *inode, struct page *page,
                super->s_dirty_used_bytes += shadow->new_len;
                super->s_dirty_free_bytes += shadow->old_len;
+                mark_segment(tree, shadow->old_ofs >> super->s_segshift);
+                mark_segment(tree, shadow->new_ofs >> super->s_segshift);
        }
 }
@@ -1594,7 +1598,6 @@ int logfs_delete(struct inode *inode, pgoff_t index,
        return ret;
 }
-/* Rewrite cannot mark the inode dirty but has to write it immediatly. */
 int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
                gc_level_t gc_level, long flags)
 {
@@ -1611,6 +1614,18 @@ int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
                if (level != 0)
                        alloc_indirect_block(inode, page, 0);
                err = logfs_write_buf(inode, page, flags);
+                if (!err && shrink_level(gc_level) == 0) {
+                        /* Rewrite cannot mark the inode dirty but has to
+                         * write it immediatly.
+                         * Q: Can't we just create an alias for the inode
+                         * instead?  And if not, why not?
+                         */
+                        if (inode->i_ino == LOGFS_INO_MASTER)
+                                logfs_write_anchor(inode->i_sb);
+                        else {
+                                err = __logfs_write_inode(inode, flags);
+                        }
+                }
        }
        logfs_put_write_page(page);
        return err;
@@ -1833,19 +1848,37 @@ static int __logfs_truncate(struct inode *inode, u64 size)
        return logfs_truncate_direct(inode, size);
 }
-int logfs_truncate(struct inode *inode, u64 size)
+/*
+ * Truncate, by changing the segment file, can consume a fair amount
+ * of resources.  So back off from time to time and do some GC.
+ * 8 or 2048 blocks should be well within safety limits even if
+ * every single block resided in a different segment.
+ */
+#define TRUNCATE_STEP   (8 * 1024 * 1024)
+int logfs_truncate(struct inode *inode, u64 target)
 {
        struct super_block *sb = inode->i_sb;
-        int err;
+        u64 size = i_size_read(inode);
+        int err = 0;
-        logfs_get_wblocks(sb, NULL, 1);
+        size = ALIGN(size, TRUNCATE_STEP);
-        err = __logfs_truncate(inode, size);
+        while (size > target) {
-        if (!err)
+                if (size > TRUNCATE_STEP)
-                err = __logfs_write_inode(inode, 0);
+                        size -= TRUNCATE_STEP;
-        logfs_put_wblocks(sb, NULL, 1);
+                else
+                        size = 0;
+                if (size < target)
+                        size = target;
+                logfs_get_wblocks(sb, NULL, 1);
+                err = __logfs_truncate(inode, size);
+                if (!err)
+                        err = __logfs_write_inode(inode, 0);
+                logfs_put_wblocks(sb, NULL, 1);
+        }
        if (!err)
-                err = vmtruncate(inode, size);
+                err = vmtruncate(inode, target);
        /* I don't trust error recovery yet. */
        WARN_ON(err);
@@ -2226,6 +2259,7 @@ int logfs_init_rw(struct super_block *sb)
        int min_fill = 3 * super->s_no_blocks;
        INIT_LIST_HEAD(&super->s_object_alias);
+        INIT_LIST_HEAD(&super->s_writeback_list);
        mutex_init(&super->s_write_mutex);
        super->s_block_pool = mempool_create_kmalloc_pool(min_fill,
                        sizeof(struct logfs_block));
@@ -2239,8 +2273,6 @@ void logfs_cleanup_rw(struct super_block *sb)
        struct logfs_super *super = logfs_super(sb);
        destroy_meta_inode(super->s_segfile_inode);
-        if (super->s_block_pool)
+        logfs_mempool_destroy(super->s_block_pool);
-                mempool_destroy(super->s_block_pool);
+        logfs_mempool_destroy(super->s_shadow_pool);
-        if (super->s_shadow_pool)
-                mempool_destroy(super->s_shadow_pool);
 }
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 1a14f9910d55..a9657afb70ad 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -10,6 +10,7 @@
 * three kinds of objects: inodes, dentries and blocks, both data and indirect.
 */
 #include "logfs.h"
+#include <linux/slab.h>
 static int logfs_mark_segment_bad(struct super_block *sb, u32 segno)
 {
@@ -66,7 +67,7 @@ static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
        return page;
 }
-void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
+int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
                int use_filler)
 {
        pgoff_t index = ofs >> PAGE_SHIFT;
@@ -80,8 +81,10 @@ void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
                copylen = min((ulong)len, PAGE_SIZE - offset);
                page = get_mapping_page(area->a_sb, index, use_filler);
-                SetPageUptodate(page);
+                if (IS_ERR(page))
+                        return PTR_ERR(page);
                BUG_ON(!page); /* FIXME: reserve a pool */
+                SetPageUptodate(page);
                memcpy(page_address(page) + offset, buf, copylen);
                SetPagePrivate(page);
                page_cache_release(page);
@@ -91,52 +94,61 @@ void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
                offset = 0;
                index++;
        } while (len);
+        return 0;
 }
-/*
+static void pad_partial_page(struct logfs_area *area)
- * bdev_writeseg will write full pages.  Memset the tail to prevent data leaks.
- */
-static void pad_wbuf(struct logfs_area *area, int final)
 {
        struct super_block *sb = area->a_sb;
-        struct logfs_super *super = logfs_super(sb);
        struct page *page;
        u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
        pgoff_t index = ofs >> PAGE_SHIFT;
        long offset = ofs & (PAGE_SIZE-1);
        u32 len = PAGE_SIZE - offset;
-        if (len == PAGE_SIZE) {
+        if (len % PAGE_SIZE) {
-                /* The math in this function can surely use some love */
+                page = get_mapping_page(sb, index, 0);
-                len = 0;
-        }
-        if (len) {
-                BUG_ON(area->a_used_bytes >= super->s_segsize);
-                page = get_mapping_page(area->a_sb, index, 0);
                BUG_ON(!page); /* FIXME: reserve a pool */
                memset(page_address(page) + offset, 0xff, len);
                SetPagePrivate(page);
                page_cache_release(page);
        }
+}
-        if (!final)
+static void pad_full_pages(struct logfs_area *area)
-                return;
+{
+        struct super_block *sb = area->a_sb;
+        struct logfs_super *super = logfs_super(sb);
+        u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
+        u32 len = super->s_segsize - area->a_used_bytes;
+        pgoff_t index = PAGE_CACHE_ALIGN(ofs) >> PAGE_CACHE_SHIFT;
+        pgoff_t no_indizes = len >> PAGE_CACHE_SHIFT;
+        struct page *page;
-        area->a_used_bytes += len;
+        while (no_indizes) {
-        for ( ; area->a_used_bytes < super->s_segsize;
+                page = get_mapping_page(sb, index, 0);
-                        area->a_used_bytes += PAGE_SIZE) {
-                /* Memset another page */
-                index++;
-                page = get_mapping_page(area->a_sb, index, 0);
                BUG_ON(!page); /* FIXME: reserve a pool */
-                memset(page_address(page), 0xff, PAGE_SIZE);
+                SetPageUptodate(page);
+                memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
                SetPagePrivate(page);
                page_cache_release(page);
+                index++;
+                no_indizes--;
        }
 }
 /*
+ * bdev_writeseg will write full pages.  Memset the tail to prevent data leaks.
+ * Also make sure we allocate (and memset) all pages for final writeout.
+ */
+static void pad_wbuf(struct logfs_area *area, int final)
+{
+        pad_partial_page(area);
+        if (final)
+                pad_full_pages(area);
+}
+/*
 * We have to be careful with the alias tree.  Since lookup is done by bix,
 * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with
 * indirect blocks.  So always use it through accessor functions.
@@ -174,14 +186,8 @@ static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
        return 0;
 }
-static gc_level_t btree_block_level(struct logfs_block *block)
-{
-        return expand_level(block->ino, block->level);
-}
 static struct logfs_block_ops btree_block_ops = {
        .write_block    = btree_write_block,
-        .block_level    = btree_block_level,
        .free_block     = __free_block,
        .write_alias    = btree_write_alias,
 };
@@ -683,7 +689,7 @@ int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow)
        return 0;
 }
-static void freeseg(struct super_block *sb, u32 segno)
+void freeseg(struct super_block *sb, u32 segno)
 {
        struct logfs_super *super = logfs_super(sb);
        struct address_space *mapping = super->s_mapping_inode->i_mapping;
@@ -910,7 +916,7 @@ err:
        for (i--; i >= 0; i--)
                free_area(super->s_area[i]);
        free_area(super->s_journal_area);
-        mempool_destroy(super->s_alias_pool);
+        logfs_mempool_destroy(super->s_alias_pool);
        return -ENOMEM;
 }
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index c66beab78dee..d651e10a1e9c 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -11,6 +11,8 @@
 */
 #include "logfs.h"
 #include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
 #include <linux/mtd/mtd.h>
 #include <linux/statfs.h>
 #include <linux/buffer_head.h>
@@ -136,6 +138,14 @@ static int logfs_sb_set(struct super_block *sb, void *_super)
        sb->s_fs_info = super;
        sb->s_mtd = super->s_mtd;
        sb->s_bdev = super->s_bdev;
+#ifdef CONFIG_BLOCK
+        if (sb->s_bdev)
+                sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
+#endif
+#ifdef CONFIG_MTD
+        if (sb->s_mtd)
+                sb->s_bdi = sb->s_mtd->backing_dev_info;
+#endif
        return 0;
 }
@@ -277,7 +287,7 @@ static int logfs_recover_sb(struct super_block *sb)
        }
        if (valid0 && valid1 && ds_cmp(ds0, ds1)) {
                printk(KERN_INFO"Superblocks don't match - fixing.\n");
-                return write_one_sb(sb, super->s_devops->find_last_sb);
+                return logfs_write_sb(sb);
        }
        /* If neither is valid now, something's wrong.  Didn't we properly
         * check them before?!? */
@@ -289,6 +299,10 @@ static int logfs_make_writeable(struct super_block *sb)
 {
        int err;
+        err = logfs_open_segfile(sb);
+        if (err)
+                return err;
        /* Repair any broken superblock copies */
        err = logfs_recover_sb(sb);
        if (err)
@@ -299,10 +313,6 @@ static int logfs_make_writeable(struct super_block *sb)
        if (err)
                return err;
-        err = logfs_open_segfile(sb);
-        if (err)
-                return err;
        /* Do one GC pass before any data gets dirtied */
        logfs_gc_pass(sb);
@@ -327,27 +337,27 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
                goto fail;
        sb->s_root = d_alloc_root(rootdir);
-        if (!sb->s_root)
+        if (!sb->s_root) {
+                iput(rootdir);
                goto fail;
+        }
        super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
        if (!super->s_erase_page)
-                goto fail2;
+                goto fail;
        memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE);
        /* FIXME: check for read-only mounts */
        err = logfs_make_writeable(sb);
        if (err)
-                goto fail3;
+                goto fail1;
        log_super("LogFS: Finished mounting\n");
        simple_set_mnt(mnt, sb);
        return 0;
-fail3:
+fail1:
        __free_page(super->s_erase_page);
-fail2:
-        iput(rootdir);
 fail:
        iput(logfs_super(sb)->s_master_inode);
        return -EIO;
@@ -376,7 +386,7 @@ static struct page *find_super_block(struct super_block *sb)
        if (!first || IS_ERR(first))
                return NULL;
        last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
-        if (!last || IS_ERR(first)) {
+        if (!last || IS_ERR(last)) {
                page_cache_release(first);
                return NULL;
        }
@@ -407,7 +417,7 @@ static int __logfs_read_sb(struct super_block *sb)
        page = find_super_block(sb);
        if (!page)
-                return -EIO;
+                return -EINVAL;
        ds = page_address(page);
        super->s_size = be64_to_cpu(ds->ds_filesystem_size);
@@ -451,6 +461,8 @@ static int logfs_read_sb(struct super_block *sb, int read_only)
        btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool);
        btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool);
+        btree_init_mempool32(&super->s_shadow_tree.segment_map,
+                        super->s_btree_pool);
        ret = logfs_init_mapping(sb);
        if (ret)
@@ -515,8 +527,8 @@ static void logfs_kill_sb(struct super_block *sb)
        if (super->s_erase_page)
                __free_page(super->s_erase_page);
        super->s_devops->put_device(sb);
-        mempool_destroy(super->s_btree_pool);
+        logfs_mempool_destroy(super->s_btree_pool);
-        mempool_destroy(super->s_alias_pool);
+        logfs_mempool_destroy(super->s_alias_pool);
        kfree(super);
        log_super("LogFS: Finished unmounting\n");
 }
@@ -572,8 +584,7 @@ int logfs_get_sb_device(struct file_system_type *type, int flags,
        return 0;
 err1:
-        up_write(&sb->s_umount);
+        deactivate_locked_super(sb);
-        deactivate_super(sb);
        return err;
 err0:
        kfree(super);
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 6ac693faae49..482779fe4e7c 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -221,7 +221,7 @@ void minix_free_inode(struct inode * inode)
        clear_inode(inode);             /* clear in-memory copy */
 }
-struct inode * minix_new_inode(const struct inode * dir, int * error)
+struct inode *minix_new_inode(const struct inode *dir, int mode, int *error)
 {
        struct super_block *sb = dir->i_sb;
        struct minix_sb_info *sbi = minix_sb(sb);
@@ -263,8 +263,7 @@ struct inode * minix_new_inode(const struct inode * dir, int * error)
                iput(inode);
                return NULL;
        }
-        inode->i_uid = current_fsuid();
+        inode_init_owner(inode, dir, mode);
-        inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid();
        inode->i_ino = j;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
        inode->i_blocks = 0;
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 6198731d7fcd..91969589131c 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -22,7 +22,7 @@ const struct file_operations minix_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = minix_readdir,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
 };
 static inline void dir_put_page(struct page *page)
@@ -72,11 +72,8 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
 {
        struct address_space *mapping = dir->i_mapping;
        struct page *page = read_mapping_page(mapping, n, NULL);
-        if (!IS_ERR(page)) {
+        if (!IS_ERR(page))
                kmap(page);
-                if (!PageUptodate(page))
-                        goto fail;
-        }
        return page;
 fail:
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 3eec3e607a87..d5320ff23faf 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -19,7 +19,7 @@ const struct file_operations minix_file_operations = {
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
        .splice_read    = generic_file_splice_read,
 };
diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c
index 82d6554b02fe..282e15ad8cd8 100644
--- a/fs/minix/itree_v1.c
+++ b/fs/minix/itree_v1.c
@@ -1,4 +1,5 @@
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include "minix.h"
 enum {DEPTH = 3, DIRECT = 7};   /* Only double indirect */
diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c
index f23010969369..13487ad16894 100644
--- a/fs/minix/itree_v2.c
+++ b/fs/minix/itree_v2.c
@@ -20,6 +20,9 @@ static inline block_t *i_data(struct inode *inode)
        return (block_t *)minix_i(inode)->u.i2_data;
 }
+#define DIRCOUNT 7
+#define INDIRCOUNT(sb) (1 << ((sb)->s_blocksize_bits - 2))
 static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
 {
        int n = 0;
@@ -34,21 +37,21 @@ static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
                        printk("MINIX-fs: block_to_path: "
                               "block %ld too big on dev %s\n",
                                block, bdevname(sb->s_bdev, b));
-        } else if (block < 7) {
+        } else if (block < DIRCOUNT) {
                offsets[n++] = block;
-        } else if ((block -= 7) < 256) {
+        } else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) {
-                offsets[n++] = 7;
+                offsets[n++] = DIRCOUNT;
                offsets[n++] = block;
-        } else if ((block -= 256) < 256*256) {
+        } else if ((block -= INDIRCOUNT(sb)) < INDIRCOUNT(sb) * INDIRCOUNT(sb)) {
-                offsets[n++] = 8;
+                offsets[n++] = DIRCOUNT + 1;
-                offsets[n++] = block>>8;
+                offsets[n++] = block / INDIRCOUNT(sb);
-                offsets[n++] = block & 255;
+                offsets[n++] = block % INDIRCOUNT(sb);
        } else {
-                block -= 256*256;
+                block -= INDIRCOUNT(sb) * INDIRCOUNT(sb);
-                offsets[n++] = 9;
+                offsets[n++] = DIRCOUNT + 2;
-                offsets[n++] = block>>16;
+                offsets[n++] = (block / INDIRCOUNT(sb)) / INDIRCOUNT(sb);
-                offsets[n++] = (block>>8) & 255;
+                offsets[n++] = (block / INDIRCOUNT(sb)) % INDIRCOUNT(sb);
-                offsets[n++] = block & 255;
+                offsets[n++] = block % INDIRCOUNT(sb);
        }
        return n;
 }
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 9dcf95b42116..111f34ee9e3b 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -46,7 +46,7 @@ struct minix_sb_info {
 extern struct inode *minix_iget(struct super_block *, unsigned long);
 extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **);
 extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **);
-extern struct inode * minix_new_inode(const struct inode * dir, int * error);
+extern struct inode * minix_new_inode(const struct inode *, int, int *);
 extern void minix_free_inode(struct inode * inode);
 extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi);
 extern int minix_new_block(struct inode * inode);
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 32b131cd6121..e20ee85955d1 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -46,10 +46,9 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, int mode, dev_
        if (!old_valid_dev(rdev))
                return -EINVAL;
-        inode = minix_new_inode(dir, &error);
+        inode = minix_new_inode(dir, mode, &error);
        if (inode) {
-                inode->i_mode = mode;
                minix_set_inode(inode, rdev);
                mark_inode_dirty(inode);
                error = add_nondir(dentry, inode);
@@ -73,11 +72,10 @@ static int minix_symlink(struct inode * dir, struct dentry *dentry,
        if (i > dir->i_sb->s_blocksize)
                goto out;
-        inode = minix_new_inode(dir, &err);
+        inode = minix_new_inode(dir, S_IFLNK | 0777, &err);
        if (!inode)
                goto out;
-        inode->i_mode = S_IFLNK | 0777;
        minix_set_inode(inode, 0);
        err = page_symlink(inode, symname, i);
        if (err)
@@ -117,13 +115,10 @@ static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode)
        inode_inc_link_count(dir);
-        inode = minix_new_inode(dir, &err);
+        inode = minix_new_inode(dir, mode, &err);
        if (!inode)
                goto out_dir;
-        inode->i_mode = S_IFDIR | mode;
-        if (dir->i_mode & S_ISGID)
-                inode->i_mode |= S_ISGID;
        minix_set_inode(inode, 0);
        inode_inc_link_count(inode);
diff --git a/fs/mpage.c b/fs/mpage.c
index 598d54e200eb..fd56ca2ea556 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/kdev_t.h>
+#include <linux/gfp.h>
 #include <linux/bio.h>
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
diff --git a/fs/namei.c b/fs/namei.c
index 1c0fca6e899e..868d0cb9d473 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -523,9 +523,10 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
 static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
 {
        dput(nd->path.dentry);
-        if (nd->path.mnt != path->mnt)
+        if (nd->path.mnt != path->mnt) {
                mntput(nd->path.mnt);
-        nd->path.mnt = path->mnt;
+                nd->path.mnt = path->mnt;
+        }
        nd->path.dentry = path->dentry;
 }
@@ -1610,8 +1611,7 @@ exit:
 static struct file *do_last(struct nameidata *nd, struct path *path,
                            int open_flag, int acc_mode,
-                            int mode, const char *pathname,
+                            int mode, const char *pathname)
-                            int *want_dir)
 {
        struct dentry *dir = nd->path.dentry;
        struct file *filp;
@@ -1621,6 +1621,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        case LAST_DOTDOT:
                follow_dotdot(nd);
                dir = nd->path.dentry;
+        case LAST_DOT:
                if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
                        if (!dir->d_op->d_revalidate(dir, nd)) {
                                error = -ESTALE;
@@ -1628,7 +1629,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                        }
                }
                /* fallthrough */
-        case LAST_DOT:
        case LAST_ROOT:
                if (open_flag & O_CREAT)
                        goto exit;
@@ -1642,7 +1642,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        if (nd->last.name[nd->last.len]) {
                if (open_flag & O_CREAT)
                        goto exit;
-                *want_dir = 1;
+                nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW;
        }
        /* just plain open? */
@@ -1656,8 +1656,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                if (path->dentry->d_inode->i_op->follow_link)
                        return NULL;
                error = -ENOTDIR;
-                if (*want_dir && !path->dentry->d_inode->i_op->lookup)
+                if (nd->flags & LOOKUP_DIRECTORY) {
-                        goto exit_dput;
+                        if (!path->dentry->d_inode->i_op->lookup)
+                                goto exit_dput;
+                }
                path_to_nameidata(path, nd);
                audit_inode(pathname, nd->path.dentry);
                goto ok;
@@ -1766,7 +1768,6 @@ struct file *do_filp_open(int dfd, const char *pathname,
        int count = 0;
        int flag = open_to_namei_flags(open_flag);
        int force_reval = 0;
-        int want_dir = open_flag & O_DIRECTORY;
        if (!(open_flag & O_CREAT))
                mode = 0;
@@ -1828,14 +1829,18 @@ reval:
                if (open_flag & O_EXCL)
                        nd.flags |= LOOKUP_EXCL;
        }
-        filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir);
+        if (open_flag & O_DIRECTORY)
+                nd.flags |= LOOKUP_DIRECTORY;
+        if (!(open_flag & O_NOFOLLOW))
+                nd.flags |= LOOKUP_FOLLOW;
+        filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
        while (unlikely(!filp)) { /* trailing symlink */
                struct path holder;
                struct inode *inode = path.dentry->d_inode;
                void *cookie;
                error = -ELOOP;
                /* S_ISDIR part is a temporary automount kludge */
-                if ((open_flag & O_NOFOLLOW) && !S_ISDIR(inode->i_mode))
+                if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode))
                        goto exit_dput;
                if (count++ == 32)
                        goto exit_dput;
@@ -1866,7 +1871,7 @@ reval:
                }
                holder = path;
                nd.flags &= ~LOOKUP_PARENT;
-                filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir);
+                filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
                if (inode->i_op->put_link)
                        inode->i_op->put_link(holder.dentry, &nd, cookie);
                path_put(&holder);
@@ -2172,8 +2177,10 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
                error = security_inode_rmdir(dir, dentry);
                if (!error) {
                        error = dir->i_op->rmdir(dir, dentry);
-                        if (!error)
+                        if (!error) {
                                dentry->d_inode->i_flags |= S_DEAD;
+                                dont_mount(dentry);
+                        }
                }
        }
        mutex_unlock(&dentry->d_inode->i_mutex);
@@ -2257,7 +2264,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
                if (!error) {
                        error = dir->i_op->unlink(dir, dentry);
                        if (!error)
-                                dentry->d_inode->i_flags |= S_DEAD;
+                                dont_mount(dentry);
                }
        }
        mutex_unlock(&dentry->d_inode->i_mutex);
@@ -2568,17 +2575,20 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
                return error;
        target = new_dentry->d_inode;
-        if (target) {
+        if (target)
                mutex_lock(&target->i_mutex);
-                dentry_unhash(new_dentry);
-        }
        if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
                error = -EBUSY;
-        else 
+        else {
+                if (target)
+                        dentry_unhash(new_dentry);
                error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+        }
        if (target) {
-                if (!error)
+                if (!error) {
                        target->i_flags |= S_DEAD;
+                        dont_mount(new_dentry);
+                }
                mutex_unlock(&target->i_mutex);
                if (d_unhashed(new_dentry))
                        d_rehash(new_dentry);
@@ -2610,7 +2620,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
                error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
        if (!error) {
                if (target)
-                        target->i_flags |= S_DEAD;
+                        dont_mount(new_dentry);
                if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
                        d_move(old_dentry, new_dentry);
        }
diff --git a/fs/namespace.c b/fs/namespace.c
index 8174c8ab5c70..88058de59c7c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -628,7 +628,6 @@ repeat:
                mnt->mnt_pinned = 0;
                spin_unlock(&vfsmount_lock);
                acct_auto_close_mnt(mnt);
-                security_sb_umount_close(mnt);
                goto repeat;
        }
 }
@@ -1117,8 +1116,6 @@ static int do_umount(struct vfsmount *mnt, int flags)
                retval = 0;
        }
        spin_unlock(&vfsmount_lock);
-        if (retval)
-                security_sb_umount_busy(mnt);
        up_write(&namespace_sem);
        release_mounts(&umount_list);
        return retval;
@@ -1432,20 +1429,13 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
        err = -ENOENT;
        mutex_lock(&path->dentry->d_inode->i_mutex);
-        if (IS_DEADDIR(path->dentry->d_inode))
+        if (cant_mount(path->dentry))
-                goto out_unlock;
-        err = security_sb_check_sb(mnt, path);
-        if (err)
                goto out_unlock;
-        err = -ENOENT;
        if (!d_unlinked(path->dentry))
                err = attach_recursive_mnt(mnt, path, NULL);
 out_unlock:
        mutex_unlock(&path->dentry->d_inode->i_mutex);
-        if (!err)
-                security_sb_post_addmount(mnt, path);
        return err;
 }
@@ -1581,8 +1571,6 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
        }
        up_write(&sb->s_umount);
        if (!err) {
-                security_sb_post_remount(path->mnt, flags, data);
                spin_lock(&vfsmount_lock);
                touch_mnt_namespace(path->mnt->mnt_ns);
                spin_unlock(&vfsmount_lock);
@@ -1623,7 +1611,7 @@ static int do_move_mount(struct path *path, char *old_name)
        err = -ENOENT;
        mutex_lock(&path->dentry->d_inode->i_mutex);
-        if (IS_DEADDIR(path->dentry->d_inode))
+        if (cant_mount(path->dentry))
                goto out1;
        if (d_unlinked(path->dentry))
@@ -2234,7 +2222,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        if (!check_mnt(root.mnt))
                goto out2;
        error = -ENOENT;
-        if (IS_DEADDIR(new.dentry->d_inode))
+        if (cant_mount(old.dentry))
                goto out2;
        if (d_unlinked(new.dentry))
                goto out2;
@@ -2277,7 +2265,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        touch_mnt_namespace(current->nsproxy->mnt_ns);
        spin_unlock(&vfsmount_lock);
        chroot_fs_refs(&root, &new);
-        security_sb_post_pivotroot(&root, &new);
        error = 0;
        path_put(&root_parent);
        path_put(&parent_path);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index b8b5b30d53f0..9578cbe0cd58 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -15,7 +15,6 @@
 #include <linux/errno.h>
 #include <linux/stat.h>
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
 #include <asm/uaccess.h>
@@ -50,9 +49,10 @@ extern int ncp_symlink(struct inode *, struct dentry *, const char *);
                      
 const struct file_operations ncp_dir_operations =
 {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = ncp_readdir,
-        .ioctl          = ncp_ioctl,
+        .unlocked_ioctl = ncp_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ncp_compat_ioctl,
 #endif
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 6a7d901f1936..3639cc5cbdae 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -15,7 +15,6 @@
 #include <linux/fcntl.h>
 #include <linux/stat.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
 #include <linux/smp_lock.h>
@@ -23,7 +22,7 @@
 #include <linux/ncp_fs.h>
 #include "ncplib_kernel.h"
-static int ncp_fsync(struct file *file, struct dentry *dentry, int datasync)
+static int ncp_fsync(struct file *file, int datasync)
 {
        return 0;
 }
@@ -296,7 +295,7 @@ const struct file_operations ncp_file_operations =
        .llseek         = ncp_remote_llseek,
        .read           = ncp_file_read,
        .write          = ncp_file_write,
-        .ioctl          = ncp_ioctl,
+        .unlocked_ioctl = ncp_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ncp_compat_ioctl,
 #endif
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index cf98da1be23e..fa3385154023 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -526,10 +526,15 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        sb->s_blocksize_bits = 10;
        sb->s_magic = NCP_SUPER_MAGIC;
        sb->s_op = &ncp_sops;
+        sb->s_bdi = &server->bdi;
        server = NCP_SBP(sb);
        memset(server, 0, sizeof(*server));
+        error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY);
+        if (error)
+                goto out_bdi;
        server->ncp_filp = ncp_filp;
        server->ncp_sock = sock;
        
@@ -719,6 +724,8 @@ out_fput2:
        if (server->info_filp)
                fput(server->info_filp);
 out_fput:
+        bdi_destroy(&server->bdi);
+out_bdi:
        /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>:
         * 
         * The previously used put_filp(ncp_filp); was bogous, since
@@ -756,6 +763,7 @@ static void ncp_put_super(struct super_block *sb)
        kill_pid(server->m.wdog_pid, SIGTERM, 1);
        put_pid(server->m.wdog_pid);
+        bdi_destroy(&server->bdi);
        kfree(server->priv.data);
        kfree(server->auth.object_name);
        vfree(server->rxbuf);
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index ec8f45f12e05..023c03d02070 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -15,10 +15,12 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/mount.h>
+#include <linux/slab.h>
 #include <linux/highuid.h>
 #include <linux/smp_lock.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include <linux/ncp_fs.h>
@@ -260,9 +262,9 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
 }
 #endif /* CONFIG_NCPFS_NLS */
-static int __ncp_ioctl(struct inode *inode, struct file *filp,
+static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-              unsigned int cmd, unsigned long arg)
 {
+        struct inode *inode = filp->f_dentry->d_inode;
        struct ncp_server *server = NCP_SERVER(inode);
        int result;
        struct ncp_ioctl_request request;
@@ -840,11 +842,11 @@ static int ncp_ioctl_need_write(unsigned int cmd)
        }
 }
-int ncp_ioctl(struct inode *inode, struct file *filp,
+long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-              unsigned int cmd, unsigned long arg)
 {
-        int ret;
+        long ret;
+        lock_kernel();
        if (ncp_ioctl_need_write(cmd)) {
                /*
                 * inside the ioctl(), any failures which
@@ -852,24 +854,28 @@ int ncp_ioctl(struct inode *inode, struct file *filp,
                 * -EACCESS, so it seems consistent to keep
                 *  that here.
                 */
-                if (mnt_want_write(filp->f_path.mnt))
+                if (mnt_want_write(filp->f_path.mnt)) {
-                        return -EACCES;
+                        ret = -EACCES;
+                        goto out;
+                }
        }
-        ret = __ncp_ioctl(inode, filp, cmd, arg);
+        ret = __ncp_ioctl(filp, cmd, arg);
        if (ncp_ioctl_need_write(cmd))
                mnt_drop_write(filp->f_path.mnt);
+out:
+        unlock_kernel();
        return ret;
 }
 #ifdef CONFIG_COMPAT
 long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-        struct inode *inode = file->f_path.dentry->d_inode;
+        long ret;
-        int ret;
        lock_kernel();
        arg = (unsigned long) compat_ptr(arg);
-        ret = ncp_ioctl(inode, file, cmd, arg);
+        ret = ncp_ioctl(file, cmd, arg);
        unlock_kernel();
        return ret;
 }
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 15458decdb8a..56f5b3a0e1ee 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -9,12 +9,12 @@
 #include <linux/stat.h>
 #include <linux/time.h>
 #include <linux/kernel.h>
+#include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/shm.h>
 #include <linux/errno.h>
 #include <linux/mman.h>
 #include <linux/string.h>
-#include <linux/slab.h>
 #include <linux/fcntl.h>
 #include <linux/ncp_fs.h>
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index e37df8d5fe70..c7ff6c700a6e 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -21,6 +21,7 @@
 #include <linux/mm.h>
 #include <linux/netdevice.h>
 #include <linux/signal.h>
+#include <linux/slab.h>
 #include <net/scm.h>
 #include <net/sock.h>
 #include <linux/ipx.h>
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index e3d26c1bd105..c634fd17b337 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -27,6 +27,7 @@
 #include <linux/fs.h>
 #include <linux/ncp_fs.h>
 #include <linux/time.h>
+#include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/stat.h>
 #include "ncplib_kernel.h"
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index b4ffd0146ea6..84690319e625 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -10,6 +10,7 @@
 #include <linux/moduleparam.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #include <linux/sunrpc/cache.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 84761b5bb8e2..a08770a7e857 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -7,6 +7,7 @@
 */
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
+#include <linux/slab.h>
 #include "nfs4_fs.h"
 #include "callback.h"
 #include "delegation.h"
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index a2b8b4df125d..05af212f0edf 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -9,6 +9,7 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
+#include <linux/slab.h>
 #include "nfs4_fs.h"
 #include "callback.h"
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2274f1737336..7ec9b34a59f8 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -35,6 +35,7 @@
 #include <linux/vfs.h>
 #include <linux/inet.h>
 #include <linux/in6.h>
+#include <linux/slab.h>
 #include <net/ipv6.h>
 #include <linux/nfs_xdr.h>
 #include <linux/sunrpc/bc_xprt.h>
@@ -933,7 +934,6 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
        }
        fsinfo.fattr = fattr;
-        nfs_fattr_init(fattr);
        error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
        if (error < 0)
                goto out_error;
@@ -965,6 +965,8 @@ out_error:
 static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source)
 {
        target->flags = source->flags;
+        target->rsize = source->rsize;
+        target->wsize = source->wsize;
        target->acregmin = source->acregmin;
        target->acregmax = source->acregmax;
        target->acdirmin = source->acdirmin;
@@ -1044,13 +1046,18 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
                                     struct nfs_fh *mntfh)
 {
        struct nfs_server *server;
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr;
        int error;
        server = nfs_alloc_server();
        if (!server)
                return ERR_PTR(-ENOMEM);
+        error = -ENOMEM;
+        fattr = nfs_alloc_fattr();
+        if (fattr == NULL)
+                goto error;
        /* Get a client representation */
        error = nfs_init_server(server, data);
        if (error < 0)
@@ -1061,7 +1068,7 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
        /* Probe the root fh to retrieve its FSID */
-        error = nfs_probe_fsinfo(server, mntfh, &fattr);
+        error = nfs_probe_fsinfo(server, mntfh, fattr);
        if (error < 0)
                goto error;
        if (server->nfs_client->rpc_ops->version == 3) {
@@ -1074,14 +1081,14 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
                        server->namelen = NFS2_MAXNAMLEN;
        }
-        if (!(fattr.valid & NFS_ATTR_FATTR)) {
+        if (!(fattr->valid & NFS_ATTR_FATTR)) {
-                error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr);
+                error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
                if (error < 0) {
                        dprintk("nfs_create_server: getattr error = %d\n", -error);
                        goto error;
                }
        }
-        memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid));
+        memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
        dprintk("Server FSID: %llx:%llx\n",
                (unsigned long long) server->fsid.major,
@@ -1093,9 +1100,11 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
        spin_unlock(&nfs_client_lock);
        server->mount_time = jiffies;
+        nfs_free_fattr(fattr);
        return server;
 error:
+        nfs_free_fattr(fattr);
        nfs_free_server(server);
        return ERR_PTR(error);
 }
@@ -1293,7 +1302,8 @@ static int nfs4_init_server(struct nfs_server *server,
        /* Initialise the client representation from the mount data */
        server->flags = data->flags;
-        server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
+        server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|
+                NFS_CAP_POSIX_LOCK;
        server->options = data->options;
        /* Get a client record */
@@ -1336,7 +1346,7 @@ error:
 struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
                                      struct nfs_fh *mntfh)
 {
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr;
        struct nfs_server *server;
        int error;
@@ -1346,6 +1356,11 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
        if (!server)
                return ERR_PTR(-ENOMEM);
+        error = -ENOMEM;
+        fattr = nfs_alloc_fattr();
+        if (fattr == NULL)
+                goto error;
        /* set up the general RPC client */
        error = nfs4_init_server(server, data);
        if (error < 0)
@@ -1360,7 +1375,7 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
                goto error;
        /* Probe the root fh to retrieve its FSID */
-        error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path);
+        error = nfs4_get_rootfh(server, mntfh);
        if (error < 0)
                goto error;
@@ -1371,7 +1386,7 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
        nfs4_session_set_rwsize(server);
-        error = nfs_probe_fsinfo(server, mntfh, &fattr);
+        error = nfs_probe_fsinfo(server, mntfh, fattr);
        if (error < 0)
                goto error;
@@ -1385,9 +1400,11 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
        server->mount_time = jiffies;
        dprintk("<-- nfs4_create_server() = %p\n", server);
+        nfs_free_fattr(fattr);
        return server;
 error:
+        nfs_free_fattr(fattr);
        nfs_free_server(server);
        dprintk("<-- nfs4_create_server() = error %d\n", error);
        return ERR_PTR(error);
@@ -1401,7 +1418,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 {
        struct nfs_client *parent_client;
        struct nfs_server *server, *parent_server;
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr;
        int error;
        dprintk("--> nfs4_create_referral_server()\n");
@@ -1410,6 +1427,11 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
        if (!server)
                return ERR_PTR(-ENOMEM);
+        error = -ENOMEM;
+        fattr = nfs_alloc_fattr();
+        if (fattr == NULL)
+                goto error;
        parent_server = NFS_SB(data->sb);
        parent_client = parent_server->nfs_client;
@@ -1439,12 +1461,12 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
        /* Probe the root fh to retrieve its FSID and filehandle */
-        error = nfs4_path_walk(server, mntfh, data->mnt_path);
+        error = nfs4_get_rootfh(server, mntfh);
        if (error < 0)
                goto error;
        /* probe the filesystem info for this server filesystem */
-        error = nfs_probe_fsinfo(server, mntfh, &fattr);
+        error = nfs_probe_fsinfo(server, mntfh, fattr);
        if (error < 0)
                goto error;
@@ -1462,10 +1484,12 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
        server->mount_time = jiffies;
+        nfs_free_fattr(fattr);
        dprintk("<-- nfs_create_referral_server() = %p\n", server);
        return server;
 error:
+        nfs_free_fattr(fattr);
        nfs_free_server(server);
        dprintk("<-- nfs4_create_referral_server() = error %d\n", error);
        return ERR_PTR(error);
@@ -1481,7 +1505,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
                                    struct nfs_fattr *fattr)
 {
        struct nfs_server *server;
-        struct nfs_fattr fattr_fsinfo;
+        struct nfs_fattr *fattr_fsinfo;
        int error;
        dprintk("--> nfs_clone_server(,%llx:%llx,)\n",
@@ -1492,6 +1516,11 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
        if (!server)
                return ERR_PTR(-ENOMEM);
+        error = -ENOMEM;
+        fattr_fsinfo = nfs_alloc_fattr();
+        if (fattr_fsinfo == NULL)
+                goto out_free_server;
        /* Copy data from the source */
        server->nfs_client = source->nfs_client;
        atomic_inc(&server->nfs_client->cl_count);
@@ -1508,7 +1537,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
                nfs_init_server_aclclient(server);
        /* probe the filesystem info for this server filesystem */
-        error = nfs_probe_fsinfo(server, fh, &fattr_fsinfo);
+        error = nfs_probe_fsinfo(server, fh, fattr_fsinfo);
        if (error < 0)
                goto out_free_server;
@@ -1530,10 +1559,12 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
        server->mount_time = jiffies;
+        nfs_free_fattr(fattr_fsinfo);
        dprintk("<-- nfs_clone_server() = %p\n", server);
        return server;
 out_free_server:
+        nfs_free_fattr(fattr_fsinfo);
        nfs_free_server(server);
        dprintk("<-- nfs_clone_server() = error %d\n", error);
        return ERR_PTR(error);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 2563bebc4c67..301634543974 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -10,6 +10,7 @@
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/spinlock.h>
@@ -23,6 +24,8 @@
 static void nfs_do_free_delegation(struct nfs_delegation *delegation)
 {
+        if (delegation->cred)
+                put_rpccred(delegation->cred);
        kfree(delegation);
 }
@@ -35,13 +38,7 @@ static void nfs_free_delegation_callback(struct rcu_head *head)
 static void nfs_free_delegation(struct nfs_delegation *delegation)
 {
-        struct rpc_cred *cred;
-        cred = rcu_dereference(delegation->cred);
-        rcu_assign_pointer(delegation->cred, NULL);
        call_rcu(&delegation->rcu, nfs_free_delegation_callback);
-        if (cred)
-                put_rpccred(cred);
 }
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
@@ -128,21 +125,35 @@ again:
 */
 void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
 {
-        struct nfs_delegation *delegation = NFS_I(inode)->delegation;
+        struct nfs_delegation *delegation;
-        struct rpc_cred *oldcred;
+        struct rpc_cred *oldcred = NULL;
-        if (delegation == NULL)
+        rcu_read_lock();
-                return;
+        delegation = rcu_dereference(NFS_I(inode)->delegation);
-        memcpy(delegation->stateid.data, res->delegation.data,
+        if (delegation != NULL) {
-                        sizeof(delegation->stateid.data));
+                spin_lock(&delegation->lock);
-        delegation->type = res->delegation_type;
+                if (delegation->inode != NULL) {
-        delegation->maxsize = res->maxsize;
+                        memcpy(delegation->stateid.data, res->delegation.data,
-        oldcred = delegation->cred;
+                               sizeof(delegation->stateid.data));
-        delegation->cred = get_rpccred(cred);
+                        delegation->type = res->delegation_type;
-        clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+                        delegation->maxsize = res->maxsize;
-        NFS_I(inode)->delegation_state = delegation->type;
+                        oldcred = delegation->cred;
-        smp_wmb();
+                        delegation->cred = get_rpccred(cred);
-        put_rpccred(oldcred);
+                        clear_bit(NFS_DELEGATION_NEED_RECLAIM,
+                                  &delegation->flags);
+                        NFS_I(inode)->delegation_state = delegation->type;
+                        spin_unlock(&delegation->lock);
+                        put_rpccred(oldcred);
+                        rcu_read_unlock();
+                } else {
+                        /* We appear to have raced with a delegation return. */
+                        spin_unlock(&delegation->lock);
+                        rcu_read_unlock();
+                        nfs_inode_set_delegation(inode, cred, res);
+                }
+        } else {
+                rcu_read_unlock();
+        }
 }
 static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
@@ -165,9 +176,13 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation
        return inode;
 }
-static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid)
+static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi,
+                                                           const nfs4_stateid *stateid,
+                                                           struct nfs_client *clp)
 {
-        struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);
+        struct nfs_delegation *delegation =
+                rcu_dereference_protected(nfsi->delegation,
+                                          lockdep_is_held(&clp->cl_lock));
        if (delegation == NULL)
                goto nomatch;
@@ -194,11 +209,11 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 {
        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
        struct nfs_inode *nfsi = NFS_I(inode);
-        struct nfs_delegation *delegation;
+        struct nfs_delegation *delegation, *old_delegation;
        struct nfs_delegation *freeme = NULL;
        int status = 0;
-        delegation = kmalloc(sizeof(*delegation), GFP_KERNEL);
+        delegation = kmalloc(sizeof(*delegation), GFP_NOFS);
        if (delegation == NULL)
                return -ENOMEM;
        memcpy(delegation->stateid.data, res->delegation.data,
@@ -212,10 +227,12 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
        spin_lock_init(&delegation->lock);
        spin_lock(&clp->cl_lock);
-        if (rcu_dereference(nfsi->delegation) != NULL) {
+        old_delegation = rcu_dereference_protected(nfsi->delegation,
-                if (memcmp(&delegation->stateid, &nfsi->delegation->stateid,
+                                                   lockdep_is_held(&clp->cl_lock));
-                                        sizeof(delegation->stateid)) == 0 &&
+        if (old_delegation != NULL) {
-                                delegation->type == nfsi->delegation->type) {
+                if (memcmp(&delegation->stateid, &old_delegation->stateid,
+                                        sizeof(old_delegation->stateid)) == 0 &&
+                                delegation->type == old_delegation->type) {
                        goto out;
                }
                /*
@@ -225,12 +242,12 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
                dfprintk(FILE, "%s: server %s handed out "
                                "a duplicate delegation!\n",
                                __func__, clp->cl_hostname);
-                if (delegation->type <= nfsi->delegation->type) {
+                if (delegation->type <= old_delegation->type) {
                        freeme = delegation;
                        delegation = NULL;
                        goto out;
                }
-                freeme = nfs_detach_delegation_locked(nfsi, NULL);
+                freeme = nfs_detach_delegation_locked(nfsi, NULL, clp);
        }
        list_add_rcu(&delegation->super_list, &clp->cl_delegations);
        nfsi->delegation_state = delegation->type;
@@ -300,7 +317,7 @@ restart:
                if (inode == NULL)
                        continue;
                spin_lock(&clp->cl_lock);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
+                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp);
                spin_unlock(&clp->cl_lock);
                rcu_read_unlock();
                if (delegation != NULL) {
@@ -329,9 +346,9 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode)
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation;
-        if (rcu_dereference(nfsi->delegation) != NULL) {
+        if (rcu_access_pointer(nfsi->delegation) != NULL) {
                spin_lock(&clp->cl_lock);
-                delegation = nfs_detach_delegation_locked(nfsi, NULL);
+                delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
                spin_unlock(&clp->cl_lock);
                if (delegation != NULL)
                        nfs_do_return_delegation(inode, delegation, 0);
@@ -345,9 +362,9 @@ int nfs_inode_return_delegation(struct inode *inode)
        struct nfs_delegation *delegation;
        int err = 0;
-        if (rcu_dereference(nfsi->delegation) != NULL) {
+        if (rcu_access_pointer(nfsi->delegation) != NULL) {
                spin_lock(&clp->cl_lock);
-                delegation = nfs_detach_delegation_locked(nfsi, NULL);
+                delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
                spin_unlock(&clp->cl_lock);
                if (delegation != NULL) {
                        nfs_msync_inode(inode);
@@ -525,7 +542,7 @@ restart:
                if (inode == NULL)
                        continue;
                spin_lock(&clp->cl_lock);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
+                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp);
                spin_unlock(&clp->cl_lock);
                rcu_read_unlock();
                if (delegation != NULL)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index c6f2750648f4..782b431ef91c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -53,7 +53,7 @@ static int nfs_link(struct dentry *, struct inode *, struct dentry *);
 static int nfs_mknod(struct inode *, struct dentry *, int, dev_t);
 static int nfs_rename(struct inode *, struct dentry *,
                      struct inode *, struct dentry *);
-static int nfs_fsync_dir(struct file *, struct dentry *, int);
+static int nfs_fsync_dir(struct file *, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
 const struct file_operations nfs_dir_operations = {
@@ -530,9 +530,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        nfs_readdir_descriptor_t my_desc,
                        *desc = &my_desc;
        struct nfs_entry my_entry;
-        struct nfs_fh    fh;
+        int res = -ENOMEM;
-        struct nfs_fattr fattr;
-        long            res;
        dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -554,9 +552,11 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        my_entry.cookie = my_entry.prev_cookie = 0;
        my_entry.eof = 0;
-        my_entry.fh = &fh;
+        my_entry.fh = nfs_alloc_fhandle();
-        my_entry.fattr = &fattr;
+        my_entry.fattr = nfs_alloc_fattr();
-        nfs_fattr_init(&fattr);
+        if (my_entry.fh == NULL || my_entry.fattr == NULL)
+                goto out_alloc_failed;
        desc->entry = &my_entry;
        nfs_block_sillyrename(dentry);
@@ -598,7 +598,10 @@ out:
        nfs_unblock_sillyrename(dentry);
        if (res > 0)
                res = 0;
-        dfprintk(FILE, "NFS: readdir(%s/%s) returns %ld\n",
+out_alloc_failed:
+        nfs_free_fattr(my_entry.fattr);
+        nfs_free_fhandle(my_entry.fh);
+        dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
                        res);
        return res;
@@ -638,8 +641,10 @@ out:
 * All directory operations under NFS are synchronous, so fsync()
 * is a dummy operation.
 */
-static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
+static int nfs_fsync_dir(struct file *filp, int datasync)
 {
+        struct dentry *dentry = filp->f_path.dentry;
        dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
                        datasync);
@@ -776,9 +781,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
        struct inode *dir;
        struct inode *inode;
        struct dentry *parent;
+        struct nfs_fh *fhandle = NULL;
+        struct nfs_fattr *fattr = NULL;
        int error;
-        struct nfs_fh fhandle;
-        struct nfs_fattr fattr;
        parent = dget_parent(dentry);
        dir = parent->d_inode;
@@ -811,14 +816,22 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
        if (NFS_STALE(inode))
                goto out_bad;
-        error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr);
+        error = -ENOMEM;
+        fhandle = nfs_alloc_fhandle();
+        fattr = nfs_alloc_fattr();
+        if (fhandle == NULL || fattr == NULL)
+                goto out_error;
+        error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
        if (error)
                goto out_bad;
-        if (nfs_compare_fh(NFS_FH(inode), &fhandle))
+        if (nfs_compare_fh(NFS_FH(inode), fhandle))
                goto out_bad;
-        if ((error = nfs_refresh_inode(inode, &fattr)) != 0)
+        if ((error = nfs_refresh_inode(inode, fattr)) != 0)
                goto out_bad;
+        nfs_free_fattr(fattr);
+        nfs_free_fhandle(fhandle);
 out_set_verifier:
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 out_valid:
@@ -837,14 +850,26 @@ out_zap_parent:
                /* If we have submounts, don't unhash ! */
                if (have_submounts(dentry))
                        goto out_valid;
+                if (dentry->d_flags & DCACHE_DISCONNECTED)
+                        goto out_valid;
                shrink_dcache_parent(dentry);
        }
        d_drop(dentry);
+        nfs_free_fattr(fattr);
+        nfs_free_fhandle(fhandle);
        dput(parent);
        dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
                        __func__, dentry->d_parent->d_name.name,
                        dentry->d_name.name);
        return 0;
+out_error:
+        nfs_free_fattr(fattr);
+        nfs_free_fhandle(fhandle);
+        dput(parent);
+        dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n",
+                        __func__, dentry->d_parent->d_name.name,
+                        dentry->d_name.name, error);
+        return error;
 }
 /*
@@ -909,9 +934,9 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
        struct dentry *res;
        struct dentry *parent;
        struct inode *inode = NULL;
+        struct nfs_fh *fhandle = NULL;
+        struct nfs_fattr *fattr = NULL;
        int error;
-        struct nfs_fh fhandle;
-        struct nfs_fattr fattr;
        dfprintk(VFS, "NFS: lookup(%s/%s)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
@@ -921,7 +946,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
        if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
                goto out;
-        res = ERR_PTR(-ENOMEM);
        dentry->d_op = NFS_PROTO(dir)->dentry_ops;
        /*
@@ -934,17 +958,23 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
                goto out;
        }
+        res = ERR_PTR(-ENOMEM);
+        fhandle = nfs_alloc_fhandle();
+        fattr = nfs_alloc_fattr();
+        if (fhandle == NULL || fattr == NULL)
+                goto out;
        parent = dentry->d_parent;
        /* Protect against concurrent sillydeletes */
        nfs_block_sillyrename(parent);
-        error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr);
+        error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
        if (error == -ENOENT)
                goto no_entry;
        if (error < 0) {
                res = ERR_PTR(error);
                goto out_unblock_sillyrename;
        }
-        inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr);
+        inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
        res = (struct dentry *)inode;
        if (IS_ERR(res))
                goto out_unblock_sillyrename;
@@ -960,6 +990,8 @@ no_entry:
 out_unblock_sillyrename:
        nfs_unblock_sillyrename(parent);
 out:
+        nfs_free_fattr(fattr);
+        nfs_free_fhandle(fhandle);
        return res;
 }
@@ -1025,12 +1057,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                                res = NULL;
                                goto out;
                        /* This turned out not to be a regular file */
+                        case -EISDIR:
                        case -ENOTDIR:
                                goto no_open;
                        case -ELOOP:
                                if (!(nd->intent.open.flags & O_NOFOLLOW))
                                        goto no_open;
-                        /* case -EISDIR: */
                        /* case -EINVAL: */
                        default:
                                goto out;
@@ -1050,7 +1082,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
        struct inode *dir;
        int openflags, ret = 0;
-        if (!is_atomic_open(nd))
+        if (!is_atomic_open(nd) || d_mountpoint(dentry))
                goto no_open;
        parent = dget_parent(dentry);
        dir = parent->d_inode;
@@ -1667,28 +1699,33 @@ static void nfs_access_free_entry(struct nfs_access_entry *entry)
        smp_mb__after_atomic_dec();
 }
+static void nfs_access_free_list(struct list_head *head)
+{
+        struct nfs_access_entry *cache;
+        while (!list_empty(head)) {
+                cache = list_entry(head->next, struct nfs_access_entry, lru);
+                list_del(&cache->lru);
+                nfs_access_free_entry(cache);
+        }
+}
 int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask)
 {
        LIST_HEAD(head);
        struct nfs_inode *nfsi;
        struct nfs_access_entry *cache;
-restart:
+        if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
+                return (nr_to_scan == 0) ? 0 : -1;
        spin_lock(&nfs_access_lru_lock);
        list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) {
-                struct rw_semaphore *s_umount;
                struct inode *inode;
                if (nr_to_scan-- == 0)
                        break;
-                s_umount = &nfsi->vfs_inode.i_sb->s_umount;
+                inode = &nfsi->vfs_inode;
-                if (!down_read_trylock(s_umount))
-                        continue;
-                inode = igrab(&nfsi->vfs_inode);
-                if (inode == NULL) {
-                        up_read(s_umount);
-                        continue;
-                }
                spin_lock(&inode->i_lock);
                if (list_empty(&nfsi->access_cache_entry_lru))
                        goto remove_lru_entry;
@@ -1702,61 +1739,48 @@ restart:
                else {
 remove_lru_entry:
                        list_del_init(&nfsi->access_cache_inode_lru);
+                        smp_mb__before_clear_bit();
                        clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
+                        smp_mb__after_clear_bit();
                }
                spin_unlock(&inode->i_lock);
-                spin_unlock(&nfs_access_lru_lock);
-                iput(inode);
-                up_read(s_umount);
-                goto restart;
        }
        spin_unlock(&nfs_access_lru_lock);
-        while (!list_empty(&head)) {
+        nfs_access_free_list(&head);
-                cache = list_entry(head.next, struct nfs_access_entry, lru);
-                list_del(&cache->lru);
-                nfs_access_free_entry(cache);
-        }
        return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure;
 }
-static void __nfs_access_zap_cache(struct inode *inode)
+static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
 {
-        struct nfs_inode *nfsi = NFS_I(inode);
        struct rb_root *root_node = &nfsi->access_cache;
-        struct rb_node *n, *dispose = NULL;
+        struct rb_node *n;
        struct nfs_access_entry *entry;
        /* Unhook entries from the cache */
        while ((n = rb_first(root_node)) != NULL) {
                entry = rb_entry(n, struct nfs_access_entry, rb_node);
                rb_erase(n, root_node);
-                list_del(&entry->lru);
+                list_move(&entry->lru, head);
-                n->rb_left = dispose;
-                dispose = n;
        }
        nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
-        spin_unlock(&inode->i_lock);
-        /* Now kill them all! */
-        while (dispose != NULL) {
-                n = dispose;
-                dispose = n->rb_left;
-                nfs_access_free_entry(rb_entry(n, struct nfs_access_entry, rb_node));
-        }
 }
 void nfs_access_zap_cache(struct inode *inode)
 {
+        LIST_HEAD(head);
+        if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0)
+                return;
        /* Remove from global LRU init */
-        if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
+        spin_lock(&nfs_access_lru_lock);
-                spin_lock(&nfs_access_lru_lock);
+        if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
                list_del_init(&NFS_I(inode)->access_cache_inode_lru);
-                spin_unlock(&nfs_access_lru_lock);
-        }
        spin_lock(&inode->i_lock);
-        /* This will release the spinlock */
+        __nfs_access_zap_cache(NFS_I(inode), &head);
-        __nfs_access_zap_cache(inode);
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&nfs_access_lru_lock);
+        nfs_access_free_list(&head);
 }
 static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred)
@@ -1807,8 +1831,8 @@ out_stale:
        nfs_access_free_entry(cache);
        return -ENOENT;
 out_zap:
-        /* This will release the spinlock */
+        spin_unlock(&inode->i_lock);
-        __nfs_access_zap_cache(inode);
+        nfs_access_zap_cache(inode);
        return -ENOENT;
 }
@@ -1863,9 +1887,11 @@ static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *s
        smp_mb__after_atomic_inc();
        /* Add inode to global LRU list */
-        if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
+        if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
                spin_lock(&nfs_access_lru_lock);
-                list_add_tail(&NFS_I(inode)->access_cache_inode_lru, &nfs_access_lru_list);
+                if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
+                        list_add_tail(&NFS_I(inode)->access_cache_inode_lru,
+                                        &nfs_access_lru_list);
                spin_unlock(&nfs_access_lru_lock);
        }
 }
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 0d289823e856..ad4cd31d6050 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -44,6 +44,7 @@
 #include <linux/file.h>
 #include <linux/pagemap.h>
 #include <linux/kref.h>
+#include <linux/slab.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 3f0cd4dfddaf..76fd235d0024 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -9,6 +9,7 @@
 #include <linux/hash.h>
 #include <linux/string.h>
 #include <linux/kmod.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/socket.h>
 #include <linux/seq_file.h>
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index ae8d02294e46..36a5e74f51b4 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -24,9 +24,9 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/aio.h>
+#include <linux/gfp.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -53,7 +53,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
 static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
                                unsigned long nr_segs, loff_t pos);
 static int  nfs_file_flush(struct file *, fl_owner_t id);
-static int  nfs_file_fsync(struct file *, struct dentry *dentry, int datasync);
+static int  nfs_file_fsync(struct file *, int datasync);
 static int nfs_check_flags(int flags);
 static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
@@ -161,14 +161,17 @@ static int nfs_revalidate_file_size(struct inode *inode, struct file *filp)
        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_inode *nfsi = NFS_I(inode);
-        if (server->flags & NFS_MOUNT_NOAC)
+        if (nfs_have_delegated_attributes(inode))
-                goto force_reval;
+                goto out_noreval;
        if (filp->f_flags & O_DIRECT)
                goto force_reval;
-        if (nfsi->npages != 0)
+        if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
-                return 0;
+                goto force_reval;
-        if (!(nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) && !nfs_attribute_timeout(inode))
+        if (nfs_attribute_timeout(inode))
-                return 0;
+                goto force_reval;
+out_noreval:
+        return 0;
 force_reval:
        return __nfs_revalidate_inode(server, inode);
 }
@@ -319,8 +322,9 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
 * whether any write errors occurred for this process.
 */
 static int
-nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
+nfs_file_fsync(struct file *file, int datasync)
 {
+        struct dentry *dentry = file->f_path.dentry;
        struct nfs_open_context *ctx = nfs_file_open_context(file);
        struct inode *inode = dentry->d_inode;
@@ -491,7 +495,8 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
 {
        dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
-        if (gfp & __GFP_WAIT)
+        /* Only do I/O if gfp is a superset of GFP_KERNEL */
+        if ((gfp & GFP_KERNEL) == GFP_KERNEL)
                nfs_wb_page(page->mapping->host, page);
        /* If PagePrivate() is set, then the page is not freeable */
        if (PagePrivate(page))
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 237874f1af23..ce153a6b3aec 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -17,6 +17,7 @@
 #include <linux/nfs_fs_sb.h>
 #include <linux/in6.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include "internal.h"
 #include "iostat.h"
@@ -466,7 +467,8 @@ int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
                                 struct list_head *pages,
                                 unsigned *nr_pages)
 {
-        int ret, npages = *nr_pages;
+        unsigned npages = *nr_pages;
+        int ret;
        dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
                 NFS_I(inode)->fscache, npages, inode);
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b35d2a616066..7428f7d6273b 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -78,159 +78,94 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
 {
        struct nfs_server *server = NFS_SB(sb);
        struct nfs_fsinfo fsinfo;
-        struct nfs_fattr fattr;
+        struct dentry *ret;
-        struct dentry *mntroot;
        struct inode *inode;
        int error;
        /* get the actual root for this mount */
-        fsinfo.fattr = &fattr;
+        fsinfo.fattr = nfs_alloc_fattr();
+        if (fsinfo.fattr == NULL)
+                return ERR_PTR(-ENOMEM);
        error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
        if (error < 0) {
                dprintk("nfs_get_root: getattr error = %d\n", -error);
-                return ERR_PTR(error);
+                ret = ERR_PTR(error);
+                goto out;
        }
        inode = nfs_fhget(sb, mntfh, fsinfo.fattr);
        if (IS_ERR(inode)) {
                dprintk("nfs_get_root: get root inode failed\n");
-                return ERR_CAST(inode);
+                ret = ERR_CAST(inode);
+                goto out;
        }
        error = nfs_superblock_set_dummy_root(sb, inode);
-        if (error != 0)
+        if (error != 0) {
-                return ERR_PTR(error);
+                ret = ERR_PTR(error);
+                goto out;
+        }
        /* root dentries normally start off anonymous and get spliced in later
         * if the dentry tree reaches them; however if the dentry already
         * exists, we'll pick it up at this point and use it as the root
         */
-        mntroot = d_obtain_alias(inode);
+        ret = d_obtain_alias(inode);
-        if (IS_ERR(mntroot)) {
+        if (IS_ERR(ret)) {
                dprintk("nfs_get_root: get root dentry failed\n");
-                return mntroot;
+                goto out;
        }
-        security_d_instantiate(mntroot, inode);
+        security_d_instantiate(ret, inode);
-        if (!mntroot->d_op)
-                mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops;
-        return mntroot;
+        if (ret->d_op == NULL)
+                ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
+out:
+        nfs_free_fattr(fsinfo.fattr);
+        return ret;
 }
 #ifdef CONFIG_NFS_V4
-/*
+int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh)
- * Do a simple pathwalk from the root FH of the server to the nominated target
- * of the mountpoint
- * - give error on symlinks
- * - give error on ".." occurring in the path
- * - follow traversals
- */
-int nfs4_path_walk(struct nfs_server *server,
-                   struct nfs_fh *mntfh,
-                   const char *path)
 {
        struct nfs_fsinfo fsinfo;
-        struct nfs_fattr fattr;
+        int ret = -ENOMEM;
-        struct nfs_fh lastfh;
-        struct qstr name;
-        int ret;
-        dprintk("--> nfs4_path_walk(,,%s)\n", path);
+        dprintk("--> nfs4_get_rootfh()\n");
-        fsinfo.fattr = &fattr;
+        fsinfo.fattr = nfs_alloc_fattr();
-        nfs_fattr_init(&fattr);
+        if (fsinfo.fattr == NULL)
+                goto out;
-        /* Eat leading slashes */
-        while (*path == '/')
-                path++;
        /* Start by getting the root filehandle from the server */
        ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
        if (ret < 0) {
-                dprintk("nfs4_get_root: getroot error = %d\n", -ret);
+                dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret);
-                return ret;
+                goto out;
        }
-        if (!S_ISDIR(fattr.mode)) {
+        if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_MODE)
-                printk(KERN_ERR "nfs4_get_root:"
+                        || !S_ISDIR(fsinfo.fattr->mode)) {
+                printk(KERN_ERR "nfs4_get_rootfh:"
                       " getroot encountered non-directory\n");
-                return -ENOTDIR;
+                ret = -ENOTDIR;
+                goto out;
        }
-        /* FIXME: It is quite valid for the server to return a referral here */
+        if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
-        if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) {
+                printk(KERN_ERR "nfs4_get_rootfh:"
-                printk(KERN_ERR "nfs4_get_root:"
                       " getroot obtained referral\n");
-                return -EREMOTE;
+                ret = -EREMOTE;
-        }
+                goto out;
-next_component:
-        dprintk("Next: %s\n", path);
-        /* extract the next bit of the path */
-        if (!*path)
-                goto path_walk_complete;
-        name.name = path;
-        while (*path && *path != '/')
-                path++;
-        name.len = path - (const char *) name.name;
-        if (name.len > NFS4_MAXNAMLEN)
-                return -ENAMETOOLONG;
-eat_dot_dir:
-        while (*path == '/')
-                path++;
-        if (path[0] == '.' && (path[1] == '/' || !path[1])) {
-                path += 2;
-                goto eat_dot_dir;
-        }
-        /* FIXME: Why shouldn't the user be able to use ".." in the path? */
-        if (path[0] == '.' && path[1] == '.' && (path[2] == '/' || !path[2])
-            ) {
-                printk(KERN_ERR "nfs4_get_root:"
-                       " Mount path contains reference to \"..\"\n");
-                return -EINVAL;
        }
-        /* lookup the next FH in the sequence */
+        memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid));
-        memcpy(&lastfh, mntfh, sizeof(lastfh));
+out:
+        nfs_free_fattr(fsinfo.fattr);
-        dprintk("LookupFH: %*.*s [%s]\n", name.len, name.len, name.name, path);
+        dprintk("<-- nfs4_get_rootfh() = %d\n", ret);
+        return ret;
-        ret = server->nfs_client->rpc_ops->lookupfh(server, &lastfh, &name,
-                                                    mntfh, &fattr);
-        if (ret < 0) {
-                dprintk("nfs4_get_root: getroot error = %d\n", -ret);
-                return ret;
-        }
-        if (!S_ISDIR(fattr.mode)) {
-                printk(KERN_ERR "nfs4_get_root:"
-                       " lookupfh encountered non-directory\n");
-                return -ENOTDIR;
-        }
-        /* FIXME: Referrals are quite valid here too */
-        if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) {
-                printk(KERN_ERR "nfs4_get_root:"
-                       " lookupfh obtained referral\n");
-                return -EREMOTE;
-        }
-        goto next_component;
-path_walk_complete:
-        memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid));
-        dprintk("<-- nfs4_path_walk() = 0\n");
-        return 0;
 }
 /*
@@ -239,8 +174,8 @@ path_walk_complete:
 struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
 {
        struct nfs_server *server = NFS_SB(sb);
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr = NULL;
-        struct dentry *mntroot;
+        struct dentry *ret;
        struct inode *inode;
        int error;
@@ -254,40 +189,50 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
                return ERR_PTR(error);
        }
+        fattr = nfs_alloc_fattr();
+        if (fattr == NULL)
+                return ERR_PTR(-ENOMEM);;
        /* get the actual root for this mount */
-        error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr);
+        error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
        if (error < 0) {
                dprintk("nfs_get_root: getattr error = %d\n", -error);
-                return ERR_PTR(error);
+                ret = ERR_PTR(error);
+                goto out;
        }
-        inode = nfs_fhget(sb, mntfh, &fattr);
+        inode = nfs_fhget(sb, mntfh, fattr);
        if (IS_ERR(inode)) {
                dprintk("nfs_get_root: get root inode failed\n");
-                return ERR_CAST(inode);
+                ret = ERR_CAST(inode);
+                goto out;
        }
        error = nfs_superblock_set_dummy_root(sb, inode);
-        if (error != 0)
+        if (error != 0) {
-                return ERR_PTR(error);
+                ret = ERR_PTR(error);
+                goto out;
+        }
        /* root dentries normally start off anonymous and get spliced in later
         * if the dentry tree reaches them; however if the dentry already
         * exists, we'll pick it up at this point and use it as the root
         */
-        mntroot = d_obtain_alias(inode);
+        ret = d_obtain_alias(inode);
-        if (IS_ERR(mntroot)) {
+        if (IS_ERR(ret)) {
                dprintk("nfs_get_root: get root dentry failed\n");
-                return mntroot;
+                goto out;
        }
-        security_d_instantiate(mntroot, inode);
+        security_d_instantiate(ret, inode);
-        if (!mntroot->d_op)
+        if (ret->d_op == NULL)
-                mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops;
+                ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
+out:
+        nfs_free_fattr(fattr);
        dprintk("<-- nfs4_get_root()\n");
-        return mntroot;
+        return ret;
 }
 #endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e358df75a6ad..099b3518feea 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -36,6 +36,7 @@
 #include <linux/vfs.h>
 #include <linux/inet.h>
 #include <linux/nfs_xdr.h>
+#include <linux/slab.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -392,8 +393,8 @@ int
 nfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr;
-        int error;
+        int error = -ENOMEM;
        nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
@@ -416,14 +417,20 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
                filemap_write_and_wait(inode->i_mapping);
                nfs_wb_all(inode);
        }
+        fattr = nfs_alloc_fattr();
+        if (fattr == NULL)
+                goto out;
        /*
         * Return any delegations if we're going to change ACLs
         */
        if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0)
                nfs_inode_return_delegation(inode);
-        error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr);
+        error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
        if (error == 0)
-                nfs_refresh_inode(inode, &fattr);
+                nfs_refresh_inode(inode, fattr);
+        nfs_free_fattr(fattr);
+out:
        return error;
 }
@@ -622,10 +629,10 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
        list_for_each_entry(pos, &nfsi->open_files, list) {
                if (cred != NULL && pos->cred != cred)
                        continue;
-                if ((pos->mode & mode) == mode) {
+                if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode)
-                        ctx = get_nfs_open_context(pos);
+                        continue;
-                        break;
+                ctx = get_nfs_open_context(pos);
-                }
+                break;
        }
        spin_unlock(&inode->i_lock);
        return ctx;
@@ -681,7 +688,7 @@ int
 __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 {
        int              status = -ESTALE;
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr = NULL;
        struct nfs_inode *nfsi = NFS_I(inode);
        dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
@@ -692,8 +699,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
        if (NFS_STALE(inode))
                goto out;
+        status = -ENOMEM;
+        fattr = nfs_alloc_fattr();
+        if (fattr == NULL)
+                goto out;
        nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
-        status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
+        status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr);
        if (status != 0) {
                dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
                         inode->i_sb->s_id,
@@ -706,7 +718,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
                goto out;
        }
-        status = nfs_refresh_inode(inode, &fattr);
+        status = nfs_refresh_inode(inode, fattr);
        if (status) {
                dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
                         inode->i_sb->s_id,
@@ -722,6 +734,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
                (long long)NFS_FILEID(inode));
 out:
+        nfs_free_fattr(fattr);
        return status;
 }
@@ -729,9 +742,14 @@ int nfs_attribute_timeout(struct inode *inode)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
+        return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
+}
+static int nfs_attribute_cache_expired(struct inode *inode)
+{
        if (nfs_have_delegated_attributes(inode))
                return 0;
-        return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
+        return nfs_attribute_timeout(inode);
 }
 /**
@@ -744,7 +762,7 @@ int nfs_attribute_timeout(struct inode *inode)
 int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 {
        if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)
-                        && !nfs_attribute_timeout(inode))
+                        && !nfs_attribute_cache_expired(inode))
                return NFS_STALE(inode) ? -ESTALE : 0;
        return __nfs_revalidate_inode(server, inode);
 }
@@ -781,7 +799,8 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
        int ret = 0;
        if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
-                        || nfs_attribute_timeout(inode) || NFS_STALE(inode)) {
+                        || nfs_attribute_cache_expired(inode)
+                        || NFS_STALE(inode)) {
                ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
                if (ret < 0)
                        goto out;
@@ -915,6 +934,26 @@ void nfs_fattr_init(struct nfs_fattr *fattr)
        fattr->gencount = nfs_inc_attr_generation_counter();
 }
+struct nfs_fattr *nfs_alloc_fattr(void)
+{
+        struct nfs_fattr *fattr;
+        fattr = kmalloc(sizeof(*fattr), GFP_NOFS);
+        if (fattr != NULL)
+                nfs_fattr_init(fattr);
+        return fattr;
+}
+struct nfs_fh *nfs_alloc_fhandle(void)
+{
+        struct nfs_fh *fh;
+        fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS);
+        if (fh != NULL)
+                fh->size = 0;
+        return fh;
+}
 /**
 * nfs_inode_attrs_need_update - check if the inode attributes need updating
 * @inode - pointer to inode
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 11f82f03c5de..d8bd619e386c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -244,9 +244,7 @@ extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
 #ifdef CONFIG_NFS_V4
 extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *);
-extern int nfs4_path_walk(struct nfs_server *server,
+extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
-                          struct nfs_fh *mntfh,
-                          const char *path);
 #endif
 /* read.c */
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 1d8d5c813b01..c5832487c456 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -36,14 +36,14 @@ static inline void nfs_inc_stats(const struct inode *inode,
 static inline void nfs_add_server_stats(const struct nfs_server *server,
                                        enum nfs_stat_bytecounters stat,
-                                        unsigned long addend)
+                                        long addend)
 {
        this_cpu_add(server->io_stats->bytes[stat], addend);
 }
 static inline void nfs_add_stats(const struct inode *inode,
                                 enum nfs_stat_bytecounters stat,
-                                 unsigned long addend)
+                                 long addend)
 {
        nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
 }
@@ -51,7 +51,7 @@ static inline void nfs_add_stats(const struct inode *inode,
 #ifdef CONFIG_NFS_FSCACHE
 static inline void nfs_add_fscache_stats(struct inode *inode,
                                         enum nfs_stat_fscachecounters stat,
-                                         unsigned long addend)
+                                         long addend)
 {
        this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend);
 }
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 40c766782891..db6aa3673cf3 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -8,6 +8,7 @@
 */
 #include <linux/dcache.h>
+#include <linux/gfp.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/nfs_fs.h>
@@ -104,8 +105,8 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
        struct vfsmount *mnt;
        struct nfs_server *server = NFS_SERVER(dentry->d_inode);
        struct dentry *parent;
-        struct nfs_fh fh;
+        struct nfs_fh *fh = NULL;
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr = NULL;
        int err;
        dprintk("--> nfs_follow_mountpoint()\n");
@@ -114,6 +115,12 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
        if (IS_ROOT(dentry))
                goto out_err;
+        err = -ENOMEM;
+        fh = nfs_alloc_fhandle();
+        fattr = nfs_alloc_fattr();
+        if (fh == NULL || fattr == NULL)
+                goto out_err;
        dprintk("%s: enter\n", __func__);
        dput(nd->path.dentry);
        nd->path.dentry = dget(dentry);
@@ -122,16 +129,16 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
        parent = dget_parent(nd->path.dentry);
        err = server->nfs_client->rpc_ops->lookup(parent->d_inode,
                                                  &nd->path.dentry->d_name,
-                                                  &fh, &fattr);
+                                                  fh, fattr);
        dput(parent);
        if (err != 0)
                goto out_err;
-        if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL)
+        if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
                mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry);
        else
-                mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, &fh,
+                mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh,
-                                      &fattr);
+                                      fattr);
        err = PTR_ERR(mnt);
        if (IS_ERR(mnt))
                goto out_err;
@@ -150,6 +157,8 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
        nd->path.dentry = dget(mnt->mnt_root);
        schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
 out:
+        nfs_free_fattr(fattr);
+        nfs_free_fhandle(fh);
        dprintk("%s: done, returned %d\n", __func__, err);
        dprintk("<-- nfs_follow_mountpoint() = %d\n", err);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 7bc2da8efd4a..81cf14257916 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -12,7 +12,6 @@
 #include <linux/param.h>
 #include <linux/time.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index bac60515a4b3..9f88c5f4c7e2 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -1,4 +1,5 @@
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/nfs.h>
 #include <linux/nfs3.h>
 #include <linux/nfs_fs.h>
@@ -184,7 +185,6 @@ static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl,
 struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 {
        struct nfs_server *server = NFS_SERVER(inode);
-        struct nfs_fattr fattr;
        struct page *pages[NFSACL_MAXPAGES] = { };
        struct nfs3_getaclargs args = {
                .fh = NFS_FH(inode),
@@ -192,7 +192,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
                .pages = pages,
        };
        struct nfs3_getaclres res = {
-                .fattr =        &fattr,
+                0
        };
        struct rpc_message msg = {
                .rpc_argp       = &args,
@@ -227,7 +227,10 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
        dprintk("NFS call getacl\n");
        msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL];
-        nfs_fattr_init(&fattr);
+        res.fattr = nfs_alloc_fattr();
+        if (res.fattr == NULL)
+                return ERR_PTR(-ENOMEM);
        status = rpc_call_sync(server->client_acl, &msg, 0);
        dprintk("NFS reply getacl: %d\n", status);
@@ -237,7 +240,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
        switch (status) {
                case 0:
-                        status = nfs_refresh_inode(inode, &fattr);
+                        status = nfs_refresh_inode(inode, res.fattr);
                        break;
                case -EPFNOSUPPORT:
                case -EPROTONOSUPPORT:
@@ -277,6 +280,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 getout:
        posix_acl_release(res.acl_access);
        posix_acl_release(res.acl_default);
+        nfs_free_fattr(res.fattr);
        if (status != 0) {
                posix_acl_release(acl);
@@ -289,7 +293,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
                  struct posix_acl *dfacl)
 {
        struct nfs_server *server = NFS_SERVER(inode);
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr;
        struct page *pages[NFSACL_MAXPAGES];
        struct nfs3_setaclargs args = {
                .inode = inode,
@@ -334,8 +338,13 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
        }
        dprintk("NFS call setacl\n");
+        status = -ENOMEM;
+        fattr = nfs_alloc_fattr();
+        if (fattr == NULL)
+                goto out_freepages;
        msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
-        nfs_fattr_init(&fattr);
+        msg.rpc_resp = fattr;
        status = rpc_call_sync(server->client_acl, &msg, 0);
        nfs_access_zap_cache(inode);
        nfs_zap_acl_cache(inode);
@@ -343,7 +352,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
        switch (status) {
                case 0:
-                        status = nfs_refresh_inode(inode, &fattr);
+                        status = nfs_refresh_inode(inode, fattr);
                        nfs3_cache_acls(inode, acl, dfacl);
                        break;
                case -EPFNOSUPPORT:
@@ -354,6 +363,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
                case -ENOTSUPP:
                        status = -EOPNOTSUPP;
        }
+        nfs_free_fattr(fattr);
 out_freepages:
        while (args.npages != 0) {
                args.npages--;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 24992f0a29f2..fabb4f2849a1 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -10,6 +10,7 @@
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/slab.h>
 #include <linux/nfs.h>
 #include <linux/nfs3.h>
 #include <linux/nfs_fs.h>
@@ -143,14 +144,12 @@ static int
 nfs3_proc_lookup(struct inode *dir, struct qstr *name,
                 struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
-        struct nfs_fattr        dir_attr;
        struct nfs3_diropargs   arg = {
                .fh             = NFS_FH(dir),
                .name           = name->name,
                .len            = name->len
        };
        struct nfs3_diropres    res = {
-                .dir_attr       = &dir_attr,
                .fh             = fhandle,
                .fattr          = fattr
        };
@@ -162,29 +161,30 @@ nfs3_proc_lookup(struct inode *dir, struct qstr *name,
        int                     status;
        dprintk("NFS call  lookup %s\n", name->name);
-        nfs_fattr_init(&dir_attr);
+        res.dir_attr = nfs_alloc_fattr();
+        if (res.dir_attr == NULL)
+                return -ENOMEM;
        nfs_fattr_init(fattr);
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-        nfs_refresh_inode(dir, &dir_attr);
+        nfs_refresh_inode(dir, res.dir_attr);
        if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) {
                msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
                msg.rpc_argp = fhandle;
                msg.rpc_resp = fattr;
                status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        }
+        nfs_free_fattr(res.dir_attr);
        dprintk("NFS reply lookup: %d\n", status);
        return status;
 }
 static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
 {
-        struct nfs_fattr        fattr;
        struct nfs3_accessargs  arg = {
                .fh             = NFS_FH(inode),
        };
-        struct nfs3_accessres   res = {
+        struct nfs3_accessres   res;
-                .fattr          = &fattr,
-        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs3_procedures[NFS3PROC_ACCESS],
                .rpc_argp       = &arg,
@@ -192,7 +192,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
                .rpc_cred       = entry->cred,
        };
        int mode = entry->mask;
-        int status;
+        int status = -ENOMEM;
        dprintk("NFS call  access\n");
@@ -209,9 +209,13 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
                if (mode & MAY_EXEC)
                        arg.access |= NFS3_ACCESS_EXECUTE;
        }
-        nfs_fattr_init(&fattr);
+        res.fattr = nfs_alloc_fattr();
+        if (res.fattr == NULL)
+                goto out;
        status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
-        nfs_refresh_inode(inode, &fattr);
+        nfs_refresh_inode(inode, res.fattr);
        if (status == 0) {
                entry->mask = 0;
                if (res.access & NFS3_ACCESS_READ)
@@ -221,6 +225,8 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
                if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE))
                        entry->mask |= MAY_EXEC;
        }
+        nfs_free_fattr(res.fattr);
+out:
        dprintk("NFS reply access: %d\n", status);
        return status;
 }
@@ -228,7 +234,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
 static int nfs3_proc_readlink(struct inode *inode, struct page *page,
                unsigned int pgbase, unsigned int pglen)
 {
-        struct nfs_fattr        fattr;
+        struct nfs_fattr        *fattr;
        struct nfs3_readlinkargs args = {
                .fh             = NFS_FH(inode),
                .pgbase         = pgbase,
@@ -238,14 +244,19 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page,
        struct rpc_message msg = {
                .rpc_proc       = &nfs3_procedures[NFS3PROC_READLINK],
                .rpc_argp       = &args,
-                .rpc_resp       = &fattr,
        };
-        int                     status;
+        int status = -ENOMEM;
        dprintk("NFS call  readlink\n");
-        nfs_fattr_init(&fattr);
+        fattr = nfs_alloc_fattr();
+        if (fattr == NULL)
+                goto out;
+        msg.rpc_resp = fattr;
        status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
-        nfs_refresh_inode(inode, &fattr);
+        nfs_refresh_inode(inode, fattr);
+        nfs_free_fattr(fattr);
+out:
        dprintk("NFS reply readlink: %d\n", status);
        return status;
 }
@@ -395,12 +406,17 @@ nfs3_proc_remove(struct inode *dir, struct qstr *name)
                .rpc_argp = &arg,
                .rpc_resp = &res,
        };
-        int                     status;
+        int status = -ENOMEM;
        dprintk("NFS call  remove %s\n", name->name);
-        nfs_fattr_init(&res.dir_attr);
+        res.dir_attr = nfs_alloc_fattr();
+        if (res.dir_attr == NULL)
+                goto out;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-        nfs_post_op_update_inode(dir, &res.dir_attr);
+        nfs_post_op_update_inode(dir, res.dir_attr);
+        nfs_free_fattr(res.dir_attr);
+out:
        dprintk("NFS reply remove: %d\n", status);
        return status;
 }
@@ -418,7 +434,7 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
        if (nfs3_async_handle_jukebox(task, dir))
                return 0;
        res = task->tk_msg.rpc_resp;
-        nfs_post_op_update_inode(dir, &res->dir_attr);
+        nfs_post_op_update_inode(dir, res->dir_attr);
        return 1;
 }
@@ -426,7 +442,6 @@ static int
 nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
                 struct inode *new_dir, struct qstr *new_name)
 {
-        struct nfs_fattr        old_dir_attr, new_dir_attr;
        struct nfs3_renameargs  arg = {
                .fromfh         = NFS_FH(old_dir),
                .fromname       = old_name->name,
@@ -435,23 +450,27 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
                .toname         = new_name->name,
                .tolen          = new_name->len
        };
-        struct nfs3_renameres   res = {
+        struct nfs3_renameres res;
-                .fromattr       = &old_dir_attr,
-                .toattr         = &new_dir_attr
-        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs3_procedures[NFS3PROC_RENAME],
                .rpc_argp       = &arg,
                .rpc_resp       = &res,
        };
-        int                     status;
+        int status = -ENOMEM;
        dprintk("NFS call  rename %s -> %s\n", old_name->name, new_name->name);
-        nfs_fattr_init(&old_dir_attr);
-        nfs_fattr_init(&new_dir_attr);
+        res.fromattr = nfs_alloc_fattr();
+        res.toattr = nfs_alloc_fattr();
+        if (res.fromattr == NULL || res.toattr == NULL)
+                goto out;
        status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
-        nfs_post_op_update_inode(old_dir, &old_dir_attr);
+        nfs_post_op_update_inode(old_dir, res.fromattr);
-        nfs_post_op_update_inode(new_dir, &new_dir_attr);
+        nfs_post_op_update_inode(new_dir, res.toattr);
+out:
+        nfs_free_fattr(res.toattr);
+        nfs_free_fattr(res.fromattr);
        dprintk("NFS reply rename: %d\n", status);
        return status;
 }
@@ -459,30 +478,32 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
 static int
 nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
 {
-        struct nfs_fattr        dir_attr, fattr;
        struct nfs3_linkargs    arg = {
                .fromfh         = NFS_FH(inode),
                .tofh           = NFS_FH(dir),
                .toname         = name->name,
                .tolen          = name->len
        };
-        struct nfs3_linkres     res = {
+        struct nfs3_linkres     res;
-                .dir_attr       = &dir_attr,
-                .fattr          = &fattr
-        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs3_procedures[NFS3PROC_LINK],
                .rpc_argp       = &arg,
                .rpc_resp       = &res,
        };
-        int                     status;
+        int status = -ENOMEM;
        dprintk("NFS call  link %s\n", name->name);
-        nfs_fattr_init(&dir_attr);
+        res.fattr = nfs_alloc_fattr();
-        nfs_fattr_init(&fattr);
+        res.dir_attr = nfs_alloc_fattr();
+        if (res.fattr == NULL || res.dir_attr == NULL)
+                goto out;
        status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
-        nfs_post_op_update_inode(dir, &dir_attr);
+        nfs_post_op_update_inode(dir, res.dir_attr);
-        nfs_post_op_update_inode(inode, &fattr);
+        nfs_post_op_update_inode(inode, res.fattr);
+out:
+        nfs_free_fattr(res.dir_attr);
+        nfs_free_fattr(res.fattr);
        dprintk("NFS reply link: %d\n", status);
        return status;
 }
@@ -553,7 +574,7 @@ out:
 static int
 nfs3_proc_rmdir(struct inode *dir, struct qstr *name)
 {
-        struct nfs_fattr        dir_attr;
+        struct nfs_fattr        *dir_attr;
        struct nfs3_diropargs   arg = {
                .fh             = NFS_FH(dir),
                .name           = name->name,
@@ -562,14 +583,19 @@ nfs3_proc_rmdir(struct inode *dir, struct qstr *name)
        struct rpc_message msg = {
                .rpc_proc       = &nfs3_procedures[NFS3PROC_RMDIR],
                .rpc_argp       = &arg,
-                .rpc_resp       = &dir_attr,
        };
-        int                     status;
+        int status = -ENOMEM;
        dprintk("NFS call  rmdir %s\n", name->name);
-        nfs_fattr_init(&dir_attr);
+        dir_attr = nfs_alloc_fattr();
+        if (dir_attr == NULL)
+                goto out;
+        msg.rpc_resp = dir_attr;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-        nfs_post_op_update_inode(dir, &dir_attr);
+        nfs_post_op_update_inode(dir, dir_attr);
+        nfs_free_fattr(dir_attr);
+out:
        dprintk("NFS reply rmdir: %d\n", status);
        return status;
 }
@@ -588,7 +614,6 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
                  u64 cookie, struct page *page, unsigned int count, int plus)
 {
        struct inode            *dir = dentry->d_inode;
-        struct nfs_fattr        dir_attr;
        __be32                  *verf = NFS_COOKIEVERF(dir);
        struct nfs3_readdirargs arg = {
                .fh             = NFS_FH(dir),
@@ -599,7 +624,6 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
                .pages          = &page
        };
        struct nfs3_readdirres  res = {
-                .dir_attr       = &dir_attr,
                .verf           = verf,
                .plus           = plus
        };
@@ -609,7 +633,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
                .rpc_resp       = &res,
                .rpc_cred       = cred
        };
-        int                     status;
+        int status = -ENOMEM;
        if (plus)
                msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS];
@@ -617,12 +641,17 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
        dprintk("NFS call  readdir%s %d\n",
                        plus? "plus" : "", (unsigned int) cookie);
-        nfs_fattr_init(&dir_attr);
+        res.dir_attr = nfs_alloc_fattr();
+        if (res.dir_attr == NULL)
+                goto out;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_invalidate_atime(dir);
+        nfs_refresh_inode(dir, res.dir_attr);
-        nfs_refresh_inode(dir, &dir_attr);
+        nfs_free_fattr(res.dir_attr);
+out:
        dprintk("NFS reply readdir: %d\n", status);
        return status;
 }
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 5fe5492fbd29..75dcfc7da365 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -9,7 +9,6 @@
 #include <linux/param.h>
 #include <linux/time.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
@@ -763,7 +762,7 @@ nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 static int
 nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res)
 {
-        return nfs3_xdr_wccstat(req, p, &res->dir_attr);
+        return nfs3_xdr_wccstat(req, p, res->dir_attr);
 }
 /*
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a187200a7aac..c538c6106e16 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -206,14 +206,14 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
 /* nfs4proc.c */
-extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *);
+extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
-extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *);
+extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
 extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
 extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
-extern int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait);
+extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
 extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
@@ -286,7 +286,7 @@ extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
 extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
 extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
-extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter);
+extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
 extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
 extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
 extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index fa3408f20112..3c2a1724fbd2 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -11,6 +11,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/nfs_fs.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/vfs.h>
@@ -114,6 +115,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
                                     char *page, char *page2,
                                     const struct nfs4_fs_location *location)
 {
+        const size_t addr_bufsize = sizeof(struct sockaddr_storage);
        struct vfsmount *mnt = ERR_PTR(-ENOENT);
        char *mnt_path;
        unsigned int maxbuflen;
@@ -125,9 +127,12 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
        mountdata->mnt_path = mnt_path;
        maxbuflen = mnt_path - 1 - page2;
+        mountdata->addr = kmalloc(addr_bufsize, GFP_KERNEL);
+        if (mountdata->addr == NULL)
+                return ERR_PTR(-ENOMEM);
        for (s = 0; s < location->nservers; s++) {
                const struct nfs4_string *buf = &location->servers[s];
-                struct sockaddr_storage addr;
                if (buf->len <= 0 || buf->len >= maxbuflen)
                        continue;
@@ -136,11 +141,10 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
                        continue;
                mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len,
-                                (struct sockaddr *)&addr, sizeof(addr));
+                                mountdata->addr, addr_bufsize);
                if (mountdata->addrlen == 0)
                        continue;
-                mountdata->addr = (struct sockaddr *)&addr;
                rpc_set_port(mountdata->addr, NFS_PORT);
                memcpy(page2, buf->data, buf->len);
@@ -155,6 +159,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
                if (!IS_ERR(mnt))
                        break;
        }
+        kfree(mountdata->addr);
        return mnt;
 }
@@ -220,8 +225,8 @@ out:
 /*
 * nfs_do_refmount - handle crossing a referral on server
+ * @mnt_parent - mountpoint of referral
 * @dentry - dentry of referral
- * @nd - nameidata info
 *
 */
 struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f9254fb0c9d0..70015dd60a98 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -39,6 +39,7 @@
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/string.h>
+#include <linux/slab.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/nfs.h>
 #include <linux/nfs4.h>
@@ -69,6 +70,9 @@ static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinf
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
 static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
+static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+                            struct nfs_fattr *fattr, struct iattr *sattr,
+                            struct nfs4_state *state);
 /* Prevent leaks of NFSv4 errors into userland */
 static int nfs4_map_errors(int err)
@@ -713,17 +717,18 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
 static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
                struct nfs4_state_owner *sp, fmode_t fmode, int flags,
-                const struct iattr *attrs)
+                const struct iattr *attrs,
+                gfp_t gfp_mask)
 {
        struct dentry *parent = dget_parent(path->dentry);
        struct inode *dir = parent->d_inode;
        struct nfs_server *server = NFS_SERVER(dir);
        struct nfs4_opendata *p;
-        p = kzalloc(sizeof(*p), GFP_KERNEL);
+        p = kzalloc(sizeof(*p), gfp_mask);
        if (p == NULL)
                goto err;
-        p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
+        p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask);
        if (p->o_arg.seqid == NULL)
                goto err_free;
        path_get(path);
@@ -1059,7 +1064,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
 {
        struct nfs4_opendata *opendata;
-        opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL);
+        opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL, GFP_NOFS);
        if (opendata == NULL)
                return ERR_PTR(-ENOMEM);
        opendata->state = state;
@@ -1522,6 +1527,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
                nfs_post_op_update_inode(dir, o_res->dir_attr);
        } else
                nfs_refresh_inode(dir, o_res->dir_attr);
+        if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)
+                server->caps &= ~NFS_CAP_POSIX_LOCK;
        if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
                status = _nfs4_proc_open_confirm(data);
                if (status != 0)
@@ -1645,7 +1652,7 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
        if (path->dentry->d_inode != NULL)
                nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode);
        status = -ENOMEM;
-        opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr);
+        opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr, GFP_KERNEL);
        if (opendata == NULL)
                goto err_put_state_owner;
@@ -1656,15 +1663,24 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
        if (status != 0)
                goto err_opendata_put;
-        if (opendata->o_arg.open_flags & O_EXCL)
-                nfs4_exclusive_attrset(opendata, sattr);
        state = nfs4_opendata_to_nfs4_state(opendata);
        status = PTR_ERR(state);
        if (IS_ERR(state))
                goto err_opendata_put;
-        if ((opendata->o_res.rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) != 0)
+        if (server->caps & NFS_CAP_POSIX_LOCK)
                set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
+        if (opendata->o_arg.open_flags & O_EXCL) {
+                nfs4_exclusive_attrset(opendata, sattr);
+                nfs_fattr_init(opendata->o_res.f_attr);
+                status = nfs4_do_setattr(state->inode, cred,
+                                opendata->o_res.f_attr, sattr,
+                                state);
+                if (status == 0)
+                        nfs_setattr_update_inode(state->inode, sattr);
+                nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
+        }
        nfs4_opendata_put(opendata);
        nfs4_put_state_owner(sp);
        *res = state;
@@ -1911,7 +1927,7 @@ static const struct rpc_call_ops nfs4_close_ops = {
 *
 * NOTE: Caller must be holding the sp->so_owner semaphore!
 */
-int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
+int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait)
 {
        struct nfs_server *server = NFS_SERVER(state->inode);
        struct nfs4_closedata *calldata;
@@ -1930,7 +1946,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
        };
        int status = -ENOMEM;
-        calldata = kzalloc(sizeof(*calldata), GFP_KERNEL);
+        calldata = kzalloc(sizeof(*calldata), gfp_mask);
        if (calldata == NULL)
                goto out;
        calldata->inode = state->inode;
@@ -1938,7 +1954,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
        calldata->arg.fh = NFS_FH(state->inode);
        calldata->arg.stateid = &state->open_stateid;
        /* Serialization for the sequence id */
-        calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid);
+        calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid, gfp_mask);
        if (calldata->arg.seqid == NULL)
                goto out_free_calldata;
        calldata->arg.fmode = 0;
@@ -2067,8 +2083,7 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
                        case -EDQUOT:
                        case -ENOSPC:
                        case -EROFS:
-                                lookup_instantiate_filp(nd, (struct dentry *)state, NULL);
+                                return PTR_ERR(state);
-                                return 1;
                        default:
                                goto out_drop;
                }
@@ -2402,14 +2417,12 @@ static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh
 static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
 {
        struct nfs_server *server = NFS_SERVER(inode);
-        struct nfs_fattr fattr;
        struct nfs4_accessargs args = {
                .fh = NFS_FH(inode),
                .bitmask = server->attr_bitmask,
        };
        struct nfs4_accessres res = {
                .server = server,
-                .fattr = &fattr,
        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS],
@@ -2436,7 +2449,11 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
                if (mode & MAY_EXEC)
                        args.access |= NFS4_ACCESS_EXECUTE;
        }
-        nfs_fattr_init(&fattr);
+        res.fattr = nfs_alloc_fattr();
+        if (res.fattr == NULL)
+                return -ENOMEM;
        status = nfs4_call_sync(server, &msg, &args, &res, 0);
        if (!status) {
                entry->mask = 0;
@@ -2446,8 +2463,9 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
                        entry->mask |= MAY_WRITE;
                if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE))
                        entry->mask |= MAY_EXEC;
-                nfs_refresh_inode(inode, &fattr);
+                nfs_refresh_inode(inode, res.fattr);
        }
+        nfs_free_fattr(res.fattr);
        return status;
 }
@@ -2560,13 +2578,6 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
        }
        d_add(dentry, igrab(state->inode));
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-        if (flags & O_EXCL) {
-                struct nfs_fattr fattr;
-                status = nfs4_do_setattr(state->inode, cred, &fattr, sattr, state);
-                if (status == 0)
-                        nfs_setattr_update_inode(state->inode, sattr);
-                nfs_post_op_update_inode(state->inode, &fattr);
-        }
        if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0)
                status = nfs4_intent_set_file(nd, &path, state, fmode);
        else
@@ -2594,14 +2605,19 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
                .rpc_argp = &args,
                .rpc_resp = &res,
        };
-        int                     status;
+        int status = -ENOMEM;
+        res.dir_attr = nfs_alloc_fattr();
+        if (res.dir_attr == NULL)
+                goto out;
-        nfs_fattr_init(&res.dir_attr);
        status = nfs4_call_sync(server, &msg, &args, &res, 1);
        if (status == 0) {
                update_changeattr(dir, &res.cinfo);
-                nfs_post_op_update_inode(dir, &res.dir_attr);
+                nfs_post_op_update_inode(dir, res.dir_attr);
        }
+        nfs_free_fattr(res.dir_attr);
+out:
        return status;
 }
@@ -2636,7 +2652,7 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
        if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
                return 0;
        update_changeattr(dir, &res->cinfo);
-        nfs_post_op_update_inode(dir, &res->dir_attr);
+        nfs_post_op_update_inode(dir, res->dir_attr);
        return 1;
 }
@@ -2651,29 +2667,31 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
                .new_name = new_name,
                .bitmask = server->attr_bitmask,
        };
-        struct nfs_fattr old_fattr, new_fattr;
        struct nfs4_rename_res res = {
                .server = server,
-                .old_fattr = &old_fattr,
-                .new_fattr = &new_fattr,
        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME],
                .rpc_argp = &arg,
                .rpc_resp = &res,
        };
-        int                     status;
+        int status = -ENOMEM;
        
-        nfs_fattr_init(res.old_fattr);
+        res.old_fattr = nfs_alloc_fattr();
-        nfs_fattr_init(res.new_fattr);
+        res.new_fattr = nfs_alloc_fattr();
-        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
+        if (res.old_fattr == NULL || res.new_fattr == NULL)
+                goto out;
+        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
        if (!status) {
                update_changeattr(old_dir, &res.old_cinfo);
                nfs_post_op_update_inode(old_dir, res.old_fattr);
                update_changeattr(new_dir, &res.new_cinfo);
                nfs_post_op_update_inode(new_dir, res.new_fattr);
        }
+out:
+        nfs_free_fattr(res.new_fattr);
+        nfs_free_fattr(res.old_fattr);
        return status;
 }
@@ -2700,28 +2718,30 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
                .name   = name,
                .bitmask = server->attr_bitmask,
        };
-        struct nfs_fattr fattr, dir_attr;
        struct nfs4_link_res res = {
                .server = server,
-                .fattr = &fattr,
-                .dir_attr = &dir_attr,
        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
                .rpc_argp = &arg,
                .rpc_resp = &res,
        };
-        int                     status;
+        int status = -ENOMEM;
+        res.fattr = nfs_alloc_fattr();
+        res.dir_attr = nfs_alloc_fattr();
+        if (res.fattr == NULL || res.dir_attr == NULL)
+                goto out;
-        nfs_fattr_init(res.fattr);
-        nfs_fattr_init(res.dir_attr);
        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
        if (!status) {
                update_changeattr(dir, &res.cinfo);
                nfs_post_op_update_inode(dir, res.dir_attr);
                nfs_post_op_update_inode(inode, res.fattr);
        }
+out:
+        nfs_free_fattr(res.dir_attr);
+        nfs_free_fattr(res.fattr);
        return status;
 }
@@ -3144,23 +3164,31 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
 }
+struct nfs4_renewdata {
+        struct nfs_client       *client;
+        unsigned long           timestamp;
+};
 /*
 * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special
 * standalone procedure for queueing an asynchronous RENEW.
 */
-static void nfs4_renew_release(void *data)
+static void nfs4_renew_release(void *calldata)
 {
-        struct nfs_client *clp = data;
+        struct nfs4_renewdata *data = calldata;
+        struct nfs_client *clp = data->client;
        if (atomic_read(&clp->cl_count) > 1)
                nfs4_schedule_state_renewal(clp);
        nfs_put_client(clp);
+        kfree(data);
 }
-static void nfs4_renew_done(struct rpc_task *task, void *data)
+static void nfs4_renew_done(struct rpc_task *task, void *calldata)
 {
-        struct nfs_client *clp = data;
+        struct nfs4_renewdata *data = calldata;
-        unsigned long timestamp = task->tk_start;
+        struct nfs_client *clp = data->client;
+        unsigned long timestamp = data->timestamp;
        if (task->tk_status < 0) {
                /* Unless we're shutting down, schedule state recovery! */
@@ -3186,11 +3214,17 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
                .rpc_argp       = clp,
                .rpc_cred       = cred,
        };
+        struct nfs4_renewdata *data;
        if (!atomic_inc_not_zero(&clp->cl_count))
                return -EIO;
+        data = kmalloc(sizeof(*data), GFP_KERNEL);
+        if (data == NULL)
+                return -ENOMEM;
+        data->client = clp;
+        data->timestamp = jiffies;
        return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
-                        &nfs4_renew_ops, clp);
+                        &nfs4_renew_ops, data);
 }
 int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
@@ -3492,7 +3526,9 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
        return _nfs4_async_handle_error(task, server, server->nfs_client, state);
 }
-int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred)
+int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
+                unsigned short port, struct rpc_cred *cred,
+                struct nfs4_setclientid_res *res)
 {
        nfs4_verifier sc_verifier;
        struct nfs4_setclientid setclientid = {
@@ -3502,7 +3538,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
                .rpc_argp = &setclientid,
-                .rpc_resp = clp,
+                .rpc_resp = res,
                .rpc_cred = cred,
        };
        __be32 *p;
@@ -3545,12 +3581,14 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
        return status;
 }
-static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
+static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp,
+                struct nfs4_setclientid_res *arg,
+                struct rpc_cred *cred)
 {
        struct nfs_fsinfo fsinfo;
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM],
-                .rpc_argp = clp,
+                .rpc_argp = arg,
                .rpc_resp = &fsinfo,
                .rpc_cred = cred,
        };
@@ -3568,12 +3606,14 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre
        return status;
 }
-int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
+int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
+                struct nfs4_setclientid_res *arg,
+                struct rpc_cred *cred)
 {
        long timeout = 0;
        int err;
        do {
-                err = _nfs4_proc_setclientid_confirm(clp, cred);
+                err = _nfs4_proc_setclientid_confirm(clp, arg, cred);
                switch (err) {
                        case 0:
                                return err;
@@ -3665,7 +3705,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
        };
        int status = 0;
-        data = kzalloc(sizeof(*data), GFP_KERNEL);
+        data = kzalloc(sizeof(*data), GFP_NOFS);
        if (data == NULL)
                return -ENOMEM;
        data->args.fhandle = &data->fh;
@@ -3821,7 +3861,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
        struct nfs4_unlockdata *p;
        struct inode *inode = lsp->ls_state->inode;
-        p = kzalloc(sizeof(*p), GFP_KERNEL);
+        p = kzalloc(sizeof(*p), GFP_NOFS);
        if (p == NULL)
                return NULL;
        p->arg.fh = NFS_FH(inode);
@@ -3959,7 +3999,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
        if (test_bit(NFS_DELEGATED_STATE, &state->flags))
                goto out;
        lsp = request->fl_u.nfs4_fl.owner;
-        seqid = nfs_alloc_seqid(&lsp->ls_seqid);
+        seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);
        status = -ENOMEM;
        if (seqid == NULL)
                goto out;
@@ -3987,22 +4027,23 @@ struct nfs4_lockdata {
 };
 static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
-                struct nfs_open_context *ctx, struct nfs4_lock_state *lsp)
+                struct nfs_open_context *ctx, struct nfs4_lock_state *lsp,
+                gfp_t gfp_mask)
 {
        struct nfs4_lockdata *p;
        struct inode *inode = lsp->ls_state->inode;
        struct nfs_server *server = NFS_SERVER(inode);
-        p = kzalloc(sizeof(*p), GFP_KERNEL);
+        p = kzalloc(sizeof(*p), gfp_mask);
        if (p == NULL)
                return NULL;
        p->arg.fh = NFS_FH(inode);
        p->arg.fl = &p->fl;
-        p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid);
+        p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask);
        if (p->arg.open_seqid == NULL)
                goto out_free;
-        p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid);
+        p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid, gfp_mask);
        if (p->arg.lock_seqid == NULL)
                goto out_free_seqid;
        p->arg.lock_stateid = &lsp->ls_stateid;
@@ -4156,7 +4197,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
        dprintk("%s: begin!\n", __func__);
        data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
-                        fl->fl_u.nfs4_fl.owner);
+                        fl->fl_u.nfs4_fl.owner,
+                        recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS);
        if (data == NULL)
                return -ENOMEM;
        if (IS_SETLKW(cmd))
@@ -4645,7 +4687,7 @@ static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
        if (max_reqs != tbl->max_slots) {
                ret = -ENOMEM;
                new = kmalloc(max_reqs * sizeof(struct nfs4_slot),
-                              GFP_KERNEL);
+                              GFP_NOFS);
                if (!new)
                        goto out;
                ret = 0;
@@ -4710,7 +4752,7 @@ static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
        dprintk("--> %s: max_reqs=%u\n", __func__, max_slots);
-        slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_KERNEL);
+        slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_NOFS);
        if (!slot)
                goto out;
        ret = 0;
@@ -4759,7 +4801,7 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
        struct nfs4_session *session;
        struct nfs4_slot_table *tbl;
-        session = kzalloc(sizeof(struct nfs4_session), GFP_KERNEL);
+        session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
        if (!session)
                return NULL;
@@ -5103,8 +5145,8 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp,
        if (!atomic_inc_not_zero(&clp->cl_count))
                return -EIO;
-        args = kzalloc(sizeof(*args), GFP_KERNEL);
+        args = kzalloc(sizeof(*args), GFP_NOFS);
-        res = kzalloc(sizeof(*res), GFP_KERNEL);
+        res = kzalloc(sizeof(*res), GFP_NOFS);
        if (!args || !res) {
                kfree(args);
                kfree(res);
@@ -5205,7 +5247,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
        int status = -ENOMEM;
        dprintk("--> %s\n", __func__);
-        calldata = kzalloc(sizeof(*calldata), GFP_KERNEL);
+        calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
        if (calldata == NULL)
                goto out;
        calldata->clp = clp;
@@ -5216,9 +5258,12 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
        msg.rpc_resp = &calldata->res;
        task_setup_data.callback_data = calldata;
        task = rpc_run_task(&task_setup_data);
-        if (IS_ERR(task))
+        if (IS_ERR(task)) {
                status = PTR_ERR(task);
+                goto out;
+        }
        rpc_put_task(task);
+        return 0;
 out:
        dprintk("<-- %s status=%d\n", __func__, status);
        return status;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 6c5ed51f105e..34acf5926fdc 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -62,6 +62,7 @@ static LIST_HEAD(nfs4_clientid_list);
 int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 {
+        struct nfs4_setclientid_res clid;
        unsigned short port;
        int status;
@@ -69,11 +70,15 @@ int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
        if (clp->cl_addr.ss_family == AF_INET6)
                port = nfs_callback_tcpport6;
-        status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred);
+        status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid);
-        if (status == 0)
+        if (status != 0)
-                status = nfs4_proc_setclientid_confirm(clp, cred);
+                goto out;
-        if (status == 0)
+        status = nfs4_proc_setclientid_confirm(clp, &clid, cred);
-                nfs4_schedule_state_renewal(clp);
+        if (status != 0)
+                goto out;
+        clp->cl_clientid = clid.clientid;
+        nfs4_schedule_state_renewal(clp);
+out:
        return status;
 }
@@ -361,7 +366,7 @@ nfs4_alloc_state_owner(void)
 {
        struct nfs4_state_owner *sp;
-        sp = kzalloc(sizeof(*sp),GFP_KERNEL);
+        sp = kzalloc(sizeof(*sp),GFP_NOFS);
        if (!sp)
                return NULL;
        spin_lock_init(&sp->so_lock);
@@ -435,7 +440,7 @@ nfs4_alloc_open_state(void)
 {
        struct nfs4_state *state;
-        state = kzalloc(sizeof(*state), GFP_KERNEL);
+        state = kzalloc(sizeof(*state), GFP_NOFS);
        if (!state)
                return NULL;
        atomic_set(&state->count, 1);
@@ -537,7 +542,8 @@ void nfs4_put_open_state(struct nfs4_state *state)
 /*
 * Close the current file.
 */
-static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fmode, int wait)
+static void __nfs4_close(struct path *path, struct nfs4_state *state,
+                fmode_t fmode, gfp_t gfp_mask, int wait)
 {
        struct nfs4_state_owner *owner = state->owner;
        int call_close = 0;
@@ -578,17 +584,17 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fm
                nfs4_put_open_state(state);
                nfs4_put_state_owner(owner);
        } else
-                nfs4_do_close(path, state, wait);
+                nfs4_do_close(path, state, gfp_mask, wait);
 }
 void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
 {
-        __nfs4_close(path, state, fmode, 0);
+        __nfs4_close(path, state, fmode, GFP_NOFS, 0);
 }
 void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode)
 {
-        __nfs4_close(path, state, fmode, 1);
+        __nfs4_close(path, state, fmode, GFP_KERNEL, 1);
 }
 /*
@@ -618,7 +624,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
        struct nfs4_lock_state *lsp;
        struct nfs_client *clp = state->owner->so_client;
-        lsp = kzalloc(sizeof(*lsp), GFP_KERNEL);
+        lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
        if (lsp == NULL)
                return NULL;
        rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue");
@@ -754,11 +760,11 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f
        nfs4_put_lock_state(lsp);
 }
-struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter)
+struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask)
 {
        struct nfs_seqid *new;
-        new = kmalloc(sizeof(*new), GFP_KERNEL);
+        new = kmalloc(sizeof(*new), gfp_mask);
        if (new != NULL) {
                new->sequence = counter;
                INIT_LIST_HEAD(&new->list);
@@ -1347,7 +1353,7 @@ static int nfs4_recall_slot(struct nfs_client *clp)
        nfs4_begin_drain_session(clp);
        new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot),
-                      GFP_KERNEL);
+                      GFP_NOFS);
        if (!new)
                return -ENOMEM;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4d338be492cb..6bdef28efa33 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -38,7 +38,6 @@
 #include <linux/param.h>
 #include <linux/time.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
@@ -1505,14 +1504,14 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
        hdr->replen += decode_setclientid_maxsz;
 }
-static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state, struct compound_hdr *hdr)
+static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)
 {
        __be32 *p;
        p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
        *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
-        p = xdr_encode_hyper(p, client_state->cl_clientid);
+        p = xdr_encode_hyper(p, arg->clientid);
-        xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
+        xdr_encode_opaque_fixed(p, arg->confirm.data, NFS4_VERIFIER_SIZE);
        hdr->nops++;
        hdr->replen += decode_setclientid_confirm_maxsz;
 }
@@ -2325,7 +2324,7 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4
 /*
 * a SETCLIENTID_CONFIRM request
 */
-static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp)
+static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid_res *arg)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -2335,7 +2334,7 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, req, &hdr);
-        encode_setclientid_confirm(&xdr, clp, &hdr);
+        encode_setclientid_confirm(&xdr, arg, &hdr);
        encode_putrootfh(&xdr, &hdr);
        encode_fsinfo(&xdr, lease_bitmap, &hdr);
        encode_nops(&hdr);
@@ -4398,7 +4397,7 @@ out_overflow:
        return -EIO;
 }
-static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
+static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_res *res)
 {
        __be32 *p;
        uint32_t opnum;
@@ -4418,8 +4417,8 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
                p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
                if (unlikely(!p))
                        goto out_overflow;
-                p = xdr_decode_hyper(p, &clp->cl_clientid);
+                p = xdr_decode_hyper(p, &res->clientid);
-                memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE);
+                memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE);
        } else if (nfserr == NFSERR_CLID_INUSE) {
                uint32_t len;
@@ -4816,7 +4815,7 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem
                goto out;
        if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
                goto out;
-        decode_getfattr(&xdr, &res->dir_attr, res->server,
+        decode_getfattr(&xdr, res->dir_attr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5499,7 +5498,7 @@ static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
 * Decode SETCLIENTID response
 */
 static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
-                struct nfs_client *clp)
+                struct nfs4_setclientid_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -5508,7 +5507,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_setclientid(&xdr, clp);
+                status = decode_setclientid(&xdr, res);
        return status;
 }
@@ -5552,6 +5551,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
        if (status != 0)
                goto out;
        status = decode_delegreturn(&xdr);
+        if (status != 0)
+                goto out;
        decode_getfattr(&xdr, res->fattr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 8c55b27c0de4..6bd19d843af7 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -488,7 +488,6 @@ static int __init root_nfs_ports(void)
 */
 static int __init root_nfs_get_handle(void)
 {
-        struct nfs_fh fh;
        struct sockaddr_in sin;
        unsigned int auth_flav_len = 0;
        struct nfs_mount_request request = {
@@ -499,21 +498,24 @@ static int __init root_nfs_get_handle(void)
                                        NFS_MNT3_VERSION : NFS_MNT_VERSION,
                .protocol       = (nfs_data.flags & NFS_MOUNT_TCP) ?
                                        XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
-                .fh             = &fh,
                .auth_flav_len  = &auth_flav_len,
        };
-        int status;
+        int status = -ENOMEM;
+        request.fh = nfs_alloc_fhandle();
+        if (!request.fh)
+                goto out;
        set_sockaddr(&sin, servaddr, htons(mount_port));
        status = nfs_mount(&request);
        if (status < 0)
                printk(KERN_ERR "Root-NFS: Server returned error %d "
                                "while mounting %s\n", status, nfs_export_path);
        else {
-                nfs_data.root.size = fh.size;
+                nfs_data.root.size = request.fh->size;
-                memcpy(nfs_data.root.data, fh.data, fh.size);
+                memcpy(&nfs_data.root.data, request.fh->data, request.fh->size);
        }
+        nfs_free_fhandle(request.fh);
+out:
        return status;
 }
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 29d9d36cd5f4..a3654e57b589 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -60,16 +60,10 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
 {
        struct nfs_page         *req;
-        for (;;) {
+        /* try to allocate the request struct */
-                /* try to allocate the request struct */
+        req = nfs_page_alloc();
-                req = nfs_page_alloc();
+        if (req == NULL)
-                if (req != NULL)
+                return ERR_PTR(-ENOMEM);
-                        break;
-                if (fatal_signal_pending(current))
-                        return ERR_PTR(-ERESTARTSYS);
-                yield();
-        }
        /* Initialize the request struct. Initially, we assume a
         * long write-back delay. This will be adjusted in
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index c752d944fe9e..611bec22f552 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -29,7 +29,6 @@
 #include <linux/types.h>
 #include <linux/param.h>
-#include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
@@ -225,35 +224,60 @@ static int nfs_proc_readlink(struct inode *inode, struct page *page,
        return status;
 }
+struct nfs_createdata {
+        struct nfs_createargs arg;
+        struct nfs_diropok res;
+        struct nfs_fh fhandle;
+        struct nfs_fattr fattr;
+};
+static struct nfs_createdata *nfs_alloc_createdata(struct inode *dir,
+                struct dentry *dentry, struct iattr *sattr)
+{
+        struct nfs_createdata *data;
+        data = kmalloc(sizeof(*data), GFP_KERNEL);
+        if (data != NULL) {
+                data->arg.fh = NFS_FH(dir);
+                data->arg.name = dentry->d_name.name;
+                data->arg.len = dentry->d_name.len;
+                data->arg.sattr = sattr;
+                nfs_fattr_init(&data->fattr);
+                data->fhandle.size = 0;
+                data->res.fh = &data->fhandle;
+                data->res.fattr = &data->fattr;
+        }
+        return data;
+};
+static void nfs_free_createdata(const struct nfs_createdata *data)
+{
+        kfree(data);
+}
 static int
 nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                int flags, struct nameidata *nd)
 {
-        struct nfs_fh           fhandle;
+        struct nfs_createdata *data;
-        struct nfs_fattr        fattr;
-        struct nfs_createargs   arg = {
-                .fh             = NFS_FH(dir),
-                .name           = dentry->d_name.name,
-                .len            = dentry->d_name.len,
-                .sattr          = sattr
-        };
-        struct nfs_diropok      res = {
-                .fh             = &fhandle,
-                .fattr          = &fattr
-        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs_procedures[NFSPROC_CREATE],
-                .rpc_argp       = &arg,
-                .rpc_resp       = &res,
        };
-        int                     status;
+        int status = -ENOMEM;
-        nfs_fattr_init(&fattr);
        dprintk("NFS call  create %s\n", dentry->d_name.name);
+        data = nfs_alloc_createdata(dir, dentry, sattr);
+        if (data == NULL)
+                goto out;
+        msg.rpc_argp = &data->arg;
+        msg.rpc_resp = &data->res;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_mark_for_revalidate(dir);
        if (status == 0)
-                status = nfs_instantiate(dentry, &fhandle, &fattr);
+                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+        nfs_free_createdata(data);
+out:
        dprintk("NFS reply create: %d\n", status);
        return status;
 }
@@ -265,24 +289,12 @@ static int
 nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
               dev_t rdev)
 {
-        struct nfs_fh fhandle;
+        struct nfs_createdata *data;
-        struct nfs_fattr fattr;
-        struct nfs_createargs   arg = {
-                .fh             = NFS_FH(dir),
-                .name           = dentry->d_name.name,
-                .len            = dentry->d_name.len,
-                .sattr          = sattr
-        };
-        struct nfs_diropok      res = {
-                .fh             = &fhandle,
-                .fattr          = &fattr
-        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs_procedures[NFSPROC_CREATE],
-                .rpc_argp       = &arg,
-                .rpc_resp       = &res,
        };
-        int status, mode;
+        umode_t mode;
+        int status = -ENOMEM;
        dprintk("NFS call  mknod %s\n", dentry->d_name.name);
@@ -295,17 +307,24 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */
        }
-        nfs_fattr_init(&fattr);
+        data = nfs_alloc_createdata(dir, dentry, sattr);
+        if (data == NULL)
+                goto out;
+        msg.rpc_argp = &data->arg;
+        msg.rpc_resp = &data->res;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_mark_for_revalidate(dir);
        if (status == -EINVAL && S_ISFIFO(mode)) {
                sattr->ia_mode = mode;
-                nfs_fattr_init(&fattr);
+                nfs_fattr_init(data->res.fattr);
                status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        }
        if (status == 0)
-                status = nfs_instantiate(dentry, &fhandle, &fattr);
+                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+        nfs_free_createdata(data);
+out:
        dprintk("NFS reply mknod: %d\n", status);
        return status;
 }
@@ -399,8 +418,8 @@ static int
 nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
                 unsigned int len, struct iattr *sattr)
 {
-        struct nfs_fh fhandle;
+        struct nfs_fh *fh;
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr;
        struct nfs_symlinkargs  arg = {
                .fromfh         = NFS_FH(dir),
                .fromname       = dentry->d_name.name,
@@ -413,12 +432,18 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
                .rpc_proc       = &nfs_procedures[NFSPROC_SYMLINK],
                .rpc_argp       = &arg,
        };
-        int                     status;
+        int status = -ENAMETOOLONG;
+        dprintk("NFS call  symlink %s\n", dentry->d_name.name);
        if (len > NFS2_MAXPATHLEN)
-                return -ENAMETOOLONG;
+                goto out;
-        dprintk("NFS call  symlink %s\n", dentry->d_name.name);
+        fh = nfs_alloc_fhandle();
+        fattr = nfs_alloc_fattr();
+        status = -ENOMEM;
+        if (fh == NULL || fattr == NULL)
+                goto out;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_mark_for_revalidate(dir);
@@ -428,12 +453,12 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
         * filehandle size to zero indicates to nfs_instantiate that it
         * should fill in the data with a LOOKUP call on the wire.
         */
-        if (status == 0) {
+        if (status == 0)
-                nfs_fattr_init(&fattr);
+                status = nfs_instantiate(dentry, fh, fattr);
-                fhandle.size = 0;
-                status = nfs_instantiate(dentry, &fhandle, &fattr);
-        }
+        nfs_free_fattr(fattr);
+        nfs_free_fhandle(fh);
+out:
        dprintk("NFS reply symlink: %d\n", status);
        return status;
 }
@@ -441,31 +466,25 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
 static int
 nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
 {
-        struct nfs_fh fhandle;
+        struct nfs_createdata *data;
-        struct nfs_fattr fattr;
-        struct nfs_createargs   arg = {
-                .fh             = NFS_FH(dir),
-                .name           = dentry->d_name.name,
-                .len            = dentry->d_name.len,
-                .sattr          = sattr
-        };
-        struct nfs_diropok      res = {
-                .fh             = &fhandle,
-                .fattr          = &fattr
-        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs_procedures[NFSPROC_MKDIR],
-                .rpc_argp       = &arg,
-                .rpc_resp       = &res,
        };
-        int                     status;
+        int status = -ENOMEM;
        dprintk("NFS call  mkdir %s\n", dentry->d_name.name);
-        nfs_fattr_init(&fattr);
+        data = nfs_alloc_createdata(dir, dentry, sattr);
+        if (data == NULL)
+                goto out;
+        msg.rpc_argp = &data->arg;
+        msg.rpc_resp = &data->res;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_mark_for_revalidate(dir);
        if (status == 0)
-                status = nfs_instantiate(dentry, &fhandle, &fattr);
+                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+        nfs_free_createdata(data);
+out:
        dprintk("NFS reply mkdir: %d\n", status);
        return status;
 }
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index db9b360ae19d..6e2b06e6ca79 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -40,7 +40,7 @@ static mempool_t *nfs_rdata_mempool;
 struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 {
-        struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_NOFS);
+        struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL);
        if (p) {
                memset(p, 0, sizeof(*p));
@@ -50,7 +50,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
                if (pagecount <= ARRAY_SIZE(p->page_array))
                        p->pagevec = p->page_array;
                else {
-                        p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
+                        p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
                        if (!p->pagevec) {
                                mempool_free(p, nfs_rdata_mempool);
                                p = NULL;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 6baf9a393466..04214fc5c304 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -48,6 +48,7 @@
 #include <linux/vfs.h>
 #include <linux/inet.h>
 #include <linux/in6.h>
+#include <linux/slab.h>
 #include <net/ipv6.h>
 #include <linux/netdevice.h>
 #include <linux/nfs_xdr.h>
@@ -140,7 +141,6 @@ static const match_table_t nfs_mount_option_tokens = {
        { Opt_resvport, "resvport" },
        { Opt_noresvport, "noresvport" },
        { Opt_fscache, "fsc" },
-        { Opt_fscache_uniq, "fsc=%s" },
        { Opt_nofscache, "nofsc" },
        { Opt_port, "port=%s" },
@@ -170,6 +170,7 @@ static const match_table_t nfs_mount_option_tokens = {
        { Opt_mountaddr, "mountaddr=%s" },
        { Opt_lookupcache, "lookupcache=%s" },
+        { Opt_fscache_uniq, "fsc=%s" },
        { Opt_err, NULL }
 };
@@ -422,15 +423,19 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        unsigned char blockbits;
        unsigned long blockres;
        struct nfs_fh *fh = NFS_FH(dentry->d_inode);
-        struct nfs_fattr fattr;
+        struct nfs_fsstat res;
-        struct nfs_fsstat res = {
+        int error = -ENOMEM;
-                        .fattr = &fattr,
-        };
+        res.fattr = nfs_alloc_fattr();
-        int error;
+        if (res.fattr == NULL)
+                goto out_err;
        error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
+        nfs_free_fattr(res.fattr);
        if (error < 0)
                goto out_err;
        buf->f_type = NFS_SUPER_MAGIC;
        /*
@@ -1045,14 +1050,6 @@ static int nfs_parse_mount_options(char *raw,
                        kfree(mnt->fscache_uniq);
                        mnt->fscache_uniq = NULL;
                        break;
-                case Opt_fscache_uniq:
-                        string = match_strdup(args);
-                        if (!string)
-                                goto out_nomem;
-                        kfree(mnt->fscache_uniq);
-                        mnt->fscache_uniq = string;
-                        mnt->options |= NFS_OPTION_FSCACHE;
-                        break;
                /*
                 * options that take numeric values
@@ -1063,7 +1060,7 @@ static int nfs_parse_mount_options(char *raw,
                                goto out_nomem;
                        rc = strict_strtoul(string, 10, &option);
                        kfree(string);
-                        if (rc != 0 || option > USHORT_MAX)
+                        if (rc != 0 || option > USHRT_MAX)
                                goto out_invalid_value;
                        mnt->nfs_server.port = option;
                        break;
@@ -1184,7 +1181,7 @@ static int nfs_parse_mount_options(char *raw,
                                goto out_nomem;
                        rc = strict_strtoul(string, 10, &option);
                        kfree(string);
-                        if (rc != 0 || option > USHORT_MAX)
+                        if (rc != 0 || option > USHRT_MAX)
                                goto out_invalid_value;
                        mnt->mount_server.port = option;
                        break;
@@ -1383,6 +1380,14 @@ static int nfs_parse_mount_options(char *raw,
                                        return 0;
                        };
                        break;
+                case Opt_fscache_uniq:
+                        string = match_strdup(args);
+                        if (string == NULL)
+                                goto out_nomem;
+                        kfree(mnt->fscache_uniq);
+                        mnt->fscache_uniq = string;
+                        mnt->options |= NFS_OPTION_FSCACHE;
+                        break;
                /*
                 * Special options
@@ -2171,7 +2176,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        int error = -ENOMEM;
        data = nfs_alloc_parsed_mount_data(3);
-        mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
+        mntfh = nfs_alloc_fhandle();
        if (data == NULL || mntfh == NULL)
                goto out_free_fh;
@@ -2186,6 +2191,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        if (data->version == 4) {
                error = nfs4_try_mount(flags, dev_name, data, mnt);
                kfree(data->client_address);
+                kfree(data->nfs_server.export_path);
                goto out;
        }
 #endif  /* CONFIG_NFS_V4 */
@@ -2245,7 +2251,7 @@ out:
        kfree(data->fscache_uniq);
        security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
-        kfree(mntfh);
+        nfs_free_fhandle(mntfh);
        kfree(data);
        return error;
@@ -2554,7 +2560,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
        };
        int error = -ENOMEM;
-        mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
+        mntfh = nfs_alloc_fhandle();
        if (data == NULL || mntfh == NULL)
                goto out_free_fh;
@@ -2612,7 +2618,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
 out:
        security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
-        kfree(mntfh);
+        nfs_free_fhandle(mntfh);
        return error;
 out_free:
@@ -2656,7 +2662,7 @@ static void nfs_fix_devname(const struct path *path, struct vfsmount *mnt)
        devname = nfs_path(path->mnt->mnt_devname,
                        path->mnt->mnt_root, path->dentry,
                        page, PAGE_SIZE);
-        if (devname == NULL)
+        if (IS_ERR(devname))
                goto out_freepage;
        tmp = kstrdup(devname, GFP_KERNEL);
        if (tmp == NULL)
@@ -2667,41 +2673,120 @@ out_freepage:
        free_page((unsigned long)page);
 }
+struct nfs_referral_count {
+        struct list_head list;
+        const struct task_struct *task;
+        unsigned int referral_count;
+};
+static LIST_HEAD(nfs_referral_count_list);
+static DEFINE_SPINLOCK(nfs_referral_count_list_lock);
+static struct nfs_referral_count *nfs_find_referral_count(void)
+{
+        struct nfs_referral_count *p;
+        list_for_each_entry(p, &nfs_referral_count_list, list) {
+                if (p->task == current)
+                        return p;
+        }
+        return NULL;
+}
+#define NFS_MAX_NESTED_REFERRALS 2
+static int nfs_referral_loop_protect(void)
+{
+        struct nfs_referral_count *p, *new;
+        int ret = -ENOMEM;
+        new = kmalloc(sizeof(*new), GFP_KERNEL);
+        if (!new)
+                goto out;
+        new->task = current;
+        new->referral_count = 1;
+        ret = 0;
+        spin_lock(&nfs_referral_count_list_lock);
+        p = nfs_find_referral_count();
+        if (p != NULL) {
+                if (p->referral_count >= NFS_MAX_NESTED_REFERRALS)
+                        ret = -ELOOP;
+                else
+                        p->referral_count++;
+        } else {
+                list_add(&new->list, &nfs_referral_count_list);
+                new = NULL;
+        }
+        spin_unlock(&nfs_referral_count_list_lock);
+        kfree(new);
+out:
+        return ret;
+}
+static void nfs_referral_loop_unprotect(void)
+{
+        struct nfs_referral_count *p;
+        spin_lock(&nfs_referral_count_list_lock);
+        p = nfs_find_referral_count();
+        p->referral_count--;
+        if (p->referral_count == 0)
+                list_del(&p->list);
+        else
+                p = NULL;
+        spin_unlock(&nfs_referral_count_list_lock);
+        kfree(p);
+}
 static int nfs_follow_remote_path(struct vfsmount *root_mnt,
                const char *export_path, struct vfsmount *mnt_target)
 {
+        struct nameidata *nd = NULL;
        struct mnt_namespace *ns_private;
-        struct nameidata nd;
        struct super_block *s;
        int ret;
+        nd = kmalloc(sizeof(*nd), GFP_KERNEL);
+        if (nd == NULL)
+                return -ENOMEM;
        ns_private = create_mnt_ns(root_mnt);
        ret = PTR_ERR(ns_private);
        if (IS_ERR(ns_private))
                goto out_mntput;
+        ret = nfs_referral_loop_protect();
+        if (ret != 0)
+                goto out_put_mnt_ns;
        ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt,
-                        export_path, LOOKUP_FOLLOW, &nd);
+                        export_path, LOOKUP_FOLLOW, nd);
+        nfs_referral_loop_unprotect();
        put_mnt_ns(ns_private);
        if (ret != 0)
                goto out_err;
-        s = nd.path.mnt->mnt_sb;
+        s = nd->path.mnt->mnt_sb;
        atomic_inc(&s->s_active);
        mnt_target->mnt_sb = s;
-        mnt_target->mnt_root = dget(nd.path.dentry);
+        mnt_target->mnt_root = dget(nd->path.dentry);
        /* Correct the device pathname */
-        nfs_fix_devname(&nd.path, mnt_target);
+        nfs_fix_devname(&nd->path, mnt_target);
-        path_put(&nd.path);
+        path_put(&nd->path);
+        kfree(nd);
        down_write(&s->s_umount);
        return 0;
+out_put_mnt_ns:
+        put_mnt_ns(ns_private);
 out_mntput:
        mntput(root_mnt);
 out_err:
+        kfree(nd);
        return ret;
 }
@@ -2872,17 +2957,21 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
        struct super_block *s;
        struct nfs_server *server;
        struct dentry *mntroot;
-        struct nfs_fh mntfh;
+        struct nfs_fh *mntfh;
        int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
        struct nfs_sb_mountdata sb_mntdata = {
                .mntflags = flags,
        };
-        int error;
+        int error = -ENOMEM;
        dprintk("--> nfs4_referral_get_sb()\n");
+        mntfh = nfs_alloc_fhandle();
+        if (mntfh == NULL)
+                goto out_err_nofh;
        /* create a new volume representation */
-        server = nfs4_create_referral_server(data, &mntfh);
+        server = nfs4_create_referral_server(data, mntfh);
        if (IS_ERR(server)) {
                error = PTR_ERR(server);
                goto out_err_noserver;
@@ -2914,7 +3003,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
                nfs_fscache_get_super_cookie(s, NULL, data);
        }
-        mntroot = nfs4_get_root(s, &mntfh);
+        mntroot = nfs4_get_root(s, mntfh);
        if (IS_ERR(mntroot)) {
                error = PTR_ERR(mntroot);
                goto error_splat_super;
@@ -2931,12 +3020,15 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
        security_sb_clone_mnt_opts(data->sb, s);
+        nfs_free_fhandle(mntfh);
        dprintk("<-- nfs4_referral_get_sb() = 0\n");
        return 0;
 out_err_nosb:
        nfs_free_server(server);
 out_err_noserver:
+        nfs_free_fhandle(mntfh);
+out_err_nofh:
        dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error);
        return error;
@@ -2945,6 +3037,7 @@ error_splat_super:
                bdi_unregister(&server->backing_dev_info);
 error_splat_bdi:
        deactivate_locked_super(s);
+        nfs_free_fhandle(mntfh);
        dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
        return error;
 }
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 2ea9e5c27e55..05c9e02f4153 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -19,7 +19,6 @@
 #include <linux/pagemap.h>
 #include <linux/stat.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/namei.h>
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 6da3d3ff6edd..a2242af6a17d 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -23,6 +23,7 @@ struct nfs_unlinkdata {
        struct nfs_removeres res;
        struct inode *dir;
        struct rpc_cred *cred;
+        struct nfs_fattr dir_attr;
 };
 /**
@@ -169,7 +170,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
        }
        nfs_sb_active(dir->i_sb);
        data->args.fh = NFS_FH(dir);
-        nfs_fattr_init(&data->res.dir_attr);
+        nfs_fattr_init(data->res.dir_attr);
        NFS_PROTO(dir)->unlink_setup(&msg, dir);
@@ -259,6 +260,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
                goto out_free;
        }
        data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
+        data->res.dir_attr = &data->dir_attr;
        status = -EBUSY;
        spin_lock(&dentry->d_lock);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 53ff70e23993..91679e2631ee 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -201,6 +201,7 @@ static int nfs_set_page_writeback(struct page *page)
                struct inode *inode = page->mapping->host;
                struct nfs_server *nfss = NFS_SERVER(inode);
+                page_cache_get(page);
                if (atomic_long_inc_return(&nfss->writeback) >
                                NFS_CONGESTION_ON_THRESH) {
                        set_bdi_congested(&nfss->backing_dev_info,
@@ -216,6 +217,7 @@ static void nfs_end_page_writeback(struct page *page)
        struct nfs_server *nfss = NFS_SERVER(inode);
        end_page_writeback(page);
+        page_cache_release(page);
        if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
                clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
 }
@@ -421,6 +423,7 @@ static void
 nfs_mark_request_dirty(struct nfs_page *req)
 {
        __set_page_dirty_nobuffers(req->wb_page);
+        __mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC);
 }
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -660,9 +663,11 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
        req = nfs_setup_write_request(ctx, page, offset, count);
        if (IS_ERR(req))
                return PTR_ERR(req);
+        nfs_mark_request_dirty(req);
        /* Update file length */
        nfs_grow_file(page, offset, count);
        nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
+        nfs_mark_request_dirty(req);
        nfs_clear_page_tag_locked(req);
        return 0;
 }
@@ -739,8 +744,6 @@ int nfs_updatepage(struct file *file, struct page *page,
        status = nfs_writepage_setup(ctx, page, offset, count);
        if (status < 0)
                nfs_set_pageerror(page);
-        else
-                __set_page_dirty_nobuffers(page);
        dprintk("NFS:       nfs_updatepage returns %d (isize %lld)\n",
                        status, (long long)i_size_read(inode));
@@ -749,13 +752,12 @@ int nfs_updatepage(struct file *file, struct page *page,
 static void nfs_writepage_release(struct nfs_page *req)
 {
+        struct page *page = req->wb_page;
-        if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) {
+        if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req))
-                nfs_end_page_writeback(req->wb_page);
                nfs_inode_remove_request(req);
-        } else
-                nfs_end_page_writeback(req->wb_page);
        nfs_clear_page_tag_locked(req);
+        nfs_end_page_writeback(page);
 }
 static int flush_task_priority(int how)
@@ -779,7 +781,6 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
                int how)
 {
        struct inode *inode = req->wb_context->path.dentry->d_inode;
-        int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
        int priority = flush_task_priority(how);
        struct rpc_task *task;
        struct rpc_message msg = {
@@ -794,9 +795,10 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
                .callback_ops = call_ops,
                .callback_data = data,
                .workqueue = nfsiod_workqueue,
-                .flags = flags,
+                .flags = RPC_TASK_ASYNC,
                .priority = priority,
        };
+        int ret = 0;
        /* Set up the RPC argument and reply structs
         * NB: take care not to mess about with data->commit et al. */
@@ -835,10 +837,18 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
                (unsigned long long)data->args.offset);
        task = rpc_run_task(&task_setup_data);
-        if (IS_ERR(task))
+        if (IS_ERR(task)) {
-                return PTR_ERR(task);
+                ret = PTR_ERR(task);
+                goto out;
+        }
+        if (how & FLUSH_SYNC) {
+                ret = rpc_wait_for_completion_task(task);
+                if (ret == 0)
+                        ret = task->tk_status;
+        }
        rpc_put_task(task);
-        return 0;
+out:
+        return ret;
 }
 /* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -847,9 +857,11 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 */
 static void nfs_redirty_request(struct nfs_page *req)
 {
+        struct page *page = req->wb_page;
        nfs_mark_request_dirty(req);
-        nfs_end_page_writeback(req->wb_page);
        nfs_clear_page_tag_locked(req);
+        nfs_end_page_writeback(page);
 }
 /*
@@ -1084,16 +1096,15 @@ static void nfs_writeback_release_full(void *calldata)
                if (nfs_write_need_commit(data)) {
                        memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
                        nfs_mark_request_commit(req);
-                        nfs_end_page_writeback(page);
                        dprintk(" marked for commit\n");
                        goto next;
                }
                dprintk(" OK\n");
 remove_request:
-                nfs_end_page_writeback(page);
                nfs_inode_remove_request(req);
        next:
                nfs_clear_page_tag_locked(req);
+                nfs_end_page_writeback(page);
        }
        nfs_writedata_release(calldata);
 }
@@ -1190,6 +1201,25 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
+{
+        if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags))
+                return 1;
+        if (may_wait && !out_of_line_wait_on_bit_lock(&nfsi->flags,
+                                NFS_INO_COMMIT, nfs_wait_bit_killable,
+                                TASK_KILLABLE))
+                return 1;
+        return 0;
+}
+static void nfs_commit_clear_lock(struct nfs_inode *nfsi)
+{
+        clear_bit(NFS_INO_COMMIT, &nfsi->flags);
+        smp_mb__after_clear_bit();
+        wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);
+}
 static void nfs_commitdata_release(void *data)
 {
        struct nfs_write_data *wdata = data;
@@ -1207,7 +1237,6 @@ static int nfs_commit_rpcsetup(struct list_head *head,
 {
        struct nfs_page *first = nfs_list_entry(head->next);
        struct inode *inode = first->wb_context->path.dentry->d_inode;
-        int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
        int priority = flush_task_priority(how);
        struct rpc_task *task;
        struct rpc_message msg = {
@@ -1222,7 +1251,7 @@ static int nfs_commit_rpcsetup(struct list_head *head,
                .callback_ops = &nfs_commit_ops,
                .callback_data = data,
                .workqueue = nfsiod_workqueue,
-                .flags = flags,
+                .flags = RPC_TASK_ASYNC,
                .priority = priority,
        };
@@ -1282,6 +1311,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
                                BDI_RECLAIMABLE);
                nfs_clear_page_tag_locked(req);
        }
+        nfs_commit_clear_lock(NFS_I(inode));
        return -ENOMEM;
 }
@@ -1337,6 +1367,7 @@ static void nfs_commit_release(void *calldata)
        next:
                nfs_clear_page_tag_locked(req);
        }
+        nfs_commit_clear_lock(NFS_I(data->inode));
        nfs_commitdata_release(calldata);
 }
@@ -1351,8 +1382,11 @@ static const struct rpc_call_ops nfs_commit_ops = {
 static int nfs_commit_inode(struct inode *inode, int how)
 {
        LIST_HEAD(head);
-        int res;
+        int may_wait = how & FLUSH_SYNC;
+        int res = 0;
+        if (!nfs_commit_set_lock(NFS_I(inode), may_wait))
+                goto out_mark_dirty;
        spin_lock(&inode->i_lock);
        res = nfs_scan_commit(inode, &head, 0, 0);
        spin_unlock(&inode->i_lock);
@@ -1360,7 +1394,22 @@ static int nfs_commit_inode(struct inode *inode, int how)
                int error = nfs_commit_list(inode, &head, how);
                if (error < 0)
                        return error;
-        }
+                if (may_wait)
+                        wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
+                                        nfs_wait_bit_killable,
+                                        TASK_KILLABLE);
+                else
+                        goto out_mark_dirty;
+        } else
+                nfs_commit_clear_lock(NFS_I(inode));
+        return res;
+        /* Note: If we exit without ensuring that the commit is complete,
+         * we must mark the inode as dirty. Otherwise, future calls to
+         * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
+         * that the data is on the disk.
+         */
+out_mark_dirty:
+        __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
        return res;
 }
@@ -1432,6 +1481,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
        BUG_ON(!PageLocked(page));
        for (;;) {
+                wait_on_page_writeback(page);
                req = nfs_page_find_request(page);
                if (req == NULL)
                        break;
@@ -1466,30 +1516,21 @@ int nfs_wb_page(struct inode *inode, struct page *page)
                .range_start = range_start,
                .range_end = range_end,
        };
-        struct nfs_page *req;
-        int need_commit;
        int ret;
-        while(PagePrivate(page)) {
+        for (;;) {
+                wait_on_page_writeback(page);
                if (clear_page_dirty_for_io(page)) {
                        ret = nfs_writepage_locked(page, &wbc);
                        if (ret < 0)
                                goto out_error;
+                        continue;
                }
-                req = nfs_find_and_lock_request(page);
+                if (!PagePrivate(page))
-                if (!req)
                        break;
-                if (IS_ERR(req)) {
+                ret = nfs_commit_inode(inode, FLUSH_SYNC);
-                        ret = PTR_ERR(req);
+                if (ret < 0)
                        goto out_error;
-                }
-                need_commit = test_bit(PG_CLEAN, &req->wb_flags);
-                nfs_clear_page_tag_locked(req);
-                if (need_commit) {
-                        ret = nfs_commit_inode(inode, FLUSH_SYNC);
-                        if (ret < 0)
-                                goto out_error;
-                }
        }
        return 0;
 out_error:
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 04133aacb1e5..fc1c52571c03 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/sunrpc/xdr.h>
 #include <linux/nfsacl.h>
 #include <linux/nfs3.h>
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index a0c4016413f1..c2a4f71d87dd 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -12,6 +12,7 @@
 * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de>
 */
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/module.h>
 #include <linux/exportfs.h>
@@ -258,10 +259,9 @@ static struct cache_detail svc_expkey_cache = {
        .alloc          = expkey_alloc,
 };
-static struct svc_expkey *
+static int
-svc_expkey_lookup(struct svc_expkey *item)
+svc_expkey_hash(struct svc_expkey *item)
 {
-        struct cache_head *ch;
        int hash = item->ek_fsidtype;
        char * cp = (char*)item->ek_fsid;
        int len = key_len(item->ek_fsidtype);
@@ -269,6 +269,14 @@ svc_expkey_lookup(struct svc_expkey *item)
        hash ^= hash_mem(cp, len, EXPKEY_HASHBITS);
        hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS);
        hash &= EXPKEY_HASHMASK;
+        return hash;
+}
+static struct svc_expkey *
+svc_expkey_lookup(struct svc_expkey *item)
+{
+        struct cache_head *ch;
+        int hash = svc_expkey_hash(item);
        ch = sunrpc_cache_lookup(&svc_expkey_cache, &item->h,
                                 hash);
@@ -282,13 +290,7 @@ static struct svc_expkey *
 svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old)
 {
        struct cache_head *ch;
-        int hash = new->ek_fsidtype;
+        int hash = svc_expkey_hash(new);
-        char * cp = (char*)new->ek_fsid;
-        int len = key_len(new->ek_fsidtype);
-        hash ^= hash_mem(cp, len, EXPKEY_HASHBITS);
-        hash ^= hash_ptr(new->ek_client, EXPKEY_HASHBITS);
-        hash &= EXPKEY_HASHMASK;
        ch = sunrpc_cache_update(&svc_expkey_cache, &new->h,
                                 &old->h, hash);
@@ -737,14 +739,22 @@ struct cache_detail svc_export_cache = {
        .alloc          = svc_export_alloc,
 };
-static struct svc_export *
+static int
-svc_export_lookup(struct svc_export *exp)
+svc_export_hash(struct svc_export *exp)
 {
-        struct cache_head *ch;
        int hash;
        hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS);
        hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS);
        hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS);
+        return hash;
+}
+static struct svc_export *
+svc_export_lookup(struct svc_export *exp)
+{
+        struct cache_head *ch;
+        int hash = svc_export_hash(exp);
        ch = sunrpc_cache_lookup(&svc_export_cache, &exp->h,
                                 hash);
@@ -758,10 +768,7 @@ static struct svc_export *
 svc_export_update(struct svc_export *new, struct svc_export *old)
 {
        struct cache_head *ch;
-        int hash;
+        int hash = svc_export_hash(old);
-        hash = hash_ptr(old->ex_client, EXPORT_HASHBITS);
-        hash ^= hash_ptr(old->ex_path.dentry, EXPORT_HASHBITS);
-        hash ^= hash_ptr(old->ex_path.mnt, EXPORT_HASHBITS);
        ch = sunrpc_cache_update(&svc_export_cache, &new->h,
                                 &old->h,
@@ -1070,9 +1077,9 @@ exp_export(struct nfsctl_export *nxp)
                err = 0;
 finish:
        kfree(new.ex_pathname);
-        if (exp)
+        if (!IS_ERR_OR_NULL(exp))
                exp_put(exp);
-        if (fsid_key && !IS_ERR(fsid_key))
+        if (!IS_ERR_OR_NULL(fsid_key))
                cache_put(&fsid_key->h, &svc_expkey_cache);
        path_put(&path);
 out_put_clp:
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index f20589d2ae27..6aa5590c3679 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -7,6 +7,7 @@
 #include "nfsd.h"
 /* FIXME: nfsacl.h is a broken header */
 #include <linux/nfsacl.h>
+#include <linux/gfp.h>
 #include "cache.h"
 #include "xdr3.h"
 #include "vfs.h"
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index e0c4846bad92..a596e9d987e4 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -7,6 +7,7 @@
 #include "nfsd.h"
 /* FIXME: nfsacl.h is a broken header */
 #include <linux/nfsacl.h>
+#include <linux/gfp.h>
 #include "cache.h"
 #include "xdr3.h"
 #include "vfs.h"
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 88150685df34..e48052615159 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -34,6 +34,7 @@
 *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
+#include <linux/slab.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs4_acl.h>
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4bc22c763de7..eb78e7e22077 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -32,6 +32,8 @@
 */
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svc_xprt.h>
+#include <linux/slab.h>
 #include "nfsd.h"
 #include "state.h"
@@ -78,11 +80,6 @@ enum nfs_cb_opnum4 {
                                        cb_sequence_dec_sz +            \
                                        op_dec_sz)
-struct nfs4_rpc_args {
-        void                            *args_op;
-        struct nfsd4_cb_sequence        args_seq;
-};
 /*
 * Generic encode routines from fs/nfs/nfs4xdr.c
 */
@@ -427,13 +424,19 @@ static struct rpc_procinfo     nfs4_cb_procedures[] = {
 };
 static struct rpc_version       nfs_cb_version4 = {
+/*
+ * Note on the callback rpc program version number: despite language in rfc
+ * 5661 section 18.36.3 requiring servers to use 4 in this field, the
+ * official xdr descriptions for both 4.0 and 4.1 specify version 1, and
+ * in practice that appears to be what implementations use.  The section
+ * 18.36.3 language is expected to be fixed in an erratum.
+ */
        .number                 = 1,
        .nrprocs                = ARRAY_SIZE(nfs4_cb_procedures),
        .procs                  = nfs4_cb_procedures
 };
 static struct rpc_version *     nfs_cb_version[] = {
-        NULL,
        &nfs_cb_version4,
 };
@@ -455,15 +458,14 @@ static struct rpc_program cb_program = {
 static int max_cb_time(void)
 {
-        return max(NFSD_LEASE_TIME/10, (time_t)1) * HZ;
+        return max(nfsd4_lease/10, (time_t)1) * HZ;
 }
 /* Reference counting, callback cleanup, etc., all look racy as heck.
- * And why is cb_set an atomic? */
+ * And why is cl_cb_set an atomic? */
-int setup_callback_client(struct nfs4_client *clp)
+int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
 {
-        struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
        struct rpc_timeout      timeparms = {
                .to_initval     = max_cb_time(),
                .to_retries     = 0,
@@ -475,7 +477,7 @@ int setup_callback_client(struct nfs4_client *clp)
                .timeout        = &timeparms,
                .program        = &cb_program,
                .prognumber     = cb->cb_prog,
-                .version        = nfs_cb_version[1]->number,
+                .version        = 0,
                .authflavor     = clp->cl_flavor,
                .flags          = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
                .client_name    = clp->cl_principal,
@@ -485,7 +487,7 @@ int setup_callback_client(struct nfs4_client *clp)
        if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
                return -EINVAL;
        if (cb->cb_minorversion) {
-                args.bc_xprt = clp->cl_cb_xprt;
+                args.bc_xprt = cb->cb_xprt;
                args.protocol = XPRT_TRANSPORT_BC_TCP;
        }
        /* Create RPC client */
@@ -495,7 +497,7 @@ int setup_callback_client(struct nfs4_client *clp)
                        PTR_ERR(client));
                return PTR_ERR(client);
        }
-        cb->cb_client = client;
+        nfsd4_set_callback_client(clp, client);
        return 0;
 }
@@ -513,8 +515,7 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
        if (task->tk_status)
                warn_no_callback_path(clp, task->tk_status);
        else
-                atomic_set(&clp->cl_cb_conn.cb_set, 1);
+                atomic_set(&clp->cl_cb_set, 1);
-        put_nfs4_client(clp);
 }
 static const struct rpc_call_ops nfsd4_cb_probe_ops = {
@@ -536,7 +537,6 @@ int set_callback_cred(void)
 void do_probe_callback(struct nfs4_client *clp)
 {
-        struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
        struct rpc_message msg = {
                .rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
                .rpc_argp       = clp,
@@ -544,34 +544,27 @@ void do_probe_callback(struct nfs4_client *clp)
        };
        int status;
-        status = rpc_call_async(cb->cb_client, &msg,
+        status = rpc_call_async(clp->cl_cb_client, &msg,
                                RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
                                &nfsd4_cb_probe_ops, (void *)clp);
-        if (status) {
+        if (status)
                warn_no_callback_path(clp, status);
-                put_nfs4_client(clp);
-        }
 }
 /*
 * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
 */
-void
+void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
-nfsd4_probe_callback(struct nfs4_client *clp)
 {
        int status;
-        BUG_ON(atomic_read(&clp->cl_cb_conn.cb_set));
+        BUG_ON(atomic_read(&clp->cl_cb_set));
-        status = setup_callback_client(clp);
+        status = setup_callback_client(clp, cb);
        if (status) {
                warn_no_callback_path(clp, status);
                return;
        }
-        /* the task holds a reference to the nfs4_client struct */
-        atomic_inc(&clp->cl_count);
        do_probe_callback(clp);
 }
@@ -657,18 +650,32 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
        }
 }
 static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_delegation *dp = calldata;
        struct nfs4_client *clp = dp->dl_client;
+        struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
        nfsd4_cb_done(task, calldata);
+        if (current_rpc_client == NULL) {
+                /* We're shutting down; give up. */
+                /* XXX: err, or is it ok just to fall through
+                 * and rpc_restart_call? */
+                return;
+        }
        switch (task->tk_status) {
        case -EIO:
                /* Network partition? */
-                atomic_set(&clp->cl_cb_conn.cb_set, 0);
+                atomic_set(&clp->cl_cb_set, 0);
                warn_no_callback_path(clp, task->tk_status);
+                if (current_rpc_client != task->tk_client) {
+                        /* queue a callback on the new connection: */
+                        nfsd4_cb_recall(dp);
+                        return;
+                }
        case -EBADHANDLE:
        case -NFS4ERR_BAD_STATEID:
                /* Race: client probably got cb_recall
@@ -676,7 +683,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
                break;
        default:
                /* success, or error we can't handle */
-                goto done;
+                return;
        }
        if (dp->dl_retries--) {
                rpc_delay(task, 2*HZ);
@@ -684,20 +691,16 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
                rpc_restart_call(task);
                return;
        } else {
-                atomic_set(&clp->cl_cb_conn.cb_set, 0);
+                atomic_set(&clp->cl_cb_set, 0);
                warn_no_callback_path(clp, task->tk_status);
        }
-done:
-        kfree(task->tk_msg.rpc_argp);
 }
 static void nfsd4_cb_recall_release(void *calldata)
 {
        struct nfs4_delegation *dp = calldata;
-        struct nfs4_client *clp = dp->dl_client;
        nfs4_put_delegation(dp);
-        put_nfs4_client(clp);
 }
 static const struct rpc_call_ops nfsd4_cb_recall_ops = {
@@ -706,33 +709,75 @@ static const struct rpc_call_ops nfsd4_cb_recall_ops = {
        .rpc_release = nfsd4_cb_recall_release,
 };
+static struct workqueue_struct *callback_wq;
+int nfsd4_create_callback_queue(void)
+{
+        callback_wq = create_singlethread_workqueue("nfsd4_callbacks");
+        if (!callback_wq)
+                return -ENOMEM;
+        return 0;
+}
+void nfsd4_destroy_callback_queue(void)
+{
+        destroy_workqueue(callback_wq);
+}
+/* must be called under the state lock */
+void nfsd4_set_callback_client(struct nfs4_client *clp, struct rpc_clnt *new)
+{
+        struct rpc_clnt *old = clp->cl_cb_client;
+        clp->cl_cb_client = new;
+        /*
+         * After this, any work that saw the old value of cl_cb_client will
+         * be gone:
+         */
+        flush_workqueue(callback_wq);
+        /* So we can safely shut it down: */
+        if (old)
+                rpc_shutdown_client(old);
+}
 /*
 * called with dp->dl_count inc'ed.
 */
-void
+static void _nfsd4_cb_recall(struct nfs4_delegation *dp)
-nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
        struct nfs4_client *clp = dp->dl_client;
-        struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
+        struct rpc_clnt *clnt = clp->cl_cb_client;
-        struct nfs4_rpc_args *args;
+        struct nfs4_rpc_args *args = &dp->dl_recall.cb_args;
        struct rpc_message msg = {
                .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
                .rpc_cred = callback_cred
        };
-        int status = -ENOMEM;
+        int status;
+        if (clnt == NULL)
+                return; /* Client is shutting down; give up. */
-        args = kzalloc(sizeof(*args), GFP_KERNEL);
-        if (!args)
-                goto out;
        args->args_op = dp;
        msg.rpc_argp = args;
        dp->dl_retries = 1;
        status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
                                &nfsd4_cb_recall_ops, dp);
-out:
+        if (status)
-        if (status) {
-                kfree(args);
-                put_nfs4_client(clp);
                nfs4_put_delegation(dp);
-        }
+}
+void nfsd4_do_callback_rpc(struct work_struct *w)
+{
+        /* XXX: for now, just send off delegation recall. */
+        /* In future, generalize to handle any sort of callback. */
+        struct nfsd4_callback *c = container_of(w, struct nfsd4_callback, cb_work);
+        struct nfs4_delegation *dp = container_of(c, struct nfs4_delegation, dl_recall);
+        _nfsd4_cb_recall(dp);
+}
+void nfsd4_cb_recall(struct nfs4_delegation *dp)
+{
+        queue_work(callback_wq, &dp->dl_recall.cb_work);
 }
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 6e2983b27f3c..c78dbf493424 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -36,6 +36,7 @@
 #include <linux/nfsd_idmap.h>
 #include <linux/seq_file.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 /*
 * Cache entry
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 37514c469846..59ec449b0c7f 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -33,6 +33,7 @@
 *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include <linux/file.h>
+#include <linux/slab.h>
 #include "cache.h"
 #include "xdr4.h"
@@ -968,20 +969,36 @@ static struct nfsd4_operation nfsd4_ops[];
 static const char *nfsd4_op_name(unsigned opnum);
 /*
- * Enforce NFSv4.1 COMPOUND ordering rules.
+ * Enforce NFSv4.1 COMPOUND ordering rules:
 *
- * TODO:
+ * Also note, enforced elsewhere:
- * - enforce NFS4ERR_NOT_ONLY_OP,
+ *      - SEQUENCE other than as first op results in
- * - DESTROY_SESSION MUST be the final operation in the COMPOUND request.
+ *        NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().)
+ *      - BIND_CONN_TO_SESSION must be the only op in its compound
+ *        (Will be enforced in nfsd4_bind_conn_to_session().)
+ *      - DESTROY_SESSION must be the final operation in a compound, if
+ *        sessionid's in SEQUENCE and DESTROY_SESSION are the same.
+ *        (Enforced in nfsd4_destroy_session().)
 */
-static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args)
+static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args)
 {
-        if (args->minorversion && args->opcnt > 0) {
+        struct nfsd4_op *op = &args->ops[0];
-                struct nfsd4_op *op = &args->ops[0];
-                return (op->status == nfserr_op_illegal) ||
+        /* These ordering requirements don't apply to NFSv4.0: */
-                       (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP);
+        if (args->minorversion == 0)
-        }
+                return nfs_ok;
-        return true;
+        /* This is weird, but OK, not our problem: */
+        if (args->opcnt == 0)
+                return nfs_ok;
+        if (op->status == nfserr_op_illegal)
+                return nfs_ok;
+        if (!(nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP))
+                return nfserr_op_not_in_session;
+        if (op->opnum == OP_SEQUENCE)
+                return nfs_ok;
+        if (args->opcnt != 1)
+                return nfserr_not_only_op;
+        return nfs_ok;
 }
 /*
@@ -1011,6 +1028,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
        resp->rqstp = rqstp;
        resp->cstate.minorversion = args->minorversion;
        resp->cstate.replay_owner = NULL;
+        resp->cstate.session = NULL;
        fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
        fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
        /* Use the deferral mechanism only for NFSv4.0 compounds */
@@ -1023,13 +1041,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
        if (args->minorversion > nfsd_supported_minorversion)
                goto out;
-        if (!nfs41_op_ordering_ok(args)) {
+        status = nfs41_check_op_ordering(args);
+        if (status) {
                op = &args->ops[0];
-                op->status = nfserr_sequence_pos;
+                op->status = status;
                goto encode_op;
        }
-        status = nfs_ok;
        while (!status && resp->opcnt < args->opcnt) {
                op = &args->ops[resp->opcnt++];
@@ -1294,6 +1312,11 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
                .op_name = "OP_SEQUENCE",
        },
+        [OP_RECLAIM_COMPLETE] = {
+                .op_func = (nfsd4op_func)nfsd4_reclaim_complete,
+                .op_flags = ALLOWED_WITHOUT_FH,
+                .op_name = "OP_RECLAIM_COMPLETE",
+        },
 };
 static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 98fb98e330b4..7e26caab2a26 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -32,6 +32,7 @@
 */
 #include <linux/file.h>
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/crypto.h>
 #include <linux/sched.h>
@@ -43,8 +44,7 @@
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 /* Globals */
-static struct path rec_dir;
+static struct file *rec_file;
-static int rec_dir_init = 0;
 static int
 nfs4_save_creds(const struct cred **original_creds)
@@ -116,33 +116,28 @@ out_no_tfm:
        return status;
 }
-static void
-nfsd4_sync_rec_dir(void)
-{
-        vfs_fsync(NULL, rec_dir.dentry, 0);
-}
 int
 nfsd4_create_clid_dir(struct nfs4_client *clp)
 {
        const struct cred *original_cred;
        char *dname = clp->cl_recdir;
-        struct dentry *dentry;
+        struct dentry *dir, *dentry;
        int status;
        dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
-        if (!rec_dir_init || clp->cl_firststate)
+        if (!rec_file || clp->cl_firststate)
                return 0;
        status = nfs4_save_creds(&original_cred);
        if (status < 0)
                return status;
+        dir = rec_file->f_path.dentry;
        /* lock the parent */
-        mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
+        mutex_lock(&dir->d_inode->i_mutex);
-        dentry = lookup_one_len(dname, rec_dir.dentry, HEXDIR_LEN-1);
+        dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1);
        if (IS_ERR(dentry)) {
                status = PTR_ERR(dentry);
                goto out_unlock;
@@ -152,18 +147,18 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
                dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
                goto out_put;
        }
-        status = mnt_want_write(rec_dir.mnt);
+        status = mnt_want_write(rec_file->f_path.mnt);
        if (status)
                goto out_put;
-        status = vfs_mkdir(rec_dir.dentry->d_inode, dentry, S_IRWXU);
+        status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU);
-        mnt_drop_write(rec_dir.mnt);
+        mnt_drop_write(rec_file->f_path.mnt);
 out_put:
        dput(dentry);
 out_unlock:
-        mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
+        mutex_unlock(&dir->d_inode->i_mutex);
        if (status == 0) {
                clp->cl_firststate = 1;
-                nfsd4_sync_rec_dir();
+                vfs_fsync(rec_file, 0);
        }
        nfs4_reset_creds(original_cred);
        dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
@@ -205,14 +200,14 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
        struct dentry *dentry;
        int status;
-        if (!rec_dir_init)
+        if (!rec_file)
                return 0;
        status = nfs4_save_creds(&original_cred);
        if (status < 0)
                return status;
-        filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY,
+        filp = dentry_open(dget(dir), mntget(rec_file->f_path.mnt), O_RDONLY,
                           current_cred());
        status = PTR_ERR(filp);
        if (IS_ERR(filp))
@@ -249,13 +244,14 @@ out:
 static int
 nfsd4_unlink_clid_dir(char *name, int namlen)
 {
-        struct dentry *dentry;
+        struct dentry *dir, *dentry;
        int status;
        dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
-        mutex_lock_nested(&rec_dir.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+        dir = rec_file->f_path.dentry;
-        dentry = lookup_one_len(name, rec_dir.dentry, namlen);
+        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
+        dentry = lookup_one_len(name, dir, namlen);
        if (IS_ERR(dentry)) {
                status = PTR_ERR(dentry);
                goto out_unlock;
@@ -263,11 +259,11 @@ nfsd4_unlink_clid_dir(char *name, int namlen)
        status = -ENOENT;
        if (!dentry->d_inode)
                goto out;
-        status = vfs_rmdir(rec_dir.dentry->d_inode, dentry);
+        status = vfs_rmdir(dir->d_inode, dentry);
 out:
        dput(dentry);
 out_unlock:
-        mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
+        mutex_unlock(&dir->d_inode->i_mutex);
        return status;
 }
@@ -277,10 +273,10 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
        const struct cred *original_cred;
        int status;
-        if (!rec_dir_init || !clp->cl_firststate)
+        if (!rec_file || !clp->cl_firststate)
                return;
-        status = mnt_want_write(rec_dir.mnt);
+        status = mnt_want_write(rec_file->f_path.mnt);
        if (status)
                goto out;
        clp->cl_firststate = 0;
@@ -292,8 +288,8 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
        status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
        nfs4_reset_creds(original_cred);
        if (status == 0)
-                nfsd4_sync_rec_dir();
+                vfs_fsync(rec_file, 0);
-        mnt_drop_write(rec_dir.mnt);
+        mnt_drop_write(rec_file->f_path.mnt);
 out:
        if (status)
                printk("NFSD: Failed to remove expired client state directory"
@@ -322,19 +318,19 @@ void
 nfsd4_recdir_purge_old(void) {
        int status;
-        if (!rec_dir_init)
+        if (!rec_file)
                return;
-        status = mnt_want_write(rec_dir.mnt);
+        status = mnt_want_write(rec_file->f_path.mnt);
        if (status)
                goto out;
-        status = nfsd4_list_rec_dir(rec_dir.dentry, purge_old);
+        status = nfsd4_list_rec_dir(rec_file->f_path.dentry, purge_old);
        if (status == 0)
-                nfsd4_sync_rec_dir();
+                vfs_fsync(rec_file, 0);
-        mnt_drop_write(rec_dir.mnt);
+        mnt_drop_write(rec_file->f_path.mnt);
 out:
        if (status)
                printk("nfsd4: failed to purge old clients from recovery"
-                        " directory %s\n", rec_dir.dentry->d_name.name);
+                        " directory %s\n", rec_file->f_path.dentry->d_name.name);
 }
 static int
@@ -354,10 +350,13 @@ int
 nfsd4_recdir_load(void) {
        int status;
-        status = nfsd4_list_rec_dir(rec_dir.dentry, load_recdir);
+        if (!rec_file)
+                return 0;
+        status = nfsd4_list_rec_dir(rec_file->f_path.dentry, load_recdir);
        if (status)
                printk("nfsd4: failed loading clients from recovery"
-                        " directory %s\n", rec_dir.dentry->d_name.name);
+                        " directory %s\n", rec_file->f_path.dentry->d_name.name);
        return status;
 }
@@ -374,7 +373,7 @@ nfsd4_init_recdir(char *rec_dirname)
        printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
                        rec_dirname);
-        BUG_ON(rec_dir_init);
+        BUG_ON(rec_file);
        status = nfs4_save_creds(&original_cred);
        if (status < 0) {
@@ -384,22 +383,21 @@ nfsd4_init_recdir(char *rec_dirname)
                return;
        }
-        status = kern_path(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
+        rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0);
-                        &rec_dir);
+        if (IS_ERR(rec_file)) {
-        if (status)
                printk("NFSD: unable to find recovery directory %s\n",
                                rec_dirname);
+                rec_file = NULL;
+        }
-        if (!status)
-                rec_dir_init = 1;
        nfs4_reset_creds(original_cred);
 }
 void
 nfsd4_shutdown_recdir(void)
 {
-        if (!rec_dir_init)
+        if (!rec_file)
                return;
-        rec_dir_init = 0;
+        fput(rec_file);
-        path_put(&rec_dir);
+        rec_file = NULL;
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c97fddbd17db..12f7109720c2 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -34,6 +34,7 @@
 #include <linux/file.h>
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/swap.h>
 #include <linux/sunrpc/svcauth_gss.h>
@@ -44,8 +45,8 @@
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 /* Globals */
-static time_t lease_time = 90;     /* default lease time */
+time_t nfsd4_lease = 90;     /* default lease time */
-static time_t user_lease_time = 90;
+time_t nfsd4_grace = 90;
 static time_t boot_time;
 static u32 current_ownerid = 1;
 static u32 current_fileid = 1;
@@ -189,7 +190,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        dp->dl_vfs_file = stp->st_vfs_file;
        dp->dl_type = type;
        dp->dl_ident = cb->cb_ident;
-        dp->dl_stateid.si_boot = get_seconds();
+        dp->dl_stateid.si_boot = boot_time;
        dp->dl_stateid.si_stateownerid = current_delegid++;
        dp->dl_stateid.si_fileid = 0;
        dp->dl_stateid.si_generation = 0;
@@ -198,6 +199,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        atomic_set(&dp->dl_count, 1);
        list_add(&dp->dl_perfile, &fp->fi_delegations);
        list_add(&dp->dl_perclnt, &clp->cl_delegations);
+        INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc);
        return dp;
 }
@@ -248,6 +250,9 @@ unhash_delegation(struct nfs4_delegation *dp)
 * SETCLIENTID state 
 */
+/* client_lock protects the client lru list and session hash table */
+static DEFINE_SPINLOCK(client_lock);
 /* Hash tables for nfs4_clientid state */
 #define CLIENT_HASH_BITS                 4
 #define CLIENT_HASH_SIZE                (1 << CLIENT_HASH_BITS)
@@ -366,7 +371,6 @@ static void release_openowner(struct nfs4_stateowner *sop)
        nfs4_put_stateowner(sop);
 }
-static DEFINE_SPINLOCK(sessionid_lock);
 #define SESSION_HASH_SIZE       512
 static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
@@ -564,10 +568,10 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
        new->se_flags = cses->flags;
        kref_init(&new->se_ref);
-        spin_lock(&sessionid_lock);
+        spin_lock(&client_lock);
        list_add(&new->se_hash, &sessionid_hashtbl[idx]);
        list_add(&new->se_perclnt, &clp->cl_sessions);
-        spin_unlock(&sessionid_lock);
+        spin_unlock(&client_lock);
        status = nfs_ok;
 out:
@@ -578,7 +582,7 @@ out_free:
        goto out;
 }
-/* caller must hold sessionid_lock */
+/* caller must hold client_lock */
 static struct nfsd4_session *
 find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
 {
@@ -601,7 +605,7 @@ find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
        return NULL;
 }
-/* caller must hold sessionid_lock */
+/* caller must hold client_lock */
 static void
 unhash_session(struct nfsd4_session *ses)
 {
@@ -609,15 +613,6 @@ unhash_session(struct nfsd4_session *ses)
        list_del(&ses->se_perclnt);
 }
-static void
-release_session(struct nfsd4_session *ses)
-{
-        spin_lock(&sessionid_lock);
-        unhash_session(ses);
-        spin_unlock(&sessionid_lock);
-        nfsd4_put_session(ses);
-}
 void
 free_session(struct kref *kref)
 {
@@ -633,9 +628,18 @@ free_session(struct kref *kref)
        kfree(ses);
 }
+/* must be called under the client_lock */
 static inline void
-renew_client(struct nfs4_client *clp)
+renew_client_locked(struct nfs4_client *clp)
 {
+        if (is_client_expired(clp)) {
+                dprintk("%s: client (clientid %08x/%08x) already expired\n",
+                        __func__,
+                        clp->cl_clientid.cl_boot,
+                        clp->cl_clientid.cl_id);
+                return;
+        }
        /*
        * Move client to the end to the LRU list.
        */
@@ -646,6 +650,14 @@ renew_client(struct nfs4_client *clp)
        clp->cl_time = get_seconds();
 }
+static inline void
+renew_client(struct nfs4_client *clp)
+{
+        spin_lock(&client_lock);
+        renew_client_locked(clp);
+        spin_unlock(&client_lock);
+}
 /* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */
 static int
 STALE_CLIENTID(clientid_t *clid)
@@ -679,27 +691,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
        return clp;
 }
-static void
-shutdown_callback_client(struct nfs4_client *clp)
-{
-        struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
-        if (clnt) {
-                /*
-                 * Callback threads take a reference on the client, so there
-                 * should be no outstanding callbacks at this point.
-                 */
-                clp->cl_cb_conn.cb_client = NULL;
-                rpc_shutdown_client(clnt);
-        }
-}
 static inline void
 free_client(struct nfs4_client *clp)
 {
-        shutdown_callback_client(clp);
-        if (clp->cl_cb_xprt)
-                svc_xprt_put(clp->cl_cb_xprt);
        if (clp->cl_cred.cr_group_info)
                put_group_info(clp->cl_cred.cr_group_info);
        kfree(clp->cl_principal);
@@ -708,10 +702,34 @@ free_client(struct nfs4_client *clp)
 }
 void
-put_nfs4_client(struct nfs4_client *clp)
+release_session_client(struct nfsd4_session *session)
 {
-        if (atomic_dec_and_test(&clp->cl_count))
+        struct nfs4_client *clp = session->se_client;
+        if (!atomic_dec_and_lock(&clp->cl_refcount, &client_lock))
+                return;
+        if (is_client_expired(clp)) {
                free_client(clp);
+                session->se_client = NULL;
+        } else
+                renew_client_locked(clp);
+        spin_unlock(&client_lock);
+        nfsd4_put_session(session);
+}
+/* must be called under the client_lock */
+static inline void
+unhash_client_locked(struct nfs4_client *clp)
+{
+        mark_client_expired(clp);
+        list_del(&clp->cl_lru);
+        while (!list_empty(&clp->cl_sessions)) {
+                struct nfsd4_session  *ses;
+                ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
+                                 se_perclnt);
+                unhash_session(ses);
+                nfsd4_put_session(ses);
+        }
 }
 static void
@@ -721,9 +739,6 @@ expire_client(struct nfs4_client *clp)
        struct nfs4_delegation *dp;
        struct list_head reaplist;
-        dprintk("NFSD: expire_client cl_count %d\n",
-                            atomic_read(&clp->cl_count));
        INIT_LIST_HEAD(&reaplist);
        spin_lock(&recall_lock);
        while (!list_empty(&clp->cl_delegations)) {
@@ -739,20 +754,20 @@ expire_client(struct nfs4_client *clp)
                list_del_init(&dp->dl_recall_lru);
                unhash_delegation(dp);
        }
-        list_del(&clp->cl_idhash);
-        list_del(&clp->cl_strhash);
-        list_del(&clp->cl_lru);
        while (!list_empty(&clp->cl_openowners)) {
                sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
                release_openowner(sop);
        }
-        while (!list_empty(&clp->cl_sessions)) {
+        nfsd4_set_callback_client(clp, NULL);
-                struct nfsd4_session  *ses;
+        if (clp->cl_cb_conn.cb_xprt)
-                ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
+                svc_xprt_put(clp->cl_cb_conn.cb_xprt);
-                                 se_perclnt);
+        list_del(&clp->cl_idhash);
-                release_session(ses);
+        list_del(&clp->cl_strhash);
-        }
+        spin_lock(&client_lock);
-        put_nfs4_client(clp);
+        unhash_client_locked(clp);
+        if (atomic_read(&clp->cl_refcount) == 0)
+                free_client(clp);
+        spin_unlock(&client_lock);
 }
 static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
@@ -838,14 +853,15 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
        }
        memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
-        atomic_set(&clp->cl_count, 1);
+        atomic_set(&clp->cl_refcount, 0);
-        atomic_set(&clp->cl_cb_conn.cb_set, 0);
+        atomic_set(&clp->cl_cb_set, 0);
        INIT_LIST_HEAD(&clp->cl_idhash);
        INIT_LIST_HEAD(&clp->cl_strhash);
        INIT_LIST_HEAD(&clp->cl_openowners);
        INIT_LIST_HEAD(&clp->cl_delegations);
        INIT_LIST_HEAD(&clp->cl_sessions);
        INIT_LIST_HEAD(&clp->cl_lru);
+        clp->cl_time = get_seconds();
        clear_bit(0, &clp->cl_cb_slot_busy);
        rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
        copy_verf(clp, verf);
@@ -876,8 +892,7 @@ add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval)
        list_add(&clp->cl_strhash, &unconf_str_hashtbl[strhashval]);
        idhashval = clientid_hashval(clp->cl_clientid.cl_id);
        list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]);
-        list_add_tail(&clp->cl_lru, &client_lru);
+        renew_client(clp);
-        clp->cl_time = get_seconds();
 }
 static void
@@ -887,10 +902,9 @@ move_to_confirmed(struct nfs4_client *clp)
        unsigned int strhashval;
        dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
-        list_del_init(&clp->cl_strhash);
        list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
        strhashval = clientstr_hashval(clp->cl_recdir);
-        list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]);
+        list_move(&clp->cl_strhash, &conf_str_hashtbl[strhashval]);
        renew_client(clp);
 }
@@ -1326,15 +1340,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                cs_slot->sl_seqid++; /* from 0 to 1 */
                move_to_confirmed(unconf);
-                /*
-                 * We do not support RDMA or persistent sessions
-                 */
-                cr_ses->flags &= ~SESSION4_PERSIST;
-                cr_ses->flags &= ~SESSION4_RDMA;
                if (cr_ses->flags & SESSION4_BACK_CHAN) {
-                        unconf->cl_cb_xprt = rqstp->rq_xprt;
+                        unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
-                        svc_xprt_get(unconf->cl_cb_xprt);
+                        svc_xprt_get(rqstp->rq_xprt);
                        rpc_copy_addr(
                                (struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
                                sa);
@@ -1343,7 +1351,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                                cstate->minorversion;
                        unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
                        unconf->cl_cb_seq_nr = 1;
-                        nfsd4_probe_callback(unconf);
+                        nfsd4_probe_callback(unconf, &unconf->cl_cb_conn);
                }
                conf = unconf;
        } else {
@@ -1351,6 +1359,12 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                goto out;
        }
+        /*
+         * We do not support RDMA or persistent sessions
+         */
+        cr_ses->flags &= ~SESSION4_PERSIST;
+        cr_ses->flags &= ~SESSION4_RDMA;
        status = alloc_init_session(rqstp, conf, cr_ses);
        if (status)
                goto out;
@@ -1368,6 +1382,21 @@ out:
        return status;
 }
+static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
+{
+        struct nfsd4_compoundres *resp = rqstp->rq_resp;
+        struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+        return argp->opcnt == resp->opcnt;
+}
+static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
+{
+        if (!session)
+                return 0;
+        return !memcmp(sid, &session->se_sessionid, sizeof(*sid));
+}
 __be32
 nfsd4_destroy_session(struct svc_rqst *r,
                      struct nfsd4_compound_state *cstate,
@@ -1383,19 +1412,25 @@ nfsd4_destroy_session(struct svc_rqst *r,
         * - Do we need to clear any callback info from previous session?
         */
+        if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) {
+                if (!nfsd4_last_compound_op(r))
+                        return nfserr_not_only_op;
+        }
        dump_sessionid(__func__, &sessionid->sessionid);
-        spin_lock(&sessionid_lock);
+        spin_lock(&client_lock);
        ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
        if (!ses) {
-                spin_unlock(&sessionid_lock);
+                spin_unlock(&client_lock);
                goto out;
        }
        unhash_session(ses);
-        spin_unlock(&sessionid_lock);
+        spin_unlock(&client_lock);
+        nfs4_lock_state();
        /* wait for callbacks */
-        shutdown_callback_client(ses->se_client);
+        nfsd4_set_callback_client(ses->se_client, NULL);
+        nfs4_unlock_state();
        nfsd4_put_session(ses);
        status = nfs_ok;
 out:
@@ -1416,7 +1451,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
        if (resp->opcnt != 1)
                return nfserr_sequence_pos;
-        spin_lock(&sessionid_lock);
+        spin_lock(&client_lock);
        status = nfserr_badsession;
        session = find_in_sessionid_hashtbl(&seq->sessionid);
        if (!session)
@@ -1455,23 +1490,47 @@ nfsd4_sequence(struct svc_rqst *rqstp,
        cstate->slot = slot;
        cstate->session = session;
-        /* Hold a session reference until done processing the compound:
-         * nfsd4_put_session called only if the cstate slot is set.
-         */
-        nfsd4_get_session(session);
 out:
-        spin_unlock(&sessionid_lock);
+        /* Hold a session reference until done processing the compound. */
-        /* Renew the clientid on success and on replay */
        if (cstate->session) {
-                nfs4_lock_state();
+                nfsd4_get_session(cstate->session);
-                renew_client(session->se_client);
+                atomic_inc(&session->se_client->cl_refcount);
-                nfs4_unlock_state();
        }
+        spin_unlock(&client_lock);
        dprintk("%s: return %d\n", __func__, ntohl(status));
        return status;
 }
 __be32
+nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
+{
+        if (rc->rca_one_fs) {
+                if (!cstate->current_fh.fh_dentry)
+                        return nfserr_nofilehandle;
+                /*
+                 * We don't take advantage of the rca_one_fs case.
+                 * That's OK, it's optional, we can safely ignore it.
+                 */
+                 return nfs_ok;
+        }
+        nfs4_lock_state();
+        if (is_client_expired(cstate->session->se_client)) {
+                nfs4_unlock_state();
+                /*
+                 * The following error isn't really legal.
+                 * But we only get here if the client just explicitly
+                 * destroyed the client.  Surely it no longer cares what
+                 * error it gets back on an operation for the dead
+                 * client.
+                 */
+                return nfserr_stale_clientid;
+        }
+        nfsd4_create_clid_dir(cstate->session->se_client);
+        nfs4_unlock_state();
+        return nfs_ok;
+}
+__be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                  struct nfsd4_setclientid *setclid)
 {
@@ -1630,9 +1689,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
                        status = nfserr_clid_inuse;
                else {
-                        /* XXX: We just turn off callbacks until we can handle
+                        atomic_set(&conf->cl_cb_set, 0);
-                          * change request correctly. */
+                        nfsd4_probe_callback(conf, &unconf->cl_cb_conn);
-                        atomic_set(&conf->cl_cb_conn.cb_set, 0);
                        expire_client(unconf);
                        status = nfs_ok;
@@ -1666,7 +1724,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                        }
                        move_to_confirmed(unconf);
                        conf = unconf;
-                        nfsd4_probe_callback(conf);
+                        nfsd4_probe_callback(conf, &conf->cl_cb_conn);
                        status = nfs_ok;
                }
        } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
@@ -1699,12 +1757,12 @@ alloc_init_file(struct inode *ino)
                INIT_LIST_HEAD(&fp->fi_hash);
                INIT_LIST_HEAD(&fp->fi_stateids);
                INIT_LIST_HEAD(&fp->fi_delegations);
-                spin_lock(&recall_lock);
-                list_add(&fp->fi_hash, &file_hashtbl[hashval]);
-                spin_unlock(&recall_lock);
                fp->fi_inode = igrab(ino);
                fp->fi_id = current_fileid++;
                fp->fi_had_conflict = false;
+                spin_lock(&recall_lock);
+                list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+                spin_unlock(&recall_lock);
                return fp;
        }
        return NULL;
@@ -1826,7 +1884,7 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
        stp->st_stateowner = sop;
        get_nfs4_file(fp);
        stp->st_file = fp;
-        stp->st_stateid.si_boot = get_seconds();
+        stp->st_stateid.si_boot = boot_time;
        stp->st_stateid.si_stateownerid = sop->so_id;
        stp->st_stateid.si_fileid = fp->fi_id;
        stp->st_stateid.si_generation = 0;
@@ -2027,7 +2085,6 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
         * lock) we know the server hasn't removed the lease yet, we know
         * it's safe to take a reference: */
        atomic_inc(&dp->dl_count);
-        atomic_inc(&dp->dl_client->cl_count);
        spin_lock(&recall_lock);
        list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
@@ -2346,7 +2403,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 {
        struct nfs4_delegation *dp;
        struct nfs4_stateowner *sop = stp->st_stateowner;
-        struct nfs4_cb_conn *cb = &sop->so_client->cl_cb_conn;
+        int cb_up = atomic_read(&sop->so_client->cl_cb_set);
        struct file_lock fl, *flp = &fl;
        int status, flag = 0;
@@ -2354,7 +2411,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
        open->op_recall = 0;
        switch (open->op_claim_type) {
                case NFS4_OPEN_CLAIM_PREVIOUS:
-                        if (!atomic_read(&cb->cb_set))
+                        if (!cb_up)
                                open->op_recall = 1;
                        flag = open->op_delegate_type;
                        if (flag == NFS4_OPEN_DELEGATE_NONE)
@@ -2365,7 +2422,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
                         * had the chance to reclaim theirs.... */
                        if (locks_in_grace())
                                goto out;
-                        if (!atomic_read(&cb->cb_set) || !sop->so_confirmed)
+                        if (!cb_up || !sop->so_confirmed)
                                goto out;
                        if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
                                flag = NFS4_OPEN_DELEGATE_WRITE;
@@ -2482,10 +2539,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
        }
        memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
-        if (nfsd4_has_session(&resp->cstate)) {
+        if (nfsd4_has_session(&resp->cstate))
                open->op_stateowner->so_confirmed = 1;
-                nfsd4_create_clid_dir(open->op_stateowner->so_client);
-        }
        /*
        * Attempt to hand out a delegation. No error return, because the
@@ -2536,7 +2591,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        renew_client(clp);
        status = nfserr_cb_path_down;
        if (!list_empty(&clp->cl_delegations)
-                        && !atomic_read(&clp->cl_cb_conn.cb_set))
+                        && !atomic_read(&clp->cl_cb_set))
                goto out;
        status = nfs_ok;
 out:
@@ -2553,6 +2608,12 @@ nfsd4_end_grace(void)
        dprintk("NFSD: end of grace period\n");
        nfsd4_recdir_purge_old();
        locks_end_grace(&nfsd4_manager);
+        /*
+         * Now that every NFSv4 client has had the chance to recover and
+         * to see the (possibly new, possibly shorter) lease time, we
+         * can safely set the next grace time to the current lease time:
+         */
+        nfsd4_grace = nfsd4_lease;
 }
 static time_t
@@ -2562,15 +2623,17 @@ nfs4_laundromat(void)
        struct nfs4_stateowner *sop;
        struct nfs4_delegation *dp;
        struct list_head *pos, *next, reaplist;
-        time_t cutoff = get_seconds() - NFSD_LEASE_TIME;
+        time_t cutoff = get_seconds() - nfsd4_lease;
-        time_t t, clientid_val = NFSD_LEASE_TIME;
+        time_t t, clientid_val = nfsd4_lease;
-        time_t u, test_val = NFSD_LEASE_TIME;
+        time_t u, test_val = nfsd4_lease;
        nfs4_lock_state();
        dprintk("NFSD: laundromat service - starting\n");
        if (locks_in_grace())
                nfsd4_end_grace();
+        INIT_LIST_HEAD(&reaplist);
+        spin_lock(&client_lock);
        list_for_each_safe(pos, next, &client_lru) {
                clp = list_entry(pos, struct nfs4_client, cl_lru);
                if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
@@ -2579,12 +2642,22 @@ nfs4_laundromat(void)
                                clientid_val = t;
                        break;
                }
+                if (atomic_read(&clp->cl_refcount)) {
+                        dprintk("NFSD: client in use (clientid %08x)\n",
+                                clp->cl_clientid.cl_id);
+                        continue;
+                }
+                unhash_client_locked(clp);
+                list_add(&clp->cl_lru, &reaplist);
+        }
+        spin_unlock(&client_lock);
+        list_for_each_safe(pos, next, &reaplist) {
+                clp = list_entry(pos, struct nfs4_client, cl_lru);
                dprintk("NFSD: purging unused client (clientid %08x)\n",
                        clp->cl_clientid.cl_id);
                nfsd4_remove_clid_dir(clp);
                expire_client(clp);
        }
-        INIT_LIST_HEAD(&reaplist);
        spin_lock(&recall_lock);
        list_for_each_safe(pos, next, &del_recall_lru) {
                dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
@@ -2604,7 +2677,7 @@ nfs4_laundromat(void)
                list_del_init(&dp->dl_recall_lru);
                unhash_delegation(dp);
        }
-        test_val = NFSD_LEASE_TIME;
+        test_val = nfsd4_lease;
        list_for_each_safe(pos, next, &close_lru) {
                sop = list_entry(pos, struct nfs4_stateowner, so_close_lru);
                if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) {
@@ -2660,39 +2733,11 @@ nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
 static int
 STALE_STATEID(stateid_t *stateid)
 {
-        if (time_after((unsigned long)boot_time,
+        if (stateid->si_boot == boot_time)
-                        (unsigned long)stateid->si_boot)) {
+                return 0;
-                dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
+        dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
-                        STATEID_VAL(stateid));
-                return 1;
-        }
-        return 0;
-}
-static int
-EXPIRED_STATEID(stateid_t *stateid)
-{
-        if (time_before((unsigned long)boot_time,
-                        ((unsigned long)stateid->si_boot)) &&
-            time_before((unsigned long)(stateid->si_boot + lease_time), get_seconds())) {
-                dprintk("NFSD: expired stateid " STATEID_FMT "!\n",
-                        STATEID_VAL(stateid));
-                return 1;
-        }
-        return 0;
-}
-static __be32
-stateid_error_map(stateid_t *stateid)
-{
-        if (STALE_STATEID(stateid))
-                return nfserr_stale_stateid;
-        if (EXPIRED_STATEID(stateid))
-                return nfserr_expired;
-        dprintk("NFSD: bad stateid " STATEID_FMT "!\n",
                STATEID_VAL(stateid));
-        return nfserr_bad_stateid;
+        return 1;
 }
 static inline int
@@ -2816,10 +2861,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
        status = nfserr_bad_stateid;
        if (is_delegation_stateid(stateid)) {
                dp = find_delegation_stateid(ino, stateid);
-                if (!dp) {
+                if (!dp)
-                        status = stateid_error_map(stateid);
                        goto out;
-                }
                status = check_stateid_generation(stateid, &dp->dl_stateid,
                                                  flags);
                if (status)
@@ -2832,10 +2875,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
                        *filpp = dp->dl_vfs_file;
        } else { /* open or lock stateid */
                stp = find_stateid(stateid, flags);
-                if (!stp) {
+                if (!stp)
-                        status = stateid_error_map(stateid);
                        goto out;
-                }
                if (nfs4_check_fh(current_fh, stp))
                        goto out;
                if (!stp->st_stateowner->so_confirmed)
@@ -2907,7 +2948,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
                 */
                sop = search_close_lru(stateid->si_stateownerid, flags);
                if (sop == NULL)
-                        return stateid_error_map(stateid);
+                        return nfserr_bad_stateid;
                *sopp = sop;
                goto check_replay;
        }
@@ -3174,10 +3215,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (!is_delegation_stateid(stateid))
                goto out;
        dp = find_delegation_stateid(inode, stateid);
-        if (!dp) {
+        if (!dp)
-                status = stateid_error_map(stateid);
                goto out;
-        }
        status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
        if (status)
                goto out;
@@ -3403,7 +3442,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
        stp->st_stateowner = sop;
        get_nfs4_file(fp);
        stp->st_file = fp;
-        stp->st_stateid.si_boot = get_seconds();
+        stp->st_stateid.si_boot = boot_time;
        stp->st_stateid.si_stateownerid = sop->so_id;
        stp->st_stateid.si_fileid = fp->fi_id;
        stp->st_stateid.si_generation = 0;
@@ -3975,12 +4014,6 @@ nfsd4_load_reboot_recovery_data(void)
                printk("NFSD: Failure reading reboot recovery data\n");
 }
-unsigned long
-get_nfs4_grace_period(void)
-{
-        return max(user_lease_time, lease_time) * HZ;
-}
 /*
 * Since the lifetime of a delegation isn't limited to that of an open, a
 * client may quite reasonably hang on to a delegation as long as it has
@@ -4007,20 +4040,27 @@ set_max_delegations(void)
 static int
 __nfs4_state_start(void)
 {
-        unsigned long grace_time;
+        int ret;
        boot_time = get_seconds();
-        grace_time = get_nfs4_grace_period();
-        lease_time = user_lease_time;
        locks_start_grace(&nfsd4_manager);
        printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
-               grace_time/HZ);
+               nfsd4_grace);
+        ret = set_callback_cred();
+        if (ret)
+                return -ENOMEM;
        laundry_wq = create_singlethread_workqueue("nfsd4");
        if (laundry_wq == NULL)
                return -ENOMEM;
-        queue_delayed_work(laundry_wq, &laundromat_work, grace_time);
+        ret = nfsd4_create_callback_queue();
+        if (ret)
+                goto out_free_laundry;
+        queue_delayed_work(laundry_wq, &laundromat_work, nfsd4_grace * HZ);
        set_max_delegations();
-        return set_callback_cred();
+        return 0;
+out_free_laundry:
+        destroy_workqueue(laundry_wq);
+        return ret;
 }
 int
@@ -4038,12 +4078,6 @@ nfs4_state_start(void)
        return 0;
 }
-time_t
-nfs4_lease_time(void)
-{
-        return lease_time;
-}
 static void
 __nfs4_state_shutdown(void)
 {
@@ -4088,6 +4122,7 @@ nfs4_state_shutdown(void)
        nfs4_lock_state();
        nfs4_release_reclaim();
        __nfs4_state_shutdown();
+        nfsd4_destroy_callback_queue();
        nfs4_unlock_state();
 }
@@ -4127,21 +4162,3 @@ nfs4_recoverydir(void)
 {
        return user_recovery_dirname;
 }
-/*
- * Called when leasetime is changed.
- *
- * The only way the protocol gives us to handle on-the-fly lease changes is to
- * simulate a reboot.  Instead of doing that, we just wait till the next time
- * we start to register any changes in lease time.  If the administrator
- * really wants to change the lease time *now*, they can go ahead and bring
- * nfsd down and then back up again after changing the lease time.
- *
- * user_lease_time is protected by nfsd_mutex since it's only really accessed
- * when nfsd is starting
- */
-void
-nfs4_reset_lease(time_t leasetime)
-{
-        user_lease_time = leasetime;
-}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c47b4d7bafa7..ac17a7080239 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -40,6 +40,7 @@
 * at the end of nfs4svc_decode_compoundargs.
 */
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/statfs.h>
 #include <linux/utsname.h>
@@ -160,10 +161,10 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
        argp->p = page_address(argp->pagelist[0]);
        argp->pagelist++;
        if (argp->pagelen < PAGE_SIZE) {
-                argp->end = p + (argp->pagelen>>2);
+                argp->end = argp->p + (argp->pagelen>>2);
                argp->pagelen = 0;
        } else {
-                argp->end = p + (PAGE_SIZE>>2);
+                argp->end = argp->p + (PAGE_SIZE>>2);
                argp->pagelen -= PAGE_SIZE;
        }
        memcpy(((char*)p)+avail, argp->p, (nbytes - avail));
@@ -1233,6 +1234,16 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
        DECODE_TAIL;
 }
+static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc)
+{
+        DECODE_HEAD;
+        READ_BUF(4);
+        READ32(rc->rca_one_fs);
+        DECODE_TAIL;
+}
 static __be32
 nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
 {
@@ -1345,7 +1356,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
        [OP_TEST_STATEID]       = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_WANT_DELEGATION]    = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_DESTROY_CLIENTID]   = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_RECLAIM_COMPLETE]   = (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_RECLAIM_COMPLETE]   = (nfsd4_dec)nfsd4_decode_reclaim_complete,
 };
 struct nfsd4_minorversion_ops {
@@ -1425,10 +1436,10 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
                        argp->p = page_address(argp->pagelist[0]);
                        argp->pagelist++;
                        if (argp->pagelen < PAGE_SIZE) {
-                                argp->end = p + (argp->pagelen>>2);
+                                argp->end = argp->p + (argp->pagelen>>2);
                                argp->pagelen = 0;
                        } else {
-                                argp->end = p + (PAGE_SIZE>>2);
+                                argp->end = argp->p + (PAGE_SIZE>>2);
                                argp->pagelen -= PAGE_SIZE;
                        }
                }
@@ -1899,7 +1910,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
        if (bmval0 & FATTR4_WORD0_LEASE_TIME) {
                if ((buflen -= 4) < 0)
                        goto out_resource;
-                WRITE32(NFSD_LEASE_TIME);
+                WRITE32(nfsd4_lease);
        }
        if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
                if ((buflen -= 4) < 0)
@@ -3306,11 +3317,14 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
                iov = &rqstp->rq_res.head[0];
        iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
        BUG_ON(iov->iov_len > PAGE_SIZE);
-        if (nfsd4_has_session(cs) && cs->status != nfserr_replay_cache) {
+        if (nfsd4_has_session(cs)) {
-                nfsd4_store_cache_entry(resp);
+                if (cs->status != nfserr_replay_cache) {
-                dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
+                        nfsd4_store_cache_entry(resp);
-                resp->cstate.slot->sl_inuse = false;
+                        dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
-                nfsd4_put_session(resp->cstate.session);
+                        cs->slot->sl_inuse = false;
+                }
+                /* Renew the clientid on success and on replay */
+                release_session_client(cs->session);
        }
        return 1;
 }
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index da08560c4818..4666a209678a 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -8,6 +8,8 @@
 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
 */
+#include <linux/slab.h>
 #include "nfsd.h"
 #include "cache.h"
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 0f0e77f2012f..508941c23af7 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -4,6 +4,7 @@
 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
 */
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/ctype.h>
@@ -45,6 +46,7 @@ enum {
         */
 #ifdef CONFIG_NFSD_V4
        NFSD_Leasetime,
+        NFSD_Gracetime,
        NFSD_RecoveryDir,
 #endif
 };
@@ -69,6 +71,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size);
 static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
 #ifdef CONFIG_NFSD_V4
 static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
+static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
 static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
 #endif
@@ -90,6 +93,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
        [NFSD_MaxBlkSize] = write_maxblksize,
 #ifdef CONFIG_NFSD_V4
        [NFSD_Leasetime] = write_leasetime,
+        [NFSD_Gracetime] = write_gracetime,
        [NFSD_RecoveryDir] = write_recoverydir,
 #endif
 };
@@ -994,7 +998,7 @@ static ssize_t __write_ports_addxprt(char *buf)
        if (sscanf(buf, "%15s %4u", transport, &port) != 2)
                return -EINVAL;
-        if (port < 1 || port > USHORT_MAX)
+        if (port < 1 || port > USHRT_MAX)
                return -EINVAL;
        err = nfsd_create_serv();
@@ -1036,7 +1040,7 @@ static ssize_t __write_ports_delxprt(char *buf)
        if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2)
                return -EINVAL;
-        if (port < 1 || port > USHORT_MAX || nfsd_serv == NULL)
+        if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL)
                return -EINVAL;
        xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port);
@@ -1203,29 +1207,45 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
 }
 #ifdef CONFIG_NFSD_V4
-extern time_t nfs4_leasetime(void);
+static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time)
-static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
 {
-        /* if size > 10 seconds, call
-         * nfs4_reset_lease() then write out the new lease (seconds) as reply
-         */
        char *mesg = buf;
-        int rv, lease;
+        int rv, i;
        if (size > 0) {
                if (nfsd_serv)
                        return -EBUSY;
-                rv = get_int(&mesg, &lease);
+                rv = get_int(&mesg, &i);
                if (rv)
                        return rv;
-                if (lease < 10 || lease > 3600)
+                /*
+                 * Some sanity checking.  We don't have a reason for
+                 * these particular numbers, but problems with the
+                 * extremes are:
+                 *      - Too short: the briefest network outage may
+                 *        cause clients to lose all their locks.  Also,
+                 *        the frequent polling may be wasteful.
+                 *      - Too long: do you really want reboot recovery
+                 *        to take more than an hour?  Or to make other
+                 *        clients wait an hour before being able to
+                 *        revoke a dead client's locks?
+                 */
+                if (i < 10 || i > 3600)
                        return -EINVAL;
-                nfs4_reset_lease(lease);
+                *time = i;
        }
-        return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n",
+        return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time);
-                                                        nfs4_lease_time());
+}
+static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time)
+{
+        ssize_t rv;
+        mutex_lock(&nfsd_mutex);
+        rv = __nfsd4_write_time(file, buf, size, time);
+        mutex_unlock(&nfsd_mutex);
+        return rv;
 }
 /**
@@ -1251,12 +1271,22 @@ static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
 */
 static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
 {
-        ssize_t rv;
+        return nfsd4_write_time(file, buf, size, &nfsd4_lease);
+}
-        mutex_lock(&nfsd_mutex);
+/**
-        rv = __write_leasetime(file, buf, size);
+ * write_gracetime - Set or report current NFSv4 grace period time
-        mutex_unlock(&nfsd_mutex);
+ *
-        return rv;
+ * As above, but sets the time of the NFSv4 grace period.
+ *
+ * Note this should never be set to less than the *previous*
+ * lease-period time, but we don't try to enforce this.  (In the common
+ * case (a new boot), we don't know what the previous lease time was
+ * anyway.)
+ */
+static ssize_t write_gracetime(struct file *file, char *buf, size_t size)
+{
+        return nfsd4_write_time(file, buf, size, &nfsd4_grace);
 }
 extern char *nfs4_recoverydir(void);
@@ -1350,6 +1380,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
                [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
 #ifdef CONFIG_NFSD_V4
                [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
+                [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
 #endif
                /* last one */ {""}
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index e942a1aaac92..72377761270e 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -82,7 +82,6 @@ int nfs4_state_init(void);
 void nfsd4_free_slabs(void);
 int nfs4_state_start(void);
 void nfs4_state_shutdown(void);
-time_t nfs4_lease_time(void);
 void nfs4_reset_lease(time_t leasetime);
 int nfs4_reset_recoverydir(char *recdir);
 #else
@@ -90,7 +89,6 @@ static inline int nfs4_state_init(void) { return 0; }
 static inline void nfsd4_free_slabs(void) { }
 static inline int nfs4_state_start(void) { return 0; }
 static inline void nfs4_state_shutdown(void) { }
-static inline time_t nfs4_lease_time(void) { return 0; }
 static inline void nfs4_reset_lease(time_t leasetime) { }
 static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
 #endif
@@ -229,6 +227,9 @@ extern struct timeval	nfssvc_boot;
 #ifdef CONFIG_NFSD_V4
+extern time_t nfsd4_lease;
+extern time_t nfsd4_grace;
 /* before processing a COMPOUND operation, we have to check that there
 * is enough space in the buffer for XDR encode to succeed.  otherwise,
 * we might process an operation with side effects, and be unable to
@@ -247,7 +248,6 @@ extern struct timeval	nfssvc_boot;
 #define COMPOUND_SLACK_SPACE            140    /* OP_GETFH */
 #define COMPOUND_ERR_SLACK_SPACE        12     /* OP_SETATTR */
-#define NFSD_LEASE_TIME                 (nfs4_lease_time())
 #define NFSD_LAUNDROMAT_MINTIMEOUT      10   /* seconds */
 /*
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 171699eb07c8..06b2a26edfe0 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -120,7 +120,7 @@ u32 nfsd_supported_minorversion;
 int nfsd_vers(int vers, enum vers_op change)
 {
        if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
-                return -1;
+                return 0;
        switch(change) {
        case NFSD_SET:
                nfsd_versions[vers] = nfsd_version[vers];
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index fefeae27f25e..006c84230c7c 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -70,6 +70,16 @@ struct nfsd4_cb_sequence {
        struct nfs4_client      *cbs_clp;
 };
+struct nfs4_rpc_args {
+        void                            *args_op;
+        struct nfsd4_cb_sequence        args_seq;
+};
+struct nfsd4_callback {
+        struct nfs4_rpc_args cb_args;
+        struct work_struct cb_work;
+};
 struct nfs4_delegation {
        struct list_head        dl_perfile;
        struct list_head        dl_perclnt;
@@ -86,6 +96,7 @@ struct nfs4_delegation {
        stateid_t               dl_stateid;
        struct knfsd_fh         dl_fh;
        int                     dl_retries;
+        struct nfsd4_callback   dl_recall;
 };
 /* client delegation callback info */
@@ -96,9 +107,7 @@ struct nfs4_cb_conn {
        u32                     cb_prog;
        u32                     cb_minorversion;
        u32                     cb_ident;       /* minorversion 0 only */
-        /* RPC client info */
+        struct svc_xprt         *cb_xprt;       /* minorversion 1 only */
-        atomic_t                cb_set;     /* successful CB_NULL call */
-        struct rpc_clnt *       cb_client;
 };
 /* Maximum number of slots per session. 160 is useful for long haul TCP */
@@ -157,7 +166,7 @@ struct nfsd4_session {
        struct list_head        se_hash;        /* hash by sessionid */
        struct list_head        se_perclnt;
        u32                     se_flags;
-        struct nfs4_client      *se_client;     /* for expire_client */
+        struct nfs4_client      *se_client;
        struct nfs4_sessionid   se_sessionid;
        struct nfsd4_channel_attrs se_fchannel;
        struct nfsd4_channel_attrs se_bchannel;
@@ -212,25 +221,41 @@ struct nfs4_client {
        struct svc_cred         cl_cred;        /* setclientid principal */
        clientid_t              cl_clientid;    /* generated by server */
        nfs4_verifier           cl_confirm;     /* generated by server */
-        struct nfs4_cb_conn     cl_cb_conn;     /* callback info */
-        atomic_t                cl_count;       /* ref count */
        u32                     cl_firststate;  /* recovery dir creation */
+        /* for v4.0 and v4.1 callbacks: */
+        struct nfs4_cb_conn     cl_cb_conn;
+        struct rpc_clnt         *cl_cb_client;
+        atomic_t                cl_cb_set;
        /* for nfs41 */
        struct list_head        cl_sessions;
        struct nfsd4_clid_slot  cl_cs_slot;     /* create_session slot */
        u32                     cl_exchange_flags;
        struct nfs4_sessionid   cl_sessionid;
+        /* number of rpc's in progress over an associated session: */
+        atomic_t                cl_refcount;
        /* for nfs41 callbacks */
        /* We currently support a single back channel with a single slot */
        unsigned long           cl_cb_slot_busy;
        u32                     cl_cb_seq_nr;
-        struct svc_xprt         *cl_cb_xprt;    /* 4.1 callback transport */
        struct rpc_wait_queue   cl_cb_waitq;    /* backchannel callers may */
                                                /* wait here for slots */
 };
+static inline void
+mark_client_expired(struct nfs4_client *clp)
+{
+        clp->cl_time = 0;
+}
+static inline bool
+is_client_expired(struct nfs4_client *clp)
+{
+        return clp->cl_time == 0;
+}
 /* struct nfs4_client_reset
 * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl
 * upon lease reset, or from upcall to state_daemon (to read in state
@@ -377,11 +402,14 @@ extern void nfs4_lock_state(void);
 extern void nfs4_unlock_state(void);
 extern int nfs4_in_grace(void);
 extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
-extern void put_nfs4_client(struct nfs4_client *clp);
 extern void nfs4_free_stateowner(struct kref *kref);
 extern int set_callback_cred(void);
-extern void nfsd4_probe_callback(struct nfs4_client *clp);
+extern void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
+extern void nfsd4_do_callback_rpc(struct work_struct *);
 extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
+extern int nfsd4_create_callback_queue(void);
+extern void nfsd4_destroy_callback_queue(void);
+extern void nfsd4_set_callback_client(struct nfs4_client *, struct rpc_clnt *);
 extern void nfs4_put_delegation(struct nfs4_delegation *dp);
 extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
 extern void nfsd4_init_recdir(char *recdir_name);
@@ -392,6 +420,7 @@ extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id);
 extern void nfsd4_recdir_purge_old(void);
 extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
 extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
+extern void release_session_client(struct nfsd4_session *);
 static inline void
 nfs4_put_stateowner(struct nfs4_stateowner *so)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a11b0e8678ee..ebbf3b6b2457 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -25,6 +25,7 @@
 #include <linux/xattr.h>
 #include <linux/jhash.h>
 #include <linux/ima.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/exportfs.h>
 #include <linux/writeback.h>
@@ -723,7 +724,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
        struct inode    *inode;
        int             flags = O_RDONLY|O_LARGEFILE;
        __be32          err;
-        int             host_err;
+        int             host_err = 0;
        validate_process_creds();
@@ -760,7 +761,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
         * Check to see if there are any leases on this file.
         * This may block while leases are broken.
         */
-        host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
+        if (!(access & NFSD_MAY_NOT_BREAK_LEASE))
+                host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
        if (host_err == -EWOULDBLOCK)
                host_err = -ETIMEDOUT;
        if (host_err) /* NOMEM or WOULDBLOCK */
@@ -997,7 +999,7 @@ static int wait_for_concurrent_writes(struct file *file)
        if (inode->i_state & I_DIRTY) {
                dprintk("nfsd: write sync %d\n", task_pid_nr(current));
-                err = vfs_fsync(file, file->f_path.dentry, 0);
+                err = vfs_fsync(file, 0);
        }
        last_ino = inode->i_ino;
        last_dev = inode->i_sb->s_dev;
@@ -1168,12 +1170,12 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
                        goto out;
        }
-        err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
+        err = nfsd_open(rqstp, fhp, S_IFREG,
+                        NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &file);
        if (err)
                goto out;
        if (EX_ISSYNC(fhp->fh_export)) {
-                int err2 = vfs_fsync_range(file, file->f_path.dentry,
+                int err2 = vfs_fsync_range(file, offset, end, 0);
-                                offset, end, 0);
                if (err2 != -EINVAL)
                        err = nfserrno(err2);
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 4b1de0a9ea75..217a62c2a357 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -20,6 +20,7 @@
 #define NFSD_MAY_OWNER_OVERRIDE 64
 #define NFSD_MAY_LOCAL_ACCESS   128 /* IRIX doing local access check on device special file*/
 #define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
+#define NFSD_MAY_NOT_BREAK_LEASE 512
 #define NFSD_MAY_CREATE         (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
 #define NFSD_MAY_REMOVE         (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index efa337739534..4d476ff08ae6 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -381,6 +381,10 @@ struct nfsd4_destroy_session {
        struct nfs4_sessionid   sessionid;
 };
+struct nfsd4_reclaim_complete {
+        u32 rca_one_fs;
+};
 struct nfsd4_op {
        int                                     opnum;
        __be32                                  status;
@@ -421,6 +425,7 @@ struct nfsd4_op {
                struct nfsd4_create_session     create_session;
                struct nfsd4_destroy_session    destroy_session;
                struct nfsd4_sequence           sequence;
+                struct nfsd4_reclaim_complete   reclaim_complete;
        } u;
        struct nfs4_replay *                    replay;
 };
@@ -513,9 +518,8 @@ extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp);
 extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
                struct nfsd4_sequence *seq);
 extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
-                struct nfsd4_compound_state *,
+                struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
-struct nfsd4_exchange_id *);
+extern __be32 nfsd4_create_session(struct svc_rqst *,
-                extern __be32 nfsd4_create_session(struct svc_rqst *,
                struct nfsd4_compound_state *,
                struct nfsd4_create_session *);
 extern __be32 nfsd4_sequence(struct svc_rqst *,
@@ -524,6 +528,7 @@ extern __be32 nfsd4_sequence(struct svc_rqst *,
 extern __be32 nfsd4_destroy_session(struct svc_rqst *,
                struct nfsd4_compound_state *,
                struct nfsd4_destroy_session *);
+__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *);
 extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
                struct nfsd4_open *open);
 extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 3f959f1879d8..d7fd696e595c 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -26,10 +26,16 @@
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
 #include <linux/bitops.h>
+#include <linux/slab.h>
 #include "mdt.h"
 #include "alloc.h"
+/**
+ * nilfs_palloc_groups_per_desc_block - get the number of groups that a group
+ *                                      descriptor block can maintain
+ * @inode: inode of metadata file using this allocator
+ */
 static inline unsigned long
 nilfs_palloc_groups_per_desc_block(const struct inode *inode)
 {
@@ -37,12 +43,21 @@ nilfs_palloc_groups_per_desc_block(const struct inode *inode)
                sizeof(struct nilfs_palloc_group_desc);
 }
+/**
+ * nilfs_palloc_groups_count - get maximum number of groups
+ * @inode: inode of metadata file using this allocator
+ */
 static inline unsigned long
 nilfs_palloc_groups_count(const struct inode *inode)
 {
        return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */));
 }
+/**
+ * nilfs_palloc_init_blockgroup - initialize private variables for allocator
+ * @inode: inode of metadata file using this allocator
+ * @entry_size: size of the persistent object
+ */
 int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
 {
        struct nilfs_mdt_info *mi = NILFS_MDT(inode);
@@ -68,6 +83,12 @@ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
        return 0;
 }
+/**
+ * nilfs_palloc_group - get group number and offset from an entry number
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ * @offset: pointer to store offset number in the group
+ */
 static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
                                        unsigned long *offset)
 {
@@ -77,6 +98,14 @@ static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
        return group;
 }
+/**
+ * nilfs_palloc_desc_blkoff - get block offset of a group descriptor block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ *
+ * nilfs_palloc_desc_blkoff() returns block offset of the descriptor
+ * block which contains a descriptor of the specified group.
+ */
 static unsigned long
 nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
 {
@@ -85,6 +114,14 @@ nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
        return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block;
 }
+/**
+ * nilfs_palloc_bitmap_blkoff - get block offset of a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ *
+ * nilfs_palloc_bitmap_blkoff() returns block offset of the bitmap
+ * block used to allocate/deallocate entries in the specified group.
+ */
 static unsigned long
 nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
 {
@@ -94,6 +131,12 @@ nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
                desc_offset * NILFS_MDT(inode)->mi_blocks_per_group;
 }
+/**
+ * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @desc: pointer to descriptor structure for the group
+ */
 static unsigned long
 nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
                               const struct nilfs_palloc_group_desc *desc)
@@ -106,6 +149,13 @@ nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
        return nfree;
 }
+/**
+ * nilfs_palloc_group_desc_add_entries - adjust count of free entries
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @desc: pointer to descriptor structure for the group
+ * @n: delta to be added
+ */
 static void
 nilfs_palloc_group_desc_add_entries(struct inode *inode,
                                    unsigned long group,
@@ -117,6 +167,11 @@ nilfs_palloc_group_desc_add_entries(struct inode *inode,
        spin_unlock(nilfs_mdt_bgl_lock(inode, group));
 }
+/**
+ * nilfs_palloc_entry_blkoff - get block offset of an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ */
 static unsigned long
 nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
 {
@@ -128,6 +183,12 @@ nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
                group_offset / NILFS_MDT(inode)->mi_entries_per_block;
 }
+/**
+ * nilfs_palloc_desc_block_init - initialize buffer of a group descriptor block
+ * @inode: inode of metadata file
+ * @bh: buffer head of the buffer to be initialized
+ * @kaddr: kernel address mapped for the page including the buffer
+ */
 static void nilfs_palloc_desc_block_init(struct inode *inode,
                                         struct buffer_head *bh, void *kaddr)
 {
@@ -178,6 +239,13 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
        return ret;
 }
+/**
+ * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @create: create flag
+ * @bhp: pointer to store the resultant buffer head
+ */
 static int nilfs_palloc_get_desc_block(struct inode *inode,
                                       unsigned long group,
                                       int create, struct buffer_head **bhp)
@@ -190,6 +258,13 @@ static int nilfs_palloc_get_desc_block(struct inode *inode,
                                      bhp, &cache->prev_desc, &cache->lock);
 }
+/**
+ * nilfs_palloc_get_bitmap_block - get buffer head of a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @create: create flag
+ * @bhp: pointer to store the resultant buffer head
+ */
 static int nilfs_palloc_get_bitmap_block(struct inode *inode,
                                         unsigned long group,
                                         int create, struct buffer_head **bhp)
@@ -202,6 +277,13 @@ static int nilfs_palloc_get_bitmap_block(struct inode *inode,
                                      &cache->prev_bitmap, &cache->lock);
 }
+/**
+ * nilfs_palloc_get_entry_block - get buffer head of an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ * @create: create flag
+ * @bhp: pointer to store the resultant buffer head
+ */
 int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
                                 int create, struct buffer_head **bhp)
 {
@@ -213,6 +295,13 @@ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
                                      &cache->prev_entry, &cache->lock);
 }
+/**
+ * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @bh: buffer head of the buffer storing the group descriptor block
+ * @kaddr: kernel address mapped for the page including the buffer
+ */
 static struct nilfs_palloc_group_desc *
 nilfs_palloc_block_get_group_desc(const struct inode *inode,
                                  unsigned long group,
@@ -222,6 +311,13 @@ nilfs_palloc_block_get_group_desc(const struct inode *inode,
                group % nilfs_palloc_groups_per_desc_block(inode);
 }
+/**
+ * nilfs_palloc_block_get_entry - get kernel address of an entry
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ * @bh: buffer head of the buffer storing the entry block
+ * @kaddr: kernel address mapped for the page including the buffer
+ */
 void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
                                   const struct buffer_head *bh, void *kaddr)
 {
@@ -234,11 +330,19 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
                entry_offset * NILFS_MDT(inode)->mi_entry_size;
 }
+/**
+ * nilfs_palloc_find_available_slot - find available slot in a group
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @target: offset number of an entry in the group (start point)
+ * @bitmap: bitmap of the group
+ * @bsize: size in bits
+ */
 static int nilfs_palloc_find_available_slot(struct inode *inode,
                                            unsigned long group,
                                            unsigned long target,
                                            unsigned char *bitmap,
-                                            int bsize)  /* size in bits */
+                                            int bsize)
 {
        int curr, pos, end, i;
@@ -276,6 +380,13 @@ static int nilfs_palloc_find_available_slot(struct inode *inode,
        return -ENOSPC;
 }
+/**
+ * nilfs_palloc_rest_groups_in_desc_block - get the remaining number of groups
+ *                                          in a group descriptor block
+ * @inode: inode of metadata file using this allocator
+ * @curr: current group number
+ * @max: maximum number of groups
+ */
 static unsigned long
 nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
                                       unsigned long curr, unsigned long max)
@@ -286,6 +397,11 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
                     max - curr + 1);
 }
+/**
+ * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the allocation
+ */
 int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
                                     struct nilfs_palloc_req *req)
 {
@@ -365,6 +481,11 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
        return ret;
 }
+/**
+ * nilfs_palloc_commit_alloc_entry - finish allocation of a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the allocation
+ */
 void nilfs_palloc_commit_alloc_entry(struct inode *inode,
                                     struct nilfs_palloc_req *req)
 {
@@ -376,6 +497,11 @@ void nilfs_palloc_commit_alloc_entry(struct inode *inode,
        brelse(req->pr_desc_bh);
 }
+/**
+ * nilfs_palloc_commit_free_entry - finish deallocating a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the removal
+ */
 void nilfs_palloc_commit_free_entry(struct inode *inode,
                                    struct nilfs_palloc_req *req)
 {
@@ -409,6 +535,11 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
        brelse(req->pr_desc_bh);
 }
+/**
+ * nilfs_palloc_abort_alloc_entry - cancel allocation of a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the allocation
+ */
 void nilfs_palloc_abort_alloc_entry(struct inode *inode,
                                    struct nilfs_palloc_req *req)
 {
@@ -425,7 +556,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
        bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
        if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
                                    group_offset, bitmap))
-                printk(KERN_WARNING "%s: entry numer %llu already freed\n",
+                printk(KERN_WARNING "%s: entry number %llu already freed\n",
                       __func__, (unsigned long long)req->pr_entry_nr);
        nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
@@ -441,6 +572,11 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
        req->pr_desc_bh = NULL;
 }
+/**
+ * nilfs_palloc_prepare_free_entry - prepare to deallocate a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the removal
+ */
 int nilfs_palloc_prepare_free_entry(struct inode *inode,
                                    struct nilfs_palloc_req *req)
 {
@@ -463,6 +599,11 @@ int nilfs_palloc_prepare_free_entry(struct inode *inode,
        return 0;
 }
+/**
+ * nilfs_palloc_abort_free_entry - cancel deallocating a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the removal
+ */
 void nilfs_palloc_abort_free_entry(struct inode *inode,
                                   struct nilfs_palloc_req *req)
 {
@@ -474,6 +615,12 @@ void nilfs_palloc_abort_free_entry(struct inode *inode,
        req->pr_desc_bh = NULL;
 }
+/**
+ * nilfs_palloc_group_is_in - judge if an entry is in a group
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @nr: serial number of the entry (e.g. inode number)
+ */
 static int
 nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
 {
@@ -484,6 +631,12 @@ nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
        return (nr >= first) && (nr <= last);
 }
+/**
+ * nilfs_palloc_freev - deallocate a set of persistent objects
+ * @inode: inode of metadata file using this allocator
+ * @entry_nrs: array of entry numbers to be deallocated
+ * @nitems: number of entries stored in @entry_nrs
+ */
 int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 {
        struct buffer_head *desc_bh, *bitmap_bh;
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 5cccf874d692..9af34a7e6e13 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -29,6 +29,13 @@
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
+/**
+ * nilfs_palloc_entries_per_group - get the number of entries per group
+ * @inode: inode of metadata file using this allocator
+ *
+ * The number of entries per group is defined by the number of bits
+ * that a bitmap block can maintain.
+ */
 static inline unsigned long
 nilfs_palloc_entries_per_group(const struct inode *inode)
 {
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 471e269536ae..447ce47a3306 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -27,6 +27,7 @@
 #include <linux/buffer_head.h>
 #include <linux/mm.h>
 #include <linux/backing-dev.h>
+#include <linux/gfp.h>
 #include "nilfs.h"
 #include "mdt.h"
 #include "dat.h"
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 7cdd98b8d514..b27a342c5af6 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -31,63 +31,16 @@
 #include "alloc.h"
 #include "dat.h"
-/**
+static struct nilfs_btree_path *nilfs_btree_alloc_path(void)
- * struct nilfs_btree_path - A path on which B-tree operations are executed
- * @bp_bh: buffer head of node block
- * @bp_sib_bh: buffer head of sibling node block
- * @bp_index: index of child node
- * @bp_oldreq: ptr end request for old ptr
- * @bp_newreq: ptr alloc request for new ptr
- * @bp_op: rebalance operation
- */
-struct nilfs_btree_path {
-        struct buffer_head *bp_bh;
-        struct buffer_head *bp_sib_bh;
-        int bp_index;
-        union nilfs_bmap_ptr_req bp_oldreq;
-        union nilfs_bmap_ptr_req bp_newreq;
-        struct nilfs_btnode_chkey_ctxt bp_ctxt;
-        void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
-                      int, __u64 *, __u64 *);
-};
-/*
- * B-tree path operations
- */
-static struct kmem_cache *nilfs_btree_path_cache;
-int __init nilfs_btree_path_cache_init(void)
-{
-        nilfs_btree_path_cache =
-                kmem_cache_create("nilfs2_btree_path_cache",
-                                  sizeof(struct nilfs_btree_path) *
-                                  NILFS_BTREE_LEVEL_MAX, 0, 0, NULL);
-        return (nilfs_btree_path_cache != NULL) ? 0 : -ENOMEM;
-}
-void nilfs_btree_path_cache_destroy(void)
-{
-        kmem_cache_destroy(nilfs_btree_path_cache);
-}
-static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void)
-{
-        return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
-}
-static inline void nilfs_btree_free_path(struct nilfs_btree_path *path)
 {
-        kmem_cache_free(nilfs_btree_path_cache, path);
+        struct nilfs_btree_path *path;
-}
+        int level = NILFS_BTREE_LEVEL_DATA;
-static void nilfs_btree_init_path(struct nilfs_btree_path *path)
+        path = kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
-{
+        if (path == NULL)
-        int level;
+                goto out;
-        for (level = NILFS_BTREE_LEVEL_DATA;
+        for (; level < NILFS_BTREE_LEVEL_MAX; level++) {
-             level < NILFS_BTREE_LEVEL_MAX;
-             level++) {
                path[level].bp_bh = NULL;
                path[level].bp_sib_bh = NULL;
                path[level].bp_index = 0;
@@ -95,15 +48,19 @@ static void nilfs_btree_init_path(struct nilfs_btree_path *path)
                path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
                path[level].bp_op = NULL;
        }
+out:
+        return path;
 }
-static void nilfs_btree_release_path(struct nilfs_btree_path *path)
+static void nilfs_btree_free_path(struct nilfs_btree_path *path)
 {
-        int level;
+        int level = NILFS_BTREE_LEVEL_DATA;
-        for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX;
+        for (; level < NILFS_BTREE_LEVEL_MAX; level++)
-             level++)
                brelse(path[level].bp_bh);
+        kmem_cache_free(nilfs_btree_path_cache, path);
 }
 /*
@@ -566,14 +523,12 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
        if (ptrp != NULL)
                *ptrp = ptr;
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
@@ -594,7 +549,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
        if (ret < 0)
                goto out;
@@ -655,7 +610,6 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
        *ptrp = ptr;
        ret = cnt;
 out:
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
 }
@@ -1123,7 +1077,6 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, NULL,
                                    NILFS_BTREE_LEVEL_NODE_MIN);
@@ -1140,7 +1093,6 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
        nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
 out:
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
 }
@@ -1456,7 +1408,7 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, NULL,
                                    NILFS_BTREE_LEVEL_NODE_MIN);
        if (ret < 0)
@@ -1473,7 +1425,6 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
        nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
 out:
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
 }
@@ -1488,11 +1439,9 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
@@ -1879,7 +1828,7 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
                                   struct nilfs_btree_path *path,
                                   int level, struct buffer_head *bh)
 {
-        int maxlevel, ret;
+        int maxlevel = 0, ret;
        struct nilfs_btree_node *parent;
        struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
        __u64 ptr;
@@ -1923,7 +1872,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        if (buffer_nilfs_node(bh)) {
                node = (struct nilfs_btree_node *)bh->b_data;
@@ -1947,7 +1895,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
                nilfs_btree_propagate_p(btree, path, level, bh);
 out:
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
@@ -2108,7 +2055,6 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        if (buffer_nilfs_node(*bh)) {
                node = (struct nilfs_btree_node *)(*bh)->b_data;
@@ -2130,7 +2076,6 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
                nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
 out:
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
@@ -2175,7 +2120,6 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
        if (ret < 0) {
@@ -2195,7 +2139,6 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
                nilfs_bmap_set_dirty(&btree->bt_bmap);
 out:
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
 }
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 4b82d84ade75..af638d59e3bf 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -30,9 +30,6 @@
 #include "btnode.h"
 #include "bmap.h"
-struct nilfs_btree;
-struct nilfs_btree_path;
 /**
 * struct nilfs_btree - B-tree structure
 * @bt_bmap: bmap base structure
@@ -41,6 +38,25 @@ struct nilfs_btree {
        struct nilfs_bmap bt_bmap;
 };
+/**
+ * struct nilfs_btree_path - A path on which B-tree operations are executed
+ * @bp_bh: buffer head of node block
+ * @bp_sib_bh: buffer head of sibling node block
+ * @bp_index: index of child node
+ * @bp_oldreq: ptr end request for old ptr
+ * @bp_newreq: ptr alloc request for new ptr
+ * @bp_op: rebalance operation
+ */
+struct nilfs_btree_path {
+        struct buffer_head *bp_bh;
+        struct buffer_head *bp_sib_bh;
+        int bp_index;
+        union nilfs_bmap_ptr_req bp_oldreq;
+        union nilfs_bmap_ptr_req bp_newreq;
+        struct nilfs_btnode_chkey_ctxt bp_ctxt;
+        void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
+                      int, __u64 *, __u64 *);
+};
 #define NILFS_BTREE_ROOT_SIZE           NILFS_BMAP_SIZE
 #define NILFS_BTREE_ROOT_NCHILDREN_MAX                                  \
@@ -57,6 +73,7 @@ struct nilfs_btree {
 #define NILFS_BTREE_KEY_MIN     ((__u64)0)
 #define NILFS_BTREE_KEY_MAX     (~(__u64)0)
+extern struct kmem_cache *nilfs_btree_path_cache;
 int nilfs_btree_path_cache_init(void);
 void nilfs_btree_path_cache_destroy(void);
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 30292df443ce..c9a30d7ff6fc 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -27,7 +27,7 @@
 #include "nilfs.h"
 #include "segment.h"
-int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+int nilfs_sync_file(struct file *file, int datasync)
 {
        /*
         * Called from fsync() system call
@@ -37,7 +37,7 @@ int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
         * This function should be implemented when the writeback function
         * will be implemented.
         */
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        int err;
        if (!nilfs_inode_dirty(inode))
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 8880a9e281e7..145f03cd7d3e 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -45,6 +45,7 @@
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
 #include <linux/hash.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 #include "nilfs.h"
 #include "page.h"
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 7868cc122ac7..39e038ac8fcb 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -22,6 +22,7 @@
 */
 #include <linux/buffer_head.h>
+#include <linux/gfp.h>
 #include <linux/mpage.h>
 #include <linux/writeback.h>
 #include <linux/uio.h>
@@ -279,16 +280,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
        /* reference count of i_bh inherits from nilfs_mdt_read_block() */
        atomic_inc(&sbi->s_inodes_count);
+        inode_init_owner(inode, dir, mode);
-        inode->i_uid = current_fsuid();
-        if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
-        } else
-                inode->i_gid = current_fsgid();
-        inode->i_mode = mode;
        inode->i_ino = ino;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -450,7 +442,7 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
                inode->i_op = &nilfs_special_inode_operations;
                init_special_inode(
                        inode, inode->i_mode,
-                        new_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
+                        huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
        }
        nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
        brelse(bh);
@@ -510,7 +502,7 @@ void nilfs_write_inode_common(struct inode *inode,
                nilfs_bmap_write(ii->i_bmap, raw_inode);
        else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
                raw_inode->i_device_code =
-                        cpu_to_le64(new_encode_dev(inode->i_rdev));
+                        cpu_to_le64(huge_encode_dev(inode->i_rdev));
        /* When extending inode, nilfs->ns_inode_size should be checked
           for substitutions of appended fields */
 }
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 313d0a21da48..f90a33d9a5b0 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -23,6 +23,7 @@
 #include <linux/fs.h>
 #include <linux/wait.h>
 #include <linux/smp_lock.h>     /* lock_kernel(), unlock_kernel() */
+#include <linux/slab.h>
 #include <linux/capability.h>   /* capable() */
 #include <linux/uaccess.h>      /* copy_from_user(), copy_to_user() */
 #include <linux/vmalloc.h>
@@ -648,7 +649,7 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
 long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
        struct inode *inode = filp->f_dentry->d_inode;
-        void __user *argp = (void * __user *)arg;
+        void __user *argp = (void __user *)arg;
        switch (cmd) {
        case NILFS_IOCTL_CHANGE_CPMODE:
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 06713ffcc7f2..024be8c35bb6 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -26,6 +26,7 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/swap.h>
+#include <linux/slab.h>
 #include "nilfs.h"
 #include "segment.h"
 #include "page.h"
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 8723e5bfd071..47d6d7928122 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -228,7 +228,7 @@ extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
                           struct page *, struct inode *);
 /* file.c */
-extern int nilfs_sync_file(struct file *, struct dentry *, int);
+extern int nilfs_sync_file(struct file *, int);
 /* ioctl.c */
 long nilfs_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index fc246dba112a..8de3e1e48130 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -29,6 +29,7 @@
 #include <linux/list.h>
 #include <linux/highmem.h>
 #include <linux/pagevec.h>
+#include <linux/gfp.h>
 #include "nilfs.h"
 #include "page.h"
 #include "mdt.h"
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 017bedc761a0..bae2a516b4ee 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -23,6 +23,7 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/swap.h>
+#include <linux/slab.h>
 #include <linux/crc32.h>
 #include "nilfs.h"
 #include "segment.h"
@@ -104,6 +105,8 @@ static void store_segsum_info(struct nilfs_segsum_info *ssi,
        ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize);
        ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi);
+        /* need to verify ->ss_bytes field if read ->ss_cno */
 }
 /**
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 636eaafd6ea2..2e6a2723b8fa 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -25,6 +25,7 @@
 #include <linux/writeback.h>
 #include <linux/crc32.h>
 #include <linux/backing-dev.h>
+#include <linux/slab.h>
 #include "page.h"
 #include "segbuf.h"
@@ -39,35 +40,10 @@ struct nilfs_write_info {
        sector_t                blocknr;
 };
 static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
                              struct the_nilfs *nilfs);
 static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
-static struct kmem_cache *nilfs_segbuf_cachep;
-static void nilfs_segbuf_init_once(void *obj)
-{
-        memset(obj, 0, sizeof(struct nilfs_segment_buffer));
-}
-int __init nilfs_init_segbuf_cache(void)
-{
-        nilfs_segbuf_cachep =
-                kmem_cache_create("nilfs2_segbuf_cache",
-                                  sizeof(struct nilfs_segment_buffer),
-                                  0, SLAB_RECLAIM_ACCOUNT,
-                                  nilfs_segbuf_init_once);
-        return (nilfs_segbuf_cachep == NULL) ? -ENOMEM : 0;
-}
-void nilfs_destroy_segbuf_cache(void)
-{
-        kmem_cache_destroy(nilfs_segbuf_cachep);
-}
 struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
 {
        struct nilfs_segment_buffer *segbuf;
@@ -80,6 +56,7 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
        INIT_LIST_HEAD(&segbuf->sb_list);
        INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
        INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
+        segbuf->sb_super_root = NULL;
        init_completion(&segbuf->sb_bio_event);
        atomic_set(&segbuf->sb_err, 0);
@@ -157,7 +134,7 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
 }
 int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
-                       time_t ctime)
+                       time_t ctime, __u64 cno)
 {
        int err;
@@ -170,6 +147,7 @@ int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
        segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
        segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
        segbuf->sb_sum.ctime = ctime;
+        segbuf->sb_sum.cno = cno;
        return 0;
 }
@@ -195,13 +173,14 @@ void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
        raw_sum->ss_nfinfo   = cpu_to_le32(segbuf->sb_sum.nfinfo);
        raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes);
        raw_sum->ss_pad      = 0;
+        raw_sum->ss_cno      = cpu_to_le64(segbuf->sb_sum.cno);
 }
 /*
 * CRC calculation routines
 */
-void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf,
+static void
-                                     u32 seed)
+nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, u32 seed)
 {
        struct buffer_head *bh;
        struct nilfs_segment_summary *raw_sum;
@@ -228,8 +207,8 @@ void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf,
        raw_sum->ss_sumsum = cpu_to_le32(crc);
 }
-void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
+static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
-                                   u32 seed)
+                                          u32 seed)
 {
        struct buffer_head *bh;
        struct nilfs_segment_summary *raw_sum;
@@ -255,6 +234,20 @@ void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
        raw_sum->ss_datasum = cpu_to_le32(crc);
 }
+static void
+nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf,
+                                    u32 seed)
+{
+        struct nilfs_super_root *raw_sr;
+        u32 crc;
+        raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data;
+        crc = crc32_le(seed,
+                       (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
+                       NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
+        raw_sr->sr_sum = cpu_to_le32(crc);
+}
 static void nilfs_release_buffers(struct list_head *list)
 {
        struct buffer_head *bh, *n;
@@ -281,6 +274,7 @@ static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
 {
        nilfs_release_buffers(&segbuf->sb_segsum_buffers);
        nilfs_release_buffers(&segbuf->sb_payload_buffers);
+        segbuf->sb_super_root = NULL;
 }
 /*
@@ -323,14 +317,31 @@ int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs)
 int nilfs_wait_on_logs(struct list_head *logs)
 {
        struct nilfs_segment_buffer *segbuf;
-        int err;
+        int err, ret = 0;
        list_for_each_entry(segbuf, logs, sb_list) {
                err = nilfs_segbuf_wait(segbuf);
-                if (err)
+                if (err && !ret)
-                        return err;
+                        ret = err;
+        }
+        return ret;
+}
+/**
+ * nilfs_add_checksums_on_logs - add checksums on the logs
+ * @logs: list of segment buffers storing target logs
+ * @seed: checksum seed value
+ */
+void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed)
+{
+        struct nilfs_segment_buffer *segbuf;
+        list_for_each_entry(segbuf, logs, sb_list) {
+                if (segbuf->sb_super_root)
+                        nilfs_segbuf_fill_in_super_root_crc(segbuf, seed);
+                nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
+                nilfs_segbuf_fill_in_data_crc(segbuf, seed);
        }
-        return 0;
 }
 /*
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 94dfd3517bc0..fdf1c3b6d673 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -37,6 +37,7 @@
 * @sumbytes: Byte count of segment summary
 * @nfileblk: Total number of file blocks
 * @seg_seq: Segment sequence number
+ * @cno: Checkpoint number
 * @ctime: Creation time
 * @next: Block number of the next full segment
 */
@@ -48,6 +49,7 @@ struct nilfs_segsum_info {
        unsigned long           sumbytes;
        unsigned long           nfileblk;
        u64                     seg_seq;
+        __u64                   cno;
        time_t                  ctime;
        sector_t                next;
 };
@@ -76,6 +78,7 @@ struct nilfs_segsum_info {
 * @sb_rest_blocks: Number of residual blocks in the current segment
 * @sb_segsum_buffers: List of buffers for segment summaries
 * @sb_payload_buffers: List of buffers for segment payload
+ * @sb_super_root: Pointer to buffer storing a super root block (if exists)
 * @sb_nbio: Number of flying bio requests
 * @sb_err: I/O error status
 * @sb_bio_event: Completion event of log writing
@@ -95,6 +98,7 @@ struct nilfs_segment_buffer {
        /* Buffers */
        struct list_head        sb_segsum_buffers;
        struct list_head        sb_payload_buffers; /* including super root */
+        struct buffer_head     *sb_super_root;
        /* io status */
        int                     sb_nbio;
@@ -121,6 +125,7 @@ struct nilfs_segment_buffer {
                    b_assoc_buffers))
 #define NILFS_SEGBUF_BH_IS_LAST(bh, head)  ((bh)->b_assoc_buffers.next == head)
+extern struct kmem_cache *nilfs_segbuf_cachep;
 int __init nilfs_init_segbuf_cache(void);
 void nilfs_destroy_segbuf_cache(void);
@@ -132,13 +137,11 @@ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
                           struct nilfs_segment_buffer *prev);
 void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
                                  struct the_nilfs *);
-int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t);
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t, __u64);
 int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
 int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
                                struct buffer_head **);
 void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *);
-void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *, u32);
-void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *, u32);
 static inline void
 nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf,
@@ -171,6 +174,7 @@ void nilfs_truncate_logs(struct list_head *logs,
                         struct nilfs_segment_buffer *last);
 int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs);
 int nilfs_wait_on_logs(struct list_head *logs);
+void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed);
 static inline void nilfs_destroy_logs(struct list_head *logs)
 {
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 69576a95e13f..c9201649cc49 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -32,6 +32,7 @@
 #include <linux/kthread.h>
 #include <linux/crc32.h>
 #include <linux/pagevec.h>
+#include <linux/slab.h>
 #include "nilfs.h"
 #include "btnode.h"
 #include "page.h"
@@ -115,42 +116,6 @@ static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *,
 #define nilfs_cnt32_lt(a, b)  nilfs_cnt32_gt(b, a)
 #define nilfs_cnt32_le(a, b)  nilfs_cnt32_ge(b, a)
-/*
- * Transaction
- */
-static struct kmem_cache *nilfs_transaction_cachep;
-/**
- * nilfs_init_transaction_cache - create a cache for nilfs_transaction_info
- *
- * nilfs_init_transaction_cache() creates a slab cache for the struct
- * nilfs_transaction_info.
- *
- * Return Value: On success, it returns 0. On error, one of the following
- * negative error code is returned.
- *
- * %-ENOMEM - Insufficient memory available.
- */
-int nilfs_init_transaction_cache(void)
-{
-        nilfs_transaction_cachep =
-                kmem_cache_create("nilfs2_transaction_cache",
-                                  sizeof(struct nilfs_transaction_info),
-                                  0, SLAB_RECLAIM_ACCOUNT, NULL);
-        return (nilfs_transaction_cachep == NULL) ? -ENOMEM : 0;
-}
-/**
- * nilfs_destroy_transaction_cache - destroy the cache for transaction info
- *
- * nilfs_destroy_transaction_cache() frees the slab cache for the struct
- * nilfs_transaction_info.
- */
-void nilfs_destroy_transaction_cache(void)
-{
-        kmem_cache_destroy(nilfs_transaction_cachep);
-}
 static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
 {
        struct nilfs_transaction_info *cur_ti = current->journal_info;
@@ -401,7 +366,8 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
        if (nilfs_doing_gc())
                flags = NILFS_SS_GC;
-        err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime);
+        err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime,
+                                 sci->sc_sbi->s_nilfs->ns_cno);
        if (unlikely(err))
                return err;
@@ -434,7 +400,7 @@ static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
                        return err;
                segbuf = sci->sc_curseg;
        }
-        err = nilfs_segbuf_extend_payload(segbuf, &sci->sc_super_root);
+        err = nilfs_segbuf_extend_payload(segbuf, &segbuf->sb_super_root);
        if (likely(!err))
                segbuf->sb_sum.flags |= NILFS_SS_SR;
        return err;
@@ -598,7 +564,7 @@ static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci,
        *vblocknr = binfo->bi_v.bi_vblocknr;
 }
-struct nilfs_sc_operations nilfs_sc_file_ops = {
+static struct nilfs_sc_operations nilfs_sc_file_ops = {
        .collect_data = nilfs_collect_file_data,
        .collect_node = nilfs_collect_file_node,
        .collect_bmap = nilfs_collect_file_bmap,
@@ -648,7 +614,7 @@ static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci,
        *binfo_dat = binfo->bi_dat;
 }
-struct nilfs_sc_operations nilfs_sc_dat_ops = {
+static struct nilfs_sc_operations nilfs_sc_dat_ops = {
        .collect_data = nilfs_collect_dat_data,
        .collect_node = nilfs_collect_file_node,
        .collect_bmap = nilfs_collect_dat_bmap,
@@ -656,7 +622,7 @@ struct nilfs_sc_operations nilfs_sc_dat_ops = {
        .write_node_binfo = nilfs_write_dat_node_binfo,
 };
-struct nilfs_sc_operations nilfs_sc_dsync_ops = {
+static struct nilfs_sc_operations nilfs_sc_dsync_ops = {
        .collect_data = nilfs_collect_file_data,
        .collect_node = NULL,
        .collect_bmap = NULL,
@@ -931,43 +897,16 @@ static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci,
        }
 }
-/*
- * CRC calculation routines
- */
-static void nilfs_fill_in_super_root_crc(struct buffer_head *bh_sr, u32 seed)
-{
-        struct nilfs_super_root *raw_sr =
-                (struct nilfs_super_root *)bh_sr->b_data;
-        u32 crc;
-        crc = crc32_le(seed,
-                       (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
-                       NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
-        raw_sr->sr_sum = cpu_to_le32(crc);
-}
-static void nilfs_segctor_fill_in_checksums(struct nilfs_sc_info *sci,
-                                            u32 seed)
-{
-        struct nilfs_segment_buffer *segbuf;
-        if (sci->sc_super_root)
-                nilfs_fill_in_super_root_crc(sci->sc_super_root, seed);
-        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
-                nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
-                nilfs_segbuf_fill_in_data_crc(segbuf, seed);
-        }
-}
 static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
                                             struct the_nilfs *nilfs)
 {
-        struct buffer_head *bh_sr = sci->sc_super_root;
+        struct buffer_head *bh_sr;
-        struct nilfs_super_root *raw_sr =
+        struct nilfs_super_root *raw_sr;
-                (struct nilfs_super_root *)bh_sr->b_data;
        unsigned isz = nilfs->ns_inode_size;
+        bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;
+        raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
        raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES);
        raw_sr->sr_nongc_ctime
                = cpu_to_le64(nilfs_doing_gc() ?
@@ -1490,7 +1429,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
        /* Collection retry loop */
        for (;;) {
-                sci->sc_super_root = NULL;
                sci->sc_nblk_this_inc = 0;
                sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
@@ -1510,6 +1448,12 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
                if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
                        break;
+                nilfs_clear_logs(&sci->sc_segbufs);
+                err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
+                if (unlikely(err))
+                        return err;
                if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
                        err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
                                                        sci->sc_freesegs,
@@ -1517,12 +1461,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
                                                        NULL);
                        WARN_ON(err); /* do not happen */
                }
-                nilfs_clear_logs(&sci->sc_segbufs);
-                err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
-                if (unlikely(err))
-                        return err;
                nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
                sci->sc_stage = prev_stage;
        }
@@ -1567,7 +1505,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
        ssp.offset = sizeof(struct nilfs_segment_summary);
        list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
-                if (bh == sci->sc_super_root)
+                if (bh == segbuf->sb_super_root)
                        break;
                if (!finfo) {
                        finfo = nilfs_segctor_map_segsum_entry(
@@ -1728,7 +1666,7 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
                list_for_each_entry(bh, &segbuf->sb_payload_buffers,
                                    b_assoc_buffers) {
-                        if (bh == sci->sc_super_root) {
+                        if (bh == segbuf->sb_super_root) {
                                if (bh->b_page != bd_page) {
                                        lock_page(bd_page);
                                        clear_page_dirty_for_io(bd_page);
@@ -1847,7 +1785,7 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err)
 }
 static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
-                             struct buffer_head *bh_sr, int err)
+                             int err)
 {
        struct nilfs_segment_buffer *segbuf;
        struct page *bd_page = NULL, *fs_page = NULL;
@@ -1868,7 +1806,7 @@ static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
                list_for_each_entry(bh, &segbuf->sb_payload_buffers,
                                    b_assoc_buffers) {
-                        if (bh == bh_sr) {
+                        if (bh == segbuf->sb_super_root) {
                                if (bh->b_page != bd_page) {
                                        end_page_writeback(bd_page);
                                        bd_page = bh->b_page;
@@ -1897,8 +1835,7 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
        list_splice_tail_init(&sci->sc_write_logs, &logs);
        ret = nilfs_wait_on_logs(&logs);
-        if (ret)
+        nilfs_abort_logs(&logs, NULL, ret ? : err);
-                nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret);
        list_splice_tail_init(&sci->sc_segbufs, &logs);
        nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
@@ -1914,7 +1851,6 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
        }
        nilfs_destroy_logs(&logs);
-        sci->sc_super_root = NULL;
 }
 static void nilfs_set_next_segment(struct the_nilfs *nilfs,
@@ -1933,7 +1869,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
        struct nilfs_segment_buffer *segbuf;
        struct page *bd_page = NULL, *fs_page = NULL;
        struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
-        int update_sr = (sci->sc_super_root != NULL);
+        int update_sr = false;
        list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
                struct buffer_head *bh;
@@ -1964,11 +1900,12 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
                        set_buffer_uptodate(bh);
                        clear_buffer_dirty(bh);
                        clear_buffer_nilfs_volatile(bh);
-                        if (bh == sci->sc_super_root) {
+                        if (bh == segbuf->sb_super_root) {
                                if (bh->b_page != bd_page) {
                                        end_page_writeback(bd_page);
                                        bd_page = bh->b_page;
                                }
+                                update_sr = true;
                                break;
                        }
                        if (bh->b_page != fs_page) {
@@ -2115,7 +2052,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
        struct nilfs_sb_info *sbi = sci->sc_sbi;
        struct the_nilfs *nilfs = sbi->s_nilfs;
        struct page *failed_page;
-        int err, has_sr = 0;
+        int err;
        sci->sc_stage.scnt = NILFS_ST_INIT;
@@ -2143,8 +2080,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                if (unlikely(err))
                        goto failed;
-                has_sr = (sci->sc_super_root != NULL);
                /* Avoid empty segment */
                if (sci->sc_stage.scnt == NILFS_ST_DONE &&
                    NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) {
@@ -2159,7 +2094,8 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
                        nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile);
-                if (has_sr) {
+                if (mode == SC_LSEG_SR &&
+                    sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
                        err = nilfs_segctor_fill_in_checkpoint(sci);
                        if (unlikely(err))
                                goto failed_to_write;
@@ -2171,11 +2107,12 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                /* Write partial segments */
                err = nilfs_segctor_prepare_write(sci, &failed_page);
                if (err) {
-                        nilfs_abort_logs(&sci->sc_segbufs, failed_page,
+                        nilfs_abort_logs(&sci->sc_segbufs, failed_page, err);
-                                         sci->sc_super_root, err);
                        goto failed_to_write;
                }
-                nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed);
+                nilfs_add_checksums_on_logs(&sci->sc_segbufs,
+                                            nilfs->ns_crc_seed);
                err = nilfs_segctor_write(sci, nilfs);
                if (unlikely(err))
@@ -2196,8 +2133,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                }
        } while (sci->sc_stage.scnt != NILFS_ST_DONE);
-        sci->sc_super_root = NULL;
 out:
        nilfs_segctor_check_out_files(sci, sbi);
        return err;
@@ -2224,9 +2159,9 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
 {
        spin_lock(&sci->sc_state_lock);
-        if (sci->sc_timer && !(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
+        if (!(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
-                sci->sc_timer->expires = jiffies + sci->sc_interval;
+                sci->sc_timer.expires = jiffies + sci->sc_interval;
-                add_timer(sci->sc_timer);
+                add_timer(&sci->sc_timer);
                sci->sc_state |= NILFS_SEGCTOR_COMMIT;
        }
        spin_unlock(&sci->sc_state_lock);
@@ -2431,9 +2366,7 @@ static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
        spin_lock(&sci->sc_state_lock);
        sci->sc_seq_accepted = sci->sc_seq_request;
        spin_unlock(&sci->sc_state_lock);
+        del_timer_sync(&sci->sc_timer);
-        if (sci->sc_timer)
-                del_timer_sync(sci->sc_timer);
 }
 /**
@@ -2459,9 +2392,9 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
                        sci->sc_flush_request &= ~FLUSH_DAT_BIT;
                /* re-enable timer if checkpoint creation was not done */
-                if (sci->sc_timer && (sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
+                if ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
-                    time_before(jiffies, sci->sc_timer->expires))
+                    time_before(jiffies, sci->sc_timer.expires))
-                        add_timer(sci->sc_timer);
+                        add_timer(&sci->sc_timer);
        }
        spin_unlock(&sci->sc_state_lock);
 }
@@ -2640,13 +2573,10 @@ static int nilfs_segctor_thread(void *arg)
 {
        struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
        struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
-        struct timer_list timer;
        int timeout = 0;
-        init_timer(&timer);
+        sci->sc_timer.data = (unsigned long)current;
-        timer.data = (unsigned long)current;
+        sci->sc_timer.function = nilfs_construction_timeout;
-        timer.function = nilfs_construction_timeout;
-        sci->sc_timer = &timer;
        /* start sync. */
        sci->sc_task = current;
@@ -2695,7 +2625,7 @@ static int nilfs_segctor_thread(void *arg)
                        should_sleep = 0;
                else if (sci->sc_state & NILFS_SEGCTOR_COMMIT)
                        should_sleep = time_before(jiffies,
-                                                   sci->sc_timer->expires);
+                                        sci->sc_timer.expires);
                if (should_sleep) {
                        spin_unlock(&sci->sc_state_lock);
@@ -2704,7 +2634,7 @@ static int nilfs_segctor_thread(void *arg)
                }
                finish_wait(&sci->sc_wait_daemon, &wait);
                timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
-                           time_after_eq(jiffies, sci->sc_timer->expires));
+                           time_after_eq(jiffies, sci->sc_timer.expires));
                if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs))
                        set_nilfs_discontinued(nilfs);
@@ -2713,8 +2643,6 @@ static int nilfs_segctor_thread(void *arg)
 end_thread:
        spin_unlock(&sci->sc_state_lock);
-        del_timer_sync(sci->sc_timer);
-        sci->sc_timer = NULL;
        /* end sync. */
        sci->sc_task = NULL;
@@ -2750,13 +2678,6 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
        }
 }
-static int nilfs_segctor_init(struct nilfs_sc_info *sci)
-{
-        sci->sc_seq_done = sci->sc_seq_request;
-        return nilfs_segctor_start_thread(sci);
-}
 /*
 * Setup & clean-up functions
 */
@@ -2780,6 +2701,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
        INIT_LIST_HEAD(&sci->sc_write_logs);
        INIT_LIST_HEAD(&sci->sc_gc_inodes);
        INIT_LIST_HEAD(&sci->sc_copied_buffers);
+        init_timer(&sci->sc_timer);
        sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
        sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
@@ -2846,6 +2768,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
        down_write(&sbi->s_nilfs->ns_segctor_sem);
+        del_timer_sync(&sci->sc_timer);
        kfree(sci);
 }
@@ -2880,7 +2803,7 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
                return -ENOMEM;
        nilfs_attach_writer(nilfs, sbi);
-        err = nilfs_segctor_init(NILFS_SC(sbi));
+        err = nilfs_segctor_start_thread(NILFS_SC(sbi));
        if (err) {
                nilfs_detach_writer(nilfs, sbi);
                kfree(sbi->s_sc_info);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 82dfd6a686b9..dca142361ccf 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -100,7 +100,6 @@ struct nilfs_segsum_pointer {
 * @sc_write_logs: List of segment buffers to hold logs under writing
 * @sc_segbuf_nblocks: Number of available blocks in segment buffers.
 * @sc_curseg: Current segment buffer
- * @sc_super_root: Pointer to the super root buffer
 * @sc_stage: Collection stage
 * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary
 * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary
@@ -148,7 +147,6 @@ struct nilfs_sc_info {
        struct list_head        sc_write_logs;
        unsigned long           sc_segbuf_nblocks;
        struct nilfs_segment_buffer *sc_curseg;
-        struct buffer_head     *sc_super_root;
        struct nilfs_cstage     sc_stage;
@@ -179,7 +177,7 @@ struct nilfs_sc_info {
        unsigned long           sc_lseg_stime;  /* in 1/HZ seconds */
        unsigned long           sc_watermark;
-        struct timer_list      *sc_timer;
+        struct timer_list       sc_timer;
        struct task_struct     *sc_task;
 };
@@ -219,6 +217,8 @@ enum {
 */
 #define NILFS_SC_DEFAULT_WATERMARK  3600
+/* super.c */
+extern struct kmem_cache *nilfs_transaction_cachep;
 /* segment.c */
 extern int nilfs_init_transaction_cache(void);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 0cdbc5e7655a..03b34b738993 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -67,6 +67,11 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
                   "(NILFS)");
 MODULE_LICENSE("GPL");
+struct kmem_cache *nilfs_inode_cachep;
+struct kmem_cache *nilfs_transaction_cachep;
+struct kmem_cache *nilfs_segbuf_cachep;
+struct kmem_cache *nilfs_btree_path_cache;
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
 /**
@@ -129,7 +134,6 @@ void nilfs_warning(struct super_block *sb, const char *function,
        va_end(args);
 }
-static struct kmem_cache *nilfs_inode_cachep;
 struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
 {
@@ -155,34 +159,6 @@ void nilfs_destroy_inode(struct inode *inode)
        kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
 }
-static void init_once(void *obj)
-{
-        struct nilfs_inode_info *ii = obj;
-        INIT_LIST_HEAD(&ii->i_dirty);
-#ifdef CONFIG_NILFS_XATTR
-        init_rwsem(&ii->xattr_sem);
-#endif
-        nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
-        ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
-        inode_init_once(&ii->vfs_inode);
-}
-static int nilfs_init_inode_cache(void)
-{
-        nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
-                                               sizeof(struct nilfs_inode_info),
-                                               0, SLAB_RECLAIM_ACCOUNT,
-                                               init_once);
-        return (nilfs_inode_cachep == NULL) ? -ENOMEM : 0;
-}
-static inline void nilfs_destroy_inode_cache(void)
-{
-        kmem_cache_destroy(nilfs_inode_cachep);
-}
 static void nilfs_clear_inode(struct inode *inode)
 {
        struct nilfs_inode_info *ii = NILFS_I(inode);
@@ -266,8 +242,8 @@ int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
        int err;
        /* nilfs->sem must be locked by the caller. */
-        if (sbp[0]->s_magic != NILFS_SUPER_MAGIC) {
+        if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
-                if (sbp[1] && sbp[1]->s_magic == NILFS_SUPER_MAGIC)
+                if (sbp[1] && sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC))
                        nilfs_swap_super_block(nilfs);
                else {
                        printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
@@ -470,10 +446,10 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (nilfs_test_opt(sbi, SNAPSHOT))
                seq_printf(seq, ",cp=%llu",
                           (unsigned long long int)sbi->s_snapshot_cno);
-        if (nilfs_test_opt(sbi, ERRORS_RO))
-                seq_printf(seq, ",errors=remount-ro");
        if (nilfs_test_opt(sbi, ERRORS_PANIC))
                seq_printf(seq, ",errors=panic");
+        if (nilfs_test_opt(sbi, ERRORS_CONT))
+                seq_printf(seq, ",errors=continue");
        if (nilfs_test_opt(sbi, STRICT_ORDER))
                seq_printf(seq, ",order=strict");
        if (nilfs_test_opt(sbi, NORECOVERY))
@@ -631,7 +607,7 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi,
                          struct nilfs_super_block *sbp)
 {
        sbi->s_mount_opt =
-                NILFS_MOUNT_ERRORS_CONT | NILFS_MOUNT_BARRIER;
+                NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
 }
 static int nilfs_setup_super(struct nilfs_sb_info *sbi)
@@ -749,6 +725,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
        sb->s_export_op = &nilfs_export_ops;
        sb->s_root = NULL;
        sb->s_time_gran = 1;
+        sb->s_bdi = nilfs->ns_bdi;
        err = load_nilfs(nilfs, sbi);
        if (err)
@@ -777,9 +754,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
                                goto failed_sbi;
                        }
                        cno = sbi->s_snapshot_cno;
-                } else
+                }
-                        /* Read-only mount */
-                        sbi->s_snapshot_cno = cno;
        }
        err = nilfs_attach_checkpoint(sbi, cno);
@@ -848,7 +823,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        struct the_nilfs *nilfs = sbi->s_nilfs;
        unsigned long old_sb_flags;
        struct nilfs_mount_options old_opts;
-        int err;
+        int was_snapshot, err;
        lock_kernel();
@@ -856,6 +831,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        old_sb_flags = sb->s_flags;
        old_opts.mount_opt = sbi->s_mount_opt;
        old_opts.snapshot_cno = sbi->s_snapshot_cno;
+        was_snapshot = nilfs_test_opt(sbi, SNAPSHOT);
        if (!parse_options(data, sb)) {
                err = -EINVAL;
@@ -863,20 +839,32 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        }
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
-        if ((*flags & MS_RDONLY) &&
+        err = -EINVAL;
-            sbi->s_snapshot_cno != old_opts.snapshot_cno) {
+        if (was_snapshot) {
-                printk(KERN_WARNING "NILFS (device %s): couldn't "
+                if (!(*flags & MS_RDONLY)) {
-                       "remount to a different snapshot.\n",
+                        printk(KERN_ERR "NILFS (device %s): cannot remount "
-                       sb->s_id);
+                               "snapshot read/write.\n",
-                err = -EINVAL;
+                               sb->s_id);
-                goto restore_opts;
+                        goto restore_opts;
+                } else if (sbi->s_snapshot_cno != old_opts.snapshot_cno) {
+                        printk(KERN_ERR "NILFS (device %s): cannot "
+                               "remount to a different snapshot.\n",
+                               sb->s_id);
+                        goto restore_opts;
+                }
+        } else {
+                if (nilfs_test_opt(sbi, SNAPSHOT)) {
+                        printk(KERN_ERR "NILFS (device %s): cannot change "
+                               "a regular mount to a snapshot.\n",
+                               sb->s_id);
+                        goto restore_opts;
+                }
        }
        if (!nilfs_valid_fs(nilfs)) {
                printk(KERN_WARNING "NILFS (device %s): couldn't "
                       "remount because the filesystem is in an "
                       "incomplete recovery state.\n", sb->s_id);
-                err = -EINVAL;
                goto restore_opts;
        }
@@ -887,9 +875,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                nilfs_detach_segment_constructor(sbi);
                sb->s_flags |= MS_RDONLY;
-                sbi->s_snapshot_cno = nilfs_last_cno(nilfs);
-                /* nilfs_set_opt(sbi, SNAPSHOT); */
                /*
                 * Remounting a valid RW partition RDONLY, so set
                 * the RDONLY flag and then mark the partition as valid again.
@@ -908,24 +893,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                 * store the current valid flag.  (It may have been changed
                 * by fsck since we originally mounted the partition.)
                 */
-                if (nilfs->ns_current && nilfs->ns_current != sbi) {
-                        printk(KERN_WARNING "NILFS (device %s): couldn't "
-                               "remount because an RW-mount exists.\n",
-                               sb->s_id);
-                        err = -EBUSY;
-                        goto restore_opts;
-                }
-                if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
-                        printk(KERN_WARNING "NILFS (device %s): couldn't "
-                               "remount because the current RO-mount is not "
-                               "the latest one.\n",
-                               sb->s_id);
-                        err = -EINVAL;
-                        goto restore_opts;
-                }
                sb->s_flags &= ~MS_RDONLY;
-                nilfs_clear_opt(sbi, SNAPSHOT);
-                sbi->s_snapshot_cno = 0;
                err = nilfs_attach_segment_constructor(sbi);
                if (err)
@@ -934,8 +902,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                down_write(&nilfs->ns_sem);
                nilfs_setup_super(sbi);
                up_write(&nilfs->ns_sem);
-                nilfs->ns_current = sbi;
        }
 out:
        up_write(&nilfs->ns_super_sem);
@@ -1021,10 +987,14 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 {
        struct nilfs_super_data sd;
        struct super_block *s;
+        fmode_t mode = FMODE_READ;
        struct the_nilfs *nilfs;
        int err, need_to_close = 1;
-        sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type);
+        if (!(flags & MS_RDONLY))
+                mode |= FMODE_WRITE;
+        sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
        if (IS_ERR(sd.bdev))
                return PTR_ERR(sd.bdev);
@@ -1091,10 +1061,12 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
                /* New superblock instance created */
                s->s_flags = flags;
+                s->s_mode = mode;
                strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
                sb_set_blocksize(s, block_size(sd.bdev));
-                err = nilfs_fill_super(s, data, flags & MS_VERBOSE, nilfs);
+                err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0,
+                                       nilfs);
                if (err)
                        goto cancel_new;
@@ -1105,7 +1077,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
        mutex_unlock(&nilfs->ns_mount_mutex);
        put_nilfs(nilfs);
        if (need_to_close)
-                close_bdev_exclusive(sd.bdev, flags);
+                close_bdev_exclusive(sd.bdev, mode);
        simple_set_mnt(mnt, s);
        return 0;
@@ -1113,7 +1085,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
        mutex_unlock(&nilfs->ns_mount_mutex);
        put_nilfs(nilfs);
 failed:
-        close_bdev_exclusive(sd.bdev, flags);
+        close_bdev_exclusive(sd.bdev, mode);
        return err;
@@ -1123,7 +1095,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
        put_nilfs(nilfs);
        deactivate_locked_super(s);
        /*
-         * deactivate_super() invokes close_bdev_exclusive().
+         * deactivate_locked_super() invokes close_bdev_exclusive().
         * We must finish all post-cleaning before this call;
         * put_nilfs() needs the block device.
         */
@@ -1138,54 +1110,93 @@ struct file_system_type nilfs_fs_type = {
        .fs_flags = FS_REQUIRES_DEV,
 };
-static int __init init_nilfs_fs(void)
+static void nilfs_inode_init_once(void *obj)
 {
-        int err;
+        struct nilfs_inode_info *ii = obj;
-        err = nilfs_init_inode_cache();
-        if (err)
-                goto failed;
-        err = nilfs_init_transaction_cache();
+        INIT_LIST_HEAD(&ii->i_dirty);
-        if (err)
+#ifdef CONFIG_NILFS_XATTR
-                goto failed_inode_cache;
+        init_rwsem(&ii->xattr_sem);
+#endif
+        nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
+        ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
+        inode_init_once(&ii->vfs_inode);
+}
-        err = nilfs_init_segbuf_cache();
+static void nilfs_segbuf_init_once(void *obj)
-        if (err)
+{
-                goto failed_transaction_cache;
+        memset(obj, 0, sizeof(struct nilfs_segment_buffer));
+}
-        err = nilfs_btree_path_cache_init();
+static void nilfs_destroy_cachep(void)
-        if (err)
+{
-                goto failed_segbuf_cache;
+         if (nilfs_inode_cachep)
+                kmem_cache_destroy(nilfs_inode_cachep);
+         if (nilfs_transaction_cachep)
+                kmem_cache_destroy(nilfs_transaction_cachep);
+         if (nilfs_segbuf_cachep)
+                kmem_cache_destroy(nilfs_segbuf_cachep);
+         if (nilfs_btree_path_cache)
+                kmem_cache_destroy(nilfs_btree_path_cache);
+}
-        err = register_filesystem(&nilfs_fs_type);
+static int __init nilfs_init_cachep(void)
-        if (err)
+{
-                goto failed_btree_path_cache;
+        nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
+                        sizeof(struct nilfs_inode_info), 0,
+                        SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once);
+        if (!nilfs_inode_cachep)
+                goto fail;
+        nilfs_transaction_cachep = kmem_cache_create("nilfs2_transaction_cache",
+                        sizeof(struct nilfs_transaction_info), 0,
+                        SLAB_RECLAIM_ACCOUNT, NULL);
+        if (!nilfs_transaction_cachep)
+                goto fail;
+        nilfs_segbuf_cachep = kmem_cache_create("nilfs2_segbuf_cache",
+                        sizeof(struct nilfs_segment_buffer), 0,
+                        SLAB_RECLAIM_ACCOUNT, nilfs_segbuf_init_once);
+        if (!nilfs_segbuf_cachep)
+                goto fail;
+        nilfs_btree_path_cache = kmem_cache_create("nilfs2_btree_path_cache",
+                        sizeof(struct nilfs_btree_path) * NILFS_BTREE_LEVEL_MAX,
+                        0, 0, NULL);
+        if (!nilfs_btree_path_cache)
+                goto fail;
        return 0;
- failed_btree_path_cache:
+fail:
-        nilfs_btree_path_cache_destroy();
+        nilfs_destroy_cachep();
+        return -ENOMEM;
+}
+static int __init init_nilfs_fs(void)
+{
+        int err;
- failed_segbuf_cache:
+        err = nilfs_init_cachep();
-        nilfs_destroy_segbuf_cache();
+        if (err)
+                goto fail;
- failed_transaction_cache:
+        err = register_filesystem(&nilfs_fs_type);
-        nilfs_destroy_transaction_cache();
+        if (err)
+                goto free_cachep;
- failed_inode_cache:
+        printk(KERN_INFO "NILFS version 2 loaded\n");
-        nilfs_destroy_inode_cache();
+        return 0;
- failed:
+free_cachep:
+        nilfs_destroy_cachep();
+fail:
        return err;
 }
 static void __exit exit_nilfs_fs(void)
 {
-        nilfs_destroy_segbuf_cache();
+        nilfs_destroy_cachep();
-        nilfs_destroy_transaction_cache();
-        nilfs_destroy_inode_cache();
-        nilfs_btree_path_cache_destroy();
        unregister_filesystem(&nilfs_fs_type);
 }
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 33871f7e4f01..8c1097327abc 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -486,11 +486,15 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
                printk(KERN_WARNING
                       "NILFS warning: unable to read secondary superblock\n");
+        /*
+         * Compare two super blocks and set 1 in swp if the secondary
+         * super block is valid and newer.  Otherwise, set 0 in swp.
+         */
        valid[0] = nilfs_valid_sb(sbp[0]);
        valid[1] = nilfs_valid_sb(sbp[1]);
-        swp = valid[1] &&
+        swp = valid[1] && (!valid[0] ||
-                (!valid[0] ||
+                           le64_to_cpu(sbp[1]->s_last_cno) >
-                 le64_to_cpu(sbp[1]->s_wtime) > le64_to_cpu(sbp[0]->s_wtime));
+                           le64_to_cpu(sbp[0]->s_last_cno));
        if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) {
                brelse(sbh[1]);
@@ -670,7 +674,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
                                                   start * sects_per_block,
                                                   nblocks * sects_per_block,
                                                   GFP_NOFS,
-                                                   DISCARD_FL_BARRIER);
+                                                   BLKDEV_IFL_BARRIER);
                        if (ret < 0)
                                return ret;
                        nblocks = 0;
@@ -680,7 +684,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
                ret = blkdev_issue_discard(nilfs->ns_bdev,
                                           start * sects_per_block,
                                           nblocks * sects_per_block,
-                                           GFP_NOFS, DISCARD_FL_BARRIER);
+                                           GFP_NOFS, BLKDEV_IFL_BARRIER);
        return ret;
 }
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index e9795f1724d7..1ab974533697 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -29,6 +29,7 @@
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
+#include <linux/slab.h>
 #include "sb.h"
 /* the_nilfs struct */
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 037e878e03fc..fcc2f064af83 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -18,6 +18,7 @@
 #include <linux/dcache.h>
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/srcu.h>
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 3165d85aada2..0399bcbe09c8 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -87,7 +87,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/writeback.h> /* for inode_lock */
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index 3e56dbffe729..b3a159b21cfd 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -15,6 +15,7 @@ config INOTIFY
 config INOTIFY_USER
        bool "Inotify support for userspace"
+        select ANON_INODES
        select FSNOTIFY
        default y
        ---help---
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index 40b1cf914ccb..27b75ebc7460 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -110,14 +110,10 @@ EXPORT_SYMBOL_GPL(get_inotify_watch);
 int pin_inotify_watch(struct inotify_watch *watch)
 {
        struct super_block *sb = watch->inode->i_sb;
-        spin_lock(&sb_lock);
+        if (atomic_inc_not_zero(&sb->s_active)) {
-        if (sb->s_count >= S_BIAS) {
-                atomic_inc(&sb->s_active);
-                spin_unlock(&sb_lock);
                atomic_inc(&watch->count);
                return 1;
        }
-        spin_unlock(&sb_lock);
        return 0;
 }
@@ -515,34 +511,8 @@ EXPORT_SYMBOL_GPL(inotify_init_watch);
 * done.  Cleanup is just deactivate_super().  However, that leaves a messy
 * case - what if we *are* racing with umount() and active references to
 * superblock can't be acquired anymore?  We can bump ->s_count, grab
- * ->s_umount, which will almost certainly wait until the superblock is shut
+ * ->s_umount, which will wait until the superblock is shut down and the
- * down and the watch in question is pining for fjords.  That's fine, but
+ * watch in question is pining for fjords.
- * there is a problem - we might have hit the window between ->s_active
- * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock
- * is past the point of no return and is heading for shutdown) and the
- * moment when deactivate_super() acquires ->s_umount.  We could just do
- * drop_super() yield() and retry, but that's rather antisocial and this
- * stuff is luser-triggerable.  OTOH, having grabbed ->s_umount and having
- * found that we'd got there first (i.e. that ->s_root is non-NULL) we know
- * that we won't race with inotify_umount_inodes().  So we could grab a
- * reference to watch and do the rest as above, just with drop_super() instead
- * of deactivate_super(), right?  Wrong.  We had to drop ih->mutex before we
- * could grab ->s_umount.  So the watch could've been gone already.
- *
- * That still can be dealt with - we need to save watch->wd, do idr_find()
- * and compare its result with our pointer.  If they match, we either have
- * the damn thing still alive or we'd lost not one but two races at once,
- * the watch had been killed and a new one got created with the same ->wd
- * at the same address.  That couldn't have happened in inotify_destroy(),
- * but inotify_rm_wd() could run into that.  Still, "new one got created"
- * is not a problem - we have every right to kill it or leave it alone,
- * whatever's more convenient.
- *
- * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as
- * "grab it and kill it" check.  If it's been our original watch, we are
- * fine, if it's a newcomer - nevermind, just pretend that we'd won the
- * race and kill the fscker anyway; we are safe since we know that its
- * superblock won't be going away.
 *
 * And yes, this is far beyond mere "not very pretty"; so's the entire
 * concept of inotify to start with.
@@ -556,57 +526,31 @@ EXPORT_SYMBOL_GPL(inotify_init_watch);
 * Called with ih->mutex held, drops it.  Possible return values:
 * 0 - nothing to do, it has died
 * 1 - remove it, drop the reference and deactivate_super()
- * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid
- * that variant, since it involved a lot of PITA, but that's the best that
- * could've been done.
 */
 static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
 {
        struct super_block *sb = watch->inode->i_sb;
-        s32 wd = watch->wd;
-        spin_lock(&sb_lock);
+        if (atomic_inc_not_zero(&sb->s_active)) {
-        if (sb->s_count >= S_BIAS) {
-                atomic_inc(&sb->s_active);
-                spin_unlock(&sb_lock);
                get_inotify_watch(watch);
                mutex_unlock(&ih->mutex);
                return 1;       /* the best outcome */
        }
+        spin_lock(&sb_lock);
        sb->s_count++;
        spin_unlock(&sb_lock);
        mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
        down_read(&sb->s_umount);
-        if (likely(!sb->s_root)) {
+        /* fs is already shut down; the watch is dead */
-                /* fs is already shut down; the watch is dead */
+        drop_super(sb);
-                drop_super(sb);
+        return 0;
-                return 0;
-        }
-        /* raced with the final deactivate_super() */
-        mutex_lock(&ih->mutex);
-        if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) {
-                /* the watch is dead */
-                mutex_unlock(&ih->mutex);
-                drop_super(sb);
-                return 0;
-        }
-        /* still alive or freed and reused with the same sb and wd; kill */
-        get_inotify_watch(watch);
-        mutex_unlock(&ih->mutex);
-        return 2;
 }
-static void unpin_and_kill(struct inotify_watch *watch, int how)
+static void unpin_and_kill(struct inotify_watch *watch)
 {
        struct super_block *sb = watch->inode->i_sb;
        put_inotify_watch(watch);
-        switch (how) {
+        deactivate_super(sb);
-        case 1:
-                deactivate_super(sb);
-                break;
-        case 2:
-                drop_super(sb);
-        }
 }
 /**
@@ -628,7 +572,6 @@ void inotify_destroy(struct inotify_handle *ih)
                struct list_head *watches;
                struct super_block *sb;
                struct inode *inode;
-                int how;
                mutex_lock(&ih->mutex);
                watches = &ih->watches;
@@ -638,8 +581,7 @@ void inotify_destroy(struct inotify_handle *ih)
                }
                watch = list_first_entry(watches, struct inotify_watch, h_list);
                sb = watch->inode->i_sb;
-                how = pin_to_kill(ih, watch);
+                if (!pin_to_kill(ih, watch))
-                if (!how)
                        continue;
                inode = watch->inode;
@@ -654,7 +596,7 @@ void inotify_destroy(struct inotify_handle *ih)
                mutex_unlock(&ih->mutex);
                mutex_unlock(&inode->inotify_mutex);
-                unpin_and_kill(watch, how);
+                unpin_and_kill(watch);
        }
        /* free this handle: the put matching the get in inotify_init() */
@@ -857,7 +799,6 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
        struct inotify_watch *watch;
        struct super_block *sb;
        struct inode *inode;
-        int how;
        mutex_lock(&ih->mutex);
        watch = idr_find(&ih->idr, wd);
@@ -866,8 +807,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
                return -EINVAL;
        }
        sb = watch->inode->i_sb;
-        how = pin_to_kill(ih, watch);
+        if (!pin_to_kill(ih, watch))
-        if (!how)
                return 0;
        inode = watch->inode;
@@ -881,7 +821,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
        mutex_unlock(&ih->mutex);
        mutex_unlock(&inode->inotify_mutex);
-        unpin_and_kill(watch, how);
+        unpin_and_kill(watch);
        return 0;
 }
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 1afb0a10229f..e27960cd76ab 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -28,6 +28,7 @@
 #include <linux/path.h> /* struct path */
 #include <linux/slab.h> /* kmem_* */
 #include <linux/types.h>
+#include <linux/sched.h>
 #include "inotify.h"
@@ -146,6 +147,7 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
        idr_for_each(&group->inotify_data.idr, idr_callback, group);
        idr_remove_all(&group->inotify_data.idr);
        idr_destroy(&group->inotify_data.idr);
+        free_uid(group->inotify_data.user);
 }
 void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 472cdf29ef82..e46ca685b9be 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -546,21 +546,24 @@ retry:
        if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
                goto out_err;
+        /* we are putting the mark on the idr, take a reference */
+        fsnotify_get_mark(&tmp_ientry->fsn_entry);
        spin_lock(&group->inotify_data.idr_lock);
        ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
                                group->inotify_data.last_wd+1,
                                &tmp_ientry->wd);
        spin_unlock(&group->inotify_data.idr_lock);
        if (ret) {
+                /* we didn't get on the idr, drop the idr reference */
+                fsnotify_put_mark(&tmp_ientry->fsn_entry);
                /* idr was out of memory allocate and try again */
                if (ret == -EAGAIN)
                        goto retry;
                goto out_err;
        }
-        /* we put the mark on the idr, take a reference */
-        fsnotify_get_mark(&tmp_ientry->fsn_entry);
        /* we are on the idr, now get on the inode */
        ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
        if (ret) {
@@ -578,16 +581,13 @@ retry:
        /* return the watch descriptor for this new entry */
        ret = tmp_ientry->wd;
-        /* match the ref from fsnotify_init_markentry() */
-        fsnotify_put_mark(&tmp_ientry->fsn_entry);
        /* if this mark added a new event update the group mask */
        if (mask & ~group->mask)
                fsnotify_recalc_group_mask(group);
 out_err:
-        if (ret < 0)
+        /* match the ref from fsnotify_init_markentry() */
-                kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry);
+        fsnotify_put_mark(&tmp_ientry->fsn_entry);
        return ret;
 }
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index cfce53cb65d7..c3c2c7ac9020 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -23,6 +23,7 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 50d3b0c258e3..f5094ee224c1 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -22,6 +22,7 @@
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 08f7530e9341..6551c7cbad92 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -25,6 +25,7 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/vmalloc.h>
+#include <linux/slab.h>
 #include "attrib.h"
 #include "inode.h"
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 9173e82a45d1..0f48e7c5d9e1 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -21,6 +21,7 @@
 */
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include "dir.h"
 #include "aops.h"
@@ -1526,10 +1527,9 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp)
 * this problem for now.  We do write the $BITMAP attribute if it is present
 * which is the important one for a directory so things are not too bad.
 */
-static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry,
+static int ntfs_dir_fsync(struct file *filp, int datasync)
-                int datasync)
 {
-        struct inode *bmp_vi, *vi = dentry->d_inode;
+        struct inode *bmp_vi, *vi = filp->f_mapping->host;
        int err, ret;
        ntfs_attr na;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index b681c71d7069..113ebd9f25a4 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -20,6 +20,7 @@
 */
 #include <linux/buffer_head.h>
+#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/pagevec.h>
 #include <linux/sched.h>
@@ -97,9 +98,6 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
 * the page at all.  For a more detailed explanation see ntfs_truncate() in
 * fs/ntfs/inode.c.
 *
- * @cached_page and @lru_pvec are just optimizations for dealing with multiple
- * pages.
- *
 * Return 0 on success and -errno on error.  In the case that an error is
 * encountered it is possible that the initialized size will already have been
 * incremented some way towards @new_init_size but it is guaranteed that if
@@ -109,8 +107,7 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
 *          held by the caller.
 */
-static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size,
+static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
-                struct page **cached_page, struct pagevec *lru_pvec)
 {
        s64 old_init_size;
        loff_t old_i_size;
@@ -402,18 +399,13 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
 * Obtain @nr_pages locked page cache pages from the mapping @mapping and
 * starting at index @index.
 *
- * If a page is newly created, increment its refcount and add it to the
+ * If a page is newly created, add it to lru list
- * caller's lru-buffering pagevec @lru_pvec.
- *
- * This is the same as mm/filemap.c::__grab_cache_page(), except that @nr_pages
- * are obtained at once instead of just one page and that 0 is returned on
- * success and -errno on error.
 *
 * Note, the page locks are obtained in ascending page index order.
 */
 static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
                pgoff_t index, const unsigned nr_pages, struct page **pages,
-                struct page **cached_page, struct pagevec *lru_pvec)
+                struct page **cached_page)
 {
        int err, nr;
@@ -429,7 +421,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
                                        goto err_out;
                                }
                        }
-                        err = add_to_page_cache(*cached_page, mapping, index,
+                        err = add_to_page_cache_lru(*cached_page, mapping, index,
                                        GFP_KERNEL);
                        if (unlikely(err)) {
                                if (err == -EEXIST)
@@ -437,9 +429,6 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
                                goto err_out;
                        }
                        pages[nr] = *cached_page;
-                        page_cache_get(*cached_page);
-                        if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
-                                __pagevec_lru_add_file(lru_pvec);
                        *cached_page = NULL;
                }
                index++;
@@ -1799,7 +1788,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
        ssize_t status, written;
        unsigned nr_pages;
        int err;
-        struct pagevec lru_pvec;
        ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
                        "pos 0x%llx, count 0x%lx.",
@@ -1911,7 +1899,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
                        }
                }
        }
-        pagevec_init(&lru_pvec, 0);
        written = 0;
        /*
         * If the write starts beyond the initialized size, extend it up to the
@@ -1924,8 +1911,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
        ll = ni->initialized_size;
        read_unlock_irqrestore(&ni->size_lock, flags);
        if (pos > ll) {
-                err = ntfs_attr_extend_initialized(ni, pos, &cached_page,
+                err = ntfs_attr_extend_initialized(ni, pos);
-                                &lru_pvec);
                if (err < 0) {
                        ntfs_error(vol->sb, "Cannot perform write to inode "
                                        "0x%lx, attribute type 0x%x, because "
@@ -2011,7 +1997,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
                        ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
                /* Get and lock @do_pages starting at index @start_idx. */
                status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
-                                pages, &cached_page, &lru_pvec);
+                                pages, &cached_page);
                if (unlikely(status))
                        break;
                /*
@@ -2076,7 +2062,6 @@ err_out:
        *ppos = pos;
        if (cached_page)
                page_cache_release(cached_page);
-        pagevec_lru_add_file(&lru_pvec);
        ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
                        written ? "written" : "status", (unsigned long)written,
                        (long)status);
@@ -2148,7 +2133,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 /**
 * ntfs_file_fsync - sync a file to disk
 * @filp:       file to be synced
- * @dentry:     dentry describing the file to sync
 * @datasync:   if non-zero only flush user data and not metadata
 *
 * Data integrity sync of a file to disk.  Used for fsync, fdatasync, and msync
@@ -2164,19 +2148,15 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 * Also, if @datasync is true, we do not wait on the inode to be written out
 * but we always wait on the page cache pages to be written out.
 *
- * Note: In the past @filp could be NULL so we ignore it as we don't need it
- * anyway.
- *
 * Locking: Caller must hold i_mutex on the inode.
 *
 * TODO: We should probably also write all attribute/index inodes associated
 * with this inode but since we have no simple way of getting to them we ignore
 * this problem for now.
 */
-static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
+static int ntfs_file_fsync(struct file *filp, int datasync)
-                int datasync)
 {
-        struct inode *vi = dentry->d_inode;
+        struct inode *vi = filp->f_mapping->host;
        int err, ret = 0;
        ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index 2194eff49743..096c135691ae 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -19,6 +19,8 @@
 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
+#include <linux/slab.h>
 #include "aops.h"
 #include "collate.h"
 #include "debug.h"
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 1caa0ef0b2bb..b572b6727181 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -21,6 +21,7 @@
 */
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 #include "attrib.h"
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 2ca00153b6ec..358273e59ade 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -23,6 +23,7 @@
 #include <linux/dcache.h>
 #include <linux/exportfs.h>
 #include <linux/security.h>
+#include <linux/slab.h>
 #include "attrib.h"
 #include "debug.h"
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 791c0886c060..07d9fd854350 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -29,6 +29,7 @@ ocfs2-objs := \
        mmap.o                  \
        namei.o                 \
        refcounttree.o          \
+        reservations.o          \
        resize.o                \
        slot_map.o              \
        suballoc.o              \
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 0501974bedd0..da702294d7e7 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -21,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #define MLOG_MASK_PREFIX ML_INODE
@@ -30,6 +31,8 @@
 #include "alloc.h"
 #include "dlmglue.h"
 #include "file.h"
+#include "inode.h"
+#include "journal.h"
 #include "ocfs2_fs.h"
 #include "xattr.h"
@@ -166,6 +169,60 @@ static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
 }
 /*
+ * Helper function to set i_mode in memory and disk. Some call paths
+ * will not have di_bh or a journal handle to pass, in which case it
+ * will create it's own.
+ */
+static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
+                              handle_t *handle, umode_t new_mode)
+{
+        int ret, commit_handle = 0;
+        struct ocfs2_dinode *di;
+        if (di_bh == NULL) {
+                ret = ocfs2_read_inode_block(inode, &di_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        } else
+                get_bh(di_bh);
+        if (handle == NULL) {
+                handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
+                                           OCFS2_INODE_UPDATE_CREDITS);
+                if (IS_ERR(handle)) {
+                        ret = PTR_ERR(handle);
+                        mlog_errno(ret);
+                        goto out_brelse;
+                }
+                commit_handle = 1;
+        }
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        inode->i_mode = new_mode;
+        di->i_mode = cpu_to_le16(inode->i_mode);
+        ocfs2_journal_dirty(handle, di_bh);
+out_commit:
+        if (commit_handle)
+                ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out_brelse:
+        brelse(di_bh);
+out:
+        return ret;
+}
+/*
 * Set the access or default ACL of an inode.
 */
 static int ocfs2_set_acl(handle_t *handle,
@@ -193,9 +250,14 @@ static int ocfs2_set_acl(handle_t *handle,
                        if (ret < 0)
                                return ret;
                        else {
-                                inode->i_mode = mode;
                                if (ret == 0)
                                        acl = NULL;
+                                ret = ocfs2_acl_set_mode(inode, di_bh,
+                                                         handle, mode);
+                                if (ret)
+                                        return ret;
                        }
                }
                break;
@@ -283,6 +345,7 @@ int ocfs2_init_acl(handle_t *handle,
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct posix_acl *acl = NULL;
        int ret = 0;
+        mode_t mode;
        if (!S_ISLNK(inode->i_mode)) {
                if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
@@ -291,12 +354,17 @@ int ocfs2_init_acl(handle_t *handle,
                        if (IS_ERR(acl))
                                return PTR_ERR(acl);
                }
-                if (!acl)
+                if (!acl) {
-                        inode->i_mode &= ~current_umask();
+                        mode = inode->i_mode & ~current_umask();
+                        ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto cleanup;
+                        }
+                }
        }
        if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
                struct posix_acl *clone;
-                mode_t mode;
                if (S_ISDIR(inode->i_mode)) {
                        ret = ocfs2_set_acl(handle, inode, di_bh,
@@ -313,7 +381,7 @@ int ocfs2_init_acl(handle_t *handle,
                mode = inode->i_mode;
                ret = posix_acl_create_masq(clone, &mode);
                if (ret >= 0) {
-                        inode->i_mode = mode;
+                        ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
                        if (ret > 0) {
                                ret = ocfs2_set_acl(handle, inode,
                                                    di_bh, ACL_TYPE_ACCESS,
@@ -421,7 +489,7 @@ cleanup:
        return ret;
 }
-struct xattr_handler ocfs2_xattr_acl_access_handler = {
+const struct xattr_handler ocfs2_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags  = ACL_TYPE_ACCESS,
        .list   = ocfs2_xattr_list_acl_access,
@@ -429,7 +497,7 @@ struct xattr_handler ocfs2_xattr_acl_access_handler = {
        .set    = ocfs2_xattr_set_acl,
 };
-struct xattr_handler ocfs2_xattr_acl_default_handler = {
+const struct xattr_handler ocfs2_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags  = ACL_TYPE_DEFAULT,
        .list   = ocfs2_xattr_list_acl_default,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9f8bd913c51e..215e12ce1d85 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1006,7 +1006,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
        int count, status, i;
        u16 suballoc_bit_start;
        u32 num_got;
-        u64 first_blkno;
+        u64 suballoc_loc, first_blkno;
        struct ocfs2_super *osb =
                OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
        struct ocfs2_extent_block *eb;
@@ -1015,10 +1015,10 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
        count = 0;
        while (count < wanted) {
-                status = ocfs2_claim_metadata(osb,
+                status = ocfs2_claim_metadata(handle,
-                                              handle,
                                              meta_ac,
                                              wanted - count,
+                                              &suballoc_loc,
                                              &suballoc_bit_start,
                                              &num_got,
                                              &first_blkno);
@@ -1052,6 +1052,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
                        eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
                        eb->h_suballoc_slot =
                                cpu_to_le16(meta_ac->ac_alloc_slot);
+                        eb->h_suballoc_loc = cpu_to_le64(suballoc_loc);
                        eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
                        eb->h_list.l_count =
                                cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -1061,11 +1062,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
                        /* We'll also be dirtied by the caller, so
                         * this isn't absolutely necessary. */
-                        status = ocfs2_journal_dirty(handle, bhs[i]);
+                        ocfs2_journal_dirty(handle, bhs[i]);
-                        if (status < 0) {
-                                mlog_errno(status);
-                                goto bail;
-                        }
                }
                count += num_got;
@@ -1129,8 +1126,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
                goto out;
        }
-        status = ocfs2_extend_trans(handle, path_num_items(path) +
+        status = ocfs2_extend_trans(handle, path_num_items(path));
-                                    handle->h_buffer_credits);
        if (status < 0) {
                mlog_errno(status);
                goto out;
@@ -1270,12 +1266,7 @@ static int ocfs2_add_branch(handle_t *handle,
                if (!eb_el->l_tree_depth)
                        new_last_eb_blk = le64_to_cpu(eb->h_blkno);
-                status = ocfs2_journal_dirty(handle, bh);
+                ocfs2_journal_dirty(handle, bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
                next_blkno = le64_to_cpu(eb->h_blkno);
        }
@@ -1321,17 +1312,10 @@ static int ocfs2_add_branch(handle_t *handle,
        eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
        eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
-        status = ocfs2_journal_dirty(handle, *last_eb_bh);
+        ocfs2_journal_dirty(handle, *last_eb_bh);
-        if (status < 0)
+        ocfs2_journal_dirty(handle, et->et_root_bh);
-                mlog_errno(status);
+        if (eb_bh)
-        status = ocfs2_journal_dirty(handle, et->et_root_bh);
+                ocfs2_journal_dirty(handle, eb_bh);
-        if (status < 0)
-                mlog_errno(status);
-        if (eb_bh) {
-                status = ocfs2_journal_dirty(handle, eb_bh);
-                if (status < 0)
-                        mlog_errno(status);
-        }
        /*
         * Some callers want to track the rightmost leaf so pass it
@@ -1399,11 +1383,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
        for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
                eb_el->l_recs[i] = root_el->l_recs[i];
-        status = ocfs2_journal_dirty(handle, new_eb_bh);
+        ocfs2_journal_dirty(handle, new_eb_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        status = ocfs2_et_root_journal_access(handle, et,
                                              OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1428,11 +1408,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
        if (root_el->l_tree_depth == cpu_to_le16(1))
                ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
-        status = ocfs2_journal_dirty(handle, et->et_root_bh);
+        ocfs2_journal_dirty(handle, et->et_root_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        *ret_new_eb_bh = new_eb_bh;
        new_eb_bh = NULL;
@@ -2064,7 +2040,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
                                       struct ocfs2_path *right_path,
                                       int subtree_index)
 {
-        int ret, i, idx;
+        int i, idx;
        struct ocfs2_extent_list *el, *left_el, *right_el;
        struct ocfs2_extent_rec *left_rec, *right_rec;
        struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
@@ -2102,13 +2078,8 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
                ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
                                              right_el);
-                ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
+                ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
-                if (ret)
+                ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
-                        mlog_errno(ret);
-                ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
-                if (ret)
-                        mlog_errno(ret);
                /*
                 * Setup our list pointers now so that the current
@@ -2132,9 +2103,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
        root_bh = left_path->p_node[subtree_index].bh;
-        ret = ocfs2_journal_dirty(handle, root_bh);
+        ocfs2_journal_dirty(handle, root_bh);
-        if (ret)
-                mlog_errno(ret);
 }
 static int ocfs2_rotate_subtree_right(handle_t *handle,
@@ -2207,11 +2176,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
        ocfs2_create_empty_extent(right_el);
-        ret = ocfs2_journal_dirty(handle, right_leaf_bh);
+        ocfs2_journal_dirty(handle, right_leaf_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        /* Do the copy now. */
        i = le16_to_cpu(left_el->l_next_free_rec) - 1;
@@ -2230,11 +2195,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
        memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
        le16_add_cpu(&left_el->l_next_free_rec, 1);
-        ret = ocfs2_journal_dirty(handle, left_leaf_bh);
+        ocfs2_journal_dirty(handle, left_leaf_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        ocfs2_complete_edge_insert(handle, left_path, right_path,
                                   subtree_index);
@@ -2249,8 +2210,8 @@ out:
 *
 * Will return zero if the path passed in is already the leftmost path.
 */
-static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
-                                         struct ocfs2_path *path, u32 *cpos)
+                                  struct ocfs2_path *path, u32 *cpos)
 {
        int i, j, ret = 0;
        u64 blkno;
@@ -2327,20 +2288,14 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
                                           int op_credits,
                                           struct ocfs2_path *path)
 {
-        int ret;
+        int ret = 0;
        int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
-        if (handle->h_buffer_credits < credits) {
+        if (handle->h_buffer_credits < credits)
                ret = ocfs2_extend_trans(handle,
                                         credits - handle->h_buffer_credits);
-                if (ret)
-                        return ret;
-                if (unlikely(handle->h_buffer_credits < credits))
+        return ret;
-                        return ocfs2_extend_trans(handle, credits);
-        }
-        return 0;
 }
 /*
@@ -2584,8 +2539,7 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
         * records for all the bh in the path.
         * So we have to allocate extra credits and access them.
         */
-        ret = ocfs2_extend_trans(handle,
+        ret = ocfs2_extend_trans(handle, subtree_index);
-                                 handle->h_buffer_credits + subtree_index);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -2823,12 +2777,8 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
                ocfs2_remove_empty_extent(right_leaf_el);
        }
-        ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+        ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
-        if (ret)
+        ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
-                mlog_errno(ret);
-        ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
-        if (ret)
-                mlog_errno(ret);
        if (del_right_subtree) {
                ocfs2_unlink_subtree(handle, et, left_path, right_path,
@@ -2851,9 +2801,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
                if (right_has_empty)
                        ocfs2_remove_empty_extent(left_leaf_el);
-                ret = ocfs2_journal_dirty(handle, et_root_bh);
+                ocfs2_journal_dirty(handle, et_root_bh);
-                if (ret)
-                        mlog_errno(ret);
                *deleted = 1;
        } else
@@ -2962,10 +2910,7 @@ static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
        }
        ocfs2_remove_empty_extent(el);
+        ocfs2_journal_dirty(handle, bh);
-        ret = ocfs2_journal_dirty(handle, bh);
-        if (ret)
-                mlog_errno(ret);
 out:
        return ret;
@@ -3506,15 +3451,9 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
        ocfs2_cleanup_merge(el, index);
-        ret = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (ret)
-                mlog_errno(ret);
        if (right_path) {
-                ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
+                ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
-                if (ret)
-                        mlog_errno(ret);
                ocfs2_complete_edge_insert(handle, left_path, right_path,
                                           subtree_index);
        }
@@ -3683,14 +3622,9 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
        ocfs2_cleanup_merge(el, index);
-        ret = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (ret)
-                mlog_errno(ret);
        if (left_path) {
-                ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+                ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
-                if (ret)
-                        mlog_errno(ret);
                /*
                 * In the situation that the right_rec is empty and the extent
@@ -4016,10 +3950,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
                le32_add_cpu(&rec->e_int_clusters,
                             -le32_to_cpu(rec->e_cpos));
-                ret = ocfs2_journal_dirty(handle, bh);
+                ocfs2_journal_dirty(handle, bh);
-                if (ret)
-                        mlog_errno(ret);
        }
 }
@@ -4203,17 +4134,13 @@ static int ocfs2_insert_path(handle_t *handle,
        struct buffer_head *leaf_bh = path_leaf_bh(right_path);
        if (left_path) {
-                int credits = handle->h_buffer_credits;
                /*
                 * There's a chance that left_path got passed back to
                 * us without being accounted for in the
                 * journal. Extend our transaction here to be sure we
                 * can change those blocks.
                 */
-                credits += left_path->p_tree_depth;
+                ret = ocfs2_extend_trans(handle, left_path->p_tree_depth);
-                ret = ocfs2_extend_trans(handle, credits);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out;
@@ -4251,17 +4178,13 @@ static int ocfs2_insert_path(handle_t *handle,
                 * dirty this for us.
                 */
                if (left_path)
-                        ret = ocfs2_journal_dirty(handle,
+                        ocfs2_journal_dirty(handle,
-                                                  path_leaf_bh(left_path));
+                                            path_leaf_bh(left_path));
-                        if (ret)
-                                mlog_errno(ret);
        } else
                ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
                                     insert);
-        ret = ocfs2_journal_dirty(handle, leaf_bh);
+        ocfs2_journal_dirty(handle, leaf_bh);
-        if (ret)
-                mlog_errno(ret);
        if (left_path) {
                /*
@@ -4384,9 +4307,7 @@ out_update_clusters:
                ocfs2_et_update_clusters(et,
                                         le16_to_cpu(insert_rec->e_leaf_clusters));
-        ret = ocfs2_journal_dirty(handle, et->et_root_bh);
+        ocfs2_journal_dirty(handle, et->et_root_bh);
-        if (ret)
-                mlog_errno(ret);
 out:
        ocfs2_free_path(left_path);
@@ -4866,7 +4787,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
                goto leave;
        }
-        status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+        status = __ocfs2_claim_clusters(handle, data_ac, 1,
                                        clusters_to_add, &bit_off, &num_bits);
        if (status < 0) {
                if (status != -ENOSPC)
@@ -4895,11 +4816,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
                goto leave;
        }
-        status = ocfs2_journal_dirty(handle, et->et_root_bh);
+        ocfs2_journal_dirty(handle, et->et_root_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        clusters_to_add -= num_bits;
        *logical_offset += num_bits;
@@ -5309,7 +5226,7 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
                            int index, u32 new_range,
                            struct ocfs2_alloc_context *meta_ac)
 {
-        int ret, depth, credits = handle->h_buffer_credits;
+        int ret, depth, credits;
        struct buffer_head *last_eb_bh = NULL;
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *rightmost_el, *el;
@@ -5340,8 +5257,8 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
        } else
                rightmost_el = path_leaf_el(path);
-        credits += path->p_tree_depth +
+        credits = path->p_tree_depth +
-                   ocfs2_extend_meta_needed(et->et_root_el);
+                  ocfs2_extend_meta_needed(et->et_root_el);
        ret = ocfs2_extend_trans(handle, credits);
        if (ret) {
                mlog_errno(ret);
@@ -5671,19 +5588,97 @@ out:
        return ret;
 }
+/*
+ * ocfs2_reserve_blocks_for_rec_trunc() would look basically the
+ * same as ocfs2_lock_alloctors(), except for it accepts a blocks
+ * number to reserve some extra blocks, and it only handles meta
+ * data allocations.
+ *
+ * Currently, only ocfs2_remove_btree_range() uses it for truncating
+ * and punching holes.
+ */
+static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
+                                              struct ocfs2_extent_tree *et,
+                                              u32 extents_to_split,
+                                              struct ocfs2_alloc_context **ac,
+                                              int extra_blocks)
+{
+        int ret = 0, num_free_extents;
+        unsigned int max_recs_needed = 2 * extents_to_split;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        *ac = NULL;
+        num_free_extents = ocfs2_num_free_extents(osb, et);
+        if (num_free_extents < 0) {
+                ret = num_free_extents;
+                mlog_errno(ret);
+                goto out;
+        }
+        if (!num_free_extents ||
+            (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
+                extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
+        if (extra_blocks) {
+                ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac);
+                if (ret < 0) {
+                        if (ret != -ENOSPC)
+                                mlog_errno(ret);
+                        goto out;
+                }
+        }
+out:
+        if (ret) {
+                if (*ac) {
+                        ocfs2_free_alloc_context(*ac);
+                        *ac = NULL;
+                }
+        }
+        return ret;
+}
 int ocfs2_remove_btree_range(struct inode *inode,
                             struct ocfs2_extent_tree *et,
-                             u32 cpos, u32 phys_cpos, u32 len,
+                             u32 cpos, u32 phys_cpos, u32 len, int flags,
-                             struct ocfs2_cached_dealloc_ctxt *dealloc)
+                             struct ocfs2_cached_dealloc_ctxt *dealloc,
+                             u64 refcount_loc)
 {
-        int ret;
+        int ret, credits = 0, extra_blocks = 0;
        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct inode *tl_inode = osb->osb_tl_inode;
        handle_t *handle;
        struct ocfs2_alloc_context *meta_ac = NULL;
+        struct ocfs2_refcount_tree *ref_tree = NULL;
+        if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
+                BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+                         OCFS2_HAS_REFCOUNT_FL));
+                ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+                                               &ref_tree, NULL);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
-        ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
+                ret = ocfs2_prepare_refcount_change_for_del(inode,
+                                                            refcount_loc,
+                                                            phys_blkno,
+                                                            len,
+                                                            &credits,
+                                                            &extra_blocks);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac,
+                                                 extra_blocks);
        if (ret) {
                mlog_errno(ret);
                return ret;
@@ -5699,7 +5694,8 @@ int ocfs2_remove_btree_range(struct inode *inode,
                }
        }
-        handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
+        handle = ocfs2_start_trans(osb,
+                        ocfs2_remove_extent_credits(osb->sb) + credits);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                mlog_errno(ret);
@@ -5724,15 +5720,22 @@ int ocfs2_remove_btree_range(struct inode *inode,
        ocfs2_et_update_clusters(et, -len);
-        ret = ocfs2_journal_dirty(handle, et->et_root_bh);
+        ocfs2_journal_dirty(handle, et->et_root_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+        if (phys_blkno) {
-        if (ret)
+                if (flags & OCFS2_EXT_REFCOUNTED)
-                mlog_errno(ret);
+                        ret = ocfs2_decrease_refcount(inode, handle,
+                                        ocfs2_blocks_to_clusters(osb->sb,
+                                                                 phys_blkno),
+                                        len, meta_ac,
+                                        dealloc, 1);
+                else
+                        ret = ocfs2_truncate_log_append(osb, handle,
+                                                        phys_blkno, len);
+                if (ret)
+                        mlog_errno(ret);
+        }
 out_commit:
        ocfs2_commit_trans(osb, handle);
@@ -5742,6 +5745,9 @@ out:
        if (meta_ac)
                ocfs2_free_alloc_context(meta_ac);
+        if (ref_tree)
+                ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
        return ret;
 }
@@ -5850,11 +5856,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
        }
        tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
-        status = ocfs2_journal_dirty(handle, tl_bh);
+        ocfs2_journal_dirty(handle, tl_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
 bail:
        mlog_exit(status);
@@ -5893,11 +5895,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
                tl->tl_used = cpu_to_le16(i);
-                status = ocfs2_journal_dirty(handle, tl_bh);
+                ocfs2_journal_dirty(handle, tl_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
                /* TODO: Perhaps we can calculate the bulk of the
                 * credits up front rather than extending like
@@ -6298,6 +6296,7 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
 */
 struct ocfs2_cached_block_free {
        struct ocfs2_cached_block_free          *free_next;
+        u64                                     free_bg;
        u64                                     free_blk;
        unsigned int                            free_bit;
 };
@@ -6344,8 +6343,11 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
        }
        while (head) {
-                bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
+                if (head->free_bg)
-                                                      head->free_bit);
+                        bg_blkno = head->free_bg;
+                else
+                        bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
+                                                              head->free_bit);
                mlog(0, "Free bit: (bit %u, blkno %llu)\n",
                     head->free_bit, (unsigned long long)head->free_blk);
@@ -6393,7 +6395,7 @@ int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
        int ret = 0;
        struct ocfs2_cached_block_free *item;
-        item = kmalloc(sizeof(*item), GFP_NOFS);
+        item = kzalloc(sizeof(*item), GFP_NOFS);
        if (item == NULL) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -6533,8 +6535,8 @@ ocfs2_find_per_slot_free_list(int type,
 }
 int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
-                              int type, int slot, u64 blkno,
+                              int type, int slot, u64 suballoc,
-                              unsigned int bit)
+                              u64 blkno, unsigned int bit)
 {
        int ret;
        struct ocfs2_per_slot_free_list *fl;
@@ -6547,7 +6549,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
                goto out;
        }
-        item = kmalloc(sizeof(*item), GFP_NOFS);
+        item = kzalloc(sizeof(*item), GFP_NOFS);
        if (item == NULL) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -6557,6 +6559,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
        mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
             type, slot, bit, (unsigned long long)blkno);
+        item->free_bg = suballoc;
        item->free_blk = blkno;
        item->free_bit = bit;
        item->free_next = fl->f_first;
@@ -6573,433 +6576,11 @@ static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
 {
        return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
                                         le16_to_cpu(eb->h_suballoc_slot),
+                                         le64_to_cpu(eb->h_suballoc_loc),
                                         le64_to_cpu(eb->h_blkno),
                                         le16_to_cpu(eb->h_suballoc_bit));
 }
-/* This function will figure out whether the currently last extent
- * block will be deleted, and if it will, what the new last extent
- * block will be so we can update his h_next_leaf_blk field, as well
- * as the dinodes i_last_eb_blk */
-static int ocfs2_find_new_last_ext_blk(struct inode *inode,
-                                       unsigned int clusters_to_del,
-                                       struct ocfs2_path *path,
-                                       struct buffer_head **new_last_eb)
-{
-        int next_free, ret = 0;
-        u32 cpos;
-        struct ocfs2_extent_rec *rec;
-        struct ocfs2_extent_block *eb;
-        struct ocfs2_extent_list *el;
-        struct buffer_head *bh = NULL;
-        *new_last_eb = NULL;
-        /* we have no tree, so of course, no last_eb. */
-        if (!path->p_tree_depth)
-                goto out;
-        /* trunc to zero special case - this makes tree_depth = 0
-         * regardless of what it is.  */
-        if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
-                goto out;
-        el = path_leaf_el(path);
-        BUG_ON(!el->l_next_free_rec);
-        /*
-         * Make sure that this extent list will actually be empty
-         * after we clear away the data. We can shortcut out if
-         * there's more than one non-empty extent in the
-         * list. Otherwise, a check of the remaining extent is
-         * necessary.
-         */
-        next_free = le16_to_cpu(el->l_next_free_rec);
-        rec = NULL;
-        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
-                if (next_free > 2)
-                        goto out;
-                /* We may have a valid extent in index 1, check it. */
-                if (next_free == 2)
-                        rec = &el->l_recs[1];
-                /*
-                 * Fall through - no more nonempty extents, so we want
-                 * to delete this leaf.
-                 */
-        } else {
-                if (next_free > 1)
-                        goto out;
-                rec = &el->l_recs[0];
-        }
-        if (rec) {
-                /*
-                 * Check it we'll only be trimming off the end of this
-                 * cluster.
-                 */
-                if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
-                        goto out;
-        }
-        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_find_leaf(INODE_CACHE(inode), path_root_el(path), cpos, &bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
-        eb = (struct ocfs2_extent_block *) bh->b_data;
-        el = &eb->h_list;
-        /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
-         * Any corruption is a code bug. */
-        BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
-        *new_last_eb = bh;
-        get_bh(*new_last_eb);
-        mlog(0, "returning block %llu, (cpos: %u)\n",
-             (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
-out:
-        brelse(bh);
-        return ret;
-}
-/*
- * Trim some clusters off the rightmost edge of a tree. Only called
- * during truncate.
- *
- * The caller needs to:
- *   - start journaling of each path component.
- *   - compute and fully set up any new last ext block
- */
-static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
-                           handle_t *handle, struct ocfs2_truncate_context *tc,
-                           u32 clusters_to_del, u64 *delete_start, u8 *flags)
-{
-        int ret, i, index = path->p_tree_depth;
-        u32 new_edge = 0;
-        u64 deleted_eb = 0;
-        struct buffer_head *bh;
-        struct ocfs2_extent_list *el;
-        struct ocfs2_extent_rec *rec;
-        *delete_start = 0;
-        *flags = 0;
-        while (index >= 0) {
-                bh = path->p_node[index].bh;
-                el = path->p_node[index].el;
-                mlog(0, "traveling tree (index = %d, block = %llu)\n",
-                     index,  (unsigned long long)bh->b_blocknr);
-                BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
-                if (index !=
-                    (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
-                        ocfs2_error(inode->i_sb,
-                                    "Inode %lu has invalid ext. block %llu",
-                                    inode->i_ino,
-                                    (unsigned long long)bh->b_blocknr);
-                        ret = -EROFS;
-                        goto out;
-                }
-find_tail_record:
-                i = le16_to_cpu(el->l_next_free_rec) - 1;
-                rec = &el->l_recs[i];
-                mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
-                     "next = %u\n", i, le32_to_cpu(rec->e_cpos),
-                     ocfs2_rec_clusters(el, rec),
-                     (unsigned long long)le64_to_cpu(rec->e_blkno),
-                     le16_to_cpu(el->l_next_free_rec));
-                BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
-                if (le16_to_cpu(el->l_tree_depth) == 0) {
-                        /*
-                         * If the leaf block contains a single empty
-                         * extent and no records, we can just remove
-                         * the block.
-                         */
-                        if (i == 0 && ocfs2_is_empty_extent(rec)) {
-                                memset(rec, 0,
-                                       sizeof(struct ocfs2_extent_rec));
-                                el->l_next_free_rec = cpu_to_le16(0);
-                                goto delete;
-                        }
-                        /*
-                         * Remove any empty extents by shifting things
-                         * left. That should make life much easier on
-                         * the code below. This condition is rare
-                         * enough that we shouldn't see a performance
-                         * hit.
-                         */
-                        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
-                                le16_add_cpu(&el->l_next_free_rec, -1);
-                                for(i = 0;
-                                    i < le16_to_cpu(el->l_next_free_rec); i++)
-                                        el->l_recs[i] = el->l_recs[i + 1];
-                                memset(&el->l_recs[i], 0,
-                                       sizeof(struct ocfs2_extent_rec));
-                                /*
-                                 * We've modified our extent list. The
-                                 * simplest way to handle this change
-                                 * is to being the search from the
-                                 * start again.
-                                 */
-                                goto find_tail_record;
-                        }
-                        le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
-                        /*
-                         * We'll use "new_edge" on our way back up the
-                         * tree to know what our rightmost cpos is.
-                         */
-                        new_edge = le16_to_cpu(rec->e_leaf_clusters);
-                        new_edge += le32_to_cpu(rec->e_cpos);
-                        /*
-                         * The caller will use this to delete data blocks.
-                         */
-                        *delete_start = le64_to_cpu(rec->e_blkno)
-                                + ocfs2_clusters_to_blocks(inode->i_sb,
-                                        le16_to_cpu(rec->e_leaf_clusters));
-                        *flags = rec->e_flags;
-                        /*
-                         * If it's now empty, remove this record.
-                         */
-                        if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
-                                memset(rec, 0,
-                                       sizeof(struct ocfs2_extent_rec));
-                                le16_add_cpu(&el->l_next_free_rec, -1);
-                        }
-                } else {
-                        if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
-                                memset(rec, 0,
-                                       sizeof(struct ocfs2_extent_rec));
-                                le16_add_cpu(&el->l_next_free_rec, -1);
-                                goto delete;
-                        }
-                        /* Can this actually happen? */
-                        if (le16_to_cpu(el->l_next_free_rec) == 0)
-                                goto delete;
-                        /*
-                         * We never actually deleted any clusters
-                         * because our leaf was empty. There's no
-                         * reason to adjust the rightmost edge then.
-                         */
-                        if (new_edge == 0)
-                                goto delete;
-                        rec->e_int_clusters = cpu_to_le32(new_edge);
-                        le32_add_cpu(&rec->e_int_clusters,
-                                     -le32_to_cpu(rec->e_cpos));
-                         /*
-                          * A deleted child record should have been
-                          * caught above.
-                          */
-                         BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
-                }
-delete:
-                ret = ocfs2_journal_dirty(handle, bh);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-                mlog(0, "extent list container %llu, after: record %d: "
-                     "(%u, %u, %llu), next = %u.\n",
-                     (unsigned long long)bh->b_blocknr, i,
-                     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
-                     (unsigned long long)le64_to_cpu(rec->e_blkno),
-                     le16_to_cpu(el->l_next_free_rec));
-                /*
-                 * We must be careful to only attempt delete of an
-                 * extent block (and not the root inode block).
-                 */
-                if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
-                        struct ocfs2_extent_block *eb =
-                                (struct ocfs2_extent_block *)bh->b_data;
-                        /*
-                         * Save this for use when processing the
-                         * parent block.
-                         */
-                        deleted_eb = le64_to_cpu(eb->h_blkno);
-                        mlog(0, "deleting this extent block.\n");
-                        ocfs2_remove_from_cache(INODE_CACHE(inode), bh);
-                        BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
-                        BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
-                        BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
-                        ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
-                        /* An error here is not fatal. */
-                        if (ret < 0)
-                                mlog_errno(ret);
-                } else {
-                        deleted_eb = 0;
-                }
-                index--;
-        }
-        ret = 0;
-out:
-        return ret;
-}
-static int ocfs2_do_truncate(struct ocfs2_super *osb,
-                             unsigned int clusters_to_del,
-                             struct inode *inode,
-                             struct buffer_head *fe_bh,
-                             handle_t *handle,
-                             struct ocfs2_truncate_context *tc,
-                             struct ocfs2_path *path,
-                             struct ocfs2_alloc_context *meta_ac)
-{
-        int status;
-        struct ocfs2_dinode *fe;
-        struct ocfs2_extent_block *last_eb = NULL;
-        struct ocfs2_extent_list *el;
-        struct buffer_head *last_eb_bh = NULL;
-        u64 delete_blk = 0;
-        u8 rec_flags;
-        fe = (struct ocfs2_dinode *) fe_bh->b_data;
-        status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
-                                             path, &last_eb_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        /*
-         * Each component will be touched, so we might as well journal
-         * here to avoid having to handle errors later.
-         */
-        status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        if (last_eb_bh) {
-                status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), last_eb_bh,
-                                                 OCFS2_JOURNAL_ACCESS_WRITE);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-                last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-        }
-        el = &(fe->id2.i_list);
-        /*
-         * Lower levels depend on this never happening, but it's best
-         * to check it up here before changing the tree.
-         */
-        if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
-                ocfs2_error(inode->i_sb,
-                            "Inode %lu has an empty extent record, depth %u\n",
-                            inode->i_ino, le16_to_cpu(el->l_tree_depth));
-                status = -EROFS;
-                goto bail;
-        }
-        dquot_free_space_nodirty(inode,
-                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
-                                      clusters_to_del;
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-        le32_add_cpu(&fe->i_clusters, -clusters_to_del);
-        inode->i_blocks = ocfs2_inode_sector_count(inode);
-        status = ocfs2_trim_tree(inode, path, handle, tc,
-                                 clusters_to_del, &delete_blk, &rec_flags);
-        if (status) {
-                mlog_errno(status);
-                goto bail;
-        }
-        if (le32_to_cpu(fe->i_clusters) == 0) {
-                /* trunc to zero is a special case. */
-                el->l_tree_depth = 0;
-                fe->i_last_eb_blk = 0;
-        } else if (last_eb)
-                fe->i_last_eb_blk = last_eb->h_blkno;
-        status = ocfs2_journal_dirty(handle, fe_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        if (last_eb) {
-                /* If there will be a new last extent block, then by
-                 * definition, there cannot be any leaves to the right of
-                 * him. */
-                last_eb->h_next_leaf_blk = 0;
-                status = ocfs2_journal_dirty(handle, last_eb_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-        }
-        if (delete_blk) {
-                if (rec_flags & OCFS2_EXT_REFCOUNTED)
-                        status = ocfs2_decrease_refcount(inode, handle,
-                                        ocfs2_blocks_to_clusters(osb->sb,
-                                                                 delete_blk),
-                                        clusters_to_del, meta_ac,
-                                        &tc->tc_dealloc, 1);
-                else
-                        status = ocfs2_truncate_log_append(osb, handle,
-                                                           delete_blk,
-                                                           clusters_to_del);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-        }
-        status = 0;
-bail:
-        brelse(last_eb_bh);
-        mlog_exit(status);
-        return status;
-}
 static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
 {
        set_buffer_uptodate(bh);
@@ -7307,7 +6888,9 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                        goto out_commit;
                did_quota = 1;
-                ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
+                data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+                ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
                                           &num);
                if (ret) {
                        mlog_errno(ret);
@@ -7406,26 +6989,29 @@ out:
 */
 int ocfs2_commit_truncate(struct ocfs2_super *osb,
                          struct inode *inode,
-                          struct buffer_head *fe_bh,
+                          struct buffer_head *di_bh)
-                          struct ocfs2_truncate_context *tc)
 {
-        int status, i, credits, tl_sem = 0;
+        int status = 0, i, flags = 0;
-        u32 clusters_to_del, new_highest_cpos, range;
+        u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff;
        u64 blkno = 0;
        struct ocfs2_extent_list *el;
-        handle_t *handle = NULL;
+        struct ocfs2_extent_rec *rec;
-        struct inode *tl_inode = osb->osb_tl_inode;
        struct ocfs2_path *path = NULL;
-        struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-        struct ocfs2_alloc_context *meta_ac = NULL;
+        struct ocfs2_extent_list *root_el = &(di->id2.i_list);
-        struct ocfs2_refcount_tree *ref_tree = NULL;
+        u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
+        struct ocfs2_extent_tree et;
+        struct ocfs2_cached_dealloc_ctxt dealloc;
        mlog_entry_void();
+        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+        ocfs2_init_dealloc_ctxt(&dealloc);
        new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
                                                     i_size_read(inode));
-        path = ocfs2_new_path(fe_bh, &di->id2.i_list,
+        path = ocfs2_new_path(di_bh, &di->id2.i_list,
                              ocfs2_journal_access_di);
        if (!path) {
                status = -ENOMEM;
@@ -7444,8 +7030,6 @@ start:
                goto bail;
        }
-        credits = 0;
        /*
         * Truncate always works against the rightmost tree branch.
         */
@@ -7480,101 +7064,62 @@ start:
        }
        i = le16_to_cpu(el->l_next_free_rec) - 1;
-        range = le32_to_cpu(el->l_recs[i].e_cpos) +
+        rec = &el->l_recs[i];
-                ocfs2_rec_clusters(el, &el->l_recs[i]);
+        flags = rec->e_flags;
-        if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
+        range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
-                clusters_to_del = 0;
-        } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
+        if (i == 0 && ocfs2_is_empty_extent(rec)) {
-                clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
+                /*
-                blkno = le64_to_cpu(el->l_recs[i].e_blkno);
+                 * Lower levels depend on this never happening, but it's best
+                 * to check it up here before changing the tree.
+                */
+                if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
+                        ocfs2_error(inode->i_sb, "Inode %lu has an empty "
+                                    "extent record, depth %u\n", inode->i_ino,
+                                    le16_to_cpu(root_el->l_tree_depth));
+                        status = -EROFS;
+                        goto bail;
+                }
+                trunc_cpos = le32_to_cpu(rec->e_cpos);
+                trunc_len = 0;
+                blkno = 0;
+        } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
+                /*
+                 * Truncate entire record.
+                 */
+                trunc_cpos = le32_to_cpu(rec->e_cpos);
+                trunc_len = ocfs2_rec_clusters(el, rec);
+                blkno = le64_to_cpu(rec->e_blkno);
        } else if (range > new_highest_cpos) {
-                clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
+                /*
-                                   le32_to_cpu(el->l_recs[i].e_cpos)) -
+                 * Partial truncate. it also should be
-                                  new_highest_cpos;
+                 * the last truncate we're doing.
-                blkno = le64_to_cpu(el->l_recs[i].e_blkno) +
+                 */
-                        ocfs2_clusters_to_blocks(inode->i_sb,
+                trunc_cpos = new_highest_cpos;
-                                ocfs2_rec_clusters(el, &el->l_recs[i]) -
+                trunc_len = range - new_highest_cpos;
-                                clusters_to_del);
+                coff = new_highest_cpos - le32_to_cpu(rec->e_cpos);
+                blkno = le64_to_cpu(rec->e_blkno) +
+                                ocfs2_clusters_to_blocks(inode->i_sb, coff);
        } else {
+                /*
+                 * Truncate completed, leave happily.
+                 */
                status = 0;
                goto bail;
        }
-        mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
+        phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
-             clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
-        if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED && clusters_to_del) {
-                BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
-                         OCFS2_HAS_REFCOUNT_FL));
-                status = ocfs2_lock_refcount_tree(osb,
-                                                le64_to_cpu(di->i_refcount_loc),
-                                                1, &ref_tree, NULL);
-                if (status) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-                status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh,
-                                                               blkno,
-                                                               clusters_to_del,
-                                                               &credits,
-                                                               &meta_ac);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-        }
-        mutex_lock(&tl_inode->i_mutex);
-        tl_sem = 1;
-        /* ocfs2_truncate_log_needs_flush guarantees us at least one
-         * record is free for use. If there isn't any, we flush to get
-         * an empty truncate log.  */
-        if (ocfs2_truncate_log_needs_flush(osb)) {
-                status = __ocfs2_flush_truncate_log(osb);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-        }
-        credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
+        status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
-                                                (struct ocfs2_dinode *)fe_bh->b_data,
+                                          phys_cpos, trunc_len, flags, &dealloc,
-                                                el);
+                                          refcount_loc);
-        handle = ocfs2_start_trans(osb, credits);
-        if (IS_ERR(handle)) {
-                status = PTR_ERR(handle);
-                handle = NULL;
-                mlog_errno(status);
-                goto bail;
-        }
-        status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
-                                   tc, path, meta_ac);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        mutex_unlock(&tl_inode->i_mutex);
-        tl_sem = 0;
-        ocfs2_commit_trans(osb, handle);
-        handle = NULL;
        ocfs2_reinit_path(path, 1);
-        if (meta_ac) {
-                ocfs2_free_alloc_context(meta_ac);
-                meta_ac = NULL;
-        }
-        if (ref_tree) {
-                ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
-                ref_tree = NULL;
-        }
        /*
         * The check above will catch the case where we've truncated
         * away all allocation.
@@ -7585,25 +7130,10 @@ bail:
        ocfs2_schedule_truncate_log_flush(osb, 1);
-        if (tl_sem)
+        ocfs2_run_deallocs(osb, &dealloc);
-                mutex_unlock(&tl_inode->i_mutex);
-        if (handle)
-                ocfs2_commit_trans(osb, handle);
-        if (meta_ac)
-                ocfs2_free_alloc_context(meta_ac);
-        if (ref_tree)
-                ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
-        ocfs2_run_deallocs(osb, &tc->tc_dealloc);
        ocfs2_free_path(path);
-        /* This will drop the ext_alloc cluster lock for us */
-        ocfs2_free_truncate_context(tc);
        mlog_exit(status);
        return status;
 }
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 1db4359ccb90..55762b554b99 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -140,8 +140,9 @@ int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et,
                        struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_remove_btree_range(struct inode *inode,
                             struct ocfs2_extent_tree *et,
-                             u32 cpos, u32 phys_cpos, u32 len,
+                             u32 cpos, u32 phys_cpos, u32 len, int flags,
-                             struct ocfs2_cached_dealloc_ctxt *dealloc);
+                             struct ocfs2_cached_dealloc_ctxt *dealloc,
+                             u64 refcount_loc);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
                           struct ocfs2_extent_tree *et);
@@ -209,7 +210,7 @@ static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
                                u64 blkno, unsigned int bit);
 int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
-                              int type, int slot, u64 blkno,
+                              int type, int slot, u64 suballoc, u64 blkno,
                              unsigned int bit);
 static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
 {
@@ -233,8 +234,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                           struct ocfs2_truncate_context **tc);
 int ocfs2_commit_truncate(struct ocfs2_super *osb,
                          struct inode *inode,
-                          struct buffer_head *fe_bh,
+                          struct buffer_head *di_bh);
-                          struct ocfs2_truncate_context *tc);
 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
                          unsigned int start, unsigned int end, int trunc);
@@ -319,6 +319,8 @@ int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
                              struct ocfs2_path *path);
 int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
                                   struct ocfs2_path *path, u32 *cpos);
+int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+                                  struct ocfs2_path *path, u32 *cpos);
 int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
                            struct ocfs2_path *left,
                            struct ocfs2_path *right);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 21441ddb5506..3623ca20cc18 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1735,6 +1735,9 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
                        goto out;
                }
+                if (data_ac)
+                        data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
                credits = ocfs2_calc_extend_credits(inode->i_sb,
                                                    &di->id2.i_list,
                                                    clusters_to_alloc);
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index b7428c5d0d3b..ec6d12339593 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -403,7 +403,7 @@ void ocfs2_block_check_compute(void *data, size_t blocksize,
         * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
         * larger than 16 bits.
         */
-        BUG_ON(ecc > USHORT_MAX);
+        BUG_ON(ecc > USHRT_MAX);
        bc->bc_crc32e = cpu_to_le32(crc);
        bc->bc_ecc = cpu_to_le16((u16)ecc);
@@ -508,7 +508,7 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
         * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
         * larger than 16 bits.
         */
-        BUG_ON(ecc > USHORT_MAX);
+        BUG_ON(ecc > USHRT_MAX);
        bc->bc_crc32e = cpu_to_le32(crc);
        bc->bc_ecc = cpu_to_le16((u16)ecc);
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 21c808f752d8..f9d5d3ffc75a 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -25,7 +25,6 @@
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <cluster/masklog.h>
@@ -407,6 +406,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
                                struct buffer_head *bh)
 {
        int ret = 0;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
        mlog_entry_void();
@@ -426,6 +426,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
        get_bh(bh); /* for end_buffer_write_sync() */
        bh->b_end_io = end_buffer_write_sync;
+        ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check);
        submit_bh(WRITE, bh);
        wait_on_buffer(bh);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 5c9890006708..41d5f1f92d56 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -34,6 +34,7 @@
 #include <linux/crc32.h>
 #include <linux/time.h>
 #include <linux/debugfs.h>
+#include <linux/slab.h>
 #include "heartbeat.h"
 #include "tcp.h"
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 3bb928a2bf7d..c7fba396392d 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -116,6 +116,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
        define_mask(ERROR),
        define_mask(NOTICE),
        define_mask(KTHREAD),
+        define_mask(RESERVATIONS),
 };
 static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 3dfddbec32f2..fd96e2a2fa56 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -119,6 +119,7 @@
 #define ML_ERROR        0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE       0x0000000200000000ULL /* setn to KERN_NOTICE */
 #define ML_KTHREAD      0x0000000400000000ULL /* kernel thread activity */
+#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
 #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
 #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index c81142e3ef84..ed0c9f367fed 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -19,6 +19,7 @@
 * Boston, MA 021110-1307, USA.
 */
+#include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/configfs.h>
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 639024033fce..cf3e16696216 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -44,7 +44,6 @@
 * and if they're the last, they fire off the decision.
 */
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/workqueue.h>
 #include <linux/reboot.h>
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 73e743eea2c8..aa75ca3f78da 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -583,6 +583,9 @@ static void o2net_state_change(struct sock *sk)
                        o2net_sc_queue_work(sc, &sc->sc_connect_work);
                        break;
                default:
+                        printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT
+                              " shutdown, state %d\n",
+                              SC_NODEF_ARGS(sc), sk->sk_state);
                        o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
                        break;
        }
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index efd77d071c80..f04ebcfffc4a 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1194,7 +1194,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
                        else
                                de->inode = 0;
                        dir->i_version++;
-                        status = ocfs2_journal_dirty(handle, bh);
+                        ocfs2_journal_dirty(handle, bh);
                        goto bail;
                }
                i += le16_to_cpu(de->rec_len);
@@ -1752,7 +1752,7 @@ int __ocfs2_add_entry(handle_t *handle,
                                ocfs2_recalc_free_list(dir, handle, lookup);
                        dir->i_version++;
-                        status = ocfs2_journal_dirty(handle, insert_bh);
+                        ocfs2_journal_dirty(handle, insert_bh);
                        retval = 0;
                        goto bail;
                }
@@ -2297,12 +2297,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
        }
        ocfs2_fill_initial_dirents(inode, parent, data->id_data, size);
        ocfs2_journal_dirty(handle, di_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        i_size_write(inode, size);
        inode->i_nlink = 2;
@@ -2366,11 +2361,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
                ocfs2_init_dir_trailer(inode, new_bh, size);
        }
-        status = ocfs2_journal_dirty(handle, new_bh);
+        ocfs2_journal_dirty(handle, new_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        i_size_write(inode, inode->i_sb->s_blocksize);
        inode->i_nlink = 2;
@@ -2404,15 +2395,15 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
        int ret;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
        u16 dr_suballoc_bit;
-        u64 dr_blkno;
+        u64 suballoc_loc, dr_blkno;
        unsigned int num_bits;
        struct buffer_head *dx_root_bh = NULL;
        struct ocfs2_dx_root_block *dx_root;
        struct ocfs2_dir_block_trailer *trailer =
                ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
-        ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit,
+        ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
-                                   &num_bits, &dr_blkno);
+                                   &dr_suballoc_bit, &num_bits, &dr_blkno);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -2440,6 +2431,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
        memset(dx_root, 0, osb->sb->s_blocksize);
        strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
        dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+        dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc);
        dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
        dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
        dx_root->dr_blkno = cpu_to_le64(dr_blkno);
@@ -2458,10 +2450,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
                dx_root->dr_list.l_count =
                        cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
        }
+        ocfs2_journal_dirty(handle, dx_root_bh);
-        ret = ocfs2_journal_dirty(handle, dx_root_bh);
-        if (ret)
-                mlog_errno(ret);
        ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
                                      OCFS2_JOURNAL_ACCESS_CREATE);
@@ -2475,9 +2464,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
        OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
        di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
-        ret = ocfs2_journal_dirty(handle, di_bh);
+        ocfs2_journal_dirty(handle, di_bh);
-        if (ret)
-                mlog_errno(ret);
        *ret_dx_root_bh = dx_root_bh;
        dx_root_bh = NULL;
@@ -2558,7 +2545,7 @@ static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
         * chance of contiguousness as the directory grows in number
         * of entries.
         */
-        ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num);
+        ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -2991,7 +2978,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
         * if we only get one now, that's enough to continue. The rest
         * will be claimed after the conversion to extents.
         */
-        ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+        if (ocfs2_dir_resv_allowed(osb))
+                data_ac->ac_resv = &oi->ip_la_data_resv;
+        ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -3034,11 +3023,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                ocfs2_init_dir_trailer(dir, dirdata_bh, i);
        }
-        ret = ocfs2_journal_dirty(handle, dirdata_bh);
+        ocfs2_journal_dirty(handle, dirdata_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
        if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
                /*
@@ -3104,11 +3089,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
         */
        dir->i_blocks = ocfs2_inode_sector_count(dir);
-        ret = ocfs2_journal_dirty(handle, di_bh);
+        ocfs2_journal_dirty(handle, di_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
        if (ocfs2_supports_indexed_dirs(osb)) {
                ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
@@ -3138,7 +3119,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
         * pass. Claim the 2nd cluster as a separate extent.
         */
        if (alloc > len) {
-                ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
+                ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
                                           &len);
                if (ret) {
                        mlog_errno(ret);
@@ -3369,6 +3350,9 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
                        goto bail;
                }
+                if (ocfs2_dir_resv_allowed(osb))
+                        data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
                credits = ocfs2_calc_extend_credits(sb, el, 1);
        } else {
                spin_unlock(&OCFS2_I(dir)->ip_lock);
@@ -3423,11 +3407,7 @@ do_extend:
        } else {
                de->rec_len = cpu_to_le16(sb->s_blocksize);
        }
-        status = ocfs2_journal_dirty(handle, new_bh);
+        ocfs2_journal_dirty(handle, new_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        dir_i_size += dir->i_sb->s_blocksize;
        i_size_write(dir, dir_i_size);
@@ -3906,11 +3886,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
             sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
             dx_leaf_sort_swap);
-        ret = ocfs2_journal_dirty(handle, dx_leaf_bh);
+        ocfs2_journal_dirty(handle, dx_leaf_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
        ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
                                           &split_hash);
@@ -4490,7 +4466,10 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
        blk = le64_to_cpu(dx_root->dr_blkno);
        bit = le16_to_cpu(dx_root->dr_suballoc_bit);
-        bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+        if (dx_root->dr_suballoc_loc)
+                bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc);
+        else
+                bg_blkno = ocfs2_which_suballoc_group(blk, bit);
        ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
                                       bit, bg_blkno, 1);
        if (ret)
@@ -4551,8 +4530,8 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
                p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
-                ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen,
+                ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0,
-                                               &dealloc);
+                                               &dealloc, 0);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index dccc439fa087..f44999156839 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -28,7 +28,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
@@ -89,7 +88,7 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
        return 0;
 }
-static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
        mlog_entry_void();
@@ -146,7 +145,7 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 }
-static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
        mlog_entry_void();
@@ -185,9 +184,8 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        BUG_ON(!lksb);
        /* only updates if this node masters the lockres */
+        spin_lock(&res->spinlock);
        if (res->owner == dlm->node_num) {
-                spin_lock(&res->spinlock);
                /* check the lksb flags for the direction */
                if (lksb->flags & DLM_LKSB_GET_LVB) {
                        mlog(0, "getting lvb from lockres for %s node\n",
@@ -202,8 +200,8 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                 * here. In the future we might want to clear it at the time
                 * the put is actually done.
                 */
-                spin_unlock(&res->spinlock);
        }
+        spin_unlock(&res->spinlock);
        /* reset any lvb flags on the lksb */
        lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
@@ -453,7 +451,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
                                     lock->ml.node, &status);
        if (ret < 0)
-                mlog_errno(ret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key,
+                     lock->ml.node);
        else {
                if (status == DLM_RECOVERING) {
                        mlog(ML_ERROR, "sent AST to node %u, it thinks this "
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 0102be35980c..4b6ae2c13b47 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,7 +37,7 @@
 #define DLM_THREAD_SHUFFLE_INTERVAL    5     // flush everything every 5 passes
 #define DLM_THREAD_MS                  200   // flush at least every 200 ms
-#define DLM_HASH_SIZE_DEFAULT   (1 << 14)
+#define DLM_HASH_SIZE_DEFAULT   (1 << 17)
 #if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
 # define DLM_HASH_PAGES         1
 #else
@@ -904,6 +904,8 @@ void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
 void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 void dlm_do_local_ast(struct dlm_ctxt *dlm,
                      struct dlm_lock_resource *res,
                      struct dlm_lock *lock);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index f283bce776b4..9f30491e5e88 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -28,7 +28,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
@@ -391,7 +390,9 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
                } else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
                        dlm_error(ret);
        } else {
-                mlog_errno(tmpret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", tmpret, DLM_CONVERT_LOCK_MSG, dlm->key,
+                     res->owner);
                if (dlm_is_host_down(tmpret)) {
                        /* instead of logging the same network error over
                         * and over, sleep here and wait for the heartbeat
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 988c9055fd4e..6b5a492e1749 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -511,7 +511,7 @@ static void __dlm_print_nodes(struct dlm_ctxt *dlm)
        assert_spin_locked(&dlm->spinlock);
-        printk(KERN_INFO "ocfs2_dlm: Nodes in domain (\"%s\"): ", dlm->name);
+        printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name);
        while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
                                     node + 1)) < O2NM_MAX_NODES) {
@@ -534,7 +534,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
        node = exit_msg->node_idx;
-        printk(KERN_INFO "ocfs2_dlm: Node %u leaves domain %s\n", node, dlm->name);
+        printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name);
        spin_lock(&dlm->spinlock);
        clear_bit(node, dlm->domain_map);
@@ -565,7 +565,9 @@ static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
        status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
                                    &leave_msg, sizeof(leave_msg), node,
                                    NULL);
+        if (status < 0)
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node);
        mlog(0, "status return %d from o2net_send_message\n", status);
        return status;
@@ -904,7 +906,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
                set_bit(assert->node_idx, dlm->domain_map);
                __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
-                printk(KERN_INFO "ocfs2_dlm: Node %u joins domain %s\n",
+                printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
                       assert->node_idx, dlm->name);
                __dlm_print_nodes(dlm);
@@ -962,7 +964,9 @@ static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm,
                                    &cancel_msg, sizeof(cancel_msg), node,
                                    NULL);
        if (status < 0) {
-                mlog_errno(status);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", status, DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
+                     node);
                goto bail;
        }
@@ -1029,10 +1033,11 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
        byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES);
        status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
-                                    sizeof(join_msg), node,
+                                    sizeof(join_msg), node, &join_resp);
-                                    &join_resp);
        if (status < 0 && status != -ENOPROTOOPT) {
-                mlog_errno(status);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", status, DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
+                     node);
                goto bail;
        }
        dlm_query_join_wire_to_packet(join_resp, &packet);
@@ -1103,7 +1108,9 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
                                    &assert_msg, sizeof(assert_msg), node,
                                    NULL);
        if (status < 0)
-                mlog_errno(status);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
+                     node);
        return status;
 }
@@ -1516,7 +1523,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
                goto leave;
        }
-        dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL);
+        dlm->name = kstrdup(domain, GFP_KERNEL);
        if (dlm->name == NULL) {
                mlog_errno(-ENOMEM);
                kfree(dlm);
@@ -1550,7 +1557,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        for (i = 0; i < DLM_HASH_BUCKETS; i++)
                INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
-        strcpy(dlm->name, domain);
        dlm->key = key;
        dlm->node_num = o2nm_this_node();
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 733337772671..69cf369961c4 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -329,7 +329,9 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
                        BUG();
                }
        } else {
-                mlog_errno(tmpret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key,
+                     res->owner);
                if (dlm_is_host_down(tmpret)) {
                        ret = DLM_RECOVERING;
                        mlog(0, "node %u died so returning DLM_RECOVERING "
@@ -429,7 +431,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
        struct dlm_lock *lock;
        int kernel_allocated = 0;
-        lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
+        lock = kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
        if (!lock)
                return NULL;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a659606dcb95..4a7506a4e314 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -617,13 +617,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
 {
        struct dlm_lock_resource *res = NULL;
-        res = (struct dlm_lock_resource *)
+        res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
-                                kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
        if (!res)
                goto error;
-        res->lockname.name = (char *)
+        res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
-                                kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
        if (!res->lockname.name)
                goto error;
@@ -757,8 +755,7 @@ lookup:
                spin_unlock(&dlm->spinlock);
                mlog(0, "allocating a new resource\n");
                /* nothing found and we need to allocate one. */
-                alloc_mle = (struct dlm_master_list_entry *)
+                alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
-                        kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
                if (!alloc_mle)
                        goto leave;
                res = dlm_new_lockres(dlm, lockid, namelen);
@@ -1542,8 +1539,7 @@ way_up_top:
                        spin_unlock(&dlm->master_lock);
                        spin_unlock(&dlm->spinlock);
-                        mle = (struct dlm_master_list_entry *)
+                        mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
-                                kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
                        if (!mle) {
                                response = DLM_MASTER_RESP_ERROR;
                                mlog_errno(-ENOMEM);
@@ -1666,7 +1662,9 @@ again:
                tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
                                            &assert, sizeof(assert), to, &r);
                if (tmpret < 0) {
-                        mlog(0, "assert_master returned %d!\n", tmpret);
+                        mlog(ML_ERROR, "Error %d when sending message %u (key "
+                             "0x%x) to node %u\n", tmpret,
+                             DLM_ASSERT_MASTER_MSG, dlm->key, to);
                        if (!dlm_is_host_down(tmpret)) {
                                mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
                                BUG();
@@ -1875,7 +1873,6 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
 ok:
                spin_unlock(&res->spinlock);
        }
-        spin_unlock(&dlm->spinlock);
        // mlog(0, "woo!  got an assert_master from node %u!\n",
        //           assert->node_idx);
@@ -1926,7 +1923,6 @@ ok:
                /* master is known, detach if not already detached.
                 * ensures that only one assert_master call will happen
                 * on this mle. */
-                spin_lock(&dlm->spinlock);
                spin_lock(&dlm->master_lock);
                rr = atomic_read(&mle->mle_refs.refcount);
@@ -1959,7 +1955,6 @@ ok:
                        __dlm_put_mle(mle);
                }
                spin_unlock(&dlm->master_lock);
-                spin_unlock(&dlm->spinlock);
        } else if (res) {
                if (res->owner != assert->node_idx) {
                        mlog(0, "assert_master from %u, but current "
@@ -1967,6 +1962,7 @@ ok:
                             res->owner, namelen, name);
                }
        }
+        spin_unlock(&dlm->spinlock);
 done:
        ret = 0;
@@ -2207,7 +2203,9 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
        ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
                                 &deref, sizeof(deref), res->owner, &r);
        if (ret < 0)
-                mlog_errno(ret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key,
+                     res->owner);
        else if (r < 0) {
                /* BAD.  other node says I did not have a ref. */
                mlog(ML_ERROR,"while dropping ref on %s:%.*s "
@@ -2454,8 +2452,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
                goto leave;
        }
-        mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
+        mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
-                                                                GFP_NOFS);
        if (!mle) {
                mlog_errno(ret);
                goto leave;
@@ -2977,7 +2974,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
                                         &migrate, sizeof(migrate), nodenum,
                                         &status);
                if (ret < 0) {
-                        mlog(0, "migrate_request returned %d!\n", ret);
+                        mlog(ML_ERROR, "Error %d when sending message %u (key "
+                             "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG,
+                             dlm->key, nodenum);
                        if (!dlm_is_host_down(ret)) {
                                mlog(ML_ERROR, "unhandled error=%d!\n", ret);
                                BUG();
@@ -3035,8 +3034,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
        hash = dlm_lockid_hash(name, namelen);
        /* preallocate.. if this fails, abort */
-        mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
+        mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
-                                                         GFP_NOFS);
        if (!mle) {
                ret = -ENOMEM;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index b4f99de2caf3..f8b75ce4be70 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -803,7 +803,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
        /* negative status is handled by caller */
        if (ret < 0)
-                mlog_errno(ret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key "
+                     "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG,
+                     dlm->key, request_from);
        // return from here, then
        // sleep until all received or error
@@ -955,10 +957,10 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
        ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
                                 sizeof(done_msg), send_to, &tmpret);
        if (ret < 0) {
+                mlog(ML_ERROR, "Error %d when sending message %u (key "
+                     "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG,
+                     dlm->key, send_to);
                if (!dlm_is_host_down(ret)) {
-                        mlog_errno(ret);
-                        mlog(ML_ERROR, "%s: unknown error sending data-done "
-                             "to %u\n", dlm->name, send_to);
                        BUG();
                }
        } else
@@ -1126,7 +1128,9 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
        if (ret < 0) {
                /* XXX: negative status is not handled.
                 * this will end up killing this node. */
-                mlog_errno(ret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key "
+                     "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG,
+                     dlm->key, send_to);
        } else {
                /* might get an -ENOMEM back here */
                ret = status;
@@ -1642,7 +1646,9 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                                 &req, sizeof(req), nodenum, &status);
        /* XXX: negative status not handled properly here. */
        if (ret < 0)
-                mlog_errno(ret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key "
+                     "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG,
+                     dlm->key, nodenum);
        else {
                BUG_ON(status < 0);
                BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -2640,7 +2646,7 @@ retry:
                if (dlm_is_host_down(ret)) {
                        /* node is down.  not involved in recovery
                         * so just keep going */
-                        mlog(0, "%s: node %u was down when sending "
+                        mlog(ML_NOTICE, "%s: node %u was down when sending "
                             "begin reco msg (%d)\n", dlm->name, nodenum, ret);
                        ret = 0;
                }
@@ -2660,11 +2666,12 @@ retry:
                }
                if (ret < 0) {
                        struct dlm_lock_resource *res;
                        /* this is now a serious problem, possibly ENOMEM
                         * in the network stack.  must retry */
                        mlog_errno(ret);
                        mlog(ML_ERROR, "begin reco of dlm %s to node %u "
-                            " returned %d\n", dlm->name, nodenum, ret);
+                             "returned %d\n", dlm->name, nodenum, ret);
                        res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
                                                 DLM_RECOVERY_LOCK_NAME_LEN);
                        if (res) {
@@ -2789,7 +2796,9 @@ stage2:
                if (ret >= 0)
                        ret = status;
                if (ret < 0) {
-                        mlog_errno(ret);
+                        mlog(ML_ERROR, "Error %d when sending message %u (key "
+                             "0x%x) to node %u\n", ret, DLM_FINALIZE_RECO_MSG,
+                             dlm->key, nodenum);
                        if (dlm_is_host_down(ret)) {
                                /* this has no effect on this recovery
                                 * session, so set the status to zero to
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 52ec020ea78b..d4f73ca68fe5 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -28,7 +28,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
@@ -310,6 +309,7 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
         * spinlock, and because we know that it is not migrating/
         * recovering/in-progress, it is fine to reserve asts and
         * basts right before queueing them all throughout */
+        assert_spin_locked(&dlm->ast_lock);
        assert_spin_locked(&res->spinlock);
        BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
                              DLM_LOCK_RES_RECOVERING|
@@ -338,7 +338,7 @@ converting:
                        /* queue the BAST if not already */
                        if (lock->ml.highest_blocked == LKM_IVMODE) {
                                __dlm_lockres_reserve_ast(res);
-                                dlm_queue_bast(dlm, lock);
+                                __dlm_queue_bast(dlm, lock);
                        }
                        /* update the highest_blocked if needed */
                        if (lock->ml.highest_blocked < target->ml.convert_type)
@@ -356,7 +356,7 @@ converting:
                        can_grant = 0;
                        if (lock->ml.highest_blocked == LKM_IVMODE) {
                                __dlm_lockres_reserve_ast(res);
-                                dlm_queue_bast(dlm, lock);
+                                __dlm_queue_bast(dlm, lock);
                        }
                        if (lock->ml.highest_blocked < target->ml.convert_type)
                                lock->ml.highest_blocked =
@@ -384,7 +384,7 @@ converting:
                spin_unlock(&target->spinlock);
                __dlm_lockres_reserve_ast(res);
-                dlm_queue_ast(dlm, target);
+                __dlm_queue_ast(dlm, target);
                /* go back and check for more */
                goto converting;
        }
@@ -403,7 +403,7 @@ blocked:
                        can_grant = 0;
                        if (lock->ml.highest_blocked == LKM_IVMODE) {
                                __dlm_lockres_reserve_ast(res);
-                                dlm_queue_bast(dlm, lock);
+                                __dlm_queue_bast(dlm, lock);
                        }
                        if (lock->ml.highest_blocked < target->ml.type)
                                lock->ml.highest_blocked = target->ml.type;
@@ -419,7 +419,7 @@ blocked:
                        can_grant = 0;
                        if (lock->ml.highest_blocked == LKM_IVMODE) {
                                __dlm_lockres_reserve_ast(res);
-                                dlm_queue_bast(dlm, lock);
+                                __dlm_queue_bast(dlm, lock);
                        }
                        if (lock->ml.highest_blocked < target->ml.type)
                                lock->ml.highest_blocked = target->ml.type;
@@ -445,7 +445,7 @@ blocked:
                spin_unlock(&target->spinlock);
                __dlm_lockres_reserve_ast(res);
-                dlm_queue_ast(dlm, target);
+                __dlm_queue_ast(dlm, target);
                /* go back and check for more */
                goto converting;
        }
@@ -675,6 +675,7 @@ static int dlm_thread(void *data)
                        /* lockres can be re-dirtied/re-added to the
                         * dirty_list in this gap, but that is ok */
+                        spin_lock(&dlm->ast_lock);
                        spin_lock(&res->spinlock);
                        if (res->owner != dlm->node_num) {
                                __dlm_print_one_lock_resource(res);
@@ -695,6 +696,7 @@ static int dlm_thread(void *data)
                                /* move it to the tail and keep going */
                                res->state &= ~DLM_LOCK_RES_DIRTY;
                                spin_unlock(&res->spinlock);
+                                spin_unlock(&dlm->ast_lock);
                                mlog(0, "delaying list shuffling for in-"
                                     "progress lockres %.*s, state=%d\n",
                                     res->lockname.len, res->lockname.name,
@@ -716,6 +718,7 @@ static int dlm_thread(void *data)
                        dlm_shuffle_lists(dlm, res);
                        res->state &= ~DLM_LOCK_RES_DIRTY;
                        spin_unlock(&res->spinlock);
+                        spin_unlock(&dlm->ast_lock);
                        dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 49e29ecd0201..817287c6a6db 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -28,7 +28,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
@@ -355,7 +354,8 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
                        mlog(0, "master was in-progress.  retry\n");
                ret = status;
        } else {
-                mlog_errno(tmpret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", tmpret, DLM_UNLOCK_LOCK_MSG, dlm->key, owner);
                if (dlm_is_host_down(tmpret)) {
                        /* NOTE: this seems strange, but it is what we want.
                         * when the master goes down during a cancel or
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 1b0de157a08c..b83d6107a1f5 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -112,20 +112,20 @@ MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES);
 * O_RDONLY -> PRMODE level
 * O_WRONLY -> EXMODE level
 *
- * O_NONBLOCK -> LKM_NOQUEUE
+ * O_NONBLOCK -> NOQUEUE
 */
 static int dlmfs_decode_open_flags(int open_flags,
                                   int *level,
                                   int *flags)
 {
        if (open_flags & (O_WRONLY|O_RDWR))
-                *level = LKM_EXMODE;
+                *level = DLM_LOCK_EX;
        else
-                *level = LKM_PRMODE;
+                *level = DLM_LOCK_PR;
        *flags = 0;
        if (open_flags & O_NONBLOCK)
-                *flags |= LKM_NOQUEUE;
+                *flags |= DLM_LKF_NOQUEUE;
        return 0;
 }
@@ -166,7 +166,7 @@ static int dlmfs_file_open(struct inode *inode,
                 * to be able userspace to be able to distinguish a
                 * valid lock request from one that simply couldn't be
                 * granted. */
-                if (flags & LKM_NOQUEUE && status == -EAGAIN)
+                if (flags & DLM_LKF_NOQUEUE && status == -EAGAIN)
                        status = -ETXTBSY;
                kfree(fp);
                goto bail;
@@ -193,7 +193,7 @@ static int dlmfs_file_release(struct inode *inode,
        status = 0;
        if (fp) {
                level = fp->fp_lock_level;
-                if (level != LKM_IVMODE)
+                if (level != DLM_LOCK_IV)
                        user_dlm_cluster_unlock(&ip->ip_lockres, level);
                kfree(fp);
@@ -262,7 +262,7 @@ static ssize_t dlmfs_file_read(struct file *filp,
        if ((count + *ppos) > i_size_read(inode))
                readlen = i_size_read(inode) - *ppos;
        else
-                readlen = count - *ppos;
+                readlen = count;
        lvb_buf = kmalloc(readlen, GFP_NOFS);
        if (!lvb_buf)
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 50c4ee805da4..39eb16ac5f98 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3897,7 +3897,8 @@ static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
                oinfo->dqi_gi.dqi_free_entry =
                                        be32_to_cpu(lvb->lvb_free_entry);
        } else {
-                status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh);
+                status = ocfs2_read_quota_phys_block(oinfo->dqi_gqinode,
+                                                     oinfo->dqi_giblk, &bh);
                if (status) {
                        mlog_errno(status);
                        goto bail;
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index c562a7581cf9..09e3fdfa6d33 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -24,6 +24,7 @@
 #include <linux/fs.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/fiemap.h>
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 17947dc8341e..6a13ea64c447 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -175,13 +175,12 @@ static int ocfs2_dir_release(struct inode *inode, struct file *file)
        return 0;
 }
-static int ocfs2_sync_file(struct file *file,
+static int ocfs2_sync_file(struct file *file, int datasync)
-                           struct dentry *dentry,
-                           int datasync)
 {
        int err = 0;
        journal_t *journal;
-        struct inode *inode = dentry->d_inode;
+        struct dentry *dentry = file->f_path.dentry;
+        struct inode *inode = file->f_mapping->host;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
@@ -278,10 +277,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
        inode->i_atime = CURRENT_TIME;
        di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
        di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+        ocfs2_journal_dirty(handle, bh);
-        ret = ocfs2_journal_dirty(handle, bh);
-        if (ret < 0)
-                mlog_errno(ret);
 out_commit:
        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
@@ -430,9 +426,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
        di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
        di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
-        status = ocfs2_journal_dirty(handle, fe_bh);
+        ocfs2_journal_dirty(handle, fe_bh);
-        if (status < 0)
-                mlog_errno(status);
 out_commit:
        ocfs2_commit_trans(osb, handle);
@@ -449,7 +443,6 @@ static int ocfs2_truncate_file(struct inode *inode,
        int status = 0;
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct ocfs2_truncate_context *tc = NULL;
        mlog_entry("(inode = %llu, new_i_size = %llu\n",
                   (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -488,6 +481,9 @@ static int ocfs2_truncate_file(struct inode *inode,
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
+        ocfs2_resv_discard(&osb->osb_la_resmap,
+                           &OCFS2_I(inode)->ip_la_data_resv);
        /*
         * The inode lock forced other nodes to sync and drop their
         * pages, which (correctly) happens even if we have a truncate
@@ -517,13 +513,7 @@ static int ocfs2_truncate_file(struct inode *inode,
                goto bail_unlock_sem;
        }
-        status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
+        status = ocfs2_commit_truncate(osb, inode, di_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail_unlock_sem;
-        }
-        status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
        if (status < 0) {
                mlog_errno(status);
                goto bail_unlock_sem;
@@ -666,11 +656,7 @@ restarted_transaction:
                goto leave;
        }
-        status = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        spin_lock(&OCFS2_I(inode)->ip_lock);
        clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
@@ -684,6 +670,7 @@ restarted_transaction:
                if (why == RESTART_META) {
                        mlog(0, "restarting function.\n");
                        restart_func = 1;
+                        status = 0;
                } else {
                        BUG_ON(why != RESTART_TRANS);
@@ -945,9 +932,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
        struct ocfs2_super *osb = OCFS2_SB(sb);
        struct buffer_head *bh = NULL;
        handle_t *handle = NULL;
-        int qtype;
-        struct dquot *transfer_from[MAXQUOTAS] = { };
        struct dquot *transfer_to[MAXQUOTAS] = { };
+        int qtype;
        mlog_entry("(0x%p, '%.*s')\n", dentry,
                   dentry->d_name.len, dentry->d_name.name);
@@ -978,10 +964,10 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
        if (status)
                return status;
+        if (is_quota_modification(inode, attr))
+                dquot_initialize(inode);
        size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
        if (size_change) {
-                dquot_initialize(inode);
                status = ocfs2_rw_lock(inode, 1);
                if (status < 0) {
                        mlog_errno(status);
@@ -1031,9 +1017,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
                        transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid,
                                                      USRQUOTA);
-                        transfer_from[USRQUOTA] = dqget(sb, inode->i_uid,
+                        if (!transfer_to[USRQUOTA]) {
-                                                        USRQUOTA);
-                        if (!transfer_to[USRQUOTA] || !transfer_from[USRQUOTA]) {
                                status = -ESRCH;
                                goto bail_unlock;
                        }
@@ -1043,9 +1027,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
                        transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid,
                                                      GRPQUOTA);
-                        transfer_from[GRPQUOTA] = dqget(sb, inode->i_gid,
+                        if (!transfer_to[GRPQUOTA]) {
-                                                        GRPQUOTA);
-                        if (!transfer_to[GRPQUOTA] || !transfer_from[GRPQUOTA]) {
                                status = -ESRCH;
                                goto bail_unlock;
                        }
@@ -1057,7 +1039,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                        mlog_errno(status);
                        goto bail_unlock;
                }
-                status = dquot_transfer(inode, attr);
+                status = __dquot_transfer(inode, transfer_to);
                if (status < 0)
                        goto bail_commit;
        } else {
@@ -1070,7 +1052,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
        }
        /*
-         * This will intentionally not wind up calling vmtruncate(),
+         * This will intentionally not wind up calling simple_setsize(),
         * since all the work for a size change has been done above.
         * Otherwise, we could get into problems with truncate as
         * ip_alloc_sem is used there to protect against i_size
@@ -1097,10 +1079,8 @@ bail:
        brelse(bh);
        /* Release quota pointers in case we acquired them */
-        for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
+        for (qtype = 0; qtype < MAXQUOTAS; qtype++)
                dqput(transfer_to[qtype]);
-                dqput(transfer_from[qtype]);
-        }
        if (!status && attr->ia_valid & ATTR_MODE) {
                status = ocfs2_acl_chmod(inode);
@@ -1194,9 +1174,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
        di = (struct ocfs2_dinode *) bh->b_data;
        di->i_mode = cpu_to_le16(inode->i_mode);
-        ret = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (ret < 0)
-                mlog_errno(ret);
 out_trans:
        ocfs2_commit_trans(osb, handle);
@@ -1433,16 +1411,90 @@ out:
        return ret;
 }
+static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
+{
+        int i;
+        struct ocfs2_extent_rec *rec = NULL;
+        for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+                rec = &el->l_recs[i];
+                if (le32_to_cpu(rec->e_cpos) < pos)
+                        break;
+        }
+        return i;
+}
+/*
+ * Helper to calculate the punching pos and length in one run, we handle the
+ * following three cases in order:
+ *
+ * - remove the entire record
+ * - remove a partial record
+ * - no record needs to be removed (hole-punching completed)
+*/
+static void ocfs2_calc_trunc_pos(struct inode *inode,
+                                 struct ocfs2_extent_list *el,
+                                 struct ocfs2_extent_rec *rec,
+                                 u32 trunc_start, u32 *trunc_cpos,
+                                 u32 *trunc_len, u32 *trunc_end,
+                                 u64 *blkno, int *done)
+{
+        int ret = 0;
+        u32 coff, range;
+        range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+        if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
+                *trunc_cpos = le32_to_cpu(rec->e_cpos);
+                /*
+                 * Skip holes if any.
+                 */
+                if (range < *trunc_end)
+                        *trunc_end = range;
+                *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
+                *blkno = le64_to_cpu(rec->e_blkno);
+                *trunc_end = le32_to_cpu(rec->e_cpos);
+        } else if (range > trunc_start) {
+                *trunc_cpos = trunc_start;
+                *trunc_len = *trunc_end - trunc_start;
+                coff = trunc_start - le32_to_cpu(rec->e_cpos);
+                *blkno = le64_to_cpu(rec->e_blkno) +
+                                ocfs2_clusters_to_blocks(inode->i_sb, coff);
+                *trunc_end = trunc_start;
+        } else {
+                /*
+                 * It may have two following possibilities:
+                 *
+                 * - last record has been removed
+                 * - trunc_start was within a hole
+                 *
+                 * both two cases mean the completion of hole punching.
+                 */
+                ret = 1;
+        }
+        *done = ret;
+}
 static int ocfs2_remove_inode_range(struct inode *inode,
                                    struct buffer_head *di_bh, u64 byte_start,
                                    u64 byte_len)
 {
-        int ret = 0;
+        int ret = 0, flags = 0, done = 0, i;
-        u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
+        u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
+        u32 cluster_in_el;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_cached_dealloc_ctxt dealloc;
        struct address_space *mapping = inode->i_mapping;
        struct ocfs2_extent_tree et;
+        struct ocfs2_path *path = NULL;
+        struct ocfs2_extent_list *el = NULL;
+        struct ocfs2_extent_rec *rec = NULL;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
        ocfs2_init_dealloc_ctxt(&dealloc);
@@ -1468,17 +1520,35 @@ static int ocfs2_remove_inode_range(struct inode *inode,
                goto out;
        }
+        /*
+         * For reflinks, we may need to CoW 2 clusters which might be
+         * partially zero'd later, if hole's start and end offset were
+         * within one cluster(means is not exactly aligned to clustersize).
+         */
+        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
+                ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
        trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
-        trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
+        trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
-        if (trunc_len >= trunc_start)
+        cluster_in_el = trunc_end;
-                trunc_len -= trunc_start;
-        else
-                trunc_len = 0;
-        mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",
+        mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, cend: %u\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno,
             (unsigned long long)byte_start,
-             (unsigned long long)byte_len, trunc_start, trunc_len);
+             (unsigned long long)byte_len, trunc_start, trunc_end);
        ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
        if (ret) {
@@ -1486,31 +1556,79 @@ static int ocfs2_remove_inode_range(struct inode *inode,
                goto out;
        }
-        cpos = trunc_start;
+        path = ocfs2_new_path_from_et(&et);
-        while (trunc_len) {
+        if (!path) {
-                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
+                ret = -ENOMEM;
-                                         &alloc_size, NULL);
+                mlog_errno(ret);
+                goto out;
+        }
+        while (trunc_end > trunc_start) {
+                ret = ocfs2_find_path(INODE_CACHE(inode), path,
+                                      cluster_in_el);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                if (alloc_size > trunc_len)
+                el = path_leaf_el(path);
-                        alloc_size = trunc_len;
+                i = ocfs2_find_rec(el, trunc_end);
+                /*
+                 * Need to go to previous extent block.
+                 */
+                if (i < 0) {
+                        if (path->p_tree_depth == 0)
+                                break;
-                /* Only do work for non-holes */
+                        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
-                if (phys_cpos != 0) {
+                                                            path,
-                        ret = ocfs2_remove_btree_range(inode, &et, cpos,
+                                                            &cluster_in_el);
-                                                       phys_cpos, alloc_size,
-                                                       &dealloc);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
                        }
+                        /*
+                         * We've reached the leftmost extent block,
+                         * it's safe to leave.
+                         */
+                        if (cluster_in_el == 0)
+                                break;
+                        /*
+                         * The 'pos' searched for previous extent block is
+                         * always one cluster less than actual trunc_end.
+                         */
+                        trunc_end = cluster_in_el + 1;
+                        ocfs2_reinit_path(path, 1);
+                        continue;
+                } else
+                        rec = &el->l_recs[i];
+                ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
+                                     &trunc_len, &trunc_end, &blkno, &done);
+                if (done)
+                        break;
+                flags = rec->e_flags;
+                phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+                ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
+                                               phys_cpos, trunc_len, flags,
+                                               &dealloc, refcount_loc);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
                }
-                cpos += alloc_size;
+                cluster_in_el = trunc_end;
-                trunc_len -= alloc_size;
+                ocfs2_reinit_path(path, 1);
        }
        ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
@@ -1981,18 +2099,18 @@ relock:
        /* communicate with ocfs2_dio_end_io */
        ocfs2_iocb_set_rw_locked(iocb, rw_level);
-        if (direct_io) {
+        ret = generic_segment_checks(iov, &nr_segs, &ocount,
-                ret = generic_segment_checks(iov, &nr_segs, &ocount,
+                                     VERIFY_READ);
-                                             VERIFY_READ);
+        if (ret)
-                if (ret)
+                goto out_dio;
-                        goto out_dio;
-                count = ocount;
+        count = ocount;
-                ret = generic_write_checks(file, ppos, &count,
+        ret = generic_write_checks(file, ppos, &count,
-                                           S_ISBLK(inode->i_mode));
+                                   S_ISBLK(inode->i_mode));
-                if (ret)
+        if (ret)
-                        goto out_dio;
+                goto out_dio;
+        if (direct_io) {
                written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
                                                    ppos, count, ocount);
                if (written < 0) {
@@ -2000,14 +2118,21 @@ relock:
                         * direct write may have instantiated a few
                         * blocks outside i_size. Trim these off again.
                         * Don't need i_size_read because we hold i_mutex.
+                         *
+                         * XXX(hch): this looks buggy because ocfs2 did not
+                         * actually implement ->truncate.  Take a look at
+                         * the new truncate sequence and update this accordingly
                         */
                        if (*ppos + count > inode->i_size)
-                                vmtruncate(inode, inode->i_size);
+                                simple_setsize(inode, inode->i_size);
                        ret = written;
                        goto out_dio;
                }
        } else {
-                written = __generic_file_aio_write(iocb, iov, nr_segs, ppos);
+                current->backing_dev_info = file->f_mapping->backing_dev_info;
+                written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos,
+                                                      ppos, count, 0);
+                current->backing_dev_info = NULL;
        }
 out_dio:
@@ -2021,9 +2146,9 @@ out_dio:
                if (ret < 0)
                        written = ret;
-                if (!ret && (old_size != i_size_read(inode) ||
+                if (!ret && ((old_size != i_size_read(inode)) ||
-                    old_clusters != OCFS2_I(inode)->ip_clusters ||
+                             (old_clusters != OCFS2_I(inode)->ip_clusters) ||
-                    has_refcount)) {
+                             has_refcount)) {
                        ret = jbd2_journal_force_commit(osb->journal->j_journal);
                        if (ret < 0)
                                written = ret;
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index c6e7213db868..1aa863dd901f 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -26,7 +26,6 @@
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #define MLOG_MASK_PREFIX ML_SUPER
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 278a223aae14..abb0a95cc717 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -25,7 +25,6 @@
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
@@ -377,6 +376,10 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        OCFS2_I(inode)->ip_last_used_slot = 0;
        OCFS2_I(inode)->ip_last_used_group = 0;
+        if (S_ISDIR(inode->i_mode))
+                ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv,
+                                    OCFS2_RESV_FLAG_DIR);
        mlog_exit_void();
 }
@@ -540,7 +543,6 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
                                     struct buffer_head *fe_bh)
 {
        int status = 0;
-        struct ocfs2_truncate_context *tc = NULL;
        struct ocfs2_dinode *fe;
        handle_t *handle = NULL;
@@ -559,6 +561,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
                handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
                if (IS_ERR(handle)) {
                        status = PTR_ERR(handle);
+                        handle = NULL;
                        mlog_errno(status);
                        goto out;
                }
@@ -582,13 +585,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
                ocfs2_commit_trans(osb, handle);
                handle = NULL;
-                status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
+                status = ocfs2_commit_truncate(osb, inode, fe_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto out;
-                }
-                status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
                if (status < 0) {
                        mlog_errno(status);
                        goto out;
@@ -640,11 +637,13 @@ static int ocfs2_remove_inode(struct inode *inode,
                goto bail_unlock;
        }
-        status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
+        if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
-                                  orphan_dir_bh);
+                status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
-        if (status < 0) {
+                                          orphan_dir_bh);
-                mlog_errno(status);
+                if (status < 0) {
-                goto bail_commit;
+                        mlog_errno(status);
+                        goto bail_commit;
+                }
        }
        /* set the inodes dtime */
@@ -657,12 +656,7 @@ static int ocfs2_remove_inode(struct inode *inode,
        di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
        di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
+        ocfs2_journal_dirty(handle, di_bh);
-        status = ocfs2_journal_dirty(handle, di_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail_commit;
-        }
        ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
        dquot_free_inode(inode);
@@ -723,38 +717,39 @@ static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb,
 static int ocfs2_wipe_inode(struct inode *inode,
                            struct buffer_head *di_bh)
 {
-        int status, orphaned_slot;
+        int status, orphaned_slot = -1;
        struct inode *orphan_dir_inode = NULL;
        struct buffer_head *orphan_dir_bh = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct ocfs2_dinode *di;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
-        di = (struct ocfs2_dinode *) di_bh->b_data;
+        if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
-        orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
+                orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
-        status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
+                status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
-        if (status)
+                if (status)
-                return status;
+                        return status;
-        orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+                orphan_dir_inode = ocfs2_get_system_file_inode(osb,
-                                                       ORPHAN_DIR_SYSTEM_INODE,
+                                                               ORPHAN_DIR_SYSTEM_INODE,
-                                                       orphaned_slot);
+                                                               orphaned_slot);
-        if (!orphan_dir_inode) {
+                if (!orphan_dir_inode) {
-                status = -EEXIST;
+                        status = -EEXIST;
-                mlog_errno(status);
+                        mlog_errno(status);
-                goto bail;
+                        goto bail;
-        }
+                }
-        /* Lock the orphan dir. The lock will be held for the entire
+                /* Lock the orphan dir. The lock will be held for the entire
-         * delete_inode operation. We do this now to avoid races with
+                 * delete_inode operation. We do this now to avoid races with
-         * recovery completion on other nodes. */
+                 * recovery completion on other nodes. */
-        mutex_lock(&orphan_dir_inode->i_mutex);
+                mutex_lock(&orphan_dir_inode->i_mutex);
-        status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+                status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
-        if (status < 0) {
+                if (status < 0) {
-                mutex_unlock(&orphan_dir_inode->i_mutex);
+                        mutex_unlock(&orphan_dir_inode->i_mutex);
-                mlog_errno(status);
+                        mlog_errno(status);
-                goto bail;
+                        goto bail;
+                }
        }
        /* we do this while holding the orphan dir lock because we
@@ -795,6 +790,9 @@ static int ocfs2_wipe_inode(struct inode *inode,
                mlog_errno(status);
 bail_unlock_dir:
+        if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)
+                return status;
        ocfs2_inode_unlock(orphan_dir_inode, 1);
        mutex_unlock(&orphan_dir_inode->i_mutex);
        brelse(orphan_dir_bh);
@@ -890,7 +888,23 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
        /* Do some basic inode verification... */
        di = (struct ocfs2_dinode *) di_bh->b_data;
-        if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) {
+        if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL)) &&
+            !(oi->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
+                /*
+                 * Inodes in the orphan dir must have ORPHANED_FL.  The only
+                 * inodes that come back out of the orphan dir are reflink
+                 * targets. A reflink target may be moved out of the orphan
+                 * dir between the time we scan the directory and the time we
+                 * process it. This would lead to HAS_REFCOUNT_FL being set but
+                 * ORPHANED_FL not.
+                 */
+                if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) {
+                        mlog(0, "Reflinked inode %llu is no longer orphaned.  "
+                             "it shouldn't be deleted\n",
+                             (unsigned long long)oi->ip_blkno);
+                        goto bail;
+                }
                /* for lack of a better error? */
                status = -EEXIST;
                mlog(ML_ERROR,
@@ -958,7 +972,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
 void ocfs2_delete_inode(struct inode *inode)
 {
        int wipe, status;
-        sigset_t blocked, oldset;
+        sigset_t oldset;
        struct buffer_head *di_bh = NULL;
        mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
@@ -985,13 +999,7 @@ void ocfs2_delete_inode(struct inode *inode)
         * messaging paths may return us -ERESTARTSYS. Which would
         * cause us to exit early, resulting in inodes being orphaned
         * forever. */
-        sigfillset(&blocked);
+        ocfs2_block_signals(&oldset);
-        status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
-        if (status < 0) {
-                mlog_errno(status);
-                ocfs2_cleanup_delete_inode(inode, 1);
-                goto bail;
-        }
        /*
         * Synchronize us against ocfs2_get_dentry. We take this in
@@ -1065,9 +1073,7 @@ bail_unlock_nfs_sync:
        ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
 bail_unblock:
-        status = sigprocmask(SIG_SETMASK, &oldset, NULL);
+        ocfs2_unblock_signals(&oldset);
-        if (status < 0)
-                mlog_errno(status);
 bail:
        clear_inode(inode);
        mlog_exit_void();
@@ -1101,6 +1107,10 @@ void ocfs2_clear_inode(struct inode *inode)
        ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
        ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
+        ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
+                           &oi->ip_la_data_resv);
+        ocfs2_resv_init_once(&oi->ip_la_data_resv);
        /* We very well may get a clear_inode before all an inodes
         * metadata has hit disk. Of course, we can't drop any cluster
         * locks until the journal has finished with it. The only
@@ -1276,13 +1286,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
        fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
        fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
-        status = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (status < 0)
-                mlog_errno(status);
-        status = 0;
 leave:
        mlog_exit(status);
        return status;
 }
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index ba4fe07b293c..9f5f5fcadc45 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -70,6 +70,8 @@ struct ocfs2_inode_info
        /* Only valid if the inode is the dir. */
        u32                             ip_last_used_slot;
        u64                             ip_last_used_group;
+        struct ocfs2_alloc_reservation  ip_la_data_resv;
 };
 /*
@@ -100,6 +102,8 @@ struct ocfs2_inode_info
 #define OCFS2_INODE_MAYBE_ORPHANED      0x00000020
 /* Does someone have the file open O_DIRECT */
 #define OCFS2_INODE_OPEN_DIRECT         0x00000040
+/* Tell the inode wipe code it's not in orphan dir */
+#define OCFS2_INODE_SKIP_ORPHAN_DIR     0x00000080
 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
 {
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9336c60e3a36..47878cf16418 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -402,9 +402,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
 }
 /*
- * 'nblocks' is what you want to add to the current
+ * 'nblocks' is what you want to add to the current transaction.
- * transaction. extend_trans will either extend the current handle by
- * nblocks, or commit it and start a new one with nblocks credits.
 *
 * This might call jbd2_journal_restart() which will commit dirty buffers
 * and then restart the transaction. Before calling
@@ -422,11 +420,15 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
 */
 int ocfs2_extend_trans(handle_t *handle, int nblocks)
 {
-        int status;
+        int status, old_nblocks;
        BUG_ON(!handle);
-        BUG_ON(!nblocks);
+        BUG_ON(nblocks < 0);
+        if (!nblocks)
+                return 0;
+        old_nblocks = handle->h_buffer_credits;
        mlog_entry_void();
        mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
@@ -445,7 +447,8 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
                mlog(0,
                     "jbd2_journal_extend failed, trying "
                     "jbd2_journal_restart\n");
-                status = jbd2_journal_restart(handle, nblocks);
+                status = jbd2_journal_restart(handle,
+                                              old_nblocks + nblocks);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -734,8 +737,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
        return __ocfs2_journal_access(handle, ci, bh, NULL, type);
 }
-int ocfs2_journal_dirty(handle_t *handle,
+void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
-                        struct buffer_head *bh)
 {
        int status;
@@ -743,13 +745,9 @@ int ocfs2_journal_dirty(handle_t *handle,
                   (unsigned long long)bh->b_blocknr);
        status = jbd2_journal_dirty_metadata(handle, bh);
-        if (status < 0)
+        BUG_ON(status);
-                mlog(ML_ERROR, "Could not dirty metadata buffer. "
-                     "(bh->b_blocknr=%llu)\n",
-                     (unsigned long long)bh->b_blocknr);
-        mlog_exit(status);
+        mlog_exit_void();
-        return status;
 }
 #define OCFS2_DEFAULT_COMMIT_INTERVAL   (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3f74e09b0d80..b5baaa8e710f 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -325,8 +325,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
 *      <modify the bh>
 *      ocfs2_journal_dirty(handle, bh);
 */
-int                  ocfs2_journal_dirty(handle_t *handle,
+void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh);
-                                         struct buffer_head *bh);
 /*
 *  Credit Macros:
@@ -562,6 +561,18 @@ static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
        return blocks;
 }
+/*
+ * Allocating a discontiguous block group requires the credits from
+ * ocfs2_calc_group_alloc_credits() as well as enough credits to fill
+ * the group descriptor's extent list.  The caller already has started
+ * the transaction with ocfs2_calc_group_alloc_credits().  They extend
+ * it with these credits.
+ */
+static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb)
+{
+        return ocfs2_extent_recs_per_gd(sb);
+}
 static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
                                                unsigned int clusters_to_del,
                                                struct ocfs2_dinode *fe,
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ca992d91f511..3d7419682dc0 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -52,7 +52,8 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
 static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
                                             struct ocfs2_dinode *alloc,
-                                             u32 numbits);
+                                             u32 *numbits,
+                                             struct ocfs2_alloc_reservation *resv);
 static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
@@ -74,6 +75,144 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
                                          struct inode *local_alloc_inode);
+/*
+ * ocfs2_la_default_mb() - determine a default size, in megabytes of
+ * the local alloc.
+ *
+ * Generally, we'd like to pick as large a local alloc as
+ * possible. Performance on large workloads tends to scale
+ * proportionally to la size. In addition to that, the reservations
+ * code functions more efficiently as it can reserve more windows for
+ * write.
+ *
+ * Some things work against us when trying to choose a large local alloc:
+ *
+ * - We need to ensure our sizing is picked to leave enough space in
+ *   group descriptors for other allocations (such as block groups,
+ *   etc). Picking default sizes which are a multiple of 4 could help
+ *   - block groups are allocated in 2mb and 4mb chunks.
+ *
+ * - Likewise, we don't want to starve other nodes of bits on small
+ *   file systems. This can easily be taken care of by limiting our
+ *   default to a reasonable size (256M) on larger cluster sizes.
+ *
+ * - Some file systems can't support very large sizes - 4k and 8k in
+ *   particular are limited to less than 128 and 256 megabytes respectively.
+ *
+ * The following reference table shows group descriptor and local
+ * alloc maximums at various cluster sizes (4k blocksize)
+ *
+ * csize: 4K    group: 126M     la: 121M
+ * csize: 8K    group: 252M     la: 243M
+ * csize: 16K   group: 504M     la: 486M
+ * csize: 32K   group: 1008M    la: 972M
+ * csize: 64K   group: 2016M    la: 1944M
+ * csize: 128K  group: 4032M    la: 3888M
+ * csize: 256K  group: 8064M    la: 7776M
+ * csize: 512K  group: 16128M   la: 15552M
+ * csize: 1024K group: 32256M   la: 31104M
+ */
+#define OCFS2_LA_MAX_DEFAULT_MB 256
+#define OCFS2_LA_OLD_DEFAULT    8
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
+{
+        unsigned int la_mb;
+        unsigned int gd_mb;
+        unsigned int megs_per_slot;
+        struct super_block *sb = osb->sb;
+        gd_mb = ocfs2_clusters_to_megabytes(osb->sb,
+                8 * ocfs2_group_bitmap_size(sb, 0, osb->s_feature_incompat));
+        /*
+         * This takes care of files systems with very small group
+         * descriptors - 512 byte blocksize at cluster sizes lower
+         * than 16K and also 1k blocksize with 4k cluster size.
+         */
+        if ((sb->s_blocksize == 512 && osb->s_clustersize <= 8192)
+            || (sb->s_blocksize == 1024 && osb->s_clustersize == 4096))
+                return OCFS2_LA_OLD_DEFAULT;
+        /*
+         * Leave enough room for some block groups and make the final
+         * value we work from a multiple of 4.
+         */
+        gd_mb -= 16;
+        gd_mb &= 0xFFFFFFFB;
+        la_mb = gd_mb;
+        /*
+         * Keep window sizes down to a reasonable default
+         */
+        if (la_mb > OCFS2_LA_MAX_DEFAULT_MB) {
+                /*
+                 * Some clustersize / blocksize combinations will have
+                 * given us a larger than OCFS2_LA_MAX_DEFAULT_MB
+                 * default size, but get poor distribution when
+                 * limited to exactly 256 megabytes.
+                 *
+                 * As an example, 16K clustersize at 4K blocksize
+                 * gives us a cluster group size of 504M. Paring the
+                 * local alloc size down to 256 however, would give us
+                 * only one window and around 200MB left in the
+                 * cluster group. Instead, find the first size below
+                 * 256 which would give us an even distribution.
+                 *
+                 * Larger cluster group sizes actually work out pretty
+                 * well when pared to 256, so we don't have to do this
+                 * for any group that fits more than two
+                 * OCFS2_LA_MAX_DEFAULT_MB windows.
+                 */
+                if (gd_mb > (2 * OCFS2_LA_MAX_DEFAULT_MB))
+                        la_mb = 256;
+                else {
+                        unsigned int gd_mult = gd_mb;
+                        while (gd_mult > 256)
+                                gd_mult = gd_mult >> 1;
+                        la_mb = gd_mult;
+                }
+        }
+        megs_per_slot = osb->osb_clusters_at_boot / osb->max_slots;
+        megs_per_slot = ocfs2_clusters_to_megabytes(osb->sb, megs_per_slot);
+        /* Too many nodes, too few disk clusters. */
+        if (megs_per_slot < la_mb)
+                la_mb = megs_per_slot;
+        return la_mb;
+}
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb)
+{
+        struct super_block *sb = osb->sb;
+        unsigned int la_default_mb = ocfs2_la_default_mb(osb);
+        unsigned int la_max_mb;
+        la_max_mb = ocfs2_clusters_to_megabytes(sb,
+                                                ocfs2_local_alloc_size(sb) * 8);
+        mlog(0, "requested: %dM, max: %uM, default: %uM\n",
+             requested_mb, la_max_mb, la_default_mb);
+        if (requested_mb == -1) {
+                /* No user request - use defaults */
+                osb->local_alloc_default_bits =
+                        ocfs2_megabytes_to_clusters(sb, la_default_mb);
+        } else if (requested_mb > la_max_mb) {
+                /* Request is too big, we give the maximum available */
+                osb->local_alloc_default_bits =
+                        ocfs2_megabytes_to_clusters(sb, la_max_mb);
+        } else {
+                osb->local_alloc_default_bits =
+                        ocfs2_megabytes_to_clusters(sb, requested_mb);
+        }
+        osb->local_alloc_bits = osb->local_alloc_default_bits;
+}
 static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
 {
        return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
@@ -156,7 +295,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
                     osb->local_alloc_bits, (osb->bitmap_cpg - 1));
                osb->local_alloc_bits =
                        ocfs2_megabytes_to_clusters(osb->sb,
-                                                    OCFS2_DEFAULT_LOCAL_ALLOC_SIZE);
+                                                    ocfs2_la_default_mb(osb));
        }
        /* read the alloc off disk */
@@ -262,6 +401,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
        osb->local_alloc_state = OCFS2_LA_DISABLED;
+        ocfs2_resmap_uninit(&osb->osb_la_resmap);
        main_bm_inode = ocfs2_get_system_file_inode(osb,
                                                    GLOBAL_BITMAP_SYSTEM_INODE,
                                                    OCFS2_INVALID_SLOT);
@@ -305,12 +446,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
        }
        ocfs2_clear_local_alloc(alloc);
+        ocfs2_journal_dirty(handle, bh);
-        status = ocfs2_journal_dirty(handle, bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_commit;
-        }
        brelse(bh);
        osb->local_alloc_bh = NULL;
@@ -481,46 +617,6 @@ out:
        return status;
 }
-/* Check to see if the local alloc window is within ac->ac_max_block */
-static int ocfs2_local_alloc_in_range(struct inode *inode,
-                                      struct ocfs2_alloc_context *ac,
-                                      u32 bits_wanted)
-{
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct ocfs2_dinode *alloc;
-        struct ocfs2_local_alloc *la;
-        int start;
-        u64 block_off;
-        if (!ac->ac_max_block)
-                return 1;
-        alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
-        la = OCFS2_LOCAL_ALLOC(alloc);
-        start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
-        if (start == -1) {
-                mlog_errno(-ENOSPC);
-                return 0;
-        }
-        /*
-         * Converting (bm_off + start + bits_wanted) to blocks gives us
-         * the blkno just past our actual allocation.  This is perfect
-         * to compare with ac_max_block.
-         */
-        block_off = ocfs2_clusters_to_blocks(inode->i_sb,
-                                             le32_to_cpu(la->la_bm_off) +
-                                             start + bits_wanted);
-        mlog(0, "Checking %llu against %llu\n",
-             (unsigned long long)block_off,
-             (unsigned long long)ac->ac_max_block);
-        if (block_off > ac->ac_max_block)
-                return 0;
-        return 1;
-}
 /*
 * make sure we've got at least bits_wanted contiguous bits in the
 * local alloc. You lose them when you drop i_mutex.
@@ -613,17 +709,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
                mlog(0, "Calling in_range for max block %llu\n",
                     (unsigned long long)ac->ac_max_block);
-        if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac,
-                                        bits_wanted)) {
-                /*
-                 * The window is outside ac->ac_max_block.
-                 * This errno tells the caller to keep localalloc enabled
-                 * but to get the allocation from the main bitmap.
-                 */
-                status = -EFBIG;
-                goto bail;
-        }
        ac->ac_inode = local_alloc_inode;
        /* We should never use localalloc from another slot */
        ac->ac_alloc_slot = osb->slot_num;
@@ -664,7 +749,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
        alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
        la = OCFS2_LOCAL_ALLOC(alloc);
-        start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+        start = ocfs2_local_alloc_find_clear_bits(osb, alloc, &bits_wanted,
+                                                  ac->ac_resv);
        if (start == -1) {
                /* TODO: Shouldn't we just BUG here? */
                status = -ENOSPC;
@@ -674,8 +760,6 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
        bitmap = la->la_bitmap;
        *bit_off = le32_to_cpu(la->la_bm_off) + start;
-        /* local alloc is always contiguous by nature -- we never
-         * delete bits from it! */
        *num_bits = bits_wanted;
        status = ocfs2_journal_access_di(handle,
@@ -687,18 +771,15 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
                goto bail;
        }
+        ocfs2_resmap_claimed_bits(&osb->osb_la_resmap, ac->ac_resv, start,
+                                  bits_wanted);
        while(bits_wanted--)
                ocfs2_set_bit(start++, bitmap);
        le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits);
+        ocfs2_journal_dirty(handle, osb->local_alloc_bh);
-        status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        status = 0;
 bail:
        mlog_exit(status);
        return status;
@@ -722,13 +803,17 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
 }
 static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
-                                             struct ocfs2_dinode *alloc,
+                                     struct ocfs2_dinode *alloc,
-                                             u32 numbits)
+                                     u32 *numbits,
+                                     struct ocfs2_alloc_reservation *resv)
 {
        int numfound, bitoff, left, startoff, lastzero;
+        int local_resv = 0;
+        struct ocfs2_alloc_reservation r;
        void *bitmap = NULL;
+        struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap;
-        mlog_entry("(numbits wanted = %u)\n", numbits);
+        mlog_entry("(numbits wanted = %u)\n", *numbits);
        if (!alloc->id1.bitmap1.i_total) {
                mlog(0, "No bits in my window!\n");
@@ -736,6 +821,30 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
                goto bail;
        }
+        if (!resv) {
+                local_resv = 1;
+                ocfs2_resv_init_once(&r);
+                ocfs2_resv_set_type(&r, OCFS2_RESV_FLAG_TMP);
+                resv = &r;
+        }
+        numfound = *numbits;
+        if (ocfs2_resmap_resv_bits(resmap, resv, &bitoff, &numfound) == 0) {
+                if (numfound < *numbits)
+                        *numbits = numfound;
+                goto bail;
+        }
+        /*
+         * Code error. While reservations are enabled, local
+         * allocation should _always_ go through them.
+         */
+        BUG_ON(osb->osb_resv_level != 0);
+        /*
+         * Reservations are disabled. Handle this the old way.
+         */
        bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
        numfound = bitoff = startoff = 0;
@@ -761,7 +870,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
                        startoff = bitoff+1;
                }
                /* we got everything we needed */
-                if (numfound == numbits) {
+                if (numfound == *numbits) {
                        /* mlog(0, "Found it all!\n"); */
                        break;
                }
@@ -770,12 +879,15 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
        mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
             numfound);
-        if (numfound == numbits)
+        if (numfound == *numbits)
                bitoff = startoff - numfound;
        else
                bitoff = -1;
 bail:
+        if (local_resv)
+                ocfs2_resv_discard(resmap, resv);
        mlog_exit(bitoff);
        return bitoff;
 }
@@ -872,8 +984,10 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
                             (unsigned long long)la_start_blk,
                             (unsigned long long)blkno);
-                        status = ocfs2_free_clusters(handle, main_bm_inode,
+                        status = ocfs2_release_clusters(handle,
-                                                     main_bm_bh, blkno, count);
+                                                        main_bm_inode,
+                                                        main_bm_bh, blkno,
+                                                        count);
                        if (status < 0) {
                                mlog_errno(status);
                                goto bail;
@@ -984,8 +1098,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
        }
 retry_enospc:
-        (*ac)->ac_bits_wanted = osb->local_alloc_bits;
+        (*ac)->ac_bits_wanted = osb->local_alloc_default_bits;
        status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
        if (status == -ENOSPC) {
                if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
@@ -1048,7 +1161,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
        /* we used the generic suballoc reserve function, but we set
         * everything up nicely, so there's no reason why we can't use
         * the more specific cluster api to claim bits. */
-        status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits,
+        status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits,
                                      &cluster_off, &cluster_count);
        if (status == -ENOSPC) {
 retry_enospc:
@@ -1061,7 +1174,8 @@ retry_enospc:
                    OCFS2_LA_DISABLED)
                        goto bail;
-                status = ocfs2_claim_clusters(osb, handle, ac,
+                ac->ac_bits_wanted = osb->local_alloc_default_bits;
+                status = ocfs2_claim_clusters(handle, ac,
                                              osb->local_alloc_bits,
                                              &cluster_off,
                                              &cluster_count);
@@ -1096,6 +1210,9 @@ retry_enospc:
        memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
               le16_to_cpu(la->la_size));
+        ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count,
+                             OCFS2_LOCAL_ALLOC(alloc)->la_bitmap);
        mlog(0, "New window allocated:\n");
        mlog(0, "window la_bm_off = %u\n",
             OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
@@ -1167,12 +1284,7 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
        }
        ocfs2_clear_local_alloc(alloc);
+        ocfs2_journal_dirty(handle, osb->local_alloc_bh);
-        status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
                                          main_bm_inode, main_bm_bh);
@@ -1190,7 +1302,6 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
        atomic_inc(&osb->alloc_stats.moves);
-        status = 0;
 bail:
        if (handle)
                ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index ac5ea9f86653..1be9b5864460 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -30,6 +30,9 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb);
 void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb);
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb);
 int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
                                     int node_num,
                                     struct ocfs2_dinode **alloc_copy);
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 544ac6245175..b5cb3ede9408 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -133,7 +133,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
        if (!(fl->fl_flags & FL_POSIX))
                return -ENOLCK;
-        if (__mandatory_lock(inode))
+        if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
                return -ENOLCK;
        return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 39737613424a..af2b8fe1f139 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -25,7 +25,6 @@
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/uio.h>
@@ -42,44 +41,20 @@
 #include "file.h"
 #include "inode.h"
 #include "mmap.h"
+#include "super.h"
-static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
-{
-        /* The best way to deal with signals in the vm path is
-         * to block them upfront, rather than allowing the
-         * locking paths to return -ERESTARTSYS. */
-        sigfillset(blocked);
-        /* We should technically never get a bad return value
-         * from sigprocmask */
-        return sigprocmask(SIG_BLOCK, blocked, oldset);
-}
-static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
-{
-        return sigprocmask(SIG_SETMASK, oldset, NULL);
-}
 static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 {
-        sigset_t blocked, oldset;
+        sigset_t oldset;
-        int error, ret;
+        int ret;
        mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff);
-        error = ocfs2_vm_op_block_sigs(&blocked, &oldset);
+        ocfs2_block_signals(&oldset);
-        if (error < 0) {
-                mlog_errno(error);
-                ret = VM_FAULT_SIGBUS;
-                goto out;
-        }
        ret = filemap_fault(area, vmf);
+        ocfs2_unblock_signals(&oldset);
-        error = ocfs2_vm_op_unblock_sigs(&oldset);
-        if (error < 0)
-                mlog_errno(error);
-out:
        mlog_exit_ptr(vmf->page);
        return ret;
 }
@@ -159,14 +134,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct page *page = vmf->page;
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        struct buffer_head *di_bh = NULL;
-        sigset_t blocked, oldset;
+        sigset_t oldset;
-        int ret, ret2;
+        int ret;
-        ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
+        ocfs2_block_signals(&oldset);
-        if (ret < 0) {
-                mlog_errno(ret);
-                return ret;
-        }
        /*
         * The cluster locks taken will block a truncate from another
@@ -194,9 +165,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        ocfs2_inode_unlock(inode, 1);
 out:
-        ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
+        ocfs2_unblock_signals(&oldset);
-        if (ret2 < 0)
-                mlog_errno(ret2);
        if (ret)
                ret = VM_FAULT_SIGBUS;
        return ret;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d9cd4e373a53..f171b51a74f7 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -84,7 +84,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
                            handle_t *handle,
                            struct inode *inode,
-                            struct ocfs2_dinode *fe,
+                            struct buffer_head *fe_bh,
                            char *name,
                            struct ocfs2_dir_lookup_result *lookup,
                            struct inode *orphan_dir_inode);
@@ -204,14 +204,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
                inode->i_nlink = 2;
        else
                inode->i_nlink = 1;
-        inode->i_uid = current_fsuid();
+        inode_init_owner(inode, dir, mode);
-        if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
-        } else
-                inode->i_gid = current_fsgid();
-        inode->i_mode = mode;
        dquot_initialize(inode);
        return inode;
 }
@@ -239,6 +232,8 @@ static int ocfs2_mknod(struct inode *dir,
        };
        int did_quota_inode = 0;
        struct ocfs2_dir_lookup_result lookup = { NULL, };
+        sigset_t oldset;
+        int did_block_signals = 0;
        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
                   (unsigned long)dev, dentry->d_name.len,
@@ -350,6 +345,10 @@ static int ocfs2_mknod(struct inode *dir,
                goto leave;
        }
+        /* Starting to change things, restart is no longer possible. */
+        ocfs2_block_signals(&oldset);
+        did_block_signals = 1;
        status = dquot_alloc_inode(inode);
        if (status)
                goto leave;
@@ -384,11 +383,7 @@ static int ocfs2_mknod(struct inode *dir,
                        goto leave;
                }
                ocfs2_add_links_count(dirfe, 1);
-                status = ocfs2_journal_dirty(handle, parent_fe_bh);
+                ocfs2_journal_dirty(handle, parent_fe_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto leave;
-                }
                inc_nlink(dir);
        }
@@ -408,23 +403,28 @@ static int ocfs2_mknod(struct inode *dir,
                }
        }
-        status = ocfs2_add_entry(handle, dentry, inode,
+        /*
-                                 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
+         * Do this before adding the entry to the directory. We add
-                                 &lookup);
+         * also set d_op after success so that ->d_iput() will cleanup
-        if (status < 0) {
+         * the dentry lock even if ocfs2_add_entry() fails below.
+         */
+        status = ocfs2_dentry_attach_lock(dentry, inode,
+                                          OCFS2_I(dir)->ip_blkno);
+        if (status) {
                mlog_errno(status);
                goto leave;
        }
+        dentry->d_op = &ocfs2_dentry_ops;
-        status = ocfs2_dentry_attach_lock(dentry, inode,
+        status = ocfs2_add_entry(handle, dentry, inode,
-                                          OCFS2_I(dir)->ip_blkno);
+                                 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
-        if (status) {
+                                 &lookup);
+        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
        insert_inode_hash(inode);
-        dentry->d_op = &ocfs2_dentry_ops;
        d_instantiate(dentry, inode);
        status = 0;
 leave:
@@ -434,6 +434,8 @@ leave:
                ocfs2_commit_trans(osb, handle);
        ocfs2_inode_unlock(dir, 1);
+        if (did_block_signals)
+                ocfs2_unblock_signals(&oldset);
        if (status == -ENOSPC)
                mlog(0, "Disk is full\n");
@@ -445,11 +447,6 @@ leave:
        ocfs2_free_dir_lookup_result(&lookup);
-        if ((status < 0) && inode) {
-                clear_nlink(inode);
-                iput(inode);
-        }
        if (inode_ac)
                ocfs2_free_alloc_context(inode_ac);
@@ -459,6 +456,17 @@ leave:
        if (meta_ac)
                ocfs2_free_alloc_context(meta_ac);
+        /*
+         * We should call iput after the i_mutex of the bitmap been
+         * unlocked in ocfs2_free_alloc_context, or the
+         * ocfs2_delete_inode will mutex_lock again.
+         */
+        if ((status < 0) && inode) {
+                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
+                clear_nlink(inode);
+                iput(inode);
+        }
        mlog_exit(status);
        return status;
@@ -476,14 +484,15 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        int status = 0;
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_extent_list *fel;
-        u64 fe_blkno = 0;
+        u64 suballoc_loc, fe_blkno = 0;
        u16 suballoc_bit;
        u16 feat;
        *new_fe_bh = NULL;
-        status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
+        status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
-                                       inode_ac, &suballoc_bit, &fe_blkno);
+                                       inode_ac, &suballoc_loc,
+                                       &suballoc_bit, &fe_blkno);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -520,6 +529,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        fe->i_generation = cpu_to_le32(inode->i_generation);
        fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
        fe->i_blkno = cpu_to_le64(fe_blkno);
+        fe->i_suballoc_loc = cpu_to_le64(suballoc_loc);
        fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
        fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
        fe->i_uid = cpu_to_le32(inode->i_uid);
@@ -556,11 +566,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
                fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
        }
-        status = ocfs2_journal_dirty(handle, *new_fe_bh);
+        ocfs2_journal_dirty(handle, *new_fe_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        ocfs2_populate_inode(inode, fe, 1);
        ocfs2_ci_set_new(osb, INODE_CACHE(inode));
@@ -626,6 +632,7 @@ static int ocfs2_link(struct dentry *old_dentry,
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
        struct ocfs2_dir_lookup_result lookup = { NULL, };
+        sigset_t oldset;
        mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
                   old_dentry->d_name.len, old_dentry->d_name.name,
@@ -682,6 +689,9 @@ static int ocfs2_link(struct dentry *old_dentry,
                goto out_unlock_inode;
        }
+        /* Starting to change things, restart is no longer possible. */
+        ocfs2_block_signals(&oldset);
        err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (err < 0) {
@@ -694,14 +704,7 @@ static int ocfs2_link(struct dentry *old_dentry,
        ocfs2_set_links_count(fe, inode->i_nlink);
        fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
        fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+        ocfs2_journal_dirty(handle, fe_bh);
-        err = ocfs2_journal_dirty(handle, fe_bh);
-        if (err < 0) {
-                ocfs2_add_links_count(fe, -1);
-                drop_nlink(inode);
-                mlog_errno(err);
-                goto out_commit;
-        }
        err = ocfs2_add_entry(handle, dentry, inode,
                              OCFS2_I(inode)->ip_blkno,
@@ -725,6 +728,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 out_commit:
        ocfs2_commit_trans(osb, handle);
+        ocfs2_unblock_signals(&oldset);
 out_unlock_inode:
        ocfs2_inode_unlock(inode, 1);
@@ -879,7 +883,7 @@ static int ocfs2_unlink(struct inode *dir,
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
        if (inode_is_unlinkable(inode)) {
-                status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
+                status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name,
                                          &orphan_insert, orphan_dir);
                if (status < 0) {
                        mlog_errno(status);
@@ -898,12 +902,7 @@ static int ocfs2_unlink(struct inode *dir,
                drop_nlink(inode);
        drop_nlink(inode);
        ocfs2_set_links_count(fe, inode->i_nlink);
+        ocfs2_journal_dirty(handle, fe_bh);
-        status = ocfs2_journal_dirty(handle, fe_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        dir->i_ctime = dir->i_mtime = CURRENT_TIME;
        if (S_ISDIR(inode->i_mode))
@@ -1300,7 +1299,7 @@ static int ocfs2_rename(struct inode *old_dir,
                if (S_ISDIR(new_inode->i_mode) ||
                    (ocfs2_read_links_count(newfe) == 1)) {
                        status = ocfs2_orphan_add(osb, handle, new_inode,
-                                                  newfe, orphan_name,
+                                                  newfe_bh, orphan_name,
                                                  &orphan_insert, orphan_dir);
                        if (status < 0) {
                                mlog_errno(status);
@@ -1321,12 +1320,7 @@ static int ocfs2_rename(struct inode *old_dir,
                        ocfs2_set_links_count(newfe, 0);
                else
                        ocfs2_add_links_count(newfe, -1);
+                ocfs2_journal_dirty(handle, newfe_bh);
-                status = ocfs2_journal_dirty(handle, newfe_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
        } else {
                /* if the name was not found in new_dir, add it now */
                status = ocfs2_add_entry(handle, new_dentry, old_inode,
@@ -1345,10 +1339,7 @@ static int ocfs2_rename(struct inode *old_dir,
                old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec);
                old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec);
+                ocfs2_journal_dirty(handle, old_inode_bh);
-                status = ocfs2_journal_dirty(handle, old_inode_bh);
-                if (status < 0)
-                        mlog_errno(status);
        } else
                mlog_errno(status);
@@ -1420,7 +1411,7 @@ static int ocfs2_rename(struct inode *old_dir,
                                                         OCFS2_JOURNAL_ACCESS_WRITE);
                        fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
                        ocfs2_set_links_count(fe, old_dir->i_nlink);
-                        status = ocfs2_journal_dirty(handle, old_dir_bh);
+                        ocfs2_journal_dirty(handle, old_dir_bh);
                }
        }
        ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
@@ -1552,11 +1543,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
                       (bytes_left > sb->s_blocksize) ? sb->s_blocksize :
                       bytes_left);
-                status = ocfs2_journal_dirty(handle, bhs[virtual]);
+                ocfs2_journal_dirty(handle, bhs[virtual]);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
                virtual++;
                p_blkno++;
@@ -1600,6 +1587,8 @@ static int ocfs2_symlink(struct inode *dir,
        };
        int did_quota = 0, did_quota_inode = 0;
        struct ocfs2_dir_lookup_result lookup = { NULL, };
+        sigset_t oldset;
+        int did_block_signals = 0;
        mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
                   dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1695,6 +1684,10 @@ static int ocfs2_symlink(struct inode *dir,
                goto bail;
        }
+        /* Starting to change things, restart is no longer possible. */
+        ocfs2_block_signals(&oldset);
+        did_block_signals = 1;
        status = dquot_alloc_inode(inode);
        if (status)
                goto bail;
@@ -1771,22 +1764,27 @@ static int ocfs2_symlink(struct inode *dir,
                }
        }
-        status = ocfs2_add_entry(handle, dentry, inode,
+        /*
-                                 le64_to_cpu(fe->i_blkno), parent_fe_bh,
+         * Do this before adding the entry to the directory. We add
-                                 &lookup);
+         * also set d_op after success so that ->d_iput() will cleanup
-        if (status < 0) {
+         * the dentry lock even if ocfs2_add_entry() fails below.
+         */
+        status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
+        if (status) {
                mlog_errno(status);
                goto bail;
        }
+        dentry->d_op = &ocfs2_dentry_ops;
-        status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
+        status = ocfs2_add_entry(handle, dentry, inode,
-        if (status) {
+                                 le64_to_cpu(fe->i_blkno), parent_fe_bh,
+                                 &lookup);
+        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        insert_inode_hash(inode);
-        dentry->d_op = &ocfs2_dentry_ops;
        d_instantiate(dentry, inode);
 bail:
        if (status < 0 && did_quota)
@@ -1798,6 +1796,8 @@ bail:
                ocfs2_commit_trans(osb, handle);
        ocfs2_inode_unlock(dir, 1);
+        if (did_block_signals)
+                ocfs2_unblock_signals(&oldset);
        brelse(new_fe_bh);
        brelse(parent_fe_bh);
@@ -1811,6 +1811,7 @@ bail:
        if (xattr_ac)
                ocfs2_free_alloc_context(xattr_ac);
        if ((status < 0) && inode) {
+                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
                clear_nlink(inode);
                iput(inode);
        }
@@ -1911,7 +1912,7 @@ leave:
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
                            handle_t *handle,
                            struct inode *inode,
-                            struct ocfs2_dinode *fe,
+                            struct buffer_head *fe_bh,
                            char *name,
                            struct ocfs2_dir_lookup_result *lookup,
                            struct inode *orphan_dir_inode)
@@ -1919,6 +1920,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        struct buffer_head *orphan_dir_bh = NULL;
        int status = 0;
        struct ocfs2_dinode *orphan_fe;
+        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
        mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
@@ -1943,29 +1945,42 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        if (S_ISDIR(inode->i_mode))
                ocfs2_add_links_count(orphan_fe, 1);
        orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
+        ocfs2_journal_dirty(handle, orphan_dir_bh);
-        status = ocfs2_journal_dirty(handle, orphan_dir_bh);
+        status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
+                                   OCFS2_ORPHAN_NAMELEN, inode,
+                                   OCFS2_I(inode)->ip_blkno,
+                                   orphan_dir_bh, lookup);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
-        status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
+        /*
-                                   OCFS2_ORPHAN_NAMELEN, inode,
+         * We're going to journal the change of i_flags and i_orphaned_slot.
-                                   OCFS2_I(inode)->ip_blkno,
+         * It's safe anyway, though some callers may duplicate the journaling.
-                                   orphan_dir_bh, lookup);
+         * Journaling within the func just make the logic look more
+         * straightforward.
+         */
+        status = ocfs2_journal_access_di(handle,
+                                         INODE_CACHE(inode),
+                                         fe_bh,
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
        le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
+        OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
        /* Record which orphan dir our inode now resides
         * in. delete_inode will use this to determine which orphan
         * dir to lock. */
        fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
+        ocfs2_journal_dirty(handle, fe_bh);
        mlog(0, "Inode %llu orphaned in slot %d\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
@@ -2029,12 +2044,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
        if (S_ISDIR(inode->i_mode))
                ocfs2_add_links_count(orphan_fe, -1);
        orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
+        ocfs2_journal_dirty(handle, orphan_dir_bh);
-        status = ocfs2_journal_dirty(handle, orphan_dir_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
 leave:
        ocfs2_free_dir_lookup_result(&lookup);
@@ -2123,7 +2133,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
        }
        di = (struct ocfs2_dinode *)new_di_bh->b_data;
-        status = ocfs2_orphan_add(osb, handle, inode, di, orphan_name,
+        status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
                                  &orphan_insert, orphan_dir);
        if (status < 0) {
                mlog_errno(status);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1238b491db90..c67003b6b5a2 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -47,6 +47,7 @@
 /* For struct ocfs2_blockcheck_stats */
 #include "blockcheck.h"
+#include "reservations.h"
 /* Caching of metadata buffers */
@@ -341,6 +342,9 @@ struct ocfs2_super
         */
        unsigned int local_alloc_bits;
        unsigned int local_alloc_default_bits;
+        /* osb_clusters_at_boot can become stale! Do not trust it to
+         * be up to date. */
+        unsigned int osb_clusters_at_boot;
        enum ocfs2_local_alloc_state local_alloc_state; /* protected
                                                         * by osb_lock */
@@ -349,6 +353,11 @@ struct ocfs2_super
        u64 la_last_gd;
+        struct ocfs2_reservation_map    osb_la_resmap;
+        unsigned int    osb_resv_level;
+        unsigned int    osb_dir_resv_level;
        /* Next three fields are for local node slot recovery during
         * mount. */
        int dirty;
@@ -482,6 +491,13 @@ static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
        return 0;
 }
+static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb)
+{
+        if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
+                return 1;
+        return 0;
+}
 static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
 {
        if (ocfs2_supports_indexed_dirs(osb))
@@ -763,8 +779,24 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
        return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
 }
-#define ocfs2_set_bit ext2_set_bit
+static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
-#define ocfs2_clear_bit ext2_clear_bit
+                                                       unsigned int clusters)
+{
+        return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits);
+}
+static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
+{
+        ext2_set_bit(bit, bitmap);
+}
+#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
+static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
+{
+        ext2_clear_bit(bit, bitmap);
+}
+#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
 #define ocfs2_test_bit ext2_test_bit
 #define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
 #define ocfs2_find_next_bit ext2_find_next_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index bb37218a7978..33f1c9a8258d 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -100,7 +100,8 @@
                                         | OCFS2_FEATURE_INCOMPAT_XATTR \
                                         | OCFS2_FEATURE_INCOMPAT_META_ECC \
                                         | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
-                                         | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
+                                         | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
+                                         | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP    (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
                                         | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
                                         | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -165,6 +166,9 @@
 /* Refcount tree support */
 #define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE    0x1000
+/* Discontigous block groups */
+#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG     0x2000
 /*
 * backup superblock flag is used to indicate that this volume
 * has backup superblocks.
@@ -283,14 +287,6 @@
 #define OCFS2_MIN_JOURNAL_SIZE          (4 * 1024 * 1024)
 /*
- * Default local alloc size (in megabytes)
- *
- * The value chosen should be such that most allocations, including new
- * block groups, use local alloc.
- */
-#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE  8
-/*
 * Inline extended attribute size (in bytes)
 * The value chosen should be aligned to 16 byte boundaries.
 */
@@ -512,7 +508,10 @@ struct ocfs2_extent_block
                                           block group */
        __le32 h_fs_generation;         /* Must match super block */
        __le64 h_blkno;                 /* Offset on disk, in blocks */
-/*20*/  __le64 h_reserved3;
+/*20*/  __le64 h_suballoc_loc;          /* Suballocator block group this
+                                           eb belongs to.  Only valid
+                                           if allocated from a
+                                           discontiguous block group */
        __le64 h_next_leaf_blk;         /* Offset on disk, in blocks,
                                           of next leaf header pointing
                                           to data */
@@ -679,7 +678,11 @@ struct ocfs2_dinode {
 /*80*/  struct ocfs2_block_check i_check;       /* Error checking */
 /*88*/  __le64 i_dx_root;               /* Pointer to dir index root block */
 /*90*/  __le64 i_refcount_loc;
-        __le64 i_reserved2[4];
+        __le64 i_suballoc_loc;          /* Suballocator block group this
+                                           inode belongs to.  Only valid
+                                           if allocated from a
+                                           discontiguous block group */
+/*A0*/  __le64 i_reserved2[3];
 /*B8*/  union {
                __le64 i_pad1;          /* Generic way to refer to this
                                           64bit union */
@@ -814,7 +817,12 @@ struct ocfs2_dx_root_block {
        __le32          dr_reserved2;
        __le64          dr_free_blk;            /* Pointer to head of free
                                                 * unindexed block list. */
-        __le64          dr_reserved3[15];
+        __le64          dr_suballoc_loc;        /* Suballocator block group
+                                                   this root belongs to.
+                                                   Only valid if allocated
+                                                   from a discontiguous
+                                                   block group */
+        __le64          dr_reserved3[14];
        union {
                struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
                                                   * bits for maximum space
@@ -840,6 +848,13 @@ struct ocfs2_dx_leaf {
 };
 /*
+ * Largest bitmap for a block (suballocator) group in bytes.  This limit
+ * does not affect cluster groups (global allocator).  Cluster group
+ * bitmaps run to the end of the block.
+ */
+#define OCFS2_MAX_BG_BITMAP_SIZE        256
+/*
 * On disk allocator group structure for OCFS2
 */
 struct ocfs2_group_desc
@@ -860,7 +875,29 @@ struct ocfs2_group_desc
        __le64   bg_blkno;               /* Offset on disk, in blocks */
 /*30*/  struct ocfs2_block_check bg_check;      /* Error checking */
        __le64   bg_reserved2;
-/*40*/  __u8    bg_bitmap[0];
+/*40*/  union {
+                __u8    bg_bitmap[0];
+                struct {
+                        /*
+                         * Block groups may be discontiguous when
+                         * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set.
+                         * The extents of a discontigous block group are
+                         * stored in bg_list.  It is a flat list.
+                         * l_tree_depth must always be zero.  A
+                         * discontiguous group is signified by a non-zero
+                         * bg_list->l_next_free_rec.  Only block groups
+                         * can be discontiguous; Cluster groups cannot.
+                         * We've never made a block group with more than
+                         * 2048 blocks (256 bytes of bg_bitmap).  This
+                         * codifies that limit so that we can fit bg_list.
+                         * bg_size of a discontiguous block group will
+                         * be 256 to match bg_bitmap_filler.
+                         */
+                        __u8 bg_bitmap_filler[OCFS2_MAX_BG_BITMAP_SIZE];
+/*140*/                 struct ocfs2_extent_list bg_list;
+                };
+        };
+/* Actual on-disk size is one block */
 };
 struct ocfs2_refcount_rec {
@@ -905,7 +942,11 @@ struct ocfs2_refcount_block {
 /*40*/  __le32 rf_generation;           /* generation number. all be the same
                                         * for the same refcount tree. */
        __le32 rf_reserved0;
-        __le64 rf_reserved1[7];
+        __le64 rf_suballoc_loc;         /* Suballocator block group this
+                                           refcount block belongs to. Only
+                                           valid if allocated from a
+                                           discontiguous block group */
+/*50*/  __le64 rf_reserved1[6];
 /*80*/  union {
                struct ocfs2_refcount_list rf_records;  /* List of refcount
                                                          records */
@@ -1017,7 +1058,10 @@ struct ocfs2_xattr_block {
                                        real xattr or a xattr tree. */
        __le16  xb_reserved0;
        __le32  xb_reserved1;
-        __le64  xb_reserved2;
+        __le64  xb_suballoc_loc;        /* Suballocator block group this
+                                           xattr block belongs to. Only
+                                           valid if allocated from a
+                                           discontiguous block group */
 /*30*/  union {
                struct ocfs2_xattr_header xb_header; /* xattr header if this
                                                        block contains xattr */
@@ -1254,6 +1298,16 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
        return size / sizeof(struct ocfs2_extent_rec);
 }
+static inline u16 ocfs2_extent_recs_per_gd(struct super_block *sb)
+{
+        int size;
+        size = sb->s_blocksize -
+                offsetof(struct ocfs2_group_desc, bg_list.l_recs);
+        return size / sizeof(struct ocfs2_extent_rec);
+}
 static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
 {
        int size;
@@ -1284,13 +1338,23 @@ static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
        return size;
 }
-static inline int ocfs2_group_bitmap_size(struct super_block *sb)
+static inline int ocfs2_group_bitmap_size(struct super_block *sb,
+                                          int suballocator,
+                                          u32 feature_incompat)
 {
-        int size;
+        int size = sb->s_blocksize -
-        size = sb->s_blocksize -
                offsetof(struct ocfs2_group_desc, bg_bitmap);
+        /*
+         * The cluster allocator uses the entire block.  Suballocators have
+         * never used more than OCFS2_MAX_BG_BITMAP_SIZE.  Unfortunately, older
+         * code expects bg_size set to the maximum.  Thus we must keep
+         * bg_size as-is unless discontig_bg is enabled.
+         */
+        if (suballocator &&
+            (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
+                size = OCFS2_MAX_BG_BITMAP_SIZE;
        return size;
 }
@@ -1402,23 +1466,43 @@ static inline int ocfs2_extent_recs_per_eb(int blocksize)
        return size / sizeof(struct ocfs2_extent_rec);
 }
-static inline int ocfs2_local_alloc_size(int blocksize)
+static inline int ocfs2_extent_recs_per_gd(int blocksize)
 {
        int size;
        size = blocksize -
-                offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+                offsetof(struct ocfs2_group_desc, bg_list.l_recs);
-        return size;
+        return size / sizeof(struct ocfs2_extent_rec);
 }
-static inline int ocfs2_group_bitmap_size(int blocksize)
+static inline int ocfs2_local_alloc_size(int blocksize)
 {
        int size;
        size = blocksize -
+                offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+        return size;
+}
+static inline int ocfs2_group_bitmap_size(int blocksize,
+                                          int suballocator,
+                                          uint32_t feature_incompat)
+{
+        int size = sb->s_blocksize -
                offsetof(struct ocfs2_group_desc, bg_bitmap);
+        /*
+         * The cluster allocator uses the entire block.  Suballocators have
+         * never used more than OCFS2_MAX_BG_BITMAP_SIZE.  Unfortunately, older
+         * code expects bg_size set to the maximum.  Thus we must keep
+         * bg_size as-is unless discontig_bg is enabled.
+         */
+        if (suballocator &&
+            (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
+                size = OCFS2_MAX_BG_BITMAP_SIZE;
        return size;
 }
@@ -1491,5 +1575,19 @@ static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
        de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
 }
+static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd)
+{
+        if ((offsetof(struct ocfs2_group_desc, bg_bitmap) +
+             le16_to_cpu(gd->bg_size)) !=
+            offsetof(struct ocfs2_group_desc, bg_list))
+                return 0;
+        /*
+         * Only valid to check l_next_free_rec if
+         * bg_bitmap + bg_size == bg_list.
+         */
+        if (!gd->bg_list.l_next_free_rec)
+                return 0;
+        return 1;
+}
 #endif  /* _OCFS2_FS_H */
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 123bc520a2c0..196fcb52d95d 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -23,6 +23,7 @@
 struct ocfs2_dquot {
        struct dquot dq_dquot;  /* Generic VFS dquot */
        loff_t dq_local_off;    /* Offset in the local quota file */
+        u64 dq_local_phys_blk;  /* Physical block carrying quota structure */
        struct ocfs2_quota_chunk *dq_chunk;     /* Chunk dquot is in */
        unsigned int dq_use_count;      /* Number of nodes having reference to this entry in global quota file */
        s64 dq_origspace;       /* Last globally synced space usage */
@@ -51,8 +52,9 @@ struct ocfs2_mem_dqinfo {
        struct ocfs2_lock_res dqi_gqlock;       /* Lock protecting quota information structure */
        struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */
        int dqi_gqi_count;              /* Number of holders of dqi_gqi_bh */
+        u64 dqi_giblk;                  /* Number of block with global information header */
        struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */
-        struct buffer_head *dqi_ibh;    /* Buffer with information header */
+        struct buffer_head *dqi_libh;   /* Buffer with local information header */
        struct qtree_mem_dqinfo dqi_gi; /* Info about global file */
        struct delayed_work dqi_sync_work;      /* Work for syncing dquots */
        struct ocfs2_quota_recovery *dqi_rec;   /* Pointer to recovery
@@ -102,8 +104,12 @@ static inline int ocfs2_global_release_dquot(struct dquot *dquot)
 int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
 void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
-int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh);
-                           struct buffer_head **bh);
+int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
+                                struct buffer_head **bh);
+int ocfs2_create_local_dquot(struct dquot *dquot);
+int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot);
+int ocfs2_local_write_dquot(struct dquot *dquot);
 extern const struct dquot_operations ocfs2_quota_operations;
 extern struct quota_format_type ocfs2_quota_format;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 355f41d1d520..2bb35fe00511 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -3,6 +3,7 @@
 */
 #include <linux/spinlock.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/quota.h>
 #include <linux/quotaops.h>
 #include <linux/dqblk_qtree.h>
@@ -24,8 +25,44 @@
 #include "dlmglue.h"
 #include "uptodate.h"
 #include "super.h"
+#include "buffer_head_io.h"
 #include "quota.h"
+/*
+ * Locking of quotas with OCFS2 is rather complex. Here are rules that
+ * should be obeyed by all the functions:
+ * - any write of quota structure (either to local or global file) is protected
+ *   by dqio_mutex or dquot->dq_lock.
+ * - any modification of global quota file holds inode cluster lock, i_mutex,
+ *   and ip_alloc_sem of the global quota file (achieved by
+ *   ocfs2_lock_global_qf). It also has to hold qinfo_lock.
+ * - an allocation of new blocks for local quota file is protected by
+ *   its ip_alloc_sem
+ *
+ * A rough sketch of locking dependencies (lf = local file, gf = global file):
+ * Normal filesystem operation:
+ *   start_trans -> dqio_mutex -> write to lf
+ * Syncing of local and global file:
+ *   ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
+ *     write to gf
+ *                                                     -> write to lf
+ * Acquire dquot for the first time:
+ *   dq_lock -> ocfs2_lock_global_qf -> qinfo_lock -> read from gf
+ *                                   -> alloc space for gf
+ *                                   -> start_trans -> qinfo_lock -> write to gf
+ *           -> ip_alloc_sem of lf -> alloc space for lf
+ *           -> write to lf
+ * Release last reference to dquot:
+ *   dq_lock -> ocfs2_lock_global_qf -> start_trans -> qinfo_lock -> write to gf
+ *           -> write to lf
+ * Note that all the above operations also hold the inode cluster lock of lf.
+ * Recovery:
+ *   inode cluster lock of recovered lf
+ *     -> read bitmaps -> ip_alloc_sem of lf
+ *     -> ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
+ *        write to gf
+ */
 static struct workqueue_struct *ocfs2_quota_wq = NULL;
 static void qsync_work_fn(struct work_struct *work);
@@ -90,8 +127,7 @@ struct qtree_fmt_operations ocfs2_global_ops = {
        .is_id = ocfs2_global_is_id,
 };
-static int ocfs2_validate_quota_block(struct super_block *sb,
+int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh)
-                                      struct buffer_head *bh)
 {
        struct ocfs2_disk_dqtrailer *dqt =
                ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
@@ -109,54 +145,19 @@ static int ocfs2_validate_quota_block(struct super_block *sb,
        return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
 }
-int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
-                           struct buffer_head **bh)
+                                struct buffer_head **bhp)
 {
-        int rc = 0;
+        int rc;
-        struct buffer_head *tmp = *bh;
+        *bhp = NULL;
-        if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
+        rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, 1, bhp, 0,
-                ocfs2_error(inode->i_sb,
+                               ocfs2_validate_quota_block);
-                            "Quota file %llu is probably corrupted! Requested "
-                            "to read block %Lu but file has size only %Lu\n",
-                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                            (unsigned long long)v_block,
-                            (unsigned long long)i_size_read(inode));
-                return -EIO;
-        }
-        rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
-                                    ocfs2_validate_quota_block);
        if (rc)
                mlog_errno(rc);
-        /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
-        if (!rc && !*bh)
-                *bh = tmp;
        return rc;
 }
-static int ocfs2_get_quota_block(struct inode *inode, int block,
-                                 struct buffer_head **bh)
-{
-        u64 pblock, pcount;
-        int err;
-        down_read(&OCFS2_I(inode)->ip_alloc_sem);
-        err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL);
-        up_read(&OCFS2_I(inode)->ip_alloc_sem);
-        if (err) {
-                mlog_errno(err);
-                return err;
-        }
-        *bh = sb_getblk(inode->i_sb, pblock);
-        if (!*bh) {
-                err = -EIO;
-                mlog_errno(err);
-        }
-        return err;
-}
 /* Read data from global quotafile - avoid pagecache and such because we cannot
 * afford acquiring the locks... We use quota cluster lock to serialize
 * operations. Caller is responsible for acquiring it. */
@@ -171,6 +172,7 @@ ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
        int err = 0;
        struct buffer_head *bh;
        size_t toread, tocopy;
+        u64 pblock = 0, pcount = 0;
        if (off > i_size)
                return 0;
@@ -179,8 +181,19 @@ ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
        toread = len;
        while (toread > 0) {
                tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
+                if (!pcount) {
+                        err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock,
+                                                          &pcount, NULL);
+                        if (err) {
+                                mlog_errno(err);
+                                return err;
+                        }
+                } else {
+                        pcount--;
+                        pblock++;
+                }
                bh = NULL;
-                err = ocfs2_read_quota_block(gqinode, blk, &bh);
+                err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
                if (err) {
                        mlog_errno(err);
                        return err;
@@ -208,6 +221,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
        int err = 0, new = 0, ja_type;
        struct buffer_head *bh = NULL;
        handle_t *handle = journal_current_handle();
+        u64 pblock, pcount;
        if (!handle) {
                mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
@@ -220,12 +234,11 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
                len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
        }
-        mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
        if (gqinode->i_size < off + len) {
                loff_t rounded_end =
                                ocfs2_align_bytes_to_blocks(sb, off + len);
-                /* Space is already allocated in ocfs2_global_read_dquot() */
+                /* Space is already allocated in ocfs2_acquire_dquot() */
                err = ocfs2_simple_size_update(gqinode,
                                               oinfo->dqi_gqi_bh,
                                               rounded_end);
@@ -233,13 +246,20 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
                        goto out;
                new = 1;
        }
+        err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock, &pcount, NULL);
+        if (err) {
+                mlog_errno(err);
+                goto out;
+        }
        /* Not rewriting whole block? */
        if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
            !new) {
-                err = ocfs2_read_quota_block(gqinode, blk, &bh);
+                err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
                ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
        } else {
-                err = ocfs2_get_quota_block(gqinode, blk, &bh);
+                bh = sb_getblk(sb, pblock);
+                if (!bh)
+                        err = -ENOMEM;
                ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
        }
        if (err) {
@@ -260,19 +280,15 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
                brelse(bh);
                goto out;
        }
-        err = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
        brelse(bh);
-        if (err < 0)
-                goto out;
 out:
        if (err) {
-                mutex_unlock(&gqinode->i_mutex);
                mlog_errno(err);
                return err;
        }
        gqinode->i_version++;
        ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
-        mutex_unlock(&gqinode->i_mutex);
        return len;
 }
@@ -290,11 +306,23 @@ int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
        else
                WARN_ON(bh != oinfo->dqi_gqi_bh);
        spin_unlock(&dq_data_lock);
+        if (ex) {
+                mutex_lock(&oinfo->dqi_gqinode->i_mutex);
+                down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+        } else {
+                down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+        }
        return 0;
 }
 void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
 {
+        if (ex) {
+                up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+                mutex_unlock(&oinfo->dqi_gqinode->i_mutex);
+        } else {
+                up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+        }
        ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
        brelse(oinfo->dqi_gqi_bh);
        spin_lock(&dq_data_lock);
@@ -312,6 +340,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
        struct ocfs2_global_disk_dqinfo dinfo;
        struct mem_dqinfo *info = sb_dqinfo(sb, type);
        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        u64 pcount;
        int status;
        mlog_entry_void();
@@ -338,9 +367,19 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
                mlog_errno(status);
                goto out_err;
        }
+        status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk,
+                                             &pcount, NULL);
+        if (status < 0)
+                goto out_unlock;
+        status = ocfs2_qinfo_lock(oinfo, 0);
+        if (status < 0)
+                goto out_unlock;
        status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
                                      sizeof(struct ocfs2_global_disk_dqinfo),
                                      OCFS2_GLOBAL_INFO_OFF);
+        ocfs2_qinfo_unlock(oinfo, 0);
        ocfs2_unlock_global_qf(oinfo, 0);
        if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
                mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
@@ -367,6 +406,10 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
 out_err:
        mlog_exit(status);
        return status;
+out_unlock:
+        ocfs2_unlock_global_qf(oinfo, 0);
+        mlog_errno(status);
+        goto out_err;
 }
 /* Write information to global quota file. Expects exlusive lock on quota
@@ -425,78 +468,10 @@ static int ocfs2_global_qinit_alloc(struct super_block *sb, int type)
 static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type)
 {
-        /* We modify all the allocated blocks, tree root, and info block */
+        /* We modify all the allocated blocks, tree root, info block and
+         * the inode */
        return (ocfs2_global_qinit_alloc(sb, type) + 2) *
-                        OCFS2_QUOTA_BLOCK_UPDATE_CREDITS;
+                        OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + 1;
-}
-/* Read in information from global quota file and acquire a reference to it.
- * dquot_acquire() has already started the transaction and locked quota file */
-int ocfs2_global_read_dquot(struct dquot *dquot)
-{
-        int err, err2, ex = 0;
-        struct super_block *sb = dquot->dq_sb;
-        int type = dquot->dq_type;
-        struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
-        struct ocfs2_super *osb = OCFS2_SB(sb);
-        struct inode *gqinode = info->dqi_gqinode;
-        int need_alloc = ocfs2_global_qinit_alloc(sb, type);
-        handle_t *handle = NULL;
-        err = ocfs2_qinfo_lock(info, 0);
-        if (err < 0)
-                goto out;
-        err = qtree_read_dquot(&info->dqi_gi, dquot);
-        if (err < 0)
-                goto out_qlock;
-        OCFS2_DQUOT(dquot)->dq_use_count++;
-        OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
-        OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
-        ocfs2_qinfo_unlock(info, 0);
-        if (!dquot->dq_off) {   /* No real quota entry? */
-                ex = 1;
-                /*
-                 * Add blocks to quota file before we start a transaction since
-                 * locking allocators ranks above a transaction start
-                 */
-                WARN_ON(journal_current_handle());
-                down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
-                err = ocfs2_extend_no_holes(gqinode,
-                        gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
-                        gqinode->i_size);
-                up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
-                if (err < 0)
-                        goto out;
-        }
-        handle = ocfs2_start_trans(osb,
-                                   ocfs2_calc_global_qinit_credits(sb, type));
-        if (IS_ERR(handle)) {
-                err = PTR_ERR(handle);
-                goto out;
-        }
-        err = ocfs2_qinfo_lock(info, ex);
-        if (err < 0)
-                goto out_trans;
-        err = qtree_write_dquot(&info->dqi_gi, dquot);
-        if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
-                err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
-                if (!err)
-                        err = err2;
-        }
-out_qlock:
-        if (ex)
-                ocfs2_qinfo_unlock(info, 1);
-        else
-                ocfs2_qinfo_unlock(info, 0);
-out_trans:
-        if (handle)
-                ocfs2_commit_trans(osb, handle);
-out:
-        if (err < 0)
-                mlog_errno(err);
-        return err;
 }
 /* Sync local information about quota modifications with global quota file.
@@ -637,14 +612,13 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
        }
        mutex_lock(&sb_dqopt(sb)->dqio_mutex);
        status = ocfs2_sync_dquot(dquot);
-        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
        if (status < 0)
                mlog_errno(status);
        /* We have to write local structure as well... */
-        dquot_mark_dquot_dirty(dquot);
+        status = ocfs2_local_write_dquot(dquot);
-        status = dquot_commit(dquot);
        if (status < 0)
                mlog_errno(status);
+        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
        ocfs2_commit_trans(osb, handle);
 out_ilock:
        ocfs2_unlock_global_qf(oinfo, 1);
@@ -683,7 +657,9 @@ static int ocfs2_write_dquot(struct dquot *dquot)
                mlog_errno(status);
                goto out;
        }
-        status = dquot_commit(dquot);
+        mutex_lock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
+        status = ocfs2_local_write_dquot(dquot);
+        mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
        ocfs2_commit_trans(osb, handle);
 out:
        mlog_exit(status);
@@ -714,6 +690,10 @@ static int ocfs2_release_dquot(struct dquot *dquot)
        mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+        mutex_lock(&dquot->dq_lock);
+        /* Check whether we are not racing with some other dqget() */
+        if (atomic_read(&dquot->dq_count) > 1)
+                goto out;
        status = ocfs2_lock_global_qf(oinfo, 1);
        if (status < 0)
                goto out;
@@ -724,30 +704,113 @@ static int ocfs2_release_dquot(struct dquot *dquot)
                mlog_errno(status);
                goto out_ilock;
        }
-        status = dquot_release(dquot);
+        status = ocfs2_global_release_dquot(dquot);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        status = ocfs2_local_release_dquot(handle, dquot);
+        /*
+         * If we fail here, we cannot do much as global structure is
+         * already released. So just complain...
+         */
+        if (status < 0)
+                mlog_errno(status);
+        clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
+out_trans:
        ocfs2_commit_trans(osb, handle);
 out_ilock:
        ocfs2_unlock_global_qf(oinfo, 1);
 out:
+        mutex_unlock(&dquot->dq_lock);
        mlog_exit(status);
        return status;
 }
+/*
+ * Read global dquot structure from disk or create it if it does
+ * not exist. Also update use count of the global structure and
+ * create structure in node-local quota file.
+ */
 static int ocfs2_acquire_dquot(struct dquot *dquot)
 {
-        struct ocfs2_mem_dqinfo *oinfo =
+        int status = 0, err;
-                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+        int ex = 0;
-        int status = 0;
+        struct super_block *sb = dquot->dq_sb;
+        struct ocfs2_super *osb = OCFS2_SB(sb);
+        int type = dquot->dq_type;
+        struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+        struct inode *gqinode = info->dqi_gqinode;
+        int need_alloc = ocfs2_global_qinit_alloc(sb, type);
+        handle_t *handle;
-        mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+        mlog_entry("id=%u, type=%d", dquot->dq_id, type);
-        /* We need an exclusive lock, because we're going to update use count
+        mutex_lock(&dquot->dq_lock);
-         * and instantiate possibly new dquot structure */
+        /*
-        status = ocfs2_lock_global_qf(oinfo, 1);
+         * We need an exclusive lock, because we're going to update use count
+         * and instantiate possibly new dquot structure
+         */
+        status = ocfs2_lock_global_qf(info, 1);
        if (status < 0)
                goto out;
-        status = dquot_acquire(dquot);
+        if (!test_bit(DQ_READ_B, &dquot->dq_flags)) {
-        ocfs2_unlock_global_qf(oinfo, 1);
+                status = ocfs2_qinfo_lock(info, 0);
+                if (status < 0)
+                        goto out_dq;
+                status = qtree_read_dquot(&info->dqi_gi, dquot);
+                ocfs2_qinfo_unlock(info, 0);
+                if (status < 0)
+                        goto out_dq;
+        }
+        set_bit(DQ_READ_B, &dquot->dq_flags);
+        OCFS2_DQUOT(dquot)->dq_use_count++;
+        OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+        OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+        if (!dquot->dq_off) {   /* No real quota entry? */
+                ex = 1;
+                /*
+                 * Add blocks to quota file before we start a transaction since
+                 * locking allocators ranks above a transaction start
+                 */
+                WARN_ON(journal_current_handle());
+                status = ocfs2_extend_no_holes(gqinode,
+                        gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
+                        gqinode->i_size);
+                if (status < 0)
+                        goto out_dq;
+        }
+        handle = ocfs2_start_trans(osb,
+                                   ocfs2_calc_global_qinit_credits(sb, type));
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                goto out_dq;
+        }
+        status = ocfs2_qinfo_lock(info, ex);
+        if (status < 0)
+                goto out_trans;
+        status = qtree_write_dquot(&info->dqi_gi, dquot);
+        if (ex && info_dirty(sb_dqinfo(sb, type))) {
+                err = __ocfs2_global_write_info(sb, type);
+                if (!status)
+                        status = err;
+        }
+        ocfs2_qinfo_unlock(info, ex);
+out_trans:
+        ocfs2_commit_trans(osb, handle);
+out_dq:
+        ocfs2_unlock_global_qf(info, 1);
+        if (status < 0)
+                goto out;
+        status = ocfs2_create_local_dquot(dquot);
+        if (status < 0)
+                goto out;
+        set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
 out:
+        mutex_unlock(&dquot->dq_lock);
        mlog_exit(status);
        return status;
 }
@@ -769,7 +832,6 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
        struct ocfs2_super *osb = OCFS2_SB(sb);
        mlog_entry("id=%u, type=%d", dquot->dq_id, type);
-        dquot_mark_dquot_dirty(dquot);
        /* In case user set some limits, sync dquot immediately to global
         * quota file so that information propagates quicker */
@@ -792,14 +854,16 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
                mlog_errno(status);
                goto out_ilock;
        }
+        mutex_lock(&sb_dqopt(sb)->dqio_mutex);
        status = ocfs2_sync_dquot(dquot);
        if (status < 0) {
                mlog_errno(status);
-                goto out_trans;
+                goto out_dlock;
        }
        /* Now write updated local dquot structure */
-        status = dquot_commit(dquot);
+        status = ocfs2_local_write_dquot(dquot);
-out_trans:
+out_dlock:
+        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
        ocfs2_commit_trans(osb, handle);
 out_ilock:
        ocfs2_unlock_global_qf(oinfo, 1);
@@ -851,7 +915,7 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
 }
 const struct dquot_operations ocfs2_quota_operations = {
-        .write_dquot    = ocfs2_write_dquot,
+        /* We never make dquot dirty so .write_dquot is never called */
        .acquire_dquot  = ocfs2_acquire_dquot,
        .release_dquot  = ocfs2_release_dquot,
        .mark_dirty     = ocfs2_mark_dquot_dirty,
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index a6467f3d262e..8bd70d4d184d 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -3,6 +3,7 @@
 */
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/quota.h>
 #include <linux/quotaops.h>
 #include <linux/module.h>
@@ -21,6 +22,7 @@
 #include "dlmglue.h"
 #include "quota.h"
 #include "uptodate.h"
+#include "super.h"
 /* Number of local quota structures per block */
 static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
@@ -118,12 +120,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
        lock_buffer(bh);
        modify(bh, private);
        unlock_buffer(bh);
-        status = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (status < 0) {
-                mlog_errno(status);
-                ocfs2_commit_trans(OCFS2_SB(sb), handle);
-                return status;
-        }
        status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
        if (status < 0) {
                mlog_errno(status);
@@ -132,6 +130,39 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
        return 0;
 }
+/*
+ * Read quota block from a given logical offset.
+ *
+ * This function acquires ip_alloc_sem and thus it must not be called with a
+ * transaction started.
+ */
+static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+                                  struct buffer_head **bh)
+{
+        int rc = 0;
+        struct buffer_head *tmp = *bh;
+        if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
+                ocfs2_error(inode->i_sb,
+                            "Quota file %llu is probably corrupted! Requested "
+                            "to read block %Lu but file has size only %Lu\n",
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                            (unsigned long long)v_block,
+                            (unsigned long long)i_size_read(inode));
+                return -EIO;
+        }
+        rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
+                                    ocfs2_validate_quota_block);
+        if (rc)
+                mlog_errno(rc);
+        /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+        if (!rc && !*bh)
+                *bh = tmp;
+        return rc;
+}
 /* Check whether we understand format of quota files */
 static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
 {
@@ -522,9 +553,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
                        ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
                        le32_add_cpu(&dchunk->dqc_free, 1);
                        unlock_buffer(qbh);
-                        status = ocfs2_journal_dirty(handle, qbh);
+                        ocfs2_journal_dirty(handle, qbh);
-                        if (status < 0)
-                                mlog_errno(status);
 out_commit:
                        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
                        ocfs2_commit_trans(OCFS2_SB(sb), handle);
@@ -630,9 +659,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
                lock_buffer(bh);
                ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
                unlock_buffer(bh);
-                status = ocfs2_journal_dirty(handle, bh);
+                ocfs2_journal_dirty(handle, bh);
-                if (status < 0)
-                        mlog_errno(status);
 out_trans:
                ocfs2_commit_trans(osb, handle);
 out_bh:
@@ -678,7 +705,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
        INIT_LIST_HEAD(&oinfo->dqi_chunk);
        oinfo->dqi_rec = NULL;
        oinfo->dqi_lqi_bh = NULL;
-        oinfo->dqi_ibh = NULL;
+        oinfo->dqi_libh = NULL;
        status = ocfs2_global_read_info(sb, type);
        if (status < 0)
@@ -704,7 +731,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
        info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
        oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
        oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
-        oinfo->dqi_ibh = bh;
+        oinfo->dqi_libh = bh;
        /* We crashed when using local quota file? */
        if (!(info->dqi_flags & OLQF_CLEAN)) {
@@ -766,7 +793,7 @@ static int ocfs2_local_write_info(struct super_block *sb, int type)
 {
        struct mem_dqinfo *info = sb_dqinfo(sb, type);
        struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
-                                                ->dqi_ibh;
+                                                ->dqi_libh;
        int status;
        status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
@@ -789,10 +816,6 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
        int mark_clean = 1, len;
        int status;
-        /* At this point we know there are no more dquots and thus
-         * even if there's some sync in the pdflush queue, it won't
-         * find any dquots and return without doing anything */
-        cancel_delayed_work_sync(&oinfo->dqi_sync_work);
        iput(oinfo->dqi_gqinode);
        ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
        ocfs2_lock_res_free(&oinfo->dqi_gqlock);
@@ -827,7 +850,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
        /* Mark local file as clean */
        info->dqi_flags |= OLQF_CLEAN;
        status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
-                                 oinfo->dqi_ibh,
+                                 oinfo->dqi_libh,
                                 olq_update_info,
                                 info);
        if (status < 0) {
@@ -837,7 +860,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
 out:
        ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
-        brelse(oinfo->dqi_ibh);
+        brelse(oinfo->dqi_libh);
        brelse(oinfo->dqi_lqi_bh);
        kfree(oinfo);
        return 0;
@@ -865,22 +888,21 @@ static void olq_set_dquot(struct buffer_head *bh, void *private)
 }
 /* Write dquot to local quota file */
-static int ocfs2_local_write_dquot(struct dquot *dquot)
+int ocfs2_local_write_dquot(struct dquot *dquot)
 {
        struct super_block *sb = dquot->dq_sb;
        struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
-        struct buffer_head *bh = NULL;
+        struct buffer_head *bh;
+        struct inode *lqinode = sb_dqopt(sb)->files[dquot->dq_type];
        int status;
-        status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type],
+        status = ocfs2_read_quota_phys_block(lqinode, od->dq_local_phys_blk,
-                                    ol_dqblk_file_block(sb, od->dq_local_off),
+                                             &bh);
-                                    &bh);
        if (status) {
                mlog_errno(status);
                goto out;
        }
-        status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh,
+        status = ocfs2_modify_bh(lqinode, bh, olq_set_dquot, od);
-                                 olq_set_dquot, od);
        if (status < 0) {
                mlog_errno(status);
                goto out;
@@ -980,10 +1002,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
        }
        /* Initialize chunk header */
-        down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
        status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
                                             &p_blkno, NULL, NULL);
-        up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
        if (status < 0) {
                mlog_errno(status);
                goto out_trans;
@@ -1008,17 +1028,11 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
               sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
               OCFS2_QBLK_RESERVED_SPACE);
        unlock_buffer(bh);
-        status = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_trans;
-        }
        /* Initialize new block with structures */
-        down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
        status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1,
                                             &p_blkno, NULL, NULL);
-        up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
        if (status < 0) {
                mlog_errno(status);
                goto out_trans;
@@ -1039,11 +1053,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
        lock_buffer(dbh);
        memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE);
        unlock_buffer(dbh);
-        status = ocfs2_journal_dirty(handle, dbh);
+        ocfs2_journal_dirty(handle, dbh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_trans;
-        }
        /* Update local quotafile info */
        oinfo->dqi_blocks += 2;
@@ -1119,10 +1129,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
        }
        /* Get buffer from the just added block */
-        down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
        status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
                                             &p_blkno, NULL, NULL);
-        up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
        if (status < 0) {
                mlog_errno(status);
                goto out;
@@ -1154,11 +1162,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
        lock_buffer(bh);
        memset(bh->b_data, 0, sb->s_blocksize);
        unlock_buffer(bh);
-        status = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_trans;
-        }
        /* Update chunk header */
        status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
                                         chunk->qc_headerbh,
@@ -1172,11 +1177,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
        lock_buffer(chunk->qc_headerbh);
        le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
        unlock_buffer(chunk->qc_headerbh);
-        status = ocfs2_journal_dirty(handle, chunk->qc_headerbh);
+        ocfs2_journal_dirty(handle, chunk->qc_headerbh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_trans;
-        }
        /* Update file header */
        oinfo->dqi_blocks++;
        status = ocfs2_local_write_info(sb, type);
@@ -1209,7 +1211,7 @@ static void olq_alloc_dquot(struct buffer_head *bh, void *private)
 }
 /* Create dquot in the local file for given id */
-static int ocfs2_create_local_dquot(struct dquot *dquot)
+int ocfs2_create_local_dquot(struct dquot *dquot)
 {
        struct super_block *sb = dquot->dq_sb;
        int type = dquot->dq_type;
@@ -1218,17 +1220,27 @@ static int ocfs2_create_local_dquot(struct dquot *dquot)
        struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
        int offset;
        int status;
+        u64 pcount;
+        down_write(&OCFS2_I(lqinode)->ip_alloc_sem);
        chunk = ocfs2_find_free_entry(sb, type, &offset);
        if (!chunk) {
                chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
-                if (IS_ERR(chunk))
+                if (IS_ERR(chunk)) {
-                        return PTR_ERR(chunk);
+                        status = PTR_ERR(chunk);
+                        goto out;
+                }
        } else if (IS_ERR(chunk)) {
-                return PTR_ERR(chunk);
+                status = PTR_ERR(chunk);
+                goto out;
        }
        od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
        od->dq_chunk = chunk;
+        status = ocfs2_extent_map_get_blocks(lqinode,
+                                     ol_dqblk_block(sb, chunk->qc_num, offset),
+                                     &od->dq_local_phys_blk,
+                                     &pcount,
+                                     NULL);
        /* Initialize dquot structure on disk */
        status = ocfs2_local_write_dquot(dquot);
@@ -1245,39 +1257,15 @@ static int ocfs2_create_local_dquot(struct dquot *dquot)
                goto out;
        }
 out:
+        up_write(&OCFS2_I(lqinode)->ip_alloc_sem);
        return status;
 }
-/* Create entry in local file for dquot, load data from the global file */
+/*
-static int ocfs2_local_read_dquot(struct dquot *dquot)
+ * Release dquot structure from local quota file. ocfs2_release_dquot() has
-{
+ * already started a transaction and written all changes to global quota file
-        int status;
+ */
+int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
-        mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type);
-        status = ocfs2_global_read_dquot(dquot);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_err;
-        }
-        /* Now create entry in the local quota file */
-        status = ocfs2_create_local_dquot(dquot);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_err;
-        }
-        mlog_exit(0);
-        return 0;
-out_err:
-        mlog_exit(status);
-        return status;
-}
-/* Release dquot structure from local quota file. ocfs2_release_dquot() has
- * already started a transaction and obtained exclusive lock for global
- * quota file. */
-static int ocfs2_local_release_dquot(struct dquot *dquot)
 {
        int status;
        int type = dquot->dq_type;
@@ -1285,15 +1273,6 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
        struct super_block *sb = dquot->dq_sb;
        struct ocfs2_local_disk_chunk *dchunk;
        int offset;
-        handle_t *handle = journal_current_handle();
-        BUG_ON(!handle);
-        /* First write all local changes to global file */
-        status = ocfs2_global_release_dquot(dquot);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out;
-        }
        status = ocfs2_journal_access_dq(handle,
                        INODE_CACHE(sb_dqopt(sb)->files[type]),
@@ -1311,12 +1290,8 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
        ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
        le32_add_cpu(&dchunk->dqc_free, 1);
        unlock_buffer(od->dq_chunk->qc_headerbh);
-        status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
+        ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out;
-        }
-        status = 0;
 out:
        /* Clear the read bit so that next time someone uses this
         * dquot he reads fresh info from disk and allocates local
@@ -1330,9 +1305,6 @@ static const struct quota_format_ops ocfs2_format_ops = {
        .read_file_info         = ocfs2_local_read_info,
        .write_file_info        = ocfs2_global_write_info,
        .free_file_info         = ocfs2_local_free_info,
-        .read_dqblk             = ocfs2_local_read_dquot,
-        .commit_dqblk           = ocfs2_local_write_dquot,
-        .release_dqblk          = ocfs2_local_release_dquot,
 };
 struct quota_format_type ocfs2_quota_format = {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 9e96921dffda..4793f36f6518 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -37,7 +37,6 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
-#include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
@@ -571,7 +570,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
        struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
        u16 suballoc_bit_start;
        u32 num_got;
-        u64 first_blkno;
+        u64 suballoc_loc, first_blkno;
        BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
@@ -597,7 +596,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
                goto out_commit;
        }
-        ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+        ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
                                   &suballoc_bit_start, &num_got,
                                   &first_blkno);
        if (ret) {
@@ -627,6 +626,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
        memset(rb, 0, inode->i_sb->s_blocksize);
        strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
        rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+        rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
        rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
        rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
        rb->rf_blkno = cpu_to_le64(first_blkno);
@@ -791,7 +791,10 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
        if (le32_to_cpu(rb->rf_count) == 1) {
                blk = le64_to_cpu(rb->rf_blkno);
                bit = le16_to_cpu(rb->rf_suballoc_bit);
-                bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+                if (rb->rf_suballoc_loc)
+                        bg_blkno = le64_to_cpu(rb->rf_suballoc_loc);
+                else
+                        bg_blkno = ocfs2_which_suballoc_group(blk, bit);
                alloc_inode = ocfs2_get_system_file_inode(osb,
                                        EXTENT_ALLOC_SYSTEM_INODE,
@@ -1269,9 +1272,7 @@ static int ocfs2_change_refcount_rec(handle_t *handle,
        } else if (merge)
                ocfs2_refcount_rec_merge(rb, index);
-        ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
+        ocfs2_journal_dirty(handle, ref_leaf_bh);
-        if (ret)
-                mlog_errno(ret);
 out:
        return ret;
 }
@@ -1285,7 +1286,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
        int ret;
        u16 suballoc_bit_start;
        u32 num_got;
-        u64 blkno;
+        u64 suballoc_loc, blkno;
        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
        struct buffer_head *new_bh = NULL;
        struct ocfs2_refcount_block *new_rb;
@@ -1299,7 +1300,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
                goto out;
        }
-        ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
+        ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
                                   &suballoc_bit_start, &num_got,
                                   &blkno);
        if (ret) {
@@ -1331,6 +1332,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
        new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
        new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+        new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
        new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
        new_rb->rf_blkno = cpu_to_le64(blkno);
        new_rb->rf_cpos = cpu_to_le32(0);
@@ -1525,7 +1527,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
        int ret;
        u16 suballoc_bit_start;
        u32 num_got, new_cpos;
-        u64 blkno;
+        u64 suballoc_loc, blkno;
        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
        struct ocfs2_refcount_block *root_rb =
                        (struct ocfs2_refcount_block *)ref_root_bh->b_data;
@@ -1549,7 +1551,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
                goto out;
        }
-        ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
+        ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
                                   &suballoc_bit_start, &num_got,
                                   &blkno);
        if (ret) {
@@ -1577,6 +1579,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
        memset(new_rb, 0, sb->s_blocksize);
        strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
        new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+        new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
        new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
        new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
        new_rb->rf_blkno = cpu_to_le64(blkno);
@@ -1695,7 +1698,7 @@ static int ocfs2_adjust_refcount_rec(handle_t *handle,
         * 2 more credits, one for the leaf refcount block, one for
         * the extent block contains the extent rec.
         */
-        ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2);
+        ret = ocfs2_extend_trans(handle, 2);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -1803,11 +1806,7 @@ static int ocfs2_insert_refcount_rec(handle_t *handle,
        if (merge)
                ocfs2_refcount_rec_merge(rb, index);
-        ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
+        ocfs2_journal_dirty(handle, ref_leaf_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        if (index == 0) {
                ret = ocfs2_adjust_refcount_rec(handle, ci,
@@ -1978,9 +1977,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
                        ocfs2_refcount_rec_merge(rb, index);
        }
-        ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
+        ocfs2_journal_dirty(handle, ref_leaf_bh);
-        if (ret)
-                mlog_errno(ret);
 out:
        brelse(new_bh);
@@ -2113,6 +2110,7 @@ static int ocfs2_remove_refcount_extent(handle_t *handle,
         */
        ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
                                        le16_to_cpu(rb->rf_suballoc_slot),
+                                        le64_to_cpu(rb->rf_suballoc_loc),
                                        le64_to_cpu(rb->rf_blkno),
                                        le16_to_cpu(rb->rf_suballoc_bit));
        if (ret) {
@@ -2517,20 +2515,19 @@ out:
 *
 * Normally the refcount blocks store these refcount should be
 * contiguous also, so that we can get the number easily.
- * As for meta_ac, we will at most add split 2 refcount record and
+ * We will at most add split 2 refcount records and 2 more
- * 2 more refcount block, so just check it in a rough way.
+ * refcount blocks, so just check it in a rough way.
 *
 * Caller must hold refcount tree lock.
 */
 int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
-                                          struct buffer_head *di_bh,
+                                          u64 refcount_loc,
                                          u64 phys_blkno,
                                          u32 clusters,
                                          int *credits,
-                                          struct ocfs2_alloc_context **meta_ac)
+                                          int *ref_blocks)
 {
-        int ret, ref_blocks = 0;
+        int ret;
-        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct buffer_head *ref_root_bh = NULL;
        struct ocfs2_refcount_tree *tree;
@@ -2547,14 +2544,13 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
        BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
        ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
-                                      le64_to_cpu(di->i_refcount_loc), &tree);
+                                      refcount_loc, &tree);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_read_refcount_block(&tree->rf_ci,
+        ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc,
-                                        le64_to_cpu(di->i_refcount_loc),
                                        &ref_root_bh);
        if (ret) {
                mlog_errno(ret);
@@ -2565,21 +2561,14 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
                                               &tree->rf_ci,
                                               ref_root_bh,
                                               start_cpos, clusters,
-                                               &ref_blocks, credits);
+                                               ref_blocks, credits);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        mlog(0, "reserve new metadata %d, credits = %d\n",
+        mlog(0, "reserve new metadata %d blocks, credits = %d\n",
-             ref_blocks, *credits);
+             *ref_blocks, *credits);
-        if (ref_blocks) {
-                ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
-                                                        ref_blocks, meta_ac);
-                if (ret)
-                        mlog_errno(ret);
-        }
 out:
        brelse(ref_root_bh);
@@ -3041,11 +3030,7 @@ static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
                }
                memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
-                ret = ocfs2_journal_dirty(handle, new_bh);
+                ocfs2_journal_dirty(handle, new_bh);
-                if (ret) {
-                        mlog_errno(ret);
-                        break;
-                }
                brelse(new_bh);
                brelse(old_bh);
@@ -3283,7 +3268,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
                } else {
                        delete = 1;
-                        ret = __ocfs2_claim_clusters(osb, handle,
+                        ret = __ocfs2_claim_clusters(handle,
                                                     context->data_ac,
                                                     1, set_len,
                                                     &new_bit, &new_len);
@@ -4075,6 +4060,7 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
        OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
        spin_unlock(&OCFS2_I(t_inode)->ip_lock);
        i_size_write(t_inode, size);
+        t_inode->i_blocks = s_inode->i_blocks;
        di->i_xattr_inline_size = s_di->i_xattr_inline_size;
        di->i_clusters = s_di->i_clusters;
@@ -4083,6 +4069,9 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
        di->i_attr = s_di->i_attr;
        if (preserve) {
+                t_inode->i_uid = s_inode->i_uid;
+                t_inode->i_gid = s_inode->i_gid;
+                t_inode->i_mode = s_inode->i_mode;
                di->i_uid = s_di->i_uid;
                di->i_gid = s_di->i_gid;
                di->i_mode = s_di->i_mode;
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index c1d19b1d3ecc..9983ba1570e2 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -47,11 +47,11 @@ int ocfs2_decrease_refcount(struct inode *inode,
                            struct ocfs2_cached_dealloc_ctxt *dealloc,
                            int delete);
 int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
-                                          struct buffer_head *di_bh,
+                                          u64 refcount_loc,
                                          u64 phys_blkno,
                                          u32 clusters,
                                          int *credits,
-                                          struct ocfs2_alloc_context **meta_ac);
+                                          int *ref_blocks);
 int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
                       u32 cpos, u32 write_len, u32 max_cpos);
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
new file mode 100644
index 000000000000..40650021fc24
--- /dev/null
+++ b/fs/ocfs2/reservations.c
@@ -0,0 +1,847 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.c
+ *
+ * Allocation reservations implementation
+ *
+ * Some code borrowed from fs/ext3/balloc.c and is:
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * The rest is copyright (C) 2010 Novell.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/bitops.h>
+#include <linux/list.h>
+#define MLOG_MASK_PREFIX ML_RESERVATIONS
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#ifdef CONFIG_OCFS2_DEBUG_FS
+#define OCFS2_CHECK_RESERVATIONS
+#endif
+DEFINE_SPINLOCK(resv_lock);
+#define OCFS2_MIN_RESV_WINDOW_BITS      8
+#define OCFS2_MAX_RESV_WINDOW_BITS      1024
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb)
+{
+        return (osb->osb_resv_level && osb->osb_dir_resv_level);
+}
+static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap,
+                                           struct ocfs2_alloc_reservation *resv)
+{
+        struct ocfs2_super *osb = resmap->m_osb;
+        unsigned int bits;
+        if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) {
+                /* 8, 16, 32, 64, 128, 256, 512, 1024 */
+                bits = 4 << osb->osb_resv_level;
+        } else {
+                bits = 4 << osb->osb_dir_resv_level;
+        }
+        return bits;
+}
+static inline unsigned int ocfs2_resv_end(struct ocfs2_alloc_reservation *resv)
+{
+        if (resv->r_len)
+                return resv->r_start + resv->r_len - 1;
+        return resv->r_start;
+}
+static inline int ocfs2_resv_empty(struct ocfs2_alloc_reservation *resv)
+{
+        return !!(resv->r_len == 0);
+}
+static inline int ocfs2_resmap_disabled(struct ocfs2_reservation_map *resmap)
+{
+        if (resmap->m_osb->osb_resv_level == 0)
+                return 1;
+        return 0;
+}
+static void ocfs2_dump_resv(struct ocfs2_reservation_map *resmap)
+{
+        struct ocfs2_super *osb = resmap->m_osb;
+        struct rb_node *node;
+        struct ocfs2_alloc_reservation *resv;
+        int i = 0;
+        mlog(ML_NOTICE, "Dumping resmap for device %s. Bitmap length: %u\n",
+             osb->dev_str, resmap->m_bitmap_len);
+        node = rb_first(&resmap->m_reservations);
+        while (node) {
+                resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+                mlog(ML_NOTICE, "start: %u\tend: %u\tlen: %u\tlast_start: %u"
+                     "\tlast_len: %u\n", resv->r_start,
+                     ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+                     resv->r_last_len);
+                node = rb_next(node);
+                i++;
+        }
+        mlog(ML_NOTICE, "%d reservations found. LRU follows\n", i);
+        i = 0;
+        list_for_each_entry(resv, &resmap->m_lru, r_lru) {
+                mlog(ML_NOTICE, "LRU(%d) start: %u\tend: %u\tlen: %u\t"
+                     "last_start: %u\tlast_len: %u\n", i, resv->r_start,
+                     ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+                     resv->r_last_len);
+                i++;
+        }
+}
+#ifdef OCFS2_CHECK_RESERVATIONS
+static int ocfs2_validate_resmap_bits(struct ocfs2_reservation_map *resmap,
+                                      int i,
+                                      struct ocfs2_alloc_reservation *resv)
+{
+        char *disk_bitmap = resmap->m_disk_bitmap;
+        unsigned int start = resv->r_start;
+        unsigned int end = ocfs2_resv_end(resv);
+        while (start <= end) {
+                if (ocfs2_test_bit(start, disk_bitmap)) {
+                        mlog(ML_ERROR,
+                             "reservation %d covers an allocated area "
+                             "starting at bit %u!\n", i, start);
+                        return 1;
+                }
+                start++;
+        }
+        return 0;
+}
+static void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+        unsigned int off = 0;
+        int i = 0;
+        struct rb_node *node;
+        struct ocfs2_alloc_reservation *resv;
+        node = rb_first(&resmap->m_reservations);
+        while (node) {
+                resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+                if (i > 0 && resv->r_start <= off) {
+                        mlog(ML_ERROR, "reservation %d has bad start off!\n",
+                             i);
+                        goto bad;
+                }
+                if (resv->r_len == 0) {
+                        mlog(ML_ERROR, "reservation %d has no length!\n",
+                             i);
+                        goto bad;
+                }
+                if (resv->r_start > ocfs2_resv_end(resv)) {
+                        mlog(ML_ERROR, "reservation %d has invalid range!\n",
+                             i);
+                        goto bad;
+                }
+                if (ocfs2_resv_end(resv) >= resmap->m_bitmap_len) {
+                        mlog(ML_ERROR, "reservation %d extends past bitmap!\n",
+                             i);
+                        goto bad;
+                }
+                if (ocfs2_validate_resmap_bits(resmap, i, resv))
+                        goto bad;
+                off = ocfs2_resv_end(resv);
+                node = rb_next(node);
+                i++;
+        }
+        return;
+bad:
+        ocfs2_dump_resv(resmap);
+        BUG();
+}
+#else
+static inline void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+}
+#endif
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv)
+{
+        memset(resv, 0, sizeof(*resv));
+        INIT_LIST_HEAD(&resv->r_lru);
+}
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+                         unsigned int flags)
+{
+        BUG_ON(flags & ~OCFS2_RESV_TYPES);
+        resv->r_flags |= flags;
+}
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+                      struct ocfs2_reservation_map *resmap)
+{
+        memset(resmap, 0, sizeof(*resmap));
+        resmap->m_osb = osb;
+        resmap->m_reservations = RB_ROOT;
+        /* m_bitmap_len is initialized to zero by the above memset. */
+        INIT_LIST_HEAD(&resmap->m_lru);
+        return 0;
+}
+static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap,
+                                struct ocfs2_alloc_reservation *resv)
+{
+        assert_spin_locked(&resv_lock);
+        if (!list_empty(&resv->r_lru))
+                list_del_init(&resv->r_lru);
+        list_add_tail(&resv->r_lru, &resmap->m_lru);
+}
+static void __ocfs2_resv_trunc(struct ocfs2_alloc_reservation *resv)
+{
+        resv->r_len = 0;
+        resv->r_start = 0;
+}
+static void ocfs2_resv_remove(struct ocfs2_reservation_map *resmap,
+                              struct ocfs2_alloc_reservation *resv)
+{
+        if (resv->r_flags & OCFS2_RESV_FLAG_INUSE) {
+                list_del_init(&resv->r_lru);
+                rb_erase(&resv->r_node, &resmap->m_reservations);
+                resv->r_flags &= ~OCFS2_RESV_FLAG_INUSE;
+        }
+}
+static void __ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+                                 struct ocfs2_alloc_reservation *resv)
+{
+        assert_spin_locked(&resv_lock);
+        __ocfs2_resv_trunc(resv);
+        /*
+         * last_len and last_start no longer make sense if
+         * we're changing the range of our allocations.
+         */
+        resv->r_last_len = resv->r_last_start = 0;
+        ocfs2_resv_remove(resmap, resv);
+}
+/* does nothing if 'resv' is null */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+                        struct ocfs2_alloc_reservation *resv)
+{
+        if (resv) {
+                spin_lock(&resv_lock);
+                __ocfs2_resv_discard(resmap, resv);
+                spin_unlock(&resv_lock);
+        }
+}
+static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap)
+{
+        struct rb_node *node;
+        struct ocfs2_alloc_reservation *resv;
+        assert_spin_locked(&resv_lock);
+        while ((node = rb_last(&resmap->m_reservations)) != NULL) {
+                resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+                __ocfs2_resv_discard(resmap, resv);
+        }
+}
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+                          unsigned int clen, char *disk_bitmap)
+{
+        if (ocfs2_resmap_disabled(resmap))
+                return;
+        spin_lock(&resv_lock);
+        ocfs2_resmap_clear_all_resv(resmap);
+        resmap->m_bitmap_len = clen;
+        resmap->m_disk_bitmap = disk_bitmap;
+        spin_unlock(&resv_lock);
+}
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap)
+{
+        /* Does nothing for now. Keep this around for API symmetry */
+}
+static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap,
+                              struct ocfs2_alloc_reservation *new)
+{
+        struct rb_root *root = &resmap->m_reservations;
+        struct rb_node *parent = NULL;
+        struct rb_node **p = &root->rb_node;
+        struct ocfs2_alloc_reservation *tmp;
+        assert_spin_locked(&resv_lock);
+        mlog(0, "Insert reservation start: %u len: %u\n", new->r_start,
+             new->r_len);
+        while (*p) {
+                parent = *p;
+                tmp = rb_entry(parent, struct ocfs2_alloc_reservation, r_node);
+                if (new->r_start < tmp->r_start) {
+                        p = &(*p)->rb_left;
+                        /*
+                         * This is a good place to check for
+                         * overlapping reservations.
+                         */
+                        BUG_ON(ocfs2_resv_end(new) >= tmp->r_start);
+                } else if (new->r_start > ocfs2_resv_end(tmp)) {
+                        p = &(*p)->rb_right;
+                } else {
+                        /* This should never happen! */
+                        mlog(ML_ERROR, "Duplicate reservation window!\n");
+                        BUG();
+                }
+        }
+        rb_link_node(&new->r_node, parent, p);
+        rb_insert_color(&new->r_node, root);
+        new->r_flags |= OCFS2_RESV_FLAG_INUSE;
+        ocfs2_resv_mark_lru(resmap, new);
+        ocfs2_check_resmap(resmap);
+}
+/**
+ * ocfs2_find_resv_lhs() - find the window which contains goal
+ * @resmap: reservation map to search
+ * @goal: which bit to search for
+ *
+ * If a window containing that goal is not found, we return the window
+ * which comes before goal. Returns NULL on empty rbtree or no window
+ * before goal.
+ */
+static struct ocfs2_alloc_reservation *
+ocfs2_find_resv_lhs(struct ocfs2_reservation_map *resmap, unsigned int goal)
+{
+        struct ocfs2_alloc_reservation *resv = NULL;
+        struct ocfs2_alloc_reservation *prev_resv = NULL;
+        struct rb_node *node = resmap->m_reservations.rb_node;
+        assert_spin_locked(&resv_lock);
+        if (!node)
+                return NULL;
+        node = rb_first(&resmap->m_reservations);
+        while (node) {
+                resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+                if (resv->r_start <= goal && ocfs2_resv_end(resv) >= goal)
+                        break;
+                /* Check if we overshot the reservation just before goal? */
+                if (resv->r_start > goal) {
+                        resv = prev_resv;
+                        break;
+                }
+                prev_resv = resv;
+                node = rb_next(node);
+        }
+        return resv;
+}
+/*
+ * We are given a range within the bitmap, which corresponds to a gap
+ * inside the reservations tree (search_start, search_len). The range
+ * can be anything from the whole bitmap, to a gap between
+ * reservations.
+ *
+ * The start value of *rstart is insignificant.
+ *
+ * This function searches the bitmap range starting at search_start
+ * with length search_len for a set of contiguous free bits. We try
+ * to find up to 'wanted' bits, but can sometimes return less.
+ *
+ * Returns the length of allocation, 0 if no free bits are found.
+ *
+ * *cstart and *clen will also be populated with the result.
+ */
+static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
+                                       unsigned int wanted,
+                                       unsigned int search_start,
+                                       unsigned int search_len,
+                                       unsigned int *rstart,
+                                       unsigned int *rlen)
+{
+        void *bitmap = resmap->m_disk_bitmap;
+        unsigned int best_start, best_len = 0;
+        int offset, start, found;
+        mlog(0, "Find %u bits within range (%u, len %u) resmap len: %u\n",
+             wanted, search_start, search_len, resmap->m_bitmap_len);
+        found = best_start = best_len = 0;
+        start = search_start;
+        while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len,
+                                                 start)) != -1) {
+                /* Search reached end of the region */
+                if (offset >= (search_start + search_len))
+                        break;
+                if (offset == start) {
+                        /* we found a zero */
+                        found++;
+                        /* move start to the next bit to test */
+                        start++;
+                } else {
+                        /* got a zero after some ones */
+                        found = 1;
+                        start = offset + 1;
+                }
+                if (found > best_len) {
+                        best_len = found;
+                        best_start = start - found;
+                }
+                if (found >= wanted)
+                        break;
+        }
+        if (best_len == 0)
+                return 0;
+        if (best_len >= wanted)
+                best_len = wanted;
+        *rlen = best_len;
+        *rstart = best_start;
+        mlog(0, "Found start: %u len: %u\n", best_start, best_len);
+        return *rlen;
+}
+static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+                                     struct ocfs2_alloc_reservation *resv,
+                                     unsigned int goal, unsigned int wanted)
+{
+        struct rb_root *root = &resmap->m_reservations;
+        unsigned int gap_start, gap_end, gap_len;
+        struct ocfs2_alloc_reservation *prev_resv, *next_resv;
+        struct rb_node *prev, *next;
+        unsigned int cstart, clen;
+        unsigned int best_start = 0, best_len = 0;
+        /*
+         * Nasty cases to consider:
+         *
+         * - rbtree is empty
+         * - our window should be first in all reservations
+         * - our window should be last in all reservations
+         * - need to make sure we don't go past end of bitmap
+         */
+        mlog(0, "resv start: %u resv end: %u goal: %u wanted: %u\n",
+             resv->r_start, ocfs2_resv_end(resv), goal, wanted);
+        assert_spin_locked(&resv_lock);
+        if (RB_EMPTY_ROOT(root)) {
+                /*
+                 * Easiest case - empty tree. We can just take
+                 * whatever window of free bits we want.
+                 */
+                mlog(0, "Empty root\n");
+                clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+                                                   resmap->m_bitmap_len - goal,
+                                                   &cstart, &clen);
+                /*
+                 * This should never happen - the local alloc window
+                 * will always have free bits when we're called.
+                 */
+                BUG_ON(goal == 0 && clen == 0);
+                if (clen == 0)
+                        return;
+                resv->r_start = cstart;
+                resv->r_len = clen;
+                ocfs2_resv_insert(resmap, resv);
+                return;
+        }
+        prev_resv = ocfs2_find_resv_lhs(resmap, goal);
+        if (prev_resv == NULL) {
+                mlog(0, "Goal on LHS of leftmost window\n");
+                /*
+                 * A NULL here means that the search code couldn't
+                 * find a window that starts before goal.
+                 *
+                 * However, we can take the first window after goal,
+                 * which is also by definition, the leftmost window in
+                 * the entire tree. If we can find free bits in the
+                 * gap between goal and the LHS window, then the
+                 * reservation can safely be placed there.
+                 *
+                 * Otherwise we fall back to a linear search, checking
+                 * the gaps in between windows for a place to
+                 * allocate.
+                 */
+                next = rb_first(root);
+                next_resv = rb_entry(next, struct ocfs2_alloc_reservation,
+                                     r_node);
+                /*
+                 * The search should never return such a window. (see
+                 * comment above
+                 */
+                if (next_resv->r_start <= goal) {
+                        mlog(ML_ERROR, "goal: %u next_resv: start %u len %u\n",
+                             goal, next_resv->r_start, next_resv->r_len);
+                        ocfs2_dump_resv(resmap);
+                        BUG();
+                }
+                clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+                                                   next_resv->r_start - goal,
+                                                   &cstart, &clen);
+                if (clen) {
+                        best_len = clen;
+                        best_start = cstart;
+                        if (best_len == wanted)
+                                goto out_insert;
+                }
+                prev_resv = next_resv;
+                next_resv = NULL;
+        }
+        prev = &prev_resv->r_node;
+        /* Now we do a linear search for a window, starting at 'prev_rsv' */
+        while (1) {
+                next = rb_next(prev);
+                if (next) {
+                        mlog(0, "One more resv found in linear search\n");
+                        next_resv = rb_entry(next,
+                                             struct ocfs2_alloc_reservation,
+                                             r_node);
+                        gap_start = ocfs2_resv_end(prev_resv) + 1;
+                        gap_end = next_resv->r_start - 1;
+                        gap_len = gap_end - gap_start + 1;
+                } else {
+                        mlog(0, "No next node\n");
+                        /*
+                         * We're at the rightmost edge of the
+                         * tree. See if a reservation between this
+                         * window and the end of the bitmap will work.
+                         */
+                        gap_start = ocfs2_resv_end(prev_resv) + 1;
+                        gap_len = resmap->m_bitmap_len - gap_start;
+                        gap_end = resmap->m_bitmap_len - 1;
+                }
+                /*
+                 * No need to check this gap if we have already found
+                 * a larger region of free bits.
+                 */
+                if (gap_len <= best_len)
+                        goto next_resv;
+                clen = ocfs2_resmap_find_free_bits(resmap, wanted, gap_start,
+                                                   gap_len, &cstart, &clen);
+                if (clen == wanted) {
+                        best_len = clen;
+                        best_start = cstart;
+                        goto out_insert;
+                } else if (clen > best_len) {
+                        best_len = clen;
+                        best_start = cstart;
+                }
+next_resv:
+                if (!next)
+                        break;
+                prev = next;
+                prev_resv = rb_entry(prev, struct ocfs2_alloc_reservation,
+                                     r_node);
+        }
+out_insert:
+        if (best_len) {
+                resv->r_start = best_start;
+                resv->r_len = best_len;
+                ocfs2_resv_insert(resmap, resv);
+        }
+}
+static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
+                                   struct ocfs2_alloc_reservation *resv,
+                                   unsigned int wanted)
+{
+        struct ocfs2_alloc_reservation *lru_resv;
+        int tmpwindow = !!(resv->r_flags & OCFS2_RESV_FLAG_TMP);
+        unsigned int min_bits;
+        if (!tmpwindow)
+                min_bits = ocfs2_resv_window_bits(resmap, resv) >> 1;
+        else
+                min_bits = wanted; /* We at know the temp window will use all
+                                    * of these bits */
+        /*
+         * Take the first reservation off the LRU as our 'target'. We
+         * don't try to be smart about it. There might be a case for
+         * searching based on size but I don't have enough data to be
+         * sure. --Mark (3/16/2010)
+         */
+        lru_resv = list_first_entry(&resmap->m_lru,
+                                    struct ocfs2_alloc_reservation, r_lru);
+        mlog(0, "lru resv: start: %u len: %u end: %u\n", lru_resv->r_start,
+             lru_resv->r_len, ocfs2_resv_end(lru_resv));
+        /*
+         * Cannibalize (some or all) of the target reservation and
+         * feed it to the current window.
+         */
+        if (lru_resv->r_len <= min_bits) {
+                /*
+                 * Discard completely if size is less than or equal to a
+                 * reasonable threshold - 50% of window bits for non temporary
+                 * windows.
+                 */
+                resv->r_start = lru_resv->r_start;
+                resv->r_len = lru_resv->r_len;
+                __ocfs2_resv_discard(resmap, lru_resv);
+        } else {
+                unsigned int shrink;
+                if (tmpwindow)
+                        shrink = min_bits;
+                else
+                        shrink = lru_resv->r_len / 2;
+                lru_resv->r_len -= shrink;
+                resv->r_start = ocfs2_resv_end(lru_resv) + 1;
+                resv->r_len = shrink;
+        }
+        mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
+             "r_len: %u r_last_start: %u r_last_len: %u\n",
+             resv->r_start, ocfs2_resv_end(resv), resv->r_len,
+             resv->r_last_start, resv->r_last_len);
+        ocfs2_resv_insert(resmap, resv);
+}
+static void ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+                                   struct ocfs2_alloc_reservation *resv,
+                                   unsigned int wanted)
+{
+        unsigned int goal = 0;
+        BUG_ON(!ocfs2_resv_empty(resv));
+        /*
+         * Begin by trying to get a window as close to the previous
+         * one as possible. Using the most recent allocation as a
+         * start goal makes sense.
+         */
+        if (resv->r_last_len) {
+                goal = resv->r_last_start + resv->r_last_len;
+                if (goal >= resmap->m_bitmap_len)
+                        goal = 0;
+        }
+        __ocfs2_resv_find_window(resmap, resv, goal, wanted);
+        /* Search from last alloc didn't work, try once more from beginning. */
+        if (ocfs2_resv_empty(resv) && goal != 0)
+                __ocfs2_resv_find_window(resmap, resv, 0, wanted);
+        if (ocfs2_resv_empty(resv)) {
+                /*
+                 * Still empty? Pull oldest one off the LRU, remove it from
+                 * tree, put this one in it's place.
+                 */
+                ocfs2_cannibalize_resv(resmap, resv, wanted);
+        }
+        BUG_ON(ocfs2_resv_empty(resv));
+}
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+                           struct ocfs2_alloc_reservation *resv,
+                           int *cstart, int *clen)
+{
+        unsigned int wanted = *clen;
+        if (resv == NULL || ocfs2_resmap_disabled(resmap))
+                return -ENOSPC;
+        spin_lock(&resv_lock);
+        /*
+         * We don't want to over-allocate for temporary
+         * windows. Otherwise, we run the risk of fragmenting the
+         * allocation space.
+         */
+        wanted = ocfs2_resv_window_bits(resmap, resv);
+        if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
+                wanted = *clen;
+        if (ocfs2_resv_empty(resv)) {
+                mlog(0, "empty reservation, find new window\n");
+                /*
+                 * Try to get a window here. If it works, we must fall
+                 * through and test the bitmap . This avoids some
+                 * ping-ponging of windows due to non-reserved space
+                 * being allocation before we initialize a window for
+                 * that inode.
+                 */
+                ocfs2_resv_find_window(resmap, resv, wanted);
+        }
+        BUG_ON(ocfs2_resv_empty(resv));
+        *cstart = resv->r_start;
+        *clen = resv->r_len;
+        spin_unlock(&resv_lock);
+        return 0;
+}
+static void
+        ocfs2_adjust_resv_from_alloc(struct ocfs2_reservation_map *resmap,
+                                     struct ocfs2_alloc_reservation *resv,
+                                     unsigned int start, unsigned int end)
+{
+        unsigned int rhs = 0;
+        unsigned int old_end = ocfs2_resv_end(resv);
+        BUG_ON(start != resv->r_start || old_end < end);
+        /*
+         * Completely used? We can remove it then.
+         */
+        if (old_end == end) {
+                __ocfs2_resv_discard(resmap, resv);
+                return;
+        }
+        rhs = old_end - end;
+        /*
+         * This should have been trapped above.
+         */
+        BUG_ON(rhs == 0);
+        resv->r_start = end + 1;
+        resv->r_len = old_end - resv->r_start + 1;
+}
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+                               struct ocfs2_alloc_reservation *resv,
+                               u32 cstart, u32 clen)
+{
+        unsigned int cend = cstart + clen - 1;
+        if (resmap == NULL || ocfs2_resmap_disabled(resmap))
+                return;
+        if (resv == NULL)
+                return;
+        BUG_ON(cstart != resv->r_start);
+        spin_lock(&resv_lock);
+        mlog(0, "claim bits: cstart: %u cend: %u clen: %u r_start: %u "
+             "r_end: %u r_len: %u, r_last_start: %u r_last_len: %u\n",
+             cstart, cend, clen, resv->r_start, ocfs2_resv_end(resv),
+             resv->r_len, resv->r_last_start, resv->r_last_len);
+        BUG_ON(cstart < resv->r_start);
+        BUG_ON(cstart > ocfs2_resv_end(resv));
+        BUG_ON(cend > ocfs2_resv_end(resv));
+        ocfs2_adjust_resv_from_alloc(resmap, resv, cstart, cend);
+        resv->r_last_start = cstart;
+        resv->r_last_len = clen;
+        /*
+         * May have been discarded above from
+         * ocfs2_adjust_resv_from_alloc().
+         */
+        if (!ocfs2_resv_empty(resv))
+                ocfs2_resv_mark_lru(resmap, resv);
+        mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
+             "r_len: %u r_last_start: %u r_last_len: %u\n",
+             resv->r_start, ocfs2_resv_end(resv), resv->r_len,
+             resv->r_last_start, resv->r_last_len);
+        ocfs2_check_resmap(resmap);
+        spin_unlock(&resv_lock);
+}
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
new file mode 100644
index 000000000000..1e49cc29d06c
--- /dev/null
+++ b/fs/ocfs2/reservations.h
@@ -0,0 +1,159 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.h
+ *
+ * Allocation reservations function prototypes and structures.
+ *
+ * Copyright (C) 2010 Novell.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_RESERVATIONS_H
+#define OCFS2_RESERVATIONS_H
+#include <linux/rbtree.h>
+#define OCFS2_DEFAULT_RESV_LEVEL        2
+#define OCFS2_MAX_RESV_LEVEL    9
+#define OCFS2_MIN_RESV_LEVEL    0
+struct ocfs2_alloc_reservation {
+        struct rb_node  r_node;
+        unsigned int    r_start;        /* Begining of current window */
+        unsigned int    r_len;          /* Length of the window */
+        unsigned int    r_last_len;     /* Length of most recent alloc */
+        unsigned int    r_last_start;   /* Start of most recent alloc */
+        struct list_head        r_lru;  /* LRU list head */
+        unsigned int    r_flags;
+};
+#define OCFS2_RESV_FLAG_INUSE   0x01    /* Set when r_node is part of a btree */
+#define OCFS2_RESV_FLAG_TMP     0x02    /* Temporary reservation, will be
+                                         * destroyed immedately after use */
+#define OCFS2_RESV_FLAG_DIR     0x04    /* Reservation is for an unindexed
+                                         * directory btree */
+struct ocfs2_reservation_map {
+        struct rb_root          m_reservations;
+        char                    *m_disk_bitmap;
+        struct ocfs2_super      *m_osb;
+        /* The following are not initialized to meaningful values until a disk
+         * bitmap is provided. */
+        u32                     m_bitmap_len;   /* Number of valid
+                                                 * bits available */
+        struct list_head        m_lru;          /* LRU of reservations
+                                                 * structures. */
+};
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv);
+#define OCFS2_RESV_TYPES        (OCFS2_RESV_FLAG_TMP|OCFS2_RESV_FLAG_DIR)
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+                         unsigned int flags);
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb);
+/**
+ * ocfs2_resv_discard() - truncate a reservation
+ * @resmap:
+ * @resv: the reservation to truncate.
+ *
+ * After this function is called, the reservation will be empty, and
+ * unlinked from the rbtree.
+ */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+                        struct ocfs2_alloc_reservation *resv);
+/**
+ * ocfs2_resmap_init() - Initialize fields of a reservations bitmap
+ * @resmap: struct ocfs2_reservation_map to initialize
+ * @obj: unused for now
+ * @ops: unused for now
+ * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize)
+ *
+ * Only possible return value other than '0' is -ENOMEM for failure to
+ * allocation mirror bitmap.
+ */
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+                      struct ocfs2_reservation_map *resmap);
+/**
+ * ocfs2_resmap_restart() - "restart" a reservation bitmap
+ * @resmap: reservations bitmap
+ * @clen: Number of valid bits in the bitmap
+ * @disk_bitmap: the disk bitmap this resmap should refer to.
+ *
+ * Re-initialize the parameters of a reservation bitmap. This is
+ * useful for local alloc window slides.
+ *
+ * This function will call ocfs2_trunc_resv against all existing
+ * reservations. A future version will recalculate existing
+ * reservations based on the new bitmap.
+ */
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+                          unsigned int clen, char *disk_bitmap);
+/**
+ * ocfs2_resmap_uninit() - uninitialize a reservation bitmap structure
+ * @resmap: the struct ocfs2_reservation_map to uninitialize
+ */
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap);
+/**
+ * ocfs2_resmap_resv_bits() - Return still-valid reservation bits
+ * @resmap: reservations bitmap
+ * @resv: reservation to base search from
+ * @cstart: start of proposed allocation
+ * @clen: length (in clusters) of proposed allocation
+ *
+ * Using the reservation data from resv, this function will compare
+ * resmap and resmap->m_disk_bitmap to determine what part (if any) of
+ * the reservation window is still clear to use. If resv is empty,
+ * this function will try to allocate a window for it.
+ *
+ * On success, zero is returned and the valid allocation area is set in cstart
+ * and clen.
+ *
+ * Returns -ENOSPC if reservations are disabled.
+ */
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+                           struct ocfs2_alloc_reservation *resv,
+                           int *cstart, int *clen);
+/**
+ * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used.
+ * @resmap: reservations bitmap
+ * @resv: optional reservation to recalulate based on new bitmap
+ * @cstart: start of allocation in clusters
+ * @clen: end of allocation in clusters.
+ *
+ * Tell the reservation code that bits were used to fulfill allocation in
+ * resmap. The bits don't have to have been part of any existing
+ * reservation. But we must always call this function when bits are claimed.
+ * Internally, the reservations code will use this information to mark the
+ * reservations bitmap. If resv is passed, it's next allocation window will be
+ * calculated. It also expects that 'cstart' is the same as we passed back
+ * from ocfs2_resmap_resv_bits().
+ */
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+                               struct ocfs2_alloc_reservation *resv,
+                               u32 cstart, u32 clen);
+#endif  /* OCFS2_RESERVATIONS_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 3c3d673a4d20..dacd553d8617 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -134,11 +134,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
                le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
        }
-        ret = ocfs2_journal_dirty(handle, group_bh);
+        ocfs2_journal_dirty(handle, group_bh);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_rollback;
-        }
        /* update the inode accordingly. */
        ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh,
@@ -319,7 +315,8 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
        if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
-                                 ocfs2_group_bitmap_size(osb->sb) * 8) {
+                ocfs2_group_bitmap_size(osb->sb, 0,
+                                        osb->s_feature_incompat) * 8) {
                mlog(ML_ERROR, "The disk is too old and small. "
                     "Force to do offline resize.");
                ret = -EINVAL;
@@ -500,7 +497,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
        fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
        if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
-                                 ocfs2_group_bitmap_size(osb->sb) * 8) {
+                ocfs2_group_bitmap_size(osb->sb, 0,
+                                        osb->s_feature_incompat) * 8) {
                mlog(ML_ERROR, "The disk is too old and small."
                     " Force to do offline resize.");
                ret = -EINVAL;
@@ -545,12 +543,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
        group = (struct ocfs2_group_desc *)group_bh->b_data;
        group->bg_next_group = cr->c_blkno;
+        ocfs2_journal_dirty(handle, group_bh);
-        ret = ocfs2_journal_dirty(handle, group_bh);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
        ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
                                      main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 7020e1253ffa..0d3049f696c5 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -19,6 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 /* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 5ae8812b2864..2dc57bca0688 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/reboot.h>
 #include <asm/uaccess.h>
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c3c60bc3e072..f4c2a9eb8c4d 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -53,6 +53,15 @@
 #define OCFS2_MAX_TO_STEAL              1024
+struct ocfs2_suballoc_result {
+        u64             sr_bg_blkno;    /* The bg we allocated from.  Set
+                                           to 0 when a block group is
+                                           contiguous. */
+        u64             sr_blkno;       /* The first allocated block */
+        unsigned int    sr_bit_offset;  /* The bit in the bg */
+        unsigned int    sr_bits;        /* How many bits we claimed */
+};
 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -60,6 +69,7 @@ static int ocfs2_block_group_fill(handle_t *handle,
                                  struct inode *alloc_inode,
                                  struct buffer_head *bg_bh,
                                  u64 group_blkno,
+                                  unsigned int group_clusters,
                                  u16 my_chain,
                                  struct ocfs2_chain_list *cl);
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
@@ -73,20 +83,17 @@ static int ocfs2_cluster_group_search(struct inode *inode,
                                      struct buffer_head *group_bh,
                                      u32 bits_wanted, u32 min_bits,
                                      u64 max_block,
-                                      u16 *bit_off, u16 *bits_found);
+                                      struct ocfs2_suballoc_result *res);
 static int ocfs2_block_group_search(struct inode *inode,
                                    struct buffer_head *group_bh,
                                    u32 bits_wanted, u32 min_bits,
                                    u64 max_block,
-                                    u16 *bit_off, u16 *bits_found);
+                                    struct ocfs2_suballoc_result *res);
-static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
+static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
-                                     struct ocfs2_alloc_context *ac,
                                     handle_t *handle,
                                     u32 bits_wanted,
                                     u32 min_bits,
-                                     u16 *bit_off,
+                                     struct ocfs2_suballoc_result *res);
-                                     unsigned int *num_bits,
-                                     u64 *bg_blkno);
 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
                                         int nr);
 static inline int ocfs2_block_group_set_bits(handle_t *handle,
@@ -95,13 +102,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
                                             struct buffer_head *group_bh,
                                             unsigned int bit_off,
                                             unsigned int num_bits);
-static inline int ocfs2_block_group_clear_bits(handle_t *handle,
-                                               struct inode *alloc_inode,
-                                               struct ocfs2_group_desc *bg,
-                                               struct buffer_head *group_bh,
-                                               unsigned int bit_off,
-                                               unsigned int num_bits);
 static int ocfs2_relink_block_group(handle_t *handle,
                                    struct inode *alloc_inode,
                                    struct buffer_head *fe_bh,
@@ -137,6 +137,7 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
        }
        brelse(ac->ac_bh);
        ac->ac_bh = NULL;
+        ac->ac_resv = NULL;
 }
 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -152,7 +153,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 #define do_error(fmt, ...)                                              \
        do{                                                             \
-                if (clean_error)                                        \
+                if (resize)                                     \
                        mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);        \
                else                                                    \
                        ocfs2_error(sb, fmt, ##__VA_ARGS__);            \
@@ -160,7 +161,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 static int ocfs2_validate_gd_self(struct super_block *sb,
                                  struct buffer_head *bh,
-                                  int clean_error)
+                                  int resize)
 {
        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
@@ -211,7 +212,7 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
 static int ocfs2_validate_gd_parent(struct super_block *sb,
                                    struct ocfs2_dinode *di,
                                    struct buffer_head *bh,
-                                    int clean_error)
+                                    int resize)
 {
        unsigned int max_bits;
        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
@@ -233,8 +234,11 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
                return -EINVAL;
        }
-        if (le16_to_cpu(gd->bg_chain) >=
+        /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
-            le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
+        if ((le16_to_cpu(gd->bg_chain) >
+             le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
+            ((le16_to_cpu(gd->bg_chain) ==
+             le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
                do_error("Group descriptor #%llu has bad chain %u",
                         (unsigned long long)bh->b_blocknr,
                         le16_to_cpu(gd->bg_chain));
@@ -329,14 +333,38 @@ out:
        return rc;
 }
+static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
+                                          struct ocfs2_group_desc *bg,
+                                          struct ocfs2_chain_list *cl,
+                                          u64 p_blkno, u32 clusters)
+{
+        struct ocfs2_extent_list *el = &bg->bg_list;
+        struct ocfs2_extent_rec *rec;
+        BUG_ON(!ocfs2_supports_discontig_bg(osb));
+        if (!el->l_next_free_rec)
+                el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
+        rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
+        rec->e_blkno = cpu_to_le64(p_blkno);
+        rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
+                                  le16_to_cpu(cl->cl_bpc));
+        rec->e_leaf_clusters = cpu_to_le32(clusters);
+        le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
+        le16_add_cpu(&bg->bg_free_bits_count,
+                     clusters * le16_to_cpu(cl->cl_bpc));
+        le16_add_cpu(&el->l_next_free_rec, 1);
+}
 static int ocfs2_block_group_fill(handle_t *handle,
                                  struct inode *alloc_inode,
                                  struct buffer_head *bg_bh,
                                  u64 group_blkno,
+                                  unsigned int group_clusters,
                                  u16 my_chain,
                                  struct ocfs2_chain_list *cl)
 {
        int status = 0;
+        struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
        struct super_block * sb = alloc_inode->i_sb;
@@ -363,19 +391,23 @@ static int ocfs2_block_group_fill(handle_t *handle,
        memset(bg, 0, sb->s_blocksize);
        strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
        bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
-        bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
+        bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
-        bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+                                                osb->s_feature_incompat));
        bg->bg_chain = cpu_to_le16(my_chain);
        bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
        bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
        bg->bg_blkno = cpu_to_le64(group_blkno);
+        if (group_clusters == le16_to_cpu(cl->cl_cpg))
+                bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+        else
+                ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
+                                              group_clusters);
        /* set the 1st bit in the bitmap to account for the descriptor block */
        ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
        bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
-        status = ocfs2_journal_dirty(handle, bg_bh);
+        ocfs2_journal_dirty(handle, bg_bh);
-        if (status < 0)
-                mlog_errno(status);
        /* There is no need to zero out or otherwise initialize the
         * other blocks in a group - All valid FS metadata in a block
@@ -401,6 +433,238 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
        return best;
 }
+static struct buffer_head *
+ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
+                               struct inode *alloc_inode,
+                               struct ocfs2_alloc_context *ac,
+                               struct ocfs2_chain_list *cl)
+{
+        int status;
+        u32 bit_off, num_bits;
+        u64 bg_blkno;
+        struct buffer_head *bg_bh;
+        unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
+        status = ocfs2_claim_clusters(handle, ac,
+                                      le16_to_cpu(cl->cl_cpg), &bit_off,
+                                      &num_bits);
+        if (status < 0) {
+                if (status != -ENOSPC)
+                        mlog_errno(status);
+                goto bail;
+        }
+        /* setup the group */
+        bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+        mlog(0, "new descriptor, record %u, at block %llu\n",
+             alloc_rec, (unsigned long long)bg_blkno);
+        bg_bh = sb_getblk(osb->sb, bg_blkno);
+        if (!bg_bh) {
+                status = -EIO;
+                mlog_errno(status);
+                goto bail;
+        }
+        ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
+        status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
+                                        bg_blkno, num_bits, alloc_rec, cl);
+        if (status < 0) {
+                brelse(bg_bh);
+                mlog_errno(status);
+        }
+bail:
+        return status ? ERR_PTR(status) : bg_bh;
+}
+static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
+                                        handle_t *handle,
+                                        struct ocfs2_alloc_context *ac,
+                                        unsigned int min_bits,
+                                        u32 *bit_off, u32 *num_bits)
+{
+        int status = 0;
+        while (min_bits) {
+                status = ocfs2_claim_clusters(handle, ac, min_bits,
+                                              bit_off, num_bits);
+                if (status != -ENOSPC)
+                        break;
+                min_bits >>= 1;
+        }
+        return status;
+}
+static int ocfs2_block_group_grow_discontig(handle_t *handle,
+                                            struct inode *alloc_inode,
+                                            struct buffer_head *bg_bh,
+                                            struct ocfs2_alloc_context *ac,
+                                            struct ocfs2_chain_list *cl,
+                                            unsigned int min_bits)
+{
+        int status;
+        struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+        struct ocfs2_group_desc *bg =
+                (struct ocfs2_group_desc *)bg_bh->b_data;
+        unsigned int needed = le16_to_cpu(cl->cl_cpg) -
+                         le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
+        u32 p_cpos, clusters;
+        u64 p_blkno;
+        struct ocfs2_extent_list *el = &bg->bg_list;
+        status = ocfs2_journal_access_gd(handle,
+                                         INODE_CACHE(alloc_inode),
+                                         bg_bh,
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
+                                le16_to_cpu(el->l_count))) {
+                if (min_bits > needed)
+                        min_bits = needed;
+                status = ocfs2_block_group_claim_bits(osb, handle, ac,
+                                                      min_bits, &p_cpos,
+                                                      &clusters);
+                if (status < 0) {
+                        if (status != -ENOSPC)
+                                mlog_errno(status);
+                        goto bail;
+                }
+                p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
+                ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
+                                              clusters);
+                min_bits = clusters;
+                needed = le16_to_cpu(cl->cl_cpg) -
+                         le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
+        }
+        if (needed > 0) {
+                /*
+                 * We have used up all the extent rec but can't fill up
+                 * the cpg. So bail out.
+                 */
+                status = -ENOSPC;
+                goto bail;
+        }
+        ocfs2_journal_dirty(handle, bg_bh);
+bail:
+        return status;
+}
+static void ocfs2_bg_alloc_cleanup(handle_t *handle,
+                                   struct ocfs2_alloc_context *cluster_ac,
+                                   struct inode *alloc_inode,
+                                   struct buffer_head *bg_bh)
+{
+        int i, ret;
+        struct ocfs2_group_desc *bg;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_extent_rec *rec;
+        if (!bg_bh)
+                return;
+        bg = (struct ocfs2_group_desc *)bg_bh->b_data;
+        el = &bg->bg_list;
+        for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+                rec = &el->l_recs[i];
+                ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
+                                          cluster_ac->ac_bh,
+                                          le64_to_cpu(rec->e_blkno),
+                                          le32_to_cpu(rec->e_leaf_clusters));
+                if (ret)
+                        mlog_errno(ret);
+                /* Try all the clusters to free */
+        }
+        ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
+        brelse(bg_bh);
+}
+static struct buffer_head *
+ocfs2_block_group_alloc_discontig(handle_t *handle,
+                                  struct inode *alloc_inode,
+                                  struct ocfs2_alloc_context *ac,
+                                  struct ocfs2_chain_list *cl)
+{
+        int status;
+        u32 bit_off, num_bits;
+        u64 bg_blkno;
+        unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
+        struct buffer_head *bg_bh = NULL;
+        unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
+        struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+        if (!ocfs2_supports_discontig_bg(osb)) {
+                status = -ENOSPC;
+                goto bail;
+        }
+        status = ocfs2_extend_trans(handle,
+                                    ocfs2_calc_bg_discontig_credits(osb->sb));
+        if (status) {
+                mlog_errno(status);
+                goto bail;
+        }
+        /*
+         * We're going to be grabbing from multiple cluster groups.
+         * We don't have enough credits to relink them all, and the
+         * cluster groups will be staying in cache for the duration of
+         * this operation.
+         */
+        ac->ac_allow_chain_relink = 0;
+        /* Claim the first region */
+        status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
+                                              &bit_off, &num_bits);
+        if (status < 0) {
+                if (status != -ENOSPC)
+                        mlog_errno(status);
+                goto bail;
+        }
+        min_bits = num_bits;
+        /* setup the group */
+        bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+        mlog(0, "new descriptor, record %u, at block %llu\n",
+             alloc_rec, (unsigned long long)bg_blkno);
+        bg_bh = sb_getblk(osb->sb, bg_blkno);
+        if (!bg_bh) {
+                status = -EIO;
+                mlog_errno(status);
+                goto bail;
+        }
+        ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
+        status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
+                                        bg_blkno, num_bits, alloc_rec, cl);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
+                                                  bg_bh, ac, cl, min_bits);
+        if (status)
+                mlog_errno(status);
+bail:
+        if (status)
+                ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
+        return status ? ERR_PTR(status) : bg_bh;
+}
 /*
 * We expect the block group allocator to already be locked.
 */
@@ -416,9 +680,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        struct ocfs2_chain_list *cl;
        struct ocfs2_alloc_context *ac = NULL;
        handle_t *handle = NULL;
-        u32 bit_off, num_bits;
        u16 alloc_rec;
-        u64 bg_blkno;
        struct buffer_head *bg_bh = NULL;
        struct ocfs2_group_desc *bg;
@@ -451,44 +713,20 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                     (unsigned long long)*last_alloc_group);
                ac->ac_last_group = *last_alloc_group;
        }
-        status = ocfs2_claim_clusters(osb,
-                                      handle,
+        bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
-                                      ac,
+                                               ac, cl);
-                                      le16_to_cpu(cl->cl_cpg),
+        if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
-                                      &bit_off,
+                bg_bh = ocfs2_block_group_alloc_discontig(handle,
-                                      &num_bits);
+                                                          alloc_inode,
-        if (status < 0) {
+                                                          ac, cl);
+        if (IS_ERR(bg_bh)) {
+                status = PTR_ERR(bg_bh);
+                bg_bh = NULL;
                if (status != -ENOSPC)
                        mlog_errno(status);
                goto bail;
        }
-        alloc_rec = ocfs2_find_smallest_chain(cl);
-        /* setup the group */
-        bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
-        mlog(0, "new descriptor, record %u, at block %llu\n",
-             alloc_rec, (unsigned long long)bg_blkno);
-        bg_bh = sb_getblk(osb->sb, bg_blkno);
-        if (!bg_bh) {
-                status = -EIO;
-                mlog_errno(status);
-                goto bail;
-        }
-        ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
-        status = ocfs2_block_group_fill(handle,
-                                        alloc_inode,
-                                        bg_bh,
-                                        bg_blkno,
-                                        alloc_rec,
-                                        cl);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        bg = (struct ocfs2_group_desc *) bg_bh->b_data;
        status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
@@ -498,10 +736,12 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                goto bail;
        }
+        alloc_rec = le16_to_cpu(bg->bg_chain);
        le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
                     le16_to_cpu(bg->bg_free_bits_count));
-        le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
+        le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
-        cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
+                     le16_to_cpu(bg->bg_bits));
+        cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg->bg_blkno);
        if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
                le16_add_cpu(&cl->cl_next_free_rec, 1);
@@ -510,11 +750,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
        le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
-        status = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
        OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
@@ -764,7 +1000,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
        status = ocfs2_reserve_suballoc_bits(osb, (*ac),
                                             EXTENT_ALLOC_SYSTEM_INODE,
                                             (u32)osb->slot_num, NULL,
-                                             ALLOC_NEW_GROUP);
+                                             ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
        if (status >= 0) {
@@ -950,11 +1186,7 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
                status = ocfs2_reserve_local_alloc_bits(osb,
                                                        bits_wanted,
                                                        *ac);
-                if (status == -EFBIG) {
+                if ((status < 0) && (status != -ENOSPC)) {
-                        /* The local alloc window is outside ac_max_block.
-                         * use the main bitmap. */
-                        status = -ENOSPC;
-                } else if ((status < 0) && (status != -ENOSPC)) {
                        mlog_errno(status);
                        goto bail;
                }
@@ -1037,8 +1269,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
                                             struct buffer_head *bg_bh,
                                             unsigned int bits_wanted,
                                             unsigned int total_bits,
-                                             u16 *bit_off,
+                                             struct ocfs2_suballoc_result *res)
-                                             u16 *bits_found)
 {
        void *bitmap;
        u16 best_offset, best_size;
@@ -1082,14 +1313,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
                }
        }
-        /* XXX: I think the first clause is equivalent to the second
+        if (best_size) {
-         *      - jlbec */
+                res->sr_bit_offset = best_offset;
-        if (found == bits_wanted) {
+                res->sr_bits = best_size;
-                *bit_off = start - found;
-                *bits_found = found;
-        } else if (best_size) {
-                *bit_off = best_offset;
-                *bits_found = best_size;
        } else {
                status = -ENOSPC;
                /* No error log here -- see the comment above
@@ -1133,16 +1359,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
        }
        le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
        while(num_bits--)
                ocfs2_set_bit(bit_off++, bitmap);
-        status = ocfs2_journal_dirty(handle,
+        ocfs2_journal_dirty(handle, group_bh);
-                                     group_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
 bail:
        mlog_exit(status);
@@ -1206,12 +1426,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
        }
        prev_bg->bg_next_group = bg->bg_next_group;
+        ocfs2_journal_dirty(handle, prev_bg_bh);
-        status = ocfs2_journal_dirty(handle, prev_bg_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_rollback;
-        }
        status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
                                         bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1221,12 +1436,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
        }
        bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
+        ocfs2_journal_dirty(handle, bg_bh);
-        status = ocfs2_journal_dirty(handle, bg_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_rollback;
-        }
        status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
                                         fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1236,14 +1446,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
        }
        fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
+        ocfs2_journal_dirty(handle, fe_bh);
-        status = ocfs2_journal_dirty(handle, fe_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_rollback;
-        }
-        status = 0;
 out_rollback:
        if (status < 0) {
                fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
@@ -1267,14 +1471,13 @@ static int ocfs2_cluster_group_search(struct inode *inode,
                                      struct buffer_head *group_bh,
                                      u32 bits_wanted, u32 min_bits,
                                      u64 max_block,
-                                      u16 *bit_off, u16 *bits_found)
+                                      struct ocfs2_suballoc_result *res)
 {
        int search = -ENOSPC;
        int ret;
        u64 blkoff;
        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        u16 tmp_off, tmp_found;
        unsigned int max_bits, gd_cluster_off;
        BUG_ON(!ocfs2_is_cluster_bitmap(inode));
@@ -1301,15 +1504,15 @@ static int ocfs2_cluster_group_search(struct inode *inode,
                ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
                                                        group_bh, bits_wanted,
-                                                        max_bits,
+                                                        max_bits, res);
-                                                        &tmp_off, &tmp_found);
                if (ret)
                        return ret;
                if (max_block) {
                        blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
                                                          gd_cluster_off +
-                                                          tmp_off + tmp_found);
+                                                          res->sr_bit_offset +
+                                                          res->sr_bits);
                        mlog(0, "Checking %llu against %llu\n",
                             (unsigned long long)blkoff,
                             (unsigned long long)max_block);
@@ -1321,16 +1524,14 @@ static int ocfs2_cluster_group_search(struct inode *inode,
                 * return success, but we still want to return
                 * -ENOSPC unless it found the minimum number
                 * of bits. */
-                if (min_bits <= tmp_found) {
+                if (min_bits <= res->sr_bits)
-                        *bit_off = tmp_off;
-                        *bits_found = tmp_found;
                        search = 0; /* success */
-                } else if (tmp_found) {
+                else if (res->sr_bits) {
                        /*
                         * Don't show bits which we'll be returning
                         * for allocation to the local alloc bitmap.
                         */
-                        ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
+                        ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
                }
        }
@@ -1341,7 +1542,7 @@ static int ocfs2_block_group_search(struct inode *inode,
                                    struct buffer_head *group_bh,
                                    u32 bits_wanted, u32 min_bits,
                                    u64 max_block,
-                                    u16 *bit_off, u16 *bits_found)
+                                    struct ocfs2_suballoc_result *res)
 {
        int ret = -ENOSPC;
        u64 blkoff;
@@ -1354,10 +1555,10 @@ static int ocfs2_block_group_search(struct inode *inode,
                ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
                                                        group_bh, bits_wanted,
                                                        le16_to_cpu(bg->bg_bits),
-                                                        bit_off, bits_found);
+                                                        res);
                if (!ret && max_block) {
-                        blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
+                        blkoff = le64_to_cpu(bg->bg_blkno) +
-                                *bits_found;
+                                res->sr_bit_offset + res->sr_bits;
                        mlog(0, "Checking %llu against %llu\n",
                             (unsigned long long)blkoff,
                             (unsigned long long)max_block);
@@ -1390,33 +1591,76 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
        tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
        di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
        le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
+        ocfs2_journal_dirty(handle, di_bh);
-        ret = ocfs2_journal_dirty(handle, di_bh);
-        if (ret < 0)
-                mlog_errno(ret);
 out:
        return ret;
 }
+static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
+                                         struct ocfs2_extent_rec *rec,
+                                         struct ocfs2_chain_list *cl)
+{
+        unsigned int bpc = le16_to_cpu(cl->cl_bpc);
+        unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
+        unsigned int bitcount = le32_to_cpu(rec->e_leaf_clusters) * bpc;
+        if (res->sr_bit_offset < bitoff)
+                return 0;
+        if (res->sr_bit_offset >= (bitoff + bitcount))
+                return 0;
+        res->sr_blkno = le64_to_cpu(rec->e_blkno) +
+                (res->sr_bit_offset - bitoff);
+        if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
+                res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
+        return 1;
+}
+static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
+                                          struct ocfs2_group_desc *bg,
+                                          struct ocfs2_suballoc_result *res)
+{
+        int i;
+        u64 bg_blkno = res->sr_bg_blkno;  /* Save off */
+        struct ocfs2_extent_rec *rec;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
+        struct ocfs2_chain_list *cl = &di->id2.i_chain;
+        if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
+                res->sr_blkno = 0;
+                return;
+        }
+        res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
+        res->sr_bg_blkno = 0;  /* Clear it for contig block groups */
+        if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
+            !bg->bg_list.l_next_free_rec)
+                return;
+        for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
+                rec = &bg->bg_list.l_recs[i];
+                if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
+                        res->sr_bg_blkno = bg_blkno;  /* Restore */
+                        break;
+                }
+        }
+}
 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
                                  handle_t *handle,
                                  u32 bits_wanted,
                                  u32 min_bits,
-                                  u16 *bit_off,
+                                  struct ocfs2_suballoc_result *res,
-                                  unsigned int *num_bits,
-                                  u64 gd_blkno,
                                  u16 *bits_left)
 {
        int ret;
-        u16 found;
        struct buffer_head *group_bh = NULL;
        struct ocfs2_group_desc *gd;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
        struct inode *alloc_inode = ac->ac_inode;
-        ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
+        ret = ocfs2_read_group_descriptor(alloc_inode, di,
-                                          &group_bh);
+                                          res->sr_bg_blkno, &group_bh);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
@@ -1424,17 +1668,18 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
        gd = (struct ocfs2_group_desc *) group_bh->b_data;
        ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
-                                  ac->ac_max_block, bit_off, &found);
+                                  ac->ac_max_block, res);
        if (ret < 0) {
                if (ret != -ENOSPC)
                        mlog_errno(ret);
                goto out;
        }
-        *num_bits = found;
+        if (!ret)
+                ocfs2_bg_discontig_fix_result(ac, gd, res);
        ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
-                                               *num_bits,
+                                               res->sr_bits,
                                               le16_to_cpu(gd->bg_chain));
        if (ret < 0) {
                mlog_errno(ret);
@@ -1442,7 +1687,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
        }
        ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
-                                         *bit_off, *num_bits);
+                                         res->sr_bit_offset, res->sr_bits);
        if (ret < 0)
                mlog_errno(ret);
@@ -1458,13 +1703,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
                              handle_t *handle,
                              u32 bits_wanted,
                              u32 min_bits,
-                              u16 *bit_off,
+                              struct ocfs2_suballoc_result *res,
-                              unsigned int *num_bits,
-                              u64 *bg_blkno,
                              u16 *bits_left)
 {
        int status;
-        u16 chain, tmp_bits;
+        u16 chain;
        u32 tmp_used;
        u64 next_group;
        struct inode *alloc_inode = ac->ac_inode;
@@ -1493,8 +1736,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
         * the 1st group with any empty bits. */
        while ((status = ac->ac_group_search(alloc_inode, group_bh,
                                             bits_wanted, min_bits,
-                                             ac->ac_max_block, bit_off,
+                                             ac->ac_max_block,
-                                             &tmp_bits)) == -ENOSPC) {
+                                             res)) == -ENOSPC) {
                if (!bg->bg_next_group)
                        break;
@@ -1519,11 +1762,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
        }
        mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
-             tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
+             res->sr_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
-        *num_bits = tmp_bits;
+        res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
+        BUG_ON(res->sr_bits == 0);
+        if (!status)
+                ocfs2_bg_discontig_fix_result(ac, bg, res);
-        BUG_ON(*num_bits == 0);
        /*
         * Keep track of previous block descriptor read. When
@@ -1540,7 +1786,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
         */
        if (ac->ac_allow_chain_relink &&
            (prev_group_bh) &&
-            (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
+            (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
                status = ocfs2_relink_block_group(handle, alloc_inode,
                                                  ac->ac_bh, group_bh,
                                                  prev_group_bh, chain);
@@ -1562,31 +1808,24 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
        }
        tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
-        fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
+        fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used);
-        le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
+        le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits);
+        ocfs2_journal_dirty(handle, ac->ac_bh);
-        status = ocfs2_journal_dirty(handle,
-                                     ac->ac_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        status = ocfs2_block_group_set_bits(handle,
                                            alloc_inode,
                                            bg,
                                            group_bh,
-                                            *bit_off,
+                                            res->sr_bit_offset,
-                                            *num_bits);
+                                            res->sr_bits);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
+        mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
             (unsigned long long)le64_to_cpu(fe->i_blkno));
-        *bg_blkno = le64_to_cpu(bg->bg_blkno);
        *bits_left = le16_to_cpu(bg->bg_free_bits_count);
 bail:
        brelse(group_bh);
@@ -1597,19 +1836,15 @@ bail:
 }
 /* will give out up to bits_wanted contiguous bits. */
-static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
+static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
-                                     struct ocfs2_alloc_context *ac,
                                     handle_t *handle,
                                     u32 bits_wanted,
                                     u32 min_bits,
-                                     u16 *bit_off,
+                                     struct ocfs2_suballoc_result *res)
-                                     unsigned int *num_bits,
-                                     u64 *bg_blkno)
 {
        int status;
        u16 victim, i;
        u16 bits_left = 0;
-        u64 hint_blkno = ac->ac_last_group;
        struct ocfs2_chain_list *cl;
        struct ocfs2_dinode *fe;
@@ -1627,7 +1862,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
        if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
            le32_to_cpu(fe->id1.bitmap1.i_total)) {
-                ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
+                ocfs2_error(ac->ac_inode->i_sb,
+                            "Chain allocator dinode %llu has %u used "
                            "bits but only %u total.",
                            (unsigned long long)le64_to_cpu(fe->i_blkno),
                            le32_to_cpu(fe->id1.bitmap1.i_used),
@@ -1636,22 +1872,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
                goto bail;
        }
-        if (hint_blkno) {
+        res->sr_bg_blkno = ac->ac_last_group;
+        if (res->sr_bg_blkno) {
                /* Attempt to short-circuit the usual search mechanism
                 * by jumping straight to the most recently used
                 * allocation group. This helps us mantain some
                 * contiguousness across allocations. */
                status = ocfs2_search_one_group(ac, handle, bits_wanted,
-                                                min_bits, bit_off, num_bits,
+                                                min_bits, res, &bits_left);
-                                                hint_blkno, &bits_left);
+                if (!status)
-                if (!status) {
-                        /* Be careful to update *bg_blkno here as the
-                         * caller is expecting it to be filled in, and
-                         * ocfs2_search_one_group() won't do that for
-                         * us. */
-                        *bg_blkno = hint_blkno;
                        goto set_hint;
-                }
                if (status < 0 && status != -ENOSPC) {
                        mlog_errno(status);
                        goto bail;
@@ -1664,8 +1894,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
        ac->ac_chain = victim;
        ac->ac_allow_chain_relink = 1;
-        status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off,
+        status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
-                                    num_bits, bg_blkno, &bits_left);
+                                    res, &bits_left);
        if (!status)
                goto set_hint;
        if (status < 0 && status != -ENOSPC) {
@@ -1689,8 +1919,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
                ac->ac_chain = i;
                status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
-                                            bit_off, num_bits, bg_blkno,
+                                            res, &bits_left);
-                                            &bits_left);
                if (!status)
                        break;
                if (status < 0 && status != -ENOSPC) {
@@ -1707,7 +1936,7 @@ set_hint:
                if (bits_left < min_bits)
                        ac->ac_last_group = 0;
                else
-                        ac->ac_last_group = *bg_blkno;
+                        ac->ac_last_group = res->sr_bg_blkno;
        }
 bail:
@@ -1715,37 +1944,37 @@ bail:
        return status;
 }
-int ocfs2_claim_metadata(struct ocfs2_super *osb,
+int ocfs2_claim_metadata(handle_t *handle,
-                         handle_t *handle,
                         struct ocfs2_alloc_context *ac,
                         u32 bits_wanted,
+                         u64 *suballoc_loc,
                         u16 *suballoc_bit_start,
                         unsigned int *num_bits,
                         u64 *blkno_start)
 {
        int status;
-        u64 bg_blkno;
+        struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
        BUG_ON(!ac);
        BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
        BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
-        status = ocfs2_claim_suballoc_bits(osb,
+        status = ocfs2_claim_suballoc_bits(ac,
-                                           ac,
                                           handle,
                                           bits_wanted,
                                           1,
-                                           suballoc_bit_start,
+                                           &res);
-                                           num_bits,
-                                           &bg_blkno);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        atomic_inc(&osb->alloc_stats.bg_allocs);
+        atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
-        *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
+        *suballoc_loc = res.sr_bg_blkno;
-        ac->ac_bits_given += (*num_bits);
+        *suballoc_bit_start = res.sr_bit_offset;
+        *blkno_start = res.sr_blkno;
+        ac->ac_bits_given += res.sr_bits;
+        *num_bits = res.sr_bits;
        status = 0;
 bail:
        mlog_exit(status);
@@ -1753,10 +1982,10 @@ bail:
 }
 static void ocfs2_init_inode_ac_group(struct inode *dir,
-                                      struct buffer_head *parent_fe_bh,
+                                      struct buffer_head *parent_di_bh,
                                      struct ocfs2_alloc_context *ac)
 {
-        struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
        /*
         * Try to allocate inodes from some specific group.
         *
@@ -1770,10 +1999,14 @@ static void ocfs2_init_inode_ac_group(struct inode *dir,
        if (OCFS2_I(dir)->ip_last_used_group &&
            OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
                ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
-        else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
+        else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
-                ac->ac_last_group = ocfs2_which_suballoc_group(
+                if (di->i_suballoc_loc)
-                                        le64_to_cpu(fe->i_blkno),
+                        ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
-                                        le16_to_cpu(fe->i_suballoc_bit));
+                else
+                        ac->ac_last_group = ocfs2_which_suballoc_group(
+                                        le64_to_cpu(di->i_blkno),
+                                        le16_to_cpu(di->i_suballoc_bit));
+        }
 }
 static inline void ocfs2_save_inode_ac_group(struct inode *dir,
@@ -1783,17 +2016,16 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir,
        OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
 }
-int ocfs2_claim_new_inode(struct ocfs2_super *osb,
+int ocfs2_claim_new_inode(handle_t *handle,
-                          handle_t *handle,
                          struct inode *dir,
                          struct buffer_head *parent_fe_bh,
                          struct ocfs2_alloc_context *ac,
+                          u64 *suballoc_loc,
                          u16 *suballoc_bit,
                          u64 *fe_blkno)
 {
        int status;
-        unsigned int num_bits;
+        struct ocfs2_suballoc_result res;
-        u64 bg_blkno;
        mlog_entry_void();
@@ -1804,23 +2036,22 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
        ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
-        status = ocfs2_claim_suballoc_bits(osb,
+        status = ocfs2_claim_suballoc_bits(ac,
-                                           ac,
                                           handle,
                                           1,
                                           1,
-                                           suballoc_bit,
+                                           &res);
-                                           &num_bits,
-                                           &bg_blkno);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        atomic_inc(&osb->alloc_stats.bg_allocs);
+        atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
-        BUG_ON(num_bits != 1);
+        BUG_ON(res.sr_bits != 1);
-        *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
+        *suballoc_loc = res.sr_bg_blkno;
+        *suballoc_bit = res.sr_bit_offset;
+        *fe_blkno = res.sr_blkno;
        ac->ac_bits_given++;
        ocfs2_save_inode_ac_group(dir, ac);
        status = 0;
@@ -1890,8 +2121,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
 * contig. allocation, set to '1' to indicate we can deal with extents
 * of any size.
 */
-int __ocfs2_claim_clusters(struct ocfs2_super *osb,
+int __ocfs2_claim_clusters(handle_t *handle,
-                           handle_t *handle,
                           struct ocfs2_alloc_context *ac,
                           u32 min_clusters,
                           u32 max_clusters,
@@ -1900,8 +2130,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
 {
        int status;
        unsigned int bits_wanted = max_clusters;
-        u64 bg_blkno = 0;
+        struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
-        u16 bg_bit_off;
+        struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
        mlog_entry_void();
@@ -1911,6 +2141,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
               && ac->ac_which != OCFS2_AC_USE_MAIN);
        if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
+                WARN_ON(min_clusters > 1);
                status = ocfs2_claim_local_alloc_bits(osb,
                                                      handle,
                                                      ac,
@@ -1933,20 +2165,19 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
                if (bits_wanted > (osb->bitmap_cpg - 1))
                        bits_wanted = osb->bitmap_cpg - 1;
-                status = ocfs2_claim_suballoc_bits(osb,
+                status = ocfs2_claim_suballoc_bits(ac,
-                                                   ac,
                                                   handle,
                                                   bits_wanted,
                                                   min_clusters,
-                                                   &bg_bit_off,
+                                                   &res);
-                                                   num_clusters,
-                                                   &bg_blkno);
                if (!status) {
+                        BUG_ON(res.sr_blkno); /* cluster alloc can't set */
                        *cluster_start =
                                ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
-                                                                 bg_blkno,
+                                                                 res.sr_bg_blkno,
-                                                                 bg_bit_off);
+                                                                 res.sr_bit_offset);
                        atomic_inc(&osb->alloc_stats.bitmap_data);
+                        *num_clusters = res.sr_bits;
                }
        }
        if (status < 0) {
@@ -1962,8 +2193,7 @@ bail:
        return status;
 }
-int ocfs2_claim_clusters(struct ocfs2_super *osb,
+int ocfs2_claim_clusters(handle_t *handle,
-                         handle_t *handle,
                         struct ocfs2_alloc_context *ac,
                         u32 min_clusters,
                         u32 *cluster_start,
@@ -1971,22 +2201,22 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
 {
        unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
-        return __ocfs2_claim_clusters(osb, handle, ac, min_clusters,
+        return __ocfs2_claim_clusters(handle, ac, min_clusters,
                                      bits_wanted, cluster_start, num_clusters);
 }
-static inline int ocfs2_block_group_clear_bits(handle_t *handle,
+static int ocfs2_block_group_clear_bits(handle_t *handle,
-                                               struct inode *alloc_inode,
+                                        struct inode *alloc_inode,
-                                               struct ocfs2_group_desc *bg,
+                                        struct ocfs2_group_desc *bg,
-                                               struct buffer_head *group_bh,
+                                        struct buffer_head *group_bh,
-                                               unsigned int bit_off,
+                                        unsigned int bit_off,
-                                               unsigned int num_bits)
+                                        unsigned int num_bits,
+                                        void (*undo_fn)(unsigned int bit,
+                                                        unsigned long *bmap))
 {
        int status;
        unsigned int tmp;
-        int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
        struct ocfs2_group_desc *undo_bg = NULL;
-        int cluster_bitmap = 0;
        mlog_entry_void();
@@ -1996,20 +2226,18 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
        mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
-        if (ocfs2_is_cluster_bitmap(alloc_inode))
+        BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
-                journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
        status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
-                                         group_bh, journal_type);
+                                         group_bh,
+                                         undo_fn ?
+                                         OCFS2_JOURNAL_ACCESS_UNDO :
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        if (ocfs2_is_cluster_bitmap(alloc_inode))
+        if (undo_fn) {
-                cluster_bitmap = 1;
-        if (cluster_bitmap) {
                jbd_lock_bh_state(group_bh);
                undo_bg = (struct ocfs2_group_desc *)
                                        bh2jh(group_bh)->b_committed_data;
@@ -2020,18 +2248,16 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
        while(tmp--) {
                ocfs2_clear_bit((bit_off + tmp),
                                (unsigned long *) bg->bg_bitmap);
-                if (cluster_bitmap)
+                if (undo_fn)
-                        ocfs2_set_bit(bit_off + tmp,
+                        undo_fn(bit_off + tmp,
-                                      (unsigned long *) undo_bg->bg_bitmap);
+                                (unsigned long *) undo_bg->bg_bitmap);
        }
        le16_add_cpu(&bg->bg_free_bits_count, num_bits);
-        if (cluster_bitmap)
+        if (undo_fn)
                jbd_unlock_bh_state(group_bh);
-        status = ocfs2_journal_dirty(handle, group_bh);
+        ocfs2_journal_dirty(handle, group_bh);
-        if (status < 0)
-                mlog_errno(status);
 bail:
        return status;
 }
@@ -2039,12 +2265,14 @@ bail:
 /*
 * expects the suballoc inode to already be locked.
 */
-int ocfs2_free_suballoc_bits(handle_t *handle,
+static int _ocfs2_free_suballoc_bits(handle_t *handle,
-                             struct inode *alloc_inode,
+                                     struct inode *alloc_inode,
-                             struct buffer_head *alloc_bh,
+                                     struct buffer_head *alloc_bh,
-                             unsigned int start_bit,
+                                     unsigned int start_bit,
-                             u64 bg_blkno,
+                                     u64 bg_blkno,
-                             unsigned int count)
+                                     unsigned int count,
+                                     void (*undo_fn)(unsigned int bit,
+                                                     unsigned long *bitmap))
 {
        int status = 0;
        u32 tmp_used;
@@ -2079,7 +2307,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
        status = ocfs2_block_group_clear_bits(handle, alloc_inode,
                                              group, group_bh,
-                                              start_bit, count);
+                                              start_bit, count, undo_fn);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -2096,12 +2324,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
                     count);
        tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
        fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
+        ocfs2_journal_dirty(handle, alloc_bh);
-        status = ocfs2_journal_dirty(handle, alloc_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
 bail:
        brelse(group_bh);
@@ -2110,6 +2333,17 @@ bail:
        return status;
 }
+int ocfs2_free_suballoc_bits(handle_t *handle,
+                             struct inode *alloc_inode,
+                             struct buffer_head *alloc_bh,
+                             unsigned int start_bit,
+                             u64 bg_blkno,
+                             unsigned int count)
+{
+        return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
+                                         start_bit, bg_blkno, count, NULL);
+}
 int ocfs2_free_dinode(handle_t *handle,
                      struct inode *inode_alloc_inode,
                      struct buffer_head *inode_alloc_bh,
@@ -2119,15 +2353,19 @@ int ocfs2_free_dinode(handle_t *handle,
        u16 bit = le16_to_cpu(di->i_suballoc_bit);
        u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+        if (di->i_suballoc_loc)
+                bg_blkno = le64_to_cpu(di->i_suballoc_loc);
        return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
                                        inode_alloc_bh, bit, bg_blkno, 1);
 }
-int ocfs2_free_clusters(handle_t *handle,
+static int _ocfs2_free_clusters(handle_t *handle,
-                       struct inode *bitmap_inode,
+                                struct inode *bitmap_inode,
-                       struct buffer_head *bitmap_bh,
+                                struct buffer_head *bitmap_bh,
-                       u64 start_blk,
+                                u64 start_blk,
-                       unsigned int num_clusters)
+                                unsigned int num_clusters,
+                                void (*undo_fn)(unsigned int bit,
+                                                unsigned long *bitmap))
 {
        int status;
        u16 bg_start_bit;
@@ -2154,9 +2392,9 @@ int ocfs2_free_clusters(handle_t *handle,
        mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
             (unsigned long long)bg_blkno, bg_start_bit);
-        status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
+        status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
-                                          bg_start_bit, bg_blkno,
+                                           bg_start_bit, bg_blkno,
-                                          num_clusters);
+                                           num_clusters, undo_fn);
        if (status < 0) {
                mlog_errno(status);
                goto out;
@@ -2170,6 +2408,32 @@ out:
        return status;
 }
+int ocfs2_free_clusters(handle_t *handle,
+                        struct inode *bitmap_inode,
+                        struct buffer_head *bitmap_bh,
+                        u64 start_blk,
+                        unsigned int num_clusters)
+{
+        return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
+                                    start_blk, num_clusters,
+                                    _ocfs2_set_bit);
+}
+/*
+ * Give never-used clusters back to the global bitmap.  We don't need
+ * to protect these bits in the undo buffer.
+ */
+int ocfs2_release_clusters(handle_t *handle,
+                           struct inode *bitmap_inode,
+                           struct buffer_head *bitmap_bh,
+                           u64 start_blk,
+                           unsigned int num_clusters)
+{
+        return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
+                                    start_blk, num_clusters,
+                                    _ocfs2_clear_bit);
+}
 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
 {
        printk("Block Group:\n");
@@ -2360,7 +2624,7 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
                                   struct buffer_head *alloc_bh, u64 blkno,
                                   u16 bit, int *res)
 {
-        struct ocfs2_dinode *alloc_fe;
+        struct ocfs2_dinode *alloc_di;
        struct ocfs2_group_desc *group;
        struct buffer_head *group_bh = NULL;
        u64 bg_blkno;
@@ -2369,17 +2633,20 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
        mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
                   (unsigned int)bit);
-        alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
+        alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
-        if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
+        if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
                mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
                     (unsigned int)bit,
-                     ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
+                     ocfs2_bits_per_group(&alloc_di->id2.i_chain));
                status = -EINVAL;
                goto bail;
        }
-        bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
+        if (alloc_di->i_suballoc_loc)
-        status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
+                bg_blkno = le64_to_cpu(alloc_di->i_suballoc_loc);
+        else
+                bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
+        status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
                                             &group_bh);
        if (status < 0) {
                mlog(ML_ERROR, "read group %llu failed %d\n",
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index fa60723c43e8..a017dd3ee7d9 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -26,13 +26,14 @@
 #ifndef _CHAINALLOC_H_
 #define _CHAINALLOC_H_
+struct ocfs2_suballoc_result;
 typedef int (group_search_t)(struct inode *,
                             struct buffer_head *,
                             u32,                       /* bits_wanted */
                             u32,                       /* min_bits */
                             u64,                       /* max_block */
-                             u16 *,                     /* *bit_off */
+                             struct ocfs2_suballoc_result *);
-                             u16 *);                    /* *bits_found */
+                                                        /* found bits */
 struct ocfs2_alloc_context {
        struct inode *ac_inode;    /* which bitmap are we allocating from? */
@@ -54,6 +55,8 @@ struct ocfs2_alloc_context {
        u64    ac_last_group;
        u64    ac_max_block;  /* Highest block number to allocate. 0 is
                                 is the same as ~0 - unlimited */
+        struct ocfs2_alloc_reservation  *ac_resv;
 };
 void ocfs2_init_steal_slots(struct ocfs2_super *osb);
@@ -80,22 +83,21 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
                           u32 bits_wanted,
                           struct ocfs2_alloc_context **ac);
-int ocfs2_claim_metadata(struct ocfs2_super *osb,
+int ocfs2_claim_metadata(handle_t *handle,
-                         handle_t *handle,
                         struct ocfs2_alloc_context *ac,
                         u32 bits_wanted,
+                         u64 *suballoc_loc,
                         u16 *suballoc_bit_start,
                         u32 *num_bits,
                         u64 *blkno_start);
-int ocfs2_claim_new_inode(struct ocfs2_super *osb,
+int ocfs2_claim_new_inode(handle_t *handle,
-                          handle_t *handle,
                          struct inode *dir,
                          struct buffer_head *parent_fe_bh,
                          struct ocfs2_alloc_context *ac,
+                          u64 *suballoc_loc,
                          u16 *suballoc_bit,
                          u64 *fe_blkno);
-int ocfs2_claim_clusters(struct ocfs2_super *osb,
+int ocfs2_claim_clusters(handle_t *handle,
-                         handle_t *handle,
                         struct ocfs2_alloc_context *ac,
                         u32 min_clusters,
                         u32 *cluster_start,
@@ -104,8 +106,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
 * Use this variant of ocfs2_claim_clusters to specify a maxiumum
 * number of clusters smaller than the allocation reserved.
 */
-int __ocfs2_claim_clusters(struct ocfs2_super *osb,
+int __ocfs2_claim_clusters(handle_t *handle,
-                           handle_t *handle,
                           struct ocfs2_alloc_context *ac,
                           u32 min_clusters,
                           u32 max_clusters,
@@ -127,6 +128,11 @@ int ocfs2_free_clusters(handle_t *handle,
                        struct buffer_head *bitmap_bh,
                        u64 start_blk,
                        unsigned int num_clusters);
+int ocfs2_release_clusters(handle_t *handle,
+                           struct inode *bitmap_inode,
+                           struct buffer_head *bitmap_bh,
+                           u64 start_blk,
+                           unsigned int num_clusters);
 static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
 {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index dee03197a494..0eaa929a4dbf 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -94,7 +94,9 @@ struct mount_options
        unsigned long   mount_opt;
        unsigned int    atime_quantum;
        signed short    slot;
-        unsigned int    localalloc_opt;
+        int             localalloc_opt;
+        unsigned int    resv_level;
+        int             dir_resv_level;
        char            cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
 };
@@ -176,6 +178,8 @@ enum {
        Opt_noacl,
        Opt_usrquota,
        Opt_grpquota,
+        Opt_resv_level,
+        Opt_dir_resv_level,
        Opt_err,
 };
@@ -202,6 +206,8 @@ static const match_table_t tokens = {
        {Opt_noacl, "noacl"},
        {Opt_usrquota, "usrquota"},
        {Opt_grpquota, "grpquota"},
+        {Opt_resv_level, "resv_level=%u"},
+        {Opt_dir_resv_level, "dir_resv_level=%u"},
        {Opt_err, NULL}
 };
@@ -873,13 +879,15 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
                if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
                        continue;
                if (unsuspend)
-                        status = vfs_quota_enable(
+                        status = dquot_resume(sb, type);
-                                        sb_dqopt(sb)->files[type],
+                else {
-                                        type, QFMT_OCFS2,
+                        struct ocfs2_mem_dqinfo *oinfo;
-                                        DQUOT_SUSPENDED);
-                else
+                        /* Cancel periodic syncing before suspending */
-                        status = vfs_quota_disable(sb, type,
+                        oinfo = sb_dqinfo(sb, type)->dqi_priv;
-                                                   DQUOT_SUSPENDED);
+                        cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+                        status = dquot_suspend(sb, type);
+                }
                if (status < 0)
                        break;
        }
@@ -910,8 +918,8 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb)
                        status = -ENOENT;
                        goto out_quota_off;
                }
-                status = vfs_quota_enable(inode[type], type, QFMT_OCFS2,
+                status = dquot_enable(inode[type], type, QFMT_OCFS2,
-                                                DQUOT_USAGE_ENABLED);
+                                      DQUOT_USAGE_ENABLED);
                if (status < 0)
                        goto out_quota_off;
        }
@@ -932,18 +940,22 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
        int type;
        struct inode *inode;
        struct super_block *sb = osb->sb;
+        struct ocfs2_mem_dqinfo *oinfo;
        /* We mostly ignore errors in this function because there's not much
         * we can do when we see them */
        for (type = 0; type < MAXQUOTAS; type++) {
                if (!sb_has_quota_loaded(sb, type))
                        continue;
+                /* Cancel periodic syncing before we grab dqonoff_mutex */
+                oinfo = sb_dqinfo(sb, type)->dqi_priv;
+                cancel_delayed_work_sync(&oinfo->dqi_sync_work);
                inode = igrab(sb->s_dquot.files[type]);
                /* Turn off quotas. This will remove all dquot structures from
                 * memory and so they will be automatically synced to global
                 * quota files */
-                vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED |
+                dquot_disable(sb, type, DQUOT_USAGE_ENABLED |
-                                            DQUOT_LIMITS_ENABLED);
+                                        DQUOT_LIMITS_ENABLED);
                if (!inode)
                        continue;
                iput(inode);
@@ -952,7 +964,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
 /* Handle quota on quotactl */
 static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
-                          char *path, int remount)
+                          char *path)
 {
        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
                                             OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
@@ -960,30 +972,24 @@ static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
                return -EINVAL;
-        if (remount)
+        return dquot_enable(sb_dqopt(sb)->files[type], type,
-                return 0;       /* Just ignore it has been handled in
+                            format_id, DQUOT_LIMITS_ENABLED);
-                                 * ocfs2_remount() */
-        return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
-                                    format_id, DQUOT_LIMITS_ENABLED);
 }
 /* Handle quota off quotactl */
-static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
+static int ocfs2_quota_off(struct super_block *sb, int type)
 {
-        if (remount)
+        return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
-                return 0;       /* Ignore now and handle later in
-                                 * ocfs2_remount() */
-        return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
 }
 static const struct quotactl_ops ocfs2_quotactl_ops = {
        .quota_on       = ocfs2_quota_on,
        .quota_off      = ocfs2_quota_off,
-        .quota_sync     = vfs_quota_sync,
+        .quota_sync     = dquot_quota_sync,
-        .get_info       = vfs_get_dqinfo,
+        .get_info       = dquot_get_dqinfo,
-        .set_info       = vfs_set_dqinfo,
+        .set_info       = dquot_set_dqinfo,
-        .get_dqblk      = vfs_get_dqblk,
+        .get_dqblk      = dquot_get_dqblk,
-        .set_dqblk      = vfs_set_dqblk,
+        .set_dqblk      = dquot_set_dqblk,
 };
 static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
@@ -1028,8 +1034,14 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        osb->s_atime_quantum = parsed_options.atime_quantum;
        osb->preferred_slot = parsed_options.slot;
        osb->osb_commit_interval = parsed_options.commit_interval;
-        osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
-        osb->local_alloc_bits = osb->local_alloc_default_bits;
+        ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt);
+        osb->osb_resv_level = parsed_options.resv_level;
+        osb->osb_dir_resv_level = parsed_options.resv_level;
+        if (parsed_options.dir_resv_level == -1)
+                osb->osb_dir_resv_level = parsed_options.resv_level;
+        else
+                osb->osb_dir_resv_level = parsed_options.dir_resv_level;
        status = ocfs2_verify_userspace_stack(osb, &parsed_options);
        if (status)
@@ -1285,11 +1297,13 @@ static int ocfs2_parse_options(struct super_block *sb,
                   options ? options : "(none)");
        mopt->commit_interval = 0;
-        mopt->mount_opt = 0;
+        mopt->mount_opt = OCFS2_MOUNT_NOINTR;
        mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
        mopt->slot = OCFS2_INVALID_SLOT;
-        mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+        mopt->localalloc_opt = -1;
        mopt->cluster_stack[0] = '\0';
+        mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
+        mopt->dir_resv_level = -1;
        if (!options) {
                status = 1;
@@ -1380,7 +1394,7 @@ static int ocfs2_parse_options(struct super_block *sb,
                                status = 0;
                                goto bail;
                        }
-                        if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8))
+                        if (option >= 0)
                                mopt->localalloc_opt = option;
                        break;
                case Opt_localflocks:
@@ -1433,6 +1447,28 @@ static int ocfs2_parse_options(struct super_block *sb,
                        mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
                        mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
                        break;
+                case Opt_resv_level:
+                        if (is_remount)
+                                break;
+                        if (match_int(&args[0], &option)) {
+                                status = 0;
+                                goto bail;
+                        }
+                        if (option >= OCFS2_MIN_RESV_LEVEL &&
+                            option < OCFS2_MAX_RESV_LEVEL)
+                                mopt->resv_level = option;
+                        break;
+                case Opt_dir_resv_level:
+                        if (is_remount)
+                                break;
+                        if (match_int(&args[0], &option)) {
+                                status = 0;
+                                goto bail;
+                        }
+                        if (option >= OCFS2_MIN_RESV_LEVEL &&
+                            option < OCFS2_MAX_RESV_LEVEL)
+                                mopt->dir_resv_level = option;
+                        break;
                default:
                        mlog(ML_ERROR,
                             "Unrecognized mount option \"%s\" "
@@ -1487,7 +1523,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
                           (unsigned) (osb->osb_commit_interval / HZ));
        local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
-        if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
+        if (local_alloc_megs != ocfs2_la_default_mb(osb))
                seq_printf(s, ",localalloc=%d", local_alloc_megs);
        if (opts & OCFS2_MOUNT_LOCALFLOCKS)
@@ -1514,6 +1550,12 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        else
                seq_printf(s, ",noacl");
+        if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL)
+                seq_printf(s, ",resv_level=%d", osb->osb_resv_level);
+        if (osb->osb_dir_resv_level != osb->osb_resv_level)
+                seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
        return 0;
 }
@@ -1688,6 +1730,8 @@ static void ocfs2_inode_init_once(void *data)
        oi->ip_blkno = 0ULL;
        oi->ip_clusters = 0;
+        ocfs2_resv_init_once(&oi->ip_la_data_resv);
        ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
        ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
        ocfs2_lock_res_init_once(&oi->ip_open_lockres);
@@ -2042,6 +2086,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
        init_waitqueue_head(&osb->osb_mount_event);
+        status = ocfs2_resmap_init(osb, &osb->osb_la_resmap);
+        if (status) {
+                mlog_errno(status);
+                goto bail;
+        }
        osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
        if (!osb->vol_label) {
                mlog(ML_ERROR, "unable to alloc vol label\n");
@@ -2224,9 +2274,11 @@ static int ocfs2_initialize_super(struct super_block *sb,
        }
        osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
+        osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters;
        iput(inode);
-        osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8;
+        osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0,
+                                 osb->s_feature_incompat) * 8;
        status = ocfs2_init_slot_info(osb);
        if (status < 0) {
@@ -2509,5 +2561,25 @@ void __ocfs2_abort(struct super_block* sb,
        ocfs2_handle_error(sb);
 }
+/*
+ * Void signal blockers, because in-kernel sigprocmask() only fails
+ * when SIG_* is wrong.
+ */
+void ocfs2_block_signals(sigset_t *oldset)
+{
+        int rc;
+        sigset_t blocked;
+        sigfillset(&blocked);
+        rc = sigprocmask(SIG_BLOCK, &blocked, oldset);
+        BUG_ON(rc);
+}
+void ocfs2_unblock_signals(sigset_t *oldset)
+{
+        int rc = sigprocmask(SIG_SETMASK, oldset, NULL);
+        BUG_ON(rc);
+}
 module_init(ocfs2_init);
 module_exit(ocfs2_exit);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 783f5270f2a1..40c7de084c10 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -45,4 +45,11 @@ void __ocfs2_abort(struct super_block *sb,
 #define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+/*
+ * Void signal blockers, because in-kernel sigprocmask() only fails
+ * when SIG_* is wrong.
+ */
+void ocfs2_block_signals(sigset_t *oldset);
+void ocfs2_unblock_signals(sigset_t *oldset);
 #endif /* OCFS2_SUPER_H */
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 40e53702948c..bfe7190cdbf1 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -25,7 +25,6 @@
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #define MLOG_MASK_PREFIX ML_INODE
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d1b0d386f6d1..e97b34842cfe 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -79,6 +79,7 @@ struct ocfs2_xattr_set_ctxt {
        struct ocfs2_alloc_context *meta_ac;
        struct ocfs2_alloc_context *data_ac;
        struct ocfs2_cached_dealloc_ctxt dealloc;
+        int set_abort;
 };
 #define OCFS2_XATTR_ROOT_SIZE   (sizeof(struct ocfs2_xattr_def_value_root))
@@ -96,7 +97,7 @@ static struct ocfs2_xattr_def_value_root def_xv = {
        .xv.xr_list.l_count = cpu_to_le16(1),
 };
-struct xattr_handler *ocfs2_xattr_handlers[] = {
+const struct xattr_handler *ocfs2_xattr_handlers[] = {
        &ocfs2_xattr_user_handler,
        &ocfs2_xattr_acl_access_handler,
        &ocfs2_xattr_acl_default_handler,
@@ -105,7 +106,7 @@ struct xattr_handler *ocfs2_xattr_handlers[] = {
        NULL
 };
-static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
+static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
        [OCFS2_XATTR_INDEX_USER]        = &ocfs2_xattr_user_handler,
        [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
                                        = &ocfs2_xattr_acl_access_handler,
@@ -539,7 +540,7 @@ static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
 static inline const char *ocfs2_xattr_prefix(int name_index)
 {
-        struct xattr_handler *handler = NULL;
+        const struct xattr_handler *handler = NULL;
        if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
                handler = ocfs2_xattr_handler_map[name_index];
@@ -739,11 +740,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
                goto leave;
        }
-        status = ocfs2_journal_dirty(handle, vb->vb_bh);
+        ocfs2_journal_dirty(handle, vb->vb_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
@@ -786,12 +783,7 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
        }
        le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
+        ocfs2_journal_dirty(handle, vb->vb_bh);
-        ret = ocfs2_journal_dirty(handle, vb->vb_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        if (ext_flags & OCFS2_EXT_REFCOUNTED)
                ret = ocfs2_decrease_refcount(inode, handle,
@@ -1374,11 +1366,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
                                memset(bh->b_data + cp_len, 0,
                                       blocksize - cp_len);
-                        ret = ocfs2_journal_dirty(handle, bh);
+                        ocfs2_journal_dirty(handle, bh);
-                        if (ret < 0) {
-                                mlog_errno(ret);
-                                goto out;
-                        }
                        brelse(bh);
                        bh = NULL;
@@ -1622,7 +1610,7 @@ static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
        /* Now tell xh->xh_entries about it */
        for (i = 0; i < count; i++) {
                offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
-                if (offset < namevalue_offset)
+                if (offset <= namevalue_offset)
                        le16_add_cpu(&xh->xh_entries[i].xe_name_offset,
                                     namevalue_size);
        }
@@ -2148,15 +2136,19 @@ alloc_value:
                orig_clusters = ocfs2_xa_value_clusters(loc);
                rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
                if (rc < 0) {
-                        /*
+                        ctxt->set_abort = 1;
-                         * If we tried to grow an existing external value,
-                         * ocfs2_xa_cleanuP-value_truncate() is going to
-                         * let it stand.  We have to restore its original
-                         * value size.
-                         */
-                        loc->xl_entry->xe_value_size = orig_value_size;
                        ocfs2_xa_cleanup_value_truncate(loc, "growing",
                                                        orig_clusters);
+                        /*
+                         * If we were growing an existing value,
+                         * ocfs2_xa_cleanup_value_truncate() won't remove
+                         * the entry. We need to restore the original value
+                         * size.
+                         */
+                        if (loc->xl_entry) {
+                                BUG_ON(!orig_value_size);
+                                loc->xl_entry->xe_value_size = orig_value_size;
+                        }
                        mlog_errno(rc);
                }
        }
@@ -2479,7 +2471,10 @@ static int ocfs2_xattr_free_block(struct inode *inode,
        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
        blk = le64_to_cpu(xb->xb_blkno);
        bit = le16_to_cpu(xb->xb_suballoc_bit);
-        bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+        if (xb->xb_suballoc_loc)
+                bg_blkno = le64_to_cpu(xb->xb_suballoc_loc);
+        else
+                bg_blkno = ocfs2_which_suballoc_group(blk, bit);
        xb_alloc_inode = ocfs2_get_system_file_inode(osb,
                                EXTENT_ALLOC_SYSTEM_INODE,
@@ -2594,9 +2589,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
        spin_unlock(&oi->ip_lock);
-        ret = ocfs2_journal_dirty(handle, di_bh);
+        ocfs2_journal_dirty(handle, di_bh);
-        if (ret < 0)
-                mlog_errno(ret);
 out_commit:
        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
@@ -2724,9 +2717,7 @@ static int ocfs2_xattr_ibody_init(struct inode *inode,
        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
        spin_unlock(&oi->ip_lock);
-        ret = ocfs2_journal_dirty(ctxt->handle, di_bh);
+        ocfs2_journal_dirty(ctxt->handle, di_bh);
-        if (ret < 0)
-                mlog_errno(ret);
 out:
        return ret;
@@ -2846,9 +2837,8 @@ static int ocfs2_create_xattr_block(struct inode *inode,
        int ret;
        u16 suballoc_bit_start;
        u32 num_got;
-        u64 first_blkno;
+        u64 suballoc_loc, first_blkno;
        struct ocfs2_dinode *di =  (struct ocfs2_dinode *)inode_bh->b_data;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct buffer_head *new_bh = NULL;
        struct ocfs2_xattr_block *xblk;
@@ -2859,9 +2849,9 @@ static int ocfs2_create_xattr_block(struct inode *inode,
                goto end;
        }
-        ret = ocfs2_claim_metadata(osb, ctxt->handle, ctxt->meta_ac, 1,
+        ret = ocfs2_claim_metadata(ctxt->handle, ctxt->meta_ac, 1,
-                                   &suballoc_bit_start, &num_got,
+                                   &suballoc_loc, &suballoc_bit_start,
-                                   &first_blkno);
+                                   &num_got, &first_blkno);
        if (ret < 0) {
                mlog_errno(ret);
                goto end;
@@ -2883,8 +2873,10 @@ static int ocfs2_create_xattr_block(struct inode *inode,
        memset(xblk, 0, inode->i_sb->s_blocksize);
        strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
        xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
+        xblk->xb_suballoc_loc = cpu_to_le64(suballoc_loc);
        xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
-        xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
+        xblk->xb_fs_generation =
+                cpu_to_le32(OCFS2_SB(inode->i_sb)->fs_generation);
        xblk->xb_blkno = cpu_to_le64(first_blkno);
        if (indexed) {
                struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
@@ -2956,7 +2948,7 @@ static int ocfs2_xattr_block_set(struct inode *inode,
                ret = ocfs2_xa_set(&loc, xi, ctxt);
                if (!ret)
                        xs->here = loc.xl_entry;
-                else if (ret != -ENOSPC)
+                else if ((ret != -ENOSPC) || ctxt->set_abort)
                        goto end;
                else {
                        ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
@@ -3312,14 +3304,13 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
                                goto out;
                        }
-                        ret = ocfs2_extend_trans(ctxt->handle, credits +
+                        ret = ocfs2_extend_trans(ctxt->handle, credits);
-                                        ctxt->handle->h_buffer_credits);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
                        }
                        ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
-                } else if (ret == -ENOSPC) {
+                } else if ((ret == -ENOSPC) && !ctxt->set_abort) {
                        if (di->i_xattr_loc && !xbs->xattr_bh) {
                                ret = ocfs2_xattr_block_find(inode,
                                                             xi->xi_name_index,
@@ -3343,8 +3334,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
                                        goto out;
                                }
-                                ret = ocfs2_extend_trans(ctxt->handle, credits +
+                                ret = ocfs2_extend_trans(ctxt->handle, credits);
-                                        ctxt->handle->h_buffer_credits);
                                if (ret) {
                                        mlog_errno(ret);
                                        goto out;
@@ -3378,8 +3368,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
                                        goto out;
                                }
-                                ret = ocfs2_extend_trans(ctxt->handle, credits +
+                                ret = ocfs2_extend_trans(ctxt->handle, credits);
-                                                ctxt->handle->h_buffer_credits);
                                if (ret) {
                                        mlog_errno(ret);
                                        goto out;
@@ -4249,7 +4238,6 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
        u32 bit_off, len;
        u64 blkno;
        handle_t *handle = ctxt->handle;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct buffer_head *xb_bh = xs->xattr_bh;
        struct ocfs2_xattr_block *xb =
@@ -4277,7 +4265,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
                goto out;
        }
-        ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac,
+        ret = __ocfs2_claim_clusters(handle, ctxt->data_ac,
                                     1, 1, &bit_off, &len);
        if (ret) {
                mlog_errno(ret);
@@ -4887,8 +4875,7 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
         * We need to update the first bucket of the old extent and all
         * the buckets going to the new extent.
         */
-        credits = ((num_buckets + 1) * blks_per_bucket) +
+        credits = ((num_buckets + 1) * blks_per_bucket);
-                handle->h_buffer_credits;
        ret = ocfs2_extend_trans(handle, credits);
        if (ret) {
                mlog_errno(ret);
@@ -4958,7 +4945,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
                                      u32 *first_hash)
 {
        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-        int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits;
+        int ret, credits = 2 * blk_per_bucket;
        BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
@@ -5099,7 +5086,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
                goto leave;
        }
-        ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1,
+        ret = __ocfs2_claim_clusters(handle, ctxt->data_ac, 1,
                                     clusters_to_add, &bit_off, &num_bits);
        if (ret < 0) {
                if (ret != -ENOSPC)
@@ -5153,9 +5140,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
                goto leave;
        }
-        ret = ocfs2_journal_dirty(handle, root_bh);
+        ocfs2_journal_dirty(handle, root_bh);
-        if (ret < 0)
-                mlog_errno(ret);
 leave:
        return ret;
@@ -5200,8 +5185,7 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
         * existing bucket.  Then we add the last existing bucket, the
         * new bucket, and the first bucket (3 * blk_per_bucket).
         */
-        credits = (end_blk - target_blk) + (3 * blk_per_bucket) +
+        credits = (end_blk - target_blk) + (3 * blk_per_bucket);
-                  handle->h_buffer_credits;
        ret = ocfs2_extend_trans(handle, credits);
        if (ret) {
                mlog_errno(ret);
@@ -5477,12 +5461,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
        }
        le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
+        ocfs2_journal_dirty(handle, root_bh);
-        ret = ocfs2_journal_dirty(handle, root_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
        ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
        if (ret)
@@ -6528,13 +6507,11 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
                                          int indexed)
 {
        int ret;
-        struct ocfs2_alloc_context *meta_ac;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct ocfs2_xattr_set_ctxt ctxt = {
+        struct ocfs2_xattr_set_ctxt ctxt;
-                .meta_ac = meta_ac,
-        };
-        ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+        memset(&ctxt, 0, sizeof(ctxt));
+        ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &ctxt.meta_ac);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
@@ -6556,7 +6533,7 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
        ocfs2_commit_trans(osb, ctxt.handle);
 out:
-        ocfs2_free_alloc_context(meta_ac);
+        ocfs2_free_alloc_context(ctxt.meta_ac);
        return ret;
 }
@@ -6937,7 +6914,7 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
                goto out;
        }
-        ret = ocfs2_claim_clusters(osb, handle, data_ac,
+        ret = ocfs2_claim_clusters(handle, data_ac,
                                   len, &p_cluster, &num_clusters);
        if (ret) {
                mlog_errno(ret);
@@ -7236,7 +7213,7 @@ int ocfs2_init_security_set(handle_t *handle,
                                     xattr_ac, data_ac);
 }
-struct xattr_handler ocfs2_xattr_security_handler = {
+const struct xattr_handler ocfs2_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .list   = ocfs2_xattr_security_list,
        .get    = ocfs2_xattr_security_get,
@@ -7280,7 +7257,7 @@ static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
                               name, value, size, flags);
 }
-struct xattr_handler ocfs2_xattr_trusted_handler = {
+const struct xattr_handler ocfs2_xattr_trusted_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .list   = ocfs2_xattr_trusted_list,
        .get    = ocfs2_xattr_trusted_get,
@@ -7336,7 +7313,7 @@ static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
                               name, value, size, flags);
 }
-struct xattr_handler ocfs2_xattr_user_handler = {
+const struct xattr_handler ocfs2_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
        .list   = ocfs2_xattr_user_list,
        .get    = ocfs2_xattr_user_get,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index abd72a47f520..aa64bb37a65b 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -37,12 +37,12 @@ struct ocfs2_security_xattr_info {
        size_t value_len;
 };
-extern struct xattr_handler ocfs2_xattr_user_handler;
+extern const struct xattr_handler ocfs2_xattr_user_handler;
-extern struct xattr_handler ocfs2_xattr_trusted_handler;
+extern const struct xattr_handler ocfs2_xattr_trusted_handler;
-extern struct xattr_handler ocfs2_xattr_security_handler;
+extern const struct xattr_handler ocfs2_xattr_security_handler;
-extern struct xattr_handler ocfs2_xattr_acl_access_handler;
+extern const struct xattr_handler ocfs2_xattr_acl_access_handler;
-extern struct xattr_handler ocfs2_xattr_acl_default_handler;
+extern const struct xattr_handler ocfs2_xattr_acl_default_handler;
-extern struct xattr_handler *ocfs2_xattr_handlers[];
+extern const struct xattr_handler *ocfs2_xattr_handlers[];
 ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
 int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 399487c09364..6e7a3291bbe8 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -329,7 +329,7 @@ const struct file_operations omfs_file_operations = {
        .aio_read = generic_file_aio_read,
        .aio_write = generic_file_aio_write,
        .mmap = generic_file_mmap,
-        .fsync = simple_fsync,
+        .fsync = generic_file_fsync,
        .splice_read = generic_file_splice_read,
 };
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 75d9b5ba1d45..089839a6cc64 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -3,9 +3,9 @@
 * Copyright (C) 2006 Bob Copeland <me@bobcopeland.com>
 * Released under GPL v2.
 */
-#include <linux/version.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/parser.h>
@@ -37,9 +37,7 @@ struct inode *omfs_new_inode(struct inode *dir, int mode)
                goto fail;
        inode->i_ino = new_block;
-        inode->i_mode = mode;
+        inode_init_owner(inode, NULL, mode);
-        inode->i_uid = current_fsuid();
-        inode->i_gid = current_fsgid();
        inode->i_mapping->a_ops = &omfs_aops;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/open.c b/fs/open.c
index e17f54454b50..5463266db9e6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -10,7 +10,6 @@
 #include <linux/fdtable.h>
 #include <linux/fsnotify.h>
 #include <linux/module.h>
-#include <linux/slab.h>
 #include <linux/tty.h>
 #include <linux/namei.h>
 #include <linux/backing-dev.h>
@@ -18,8 +17,8 @@
 #include <linux/securebits.h>
 #include <linux/security.h>
 #include <linux/mount.h>
-#include <linux/vfs.h>
 #include <linux/fcntl.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/fs.h>
 #include <linux/personality.h>
@@ -33,171 +32,6 @@
 #include "internal.h"
-int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-        int retval = -ENODEV;
-        if (dentry) {
-                retval = -ENOSYS;
-                if (dentry->d_sb->s_op->statfs) {
-                        memset(buf, 0, sizeof(*buf));
-                        retval = security_sb_statfs(dentry);
-                        if (retval)
-                                return retval;
-                        retval = dentry->d_sb->s_op->statfs(dentry, buf);
-                        if (retval == 0 && buf->f_frsize == 0)
-                                buf->f_frsize = buf->f_bsize;
-                }
-        }
-        return retval;
-}
-EXPORT_SYMBOL(vfs_statfs);
-static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
-{
-        struct kstatfs st;
-        int retval;
-        retval = vfs_statfs(dentry, &st);
-        if (retval)
-                return retval;
-        if (sizeof(*buf) == sizeof(st))
-                memcpy(buf, &st, sizeof(st));
-        else {
-                if (sizeof buf->f_blocks == 4) {
-                        if ((st.f_blocks | st.f_bfree | st.f_bavail |
-                             st.f_bsize | st.f_frsize) &
-                            0xffffffff00000000ULL)
-                                return -EOVERFLOW;
-                        /*
-                         * f_files and f_ffree may be -1; it's okay to stuff
-                         * that into 32 bits
-                         */
-                        if (st.f_files != -1 &&
-                            (st.f_files & 0xffffffff00000000ULL))
-                                return -EOVERFLOW;
-                        if (st.f_ffree != -1 &&
-                            (st.f_ffree & 0xffffffff00000000ULL))
-                                return -EOVERFLOW;
-                }
-                buf->f_type = st.f_type;
-                buf->f_bsize = st.f_bsize;
-                buf->f_blocks = st.f_blocks;
-                buf->f_bfree = st.f_bfree;
-                buf->f_bavail = st.f_bavail;
-                buf->f_files = st.f_files;
-                buf->f_ffree = st.f_ffree;
-                buf->f_fsid = st.f_fsid;
-                buf->f_namelen = st.f_namelen;
-                buf->f_frsize = st.f_frsize;
-                memset(buf->f_spare, 0, sizeof(buf->f_spare));
-        }
-        return 0;
-}
-static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
-{
-        struct kstatfs st;
-        int retval;
-        retval = vfs_statfs(dentry, &st);
-        if (retval)
-                return retval;
-        if (sizeof(*buf) == sizeof(st))
-                memcpy(buf, &st, sizeof(st));
-        else {
-                buf->f_type = st.f_type;
-                buf->f_bsize = st.f_bsize;
-                buf->f_blocks = st.f_blocks;
-                buf->f_bfree = st.f_bfree;
-                buf->f_bavail = st.f_bavail;
-                buf->f_files = st.f_files;
-                buf->f_ffree = st.f_ffree;
-                buf->f_fsid = st.f_fsid;
-                buf->f_namelen = st.f_namelen;
-                buf->f_frsize = st.f_frsize;
-                memset(buf->f_spare, 0, sizeof(buf->f_spare));
-        }
-        return 0;
-}
-SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
-{
-        struct path path;
-        int error;
-        error = user_path(pathname, &path);
-        if (!error) {
-                struct statfs tmp;
-                error = vfs_statfs_native(path.dentry, &tmp);
-                if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                        error = -EFAULT;
-                path_put(&path);
-        }
-        return error;
-}
-SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
-{
-        struct path path;
-        long error;
-        if (sz != sizeof(*buf))
-                return -EINVAL;
-        error = user_path(pathname, &path);
-        if (!error) {
-                struct statfs64 tmp;
-                error = vfs_statfs64(path.dentry, &tmp);
-                if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                        error = -EFAULT;
-                path_put(&path);
-        }
-        return error;
-}
-SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
-{
-        struct file * file;
-        struct statfs tmp;
-        int error;
-        error = -EBADF;
-        file = fget(fd);
-        if (!file)
-                goto out;
-        error = vfs_statfs_native(file->f_path.dentry, &tmp);
-        if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                error = -EFAULT;
-        fput(file);
-out:
-        return error;
-}
-SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
-{
-        struct file * file;
-        struct statfs64 tmp;
-        int error;
-        if (sz != sizeof(*buf))
-                return -EINVAL;
-        error = -EBADF;
-        file = fget(fd);
-        if (!file)
-                goto out;
-        error = vfs_statfs64(file->f_path.dentry, &tmp);
-        if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                error = -EFAULT;
-        fput(file);
-out:
-        return error;
-}
 int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
        struct file *filp)
 {
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index a97b477ac0fc..6921e7890be6 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -70,14 +70,14 @@ struct riscix_record {
 #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
        defined(CONFIG_ACORN_PARTITION_ADFS)
-static int
+static int riscix_partition(struct parsed_partitions *state,
-riscix_partition(struct parsed_partitions *state, struct block_device *bdev,
+                            unsigned long first_sect, int slot,
-                unsigned long first_sect, int slot, unsigned long nr_sects)
+                            unsigned long nr_sects)
 {
        Sector sect;
        struct riscix_record *rr;
        
-        rr = (struct riscix_record *)read_dev_sector(bdev, first_sect, &sect);
+        rr = read_part_sector(state, first_sect, &sect);
        if (!rr)
                return -1;
@@ -123,9 +123,9 @@ struct linux_part {
 #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
        defined(CONFIG_ACORN_PARTITION_ADFS)
-static int
+static int linux_partition(struct parsed_partitions *state,
-linux_partition(struct parsed_partitions *state, struct block_device *bdev,
+                           unsigned long first_sect, int slot,
-                unsigned long first_sect, int slot, unsigned long nr_sects)
+                           unsigned long nr_sects)
 {
        Sector sect;
        struct linux_part *linuxp;
@@ -135,7 +135,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
        put_partition(state, slot++, first_sect, size);
-        linuxp = (struct linux_part *)read_dev_sector(bdev, first_sect, &sect);
+        linuxp = read_part_sector(state, first_sect, &sect);
        if (!linuxp)
                return -1;
@@ -157,8 +157,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
 #endif
 #ifdef CONFIG_ACORN_PARTITION_CUMANA
-int
+int adfspart_check_CUMANA(struct parsed_partitions *state)
-adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev)
 {
        unsigned long first_sector = 0;
        unsigned int start_blk = 0;
@@ -185,7 +184,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
                struct adfs_discrecord *dr;
                unsigned int nr_sects;
-                data = read_dev_sector(bdev, start_blk * 2 + 6, &sect);
+                data = read_part_sector(state, start_blk * 2 + 6, &sect);
                if (!data)
                        return -1;
@@ -217,14 +216,14 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
 #ifdef CONFIG_ACORN_PARTITION_RISCIX
                case PARTITION_RISCIX_SCSI:
                        /* RISCiX - we don't know how to find the next one. */
-                        slot = riscix_partition(state, bdev, first_sector,
+                        slot = riscix_partition(state, first_sector, slot,
-                                                 slot, nr_sects);
+                                                nr_sects);
                        break;
 #endif
                case PARTITION_LINUX:
-                        slot = linux_partition(state, bdev, first_sector,
+                        slot = linux_partition(state, first_sector, slot,
-                                                slot, nr_sects);
+                                               nr_sects);
                        break;
                }
                put_dev_sector(sect);
@@ -249,8 +248,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
 *          hda1 = ADFS partition on first drive.
 *          hda2 = non-ADFS partition.
 */
-int
+int adfspart_check_ADFS(struct parsed_partitions *state)
-adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
 {
        unsigned long start_sect, nr_sects, sectscyl, heads;
        Sector sect;
@@ -259,7 +257,7 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
        unsigned char id;
        int slot = 1;
-        data = read_dev_sector(bdev, 6, &sect);
+        data = read_part_sector(state, 6, &sect);
        if (!data)
                return -1;
@@ -278,21 +276,21 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
        /*
         * Work out start of non-adfs partition.
         */
-        nr_sects = (bdev->bd_inode->i_size >> 9) - start_sect;
+        nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect;
        if (start_sect) {
                switch (id) {
 #ifdef CONFIG_ACORN_PARTITION_RISCIX
                case PARTITION_RISCIX_SCSI:
                case PARTITION_RISCIX_MFM:
-                        slot = riscix_partition(state, bdev, start_sect,
+                        slot = riscix_partition(state, start_sect, slot,
-                                                 slot, nr_sects);
+                                                nr_sects);
                        break;
 #endif
                case PARTITION_LINUX:
-                        slot = linux_partition(state, bdev, start_sect,
+                        slot = linux_partition(state, start_sect, slot,
-                                                slot, nr_sects);
+                                               nr_sects);
                        break;
                }
        }
@@ -308,10 +306,11 @@ struct ics_part {
        __le32 size;
 };
-static int adfspart_check_ICSLinux(struct block_device *bdev, unsigned long block)
+static int adfspart_check_ICSLinux(struct parsed_partitions *state,
+                                   unsigned long block)
 {
        Sector sect;
-        unsigned char *data = read_dev_sector(bdev, block, &sect);
+        unsigned char *data = read_part_sector(state, block, &sect);
        int result = 0;
        if (data) {
@@ -349,8 +348,7 @@ static inline int valid_ics_sector(const unsigned char *data)
 *          hda2 = ADFS partition 1 on first drive.
 *              ..etc..
 */
-int
+int adfspart_check_ICS(struct parsed_partitions *state)
-adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
 {
        const unsigned char *data;
        const struct ics_part *p;
@@ -360,7 +358,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
        /*
         * Try ICS style partitions - sector 0 contains partition info.
         */
-        data = read_dev_sector(bdev, 0, &sect);
+        data = read_part_sector(state, 0, &sect);
        if (!data)
                return -1;
@@ -392,7 +390,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
                         * partition is.  We must not make this visible
                         * to the filesystem.
                         */
-                        if (size > 1 && adfspart_check_ICSLinux(bdev, start)) {
+                        if (size > 1 && adfspart_check_ICSLinux(state, start)) {
                                start += 1;
                                size -= 1;
                        }
@@ -446,8 +444,7 @@ static inline int valid_ptec_sector(const unsigned char *data)
 *          hda2 = ADFS partition 1 on first drive.
 *              ..etc..
 */
-int
+int adfspart_check_POWERTEC(struct parsed_partitions *state)
-adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev)
 {
        Sector sect;
        const unsigned char *data;
@@ -455,7 +452,7 @@ adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bd
        int slot = 1;
        int i;
-        data = read_dev_sector(bdev, 0, &sect);
+        data = read_part_sector(state, 0, &sect);
        if (!data)
                return -1;
@@ -508,8 +505,7 @@ static const char eesox_name[] = {
 *  1. The individual ADFS boot block entries that are placed on the disk.
 *  2. The start address of the next entry.
 */
-int
+int adfspart_check_EESOX(struct parsed_partitions *state)
-adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
 {
        Sector sect;
        const unsigned char *data;
@@ -518,7 +514,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
        sector_t start = 0;
        int i, slot = 1;
-        data = read_dev_sector(bdev, 7, &sect);
+        data = read_part_sector(state, 7, &sect);
        if (!data)
                return -1;
@@ -545,7 +541,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
        if (i != 0) {
                sector_t size;
-                size = get_capacity(bdev->bd_disk);
+                size = get_capacity(state->bdev->bd_disk);
                put_partition(state, slot++, start, size - start);
                printk("\n");
        }
diff --git a/fs/partitions/acorn.h b/fs/partitions/acorn.h
index 81fd50ecc080..ede828529692 100644
--- a/fs/partitions/acorn.h
+++ b/fs/partitions/acorn.h
@@ -7,8 +7,8 @@
 *  format, and everyone stick to it?
 */
-int adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev);
+int adfspart_check_CUMANA(struct parsed_partitions *state);
-int adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev);
+int adfspart_check_ADFS(struct parsed_partitions *state);
-int adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev);
+int adfspart_check_ICS(struct parsed_partitions *state);
-int adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev);
+int adfspart_check_POWERTEC(struct parsed_partitions *state);
-int adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev);
+int adfspart_check_EESOX(struct parsed_partitions *state);
diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c
index 9917a8c360f2..ba443d4229f8 100644
--- a/fs/partitions/amiga.c
+++ b/fs/partitions/amiga.c
@@ -23,8 +23,7 @@ checksum_block(__be32 *m, int size)
        return sum;
 }
-int
+int amiga_partition(struct parsed_partitions *state)
-amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
        Sector sect;
        unsigned char *data;
@@ -38,11 +37,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
        for (blk = 0; ; blk++, put_dev_sector(sect)) {
                if (blk == RDB_ALLOCATION_LIMIT)
                        goto rdb_done;
-                data = read_dev_sector(bdev, blk, &sect);
+                data = read_part_sector(state, blk, &sect);
                if (!data) {
                        if (warn_no_part)
                                printk("Dev %s: unable to read RDB block %d\n",
-                                       bdevname(bdev, b), blk);
+                                       bdevname(state->bdev, b), blk);
                        res = -1;
                        goto rdb_done;
                }
@@ -64,7 +63,7 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
                }
                printk("Dev %s: RDB in block %d has bad checksum\n",
-                               bdevname(bdev, b), blk);
+                       bdevname(state->bdev, b), blk);
        }
        /* blksize is blocks per 512 byte standard block */
@@ -75,11 +74,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
        put_dev_sector(sect);
        for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
                blk *= blksize; /* Read in terms partition table understands */
-                data = read_dev_sector(bdev, blk, &sect);
+                data = read_part_sector(state, blk, &sect);
                if (!data) {
                        if (warn_no_part)
                                printk("Dev %s: unable to read partition block %d\n",
-                                       bdevname(bdev, b), blk);
+                                       bdevname(state->bdev, b), blk);
                        res = -1;
                        goto rdb_done;
                }
diff --git a/fs/partitions/amiga.h b/fs/partitions/amiga.h
index 2f3e9ce22d53..d094585cadaa 100644
--- a/fs/partitions/amiga.h
+++ b/fs/partitions/amiga.h
@@ -2,5 +2,5 @@
 *  fs/partitions/amiga.h
 */
-int amiga_partition(struct parsed_partitions *state, struct block_device *bdev);
+int amiga_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c
index 1f3572d5b755..4439ff1b6cec 100644
--- a/fs/partitions/atari.c
+++ b/fs/partitions/atari.c
@@ -30,7 +30,7 @@ static inline int OK_id(char *s)
                memcmp (s, "RAW", 3) == 0 ;
 }
-int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
+int atari_partition(struct parsed_partitions *state)
 {
        Sector sect;
        struct rootsector *rs;
@@ -42,12 +42,12 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
        int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */
 #endif
-        rs = (struct rootsector *) read_dev_sector(bdev, 0, &sect);
+        rs = read_part_sector(state, 0, &sect);
        if (!rs)
                return -1;
        /* Verify this is an Atari rootsector: */
-        hd_size = bdev->bd_inode->i_size >> 9;
+        hd_size = state->bdev->bd_inode->i_size >> 9;
        if (!VALID_PARTITION(&rs->part[0], hd_size) &&
            !VALID_PARTITION(&rs->part[1], hd_size) &&
            !VALID_PARTITION(&rs->part[2], hd_size) &&
@@ -84,7 +84,7 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
                printk(" XGM<");
                partsect = extensect = be32_to_cpu(pi->st);
                while (1) {
-                        xrs = (struct rootsector *)read_dev_sector(bdev, partsect, &sect2);
+                        xrs = read_part_sector(state, partsect, &sect2);
                        if (!xrs) {
                                printk (" block %ld read failed\n", partsect);
                                put_dev_sector(sect);
diff --git a/fs/partitions/atari.h b/fs/partitions/atari.h
index 63186b00e135..fe2d32a89f36 100644
--- a/fs/partitions/atari.h
+++ b/fs/partitions/atari.h
@@ -31,4 +31,4 @@ struct rootsector
  u16 checksum;                 /* checksum for bootable disks */
 } __attribute__((__packed__));
-int atari_partition(struct parsed_partitions *state, struct block_device *bdev);
+int atari_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index e8865c11777f..5dcd4b0c5533 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -16,6 +16,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/kmod.h>
 #include <linux/ctype.h>
 #include <linux/genhd.h>
@@ -44,7 +45,7 @@ extern void md_autodetect_dev(dev_t dev);
 int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
-static int (*check_part[])(struct parsed_partitions *, struct block_device *) = {
+static int (*check_part[])(struct parsed_partitions *) = {
        /*
         * Probe partition formats with tables at disk address 0
         * that also have an ADFS boot block at 0xdc0.
@@ -160,10 +161,11 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
        struct parsed_partitions *state;
        int i, res, err;
-        state = kmalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
+        state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
        if (!state)
                return NULL;
+        state->bdev = bdev;
        disk_name(hd, 0, state->name);
        printk(KERN_INFO " %s:", state->name);
        if (isdigit(state->name[strlen(state->name)-1]))
@@ -173,7 +175,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
        i = res = err = 0;
        while (!res && check_part[i]) {
                memset(&state->parts, 0, sizeof(state->parts));
-                res = check_part[i++](state, bdev);
+                res = check_part[i++](state);
                if (res < 0) {
                        /* We have hit an I/O error which we don't report now.
                        * But record it, and let the others do their job.
@@ -185,6 +187,8 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
        }
        if (res > 0)
                return state;
+        if (state->access_beyond_eod)
+                err = -ENOSPC;
        if (err)
        /* The partition is unrecognized. So report I/O errors if there were any */
                res = err;
@@ -537,12 +541,33 @@ exit:
        disk_part_iter_exit(&piter);
 }
+static bool disk_unlock_native_capacity(struct gendisk *disk)
+{
+        const struct block_device_operations *bdops = disk->fops;
+        if (bdops->unlock_native_capacity &&
+            !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
+                printk(KERN_CONT "enabling native capacity\n");
+                bdops->unlock_native_capacity(disk);
+                disk->flags |= GENHD_FL_NATIVE_CAPACITY;
+                return true;
+        } else {
+                printk(KERN_CONT "truncated\n");
+                return false;
+        }
+}
 int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 {
+        struct parsed_partitions *state = NULL;
        struct disk_part_iter piter;
        struct hd_struct *part;
-        struct parsed_partitions *state;
        int p, highest, res;
+rescan:
+        if (state && !IS_ERR(state)) {
+                kfree(state);
+                state = NULL;
+        }
        if (bdev->bd_part_count)
                return -EBUSY;
@@ -561,8 +586,32 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
        bdev->bd_invalidated = 0;
        if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
                return 0;
-        if (IS_ERR(state))      /* I/O error reading the partition table */
+        if (IS_ERR(state)) {
+                /*
+                 * I/O error reading the partition table.  If any
+                 * partition code tried to read beyond EOD, retry
+                 * after unlocking native capacity.
+                 */
+                if (PTR_ERR(state) == -ENOSPC) {
+                        printk(KERN_WARNING "%s: partition table beyond EOD, ",
+                               disk->disk_name);
+                        if (disk_unlock_native_capacity(disk))
+                                goto rescan;
+                }
                return -EIO;
+        }
+        /*
+         * If any partition code tried to read beyond EOD, try
+         * unlocking native capacity even if partition table is
+         * sucessfully read as we could be missing some partitions.
+         */
+        if (state->access_beyond_eod) {
+                printk(KERN_WARNING
+                       "%s: partition table partially beyond EOD, ",
+                       disk->disk_name);
+                if (disk_unlock_native_capacity(disk))
+                        goto rescan;
+        }
        /* tell userspace that the media / partition table may have changed */
        kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
@@ -580,7 +629,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
        /* add partitions */
        for (p = 1; p < state->limit; p++) {
                sector_t size, from;
-try_scan:
                size = state->parts[p].size;
                if (!size)
                        continue;
@@ -588,30 +637,21 @@ try_scan:
                from = state->parts[p].from;
                if (from >= get_capacity(disk)) {
                        printk(KERN_WARNING
-                               "%s: p%d ignored, start %llu is behind the end of the disk\n",
+                               "%s: p%d start %llu is beyond EOD, ",
                               disk->disk_name, p, (unsigned long long) from);
+                        if (disk_unlock_native_capacity(disk))
+                                goto rescan;
                        continue;
                }
                if (from + size > get_capacity(disk)) {
-                        const struct block_device_operations *bdops = disk->fops;
-                        unsigned long long capacity;
                        printk(KERN_WARNING
-                               "%s: p%d size %llu exceeds device capacity, ",
+                               "%s: p%d size %llu extends beyond EOD, ",
                               disk->disk_name, p, (unsigned long long) size);
-                        if (bdops->set_capacity &&
+                        if (disk_unlock_native_capacity(disk)) {
-                            (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) {
+                                /* free state and restart */
-                                printk(KERN_CONT "enabling native capacity\n");
+                                goto rescan;
-                                capacity = bdops->set_capacity(disk, ~0ULL);
-                                disk->flags |= GENHD_FL_NATIVE_CAPACITY;
-                                if (capacity > get_capacity(disk)) {
-                                        set_capacity(disk, capacity);
-                                        check_disk_size_change(disk, bdev);
-                                        bdev->bd_invalidated = 0;
-                                }
-                                goto try_scan;
                        } else {
                                /*
                                 * we can not ignore partitions of broken tables
@@ -619,7 +659,6 @@ try_scan:
                                 * we limit them to the end of the disk to avoid
                                 * creating invalid block devices
                                 */
-                                printk(KERN_CONT "limited to end of disk\n");
                                size = get_capacity(disk) - from;
                        }
                }
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 98dbe1a84528..52f8bd399396 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -6,6 +6,7 @@
 * description.
 */
 struct parsed_partitions {
+        struct block_device *bdev;
        char name[BDEVNAME_SIZE];
        struct {
                sector_t from;
@@ -14,8 +15,19 @@ struct parsed_partitions {
        } parts[DISK_MAX_PARTS];
        int next;
        int limit;
+        bool access_beyond_eod;
 };
+static inline void *read_part_sector(struct parsed_partitions *state,
+                                     sector_t n, Sector *p)
+{
+        if (n >= get_capacity(state->bdev->bd_disk)) {
+                state->access_beyond_eod = true;
+                return NULL;
+        }
+        return read_dev_sector(state->bdev, n, p);
+}
 static inline void
 put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
 {
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 49cfd5f54238..9efb2cfe2410 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -95,6 +95,7 @@
 ************************************************************/
 #include <linux/crc32.h>
 #include <linux/math64.h>
+#include <linux/slab.h>
 #include "check.h"
 #include "efi.h"
@@ -139,8 +140,7 @@ efi_crc32(const void *buf, unsigned long len)
 *  the part[0] entry for this disk, and is the number of
 *  physical sectors available on the disk.
 */
-static u64
+static u64 last_lba(struct block_device *bdev)
-last_lba(struct block_device *bdev)
 {
        if (!bdev || !bdev->bd_inode)
                return 0;
@@ -180,27 +180,28 @@ is_pmbr_valid(legacy_mbr *mbr)
 /**
 * read_lba(): Read bytes from disk, starting at given LBA
- * @bdev
+ * @state
 * @lba
 * @buffer
 * @size_t
 *
- * Description:  Reads @count bytes from @bdev into @buffer.
+ * Description: Reads @count bytes from @state->bdev into @buffer.
 * Returns number of bytes read on success, 0 on error.
 */
-static size_t
+static size_t read_lba(struct parsed_partitions *state,
-read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
+                       u64 lba, u8 *buffer, size_t count)
 {
        size_t totalreadcount = 0;
+        struct block_device *bdev = state->bdev;
        sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
-        if (!bdev || !buffer || lba > last_lba(bdev))
+        if (!buffer || lba > last_lba(bdev))
                return 0;
        while (count) {
                int copied = 512;
                Sector sect;
-                unsigned char *data = read_dev_sector(bdev, n++, &sect);
+                unsigned char *data = read_part_sector(state, n++, &sect);
                if (!data)
                        break;
                if (copied > count)
@@ -216,19 +217,20 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
 /**
 * alloc_read_gpt_entries(): reads partition entries from disk
- * @bdev
+ * @state
 * @gpt - GPT header
 * 
 * Description: Returns ptes on success,  NULL on error.
 * Allocates space for PTEs based on information found in @gpt.
 * Notes: remember to free pte when you're done!
 */
-static gpt_entry *
+static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
-alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
+                                         gpt_header *gpt)
 {
        size_t count;
        gpt_entry *pte;
-        if (!bdev || !gpt)
+        if (!gpt)
                return NULL;
        count = le32_to_cpu(gpt->num_partition_entries) *
@@ -239,7 +241,7 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
        if (!pte)
                return NULL;
-        if (read_lba(bdev, le64_to_cpu(gpt->partition_entry_lba),
+        if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
                     (u8 *) pte,
                     count) < count) {
                kfree(pte);
@@ -251,27 +253,24 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
 /**
 * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
- * @bdev
+ * @state
 * @lba is the Logical Block Address of the partition table
 * 
 * Description: returns GPT header on success, NULL on error.   Allocates
- * and fills a GPT header starting at @ from @bdev.
+ * and fills a GPT header starting at @ from @state->bdev.
 * Note: remember to free gpt when finished with it.
 */
-static gpt_header *
+static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
-alloc_read_gpt_header(struct block_device *bdev, u64 lba)
+                                         u64 lba)
 {
        gpt_header *gpt;
-        unsigned ssz = bdev_logical_block_size(bdev);
+        unsigned ssz = bdev_logical_block_size(state->bdev);
-        if (!bdev)
-                return NULL;
        gpt = kzalloc(ssz, GFP_KERNEL);
        if (!gpt)
                return NULL;
-        if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) {
+        if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) {
                kfree(gpt);
                gpt=NULL;
                return NULL;
@@ -282,7 +281,7 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
 /**
 * is_gpt_valid() - tests one GPT header and PTEs for validity
- * @bdev
+ * @state
 * @lba is the logical block address of the GPT header to test
 * @gpt is a GPT header ptr, filled on return.
 * @ptes is a PTEs ptr, filled on return.
@@ -290,16 +289,15 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
 * Description: returns 1 if valid,  0 on error.
 * If valid, returns pointers to newly allocated GPT header and PTEs.
 */
-static int
+static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
-is_gpt_valid(struct block_device *bdev, u64 lba,
+                        gpt_header **gpt, gpt_entry **ptes)
-             gpt_header **gpt, gpt_entry **ptes)
 {
        u32 crc, origcrc;
        u64 lastlba;
-        if (!bdev || !gpt || !ptes)
+        if (!ptes)
                return 0;
-        if (!(*gpt = alloc_read_gpt_header(bdev, lba)))
+        if (!(*gpt = alloc_read_gpt_header(state, lba)))
                return 0;
        /* Check the GUID Partition Table signature */
@@ -335,7 +333,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
        /* Check the first_usable_lba and last_usable_lba are
         * within the disk.
         */
-        lastlba = last_lba(bdev);
+        lastlba = last_lba(state->bdev);
        if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
                pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
                         (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
@@ -349,7 +347,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
                goto fail;
        }
-        if (!(*ptes = alloc_read_gpt_entries(bdev, *gpt)))
+        if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
                goto fail;
        /* Check the GUID Partition Entry Array CRC */
@@ -494,7 +492,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
 /**
 * find_valid_gpt() - Search disk for valid GPT headers and PTEs
- * @bdev
+ * @state
 * @gpt is a GPT header ptr, filled on return.
 * @ptes is a PTEs ptr, filled on return.
 * Description: Returns 1 if valid, 0 on error.
@@ -507,24 +505,25 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
 * This protects against devices which misreport their size, and forces
 * the user to decide to use the Alternate GPT.
 */
-static int
+static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
-find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
+                          gpt_entry **ptes)
 {
        int good_pgpt = 0, good_agpt = 0, good_pmbr = 0;
        gpt_header *pgpt = NULL, *agpt = NULL;
        gpt_entry *pptes = NULL, *aptes = NULL;
        legacy_mbr *legacymbr;
        u64 lastlba;
-        if (!bdev || !gpt || !ptes)
+        if (!ptes)
                return 0;
-        lastlba = last_lba(bdev);
+        lastlba = last_lba(state->bdev);
        if (!force_gpt) {
                /* This will be added to the EFI Spec. per Intel after v1.02. */
                legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL);
                if (legacymbr) {
-                        read_lba(bdev, 0, (u8 *) legacymbr,
+                        read_lba(state, 0, (u8 *) legacymbr,
-                                 sizeof (*legacymbr));
+                                 sizeof (*legacymbr));
                        good_pmbr = is_pmbr_valid(legacymbr);
                        kfree(legacymbr);
                }
@@ -532,15 +531,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
                        goto fail;
        }
-        good_pgpt = is_gpt_valid(bdev, GPT_PRIMARY_PARTITION_TABLE_LBA,
+        good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
                                 &pgpt, &pptes);
        if (good_pgpt)
-                good_agpt = is_gpt_valid(bdev,
+                good_agpt = is_gpt_valid(state,
                                         le64_to_cpu(pgpt->alternate_lba),
                                         &agpt, &aptes);
        if (!good_agpt && force_gpt)
-                good_agpt = is_gpt_valid(bdev, lastlba,
+                good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes);
-                                         &agpt, &aptes);
        /* The obviously unsuccessful case */
        if (!good_pgpt && !good_agpt)
@@ -582,9 +580,8 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
 }
 /**
- * efi_partition(struct parsed_partitions *state, struct block_device *bdev)
+ * efi_partition(struct parsed_partitions *state)
 * @state
- * @bdev
 *
 * Description: called from check.c, if the disk contains GPT
 * partitions, sets up partition entries in the kernel.
@@ -601,15 +598,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
 *  1 if successful
 *
 */
-int
+int efi_partition(struct parsed_partitions *state)
-efi_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
        gpt_header *gpt = NULL;
        gpt_entry *ptes = NULL;
        u32 i;
-        unsigned ssz = bdev_logical_block_size(bdev) / 512;
+        unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
-        if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) {
+        if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
                kfree(gpt);
                kfree(ptes);
                return 0;
@@ -622,7 +618,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
                u64 size = le64_to_cpu(ptes[i].ending_lba) -
                           le64_to_cpu(ptes[i].starting_lba) + 1ULL;
-                if (!is_pte_valid(&ptes[i], last_lba(bdev)))
+                if (!is_pte_valid(&ptes[i], last_lba(state->bdev)))
                        continue;
                put_partition(state, i+1, start * ssz, size * ssz);
@@ -630,7 +626,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
                /* If this is a RAID volume, tell md */
                if (!efi_guidcmp(ptes[i].partition_type_guid,
                                 PARTITION_LINUX_RAID_GUID))
-                        state->parts[i+1].flags = 1;
+                        state->parts[i + 1].flags = ADDPART_FLAG_RAID;
        }
        kfree(ptes);
        kfree(gpt);
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
index 6998b589abf9..b69ab729558f 100644
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
@@ -110,7 +110,7 @@ typedef struct _legacy_mbr {
 } __attribute__ ((packed)) legacy_mbr;
 /* Functions */
-extern int efi_partition(struct parsed_partitions *state, struct block_device *bdev);
+extern int efi_partition(struct parsed_partitions *state);
 #endif
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index fc71aab08460..3e73de5967ff 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -58,9 +58,9 @@ cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
 /*
 */
-int
+int ibm_partition(struct parsed_partitions *state)
-ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
+        struct block_device *bdev = state->bdev;
        int blocksize, res;
        loff_t i_size, offset, size, fmt_size;
        dasd_information2_t *info;
@@ -100,7 +100,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
        /*
         * Get volume label, extract name and type.
         */
-        data = read_dev_sector(bdev, info->label_block*(blocksize/512), &sect);
+        data = read_part_sector(state, info->label_block*(blocksize/512),
+                                &sect);
        if (data == NULL)
                goto out_readerr;
@@ -193,8 +194,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
                         */
                        blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
                        counter = 0;
-                        data = read_dev_sector(bdev, blk * (blocksize/512),
+                        data = read_part_sector(state, blk * (blocksize/512),
-                                               &sect);
+                                                &sect);
                        while (data != NULL) {
                                struct vtoc_format1_label f1;
@@ -208,9 +209,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
                                    || f1.DS1FMTID == _ascebc['7']
                                    || f1.DS1FMTID == _ascebc['9']) {
                                        blk++;
-                                        data = read_dev_sector(bdev, blk *
+                                        data = read_part_sector(state,
-                                                               (blocksize/512),
+                                                blk * (blocksize/512), &sect);
-                                                                &sect);
                                        continue;
                                }
@@ -230,9 +230,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
                                              size * (blocksize >> 9));
                                counter++;
                                blk++;
-                                data = read_dev_sector(bdev,
+                                data = read_part_sector(state,
-                                                       blk * (blocksize/512),
+                                                blk * (blocksize/512), &sect);
-                                                       &sect);
                        }
                        if (!data)
diff --git a/fs/partitions/ibm.h b/fs/partitions/ibm.h
index 31f85a6ac459..08fb0804a812 100644
--- a/fs/partitions/ibm.h
+++ b/fs/partitions/ibm.h
@@ -1 +1 @@
-int ibm_partition(struct parsed_partitions *, struct block_device *);
+int ibm_partition(struct parsed_partitions *);
diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c
index 176d89bcf123..1cc928bb762f 100644
--- a/fs/partitions/karma.c
+++ b/fs/partitions/karma.c
@@ -9,7 +9,7 @@
 #include "check.h"
 #include "karma.h"
-int karma_partition(struct parsed_partitions *state, struct block_device *bdev)
+int karma_partition(struct parsed_partitions *state)
 {
        int i;
        int slot = 1;
@@ -29,7 +29,7 @@ int karma_partition(struct parsed_partitions *state, struct block_device *bdev)
        } __attribute__((packed)) *label;
        struct d_partition *p;
-        data = read_dev_sector(bdev, 0, &sect);
+        data = read_part_sector(state, 0, &sect);
        if (!data)
                return -1;
diff --git a/fs/partitions/karma.h b/fs/partitions/karma.h
index ecf7d3f2a3d8..c764b2e9df21 100644
--- a/fs/partitions/karma.h
+++ b/fs/partitions/karma.h
@@ -4,5 +4,5 @@
 #define KARMA_LABEL_MAGIC               0xAB56
-int karma_partition(struct parsed_partitions *state, struct block_device *bdev);
+int karma_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 8652fb99e962..648c9d8f3357 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/stringify.h>
+#include <linux/kernel.h>
 #include "ldm.h"
 #include "check.h"
 #include "msdos.h"
@@ -77,17 +78,16 @@ static int ldm_parse_hexbyte (const u8 *src)
        int h;
        /* high part */
-        if      ((x = src[0] - '0') <= '9'-'0') h = x;
+        x = h = hex_to_bin(src[0]);
-        else if ((x = src[0] - 'a') <= 'f'-'a') h = x+10;
+        if (h < 0)
-        else if ((x = src[0] - 'A') <= 'F'-'A') h = x+10;
+                return -1;
-        else return -1;
-        h <<= 4;
        /* low part */
-        if ((x = src[1] - '0') <= '9'-'0') return h | x;
+        h = hex_to_bin(src[1]);
-        if ((x = src[1] - 'a') <= 'f'-'a') return h | (x+10);
+        if (h < 0)
-        if ((x = src[1] - 'A') <= 'F'-'A') return h | (x+10);
+                return -1;
-        return -1;
+        return (x << 4) + h;
 }
 /**
@@ -309,7 +309,7 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
 /**
 * ldm_validate_privheads - Compare the primary privhead with its backups
- * @bdev:  Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
 * @ph1:   Memory struct to fill with ph contents
 *
 * Read and compare all three privheads from disk.
@@ -321,8 +321,8 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
 * Return:  'true'   Success
 *          'false'  Error
 */
-static bool ldm_validate_privheads (struct block_device *bdev,
+static bool ldm_validate_privheads(struct parsed_partitions *state,
-                                    struct privhead *ph1)
+                                   struct privhead *ph1)
 {
        static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 };
        struct privhead *ph[3] = { ph1 };
@@ -332,7 +332,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
        long num_sects;
        int i;
-        BUG_ON (!bdev || !ph1);
+        BUG_ON (!state || !ph1);
        ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL);
        ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL);
@@ -346,8 +346,8 @@ static bool ldm_validate_privheads (struct block_device *bdev,
        /* Read and parse privheads */
        for (i = 0; i < 3; i++) {
-                data = read_dev_sector (bdev,
+                data = read_part_sector(state, ph[0]->config_start + off[i],
-                        ph[0]->config_start + off[i], &sect);
+                                        &sect);
                if (!data) {
                        ldm_crit ("Disk read failed.");
                        goto out;
@@ -363,7 +363,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
                }
        }
-        num_sects = bdev->bd_inode->i_size >> 9;
+        num_sects = state->bdev->bd_inode->i_size >> 9;
        if ((ph[0]->config_start > num_sects) ||
           ((ph[0]->config_start + ph[0]->config_size) > num_sects)) {
@@ -397,20 +397,20 @@ out:
 /**
 * ldm_validate_tocblocks - Validate the table of contents and its backups
- * @bdev:  Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
- * @base:  Offset, into @bdev, of the database
+ * @base:  Offset, into @state->bdev, of the database
 * @ldb:   Cache of the database structures
 *
 * Find and compare the four tables of contents of the LDM Database stored on
- * @bdev and return the parsed information into @toc1.
+ * @state->bdev and return the parsed information into @toc1.
 *
 * The offsets and sizes of the configs are range-checked against a privhead.
 *
 * Return:  'true'   @toc1 contains validated TOCBLOCK info
 *          'false'  @toc1 contents are undefined
 */
-static bool ldm_validate_tocblocks(struct block_device *bdev,
+static bool ldm_validate_tocblocks(struct parsed_partitions *state,
-        unsigned long base, struct ldmdb *ldb)
+                                   unsigned long base, struct ldmdb *ldb)
 {
        static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4};
        struct tocblock *tb[4];
@@ -420,7 +420,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
        int i, nr_tbs;
        bool result = false;
-        BUG_ON(!bdev || !ldb);
+        BUG_ON(!state || !ldb);
        ph = &ldb->ph;
        tb[0] = &ldb->toc;
        tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL);
@@ -437,7 +437,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
         * skip any that fail as long as we get at least one valid TOCBLOCK.
         */
        for (nr_tbs = i = 0; i < 4; i++) {
-                data = read_dev_sector(bdev, base + off[i], &sect);
+                data = read_part_sector(state, base + off[i], &sect);
                if (!data) {
                        ldm_error("Disk read failed for TOCBLOCK %d.", i);
                        continue;
@@ -473,7 +473,7 @@ err:
 /**
 * ldm_validate_vmdb - Read the VMDB and validate it
- * @bdev:  Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
 * @base:  Offset, into @bdev, of the database
 * @ldb:   Cache of the database structures
 *
@@ -483,8 +483,8 @@ err:
 * Return:  'true'   @ldb contains validated VBDB info
 *          'false'  @ldb contents are undefined
 */
-static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
+static bool ldm_validate_vmdb(struct parsed_partitions *state,
-                               struct ldmdb *ldb)
+                              unsigned long base, struct ldmdb *ldb)
 {
        Sector sect;
        u8 *data;
@@ -492,12 +492,12 @@ static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
        struct vmdb *vm;
        struct tocblock *toc;
-        BUG_ON (!bdev || !ldb);
+        BUG_ON (!state || !ldb);
        vm  = &ldb->vm;
        toc = &ldb->toc;
-        data = read_dev_sector (bdev, base + OFF_VMDB, &sect);
+        data = read_part_sector(state, base + OFF_VMDB, &sect);
        if (!data) {
                ldm_crit ("Disk read failed.");
                return false;
@@ -534,21 +534,21 @@ out:
 /**
 * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk
- * @bdev:  Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
 *
 * This function provides a weak test to decide whether the device is a dynamic
 * disk or not.  It looks for an MS-DOS-style partition table containing at
 * least one partition of type 0x42 (formerly SFS, now used by Windows for
 * dynamic disks).
 *
- * N.B.  The only possible error can come from the read_dev_sector and that is
+ * N.B.  The only possible error can come from the read_part_sector and that is
 *       only likely to happen if the underlying device is strange.  If that IS
 *       the case we should return zero to let someone else try.
 *
- * Return:  'true'   @bdev is a dynamic disk
+ * Return:  'true'   @state->bdev is a dynamic disk
- *          'false'  @bdev is not a dynamic disk, or an error occurred
+ *          'false'  @state->bdev is not a dynamic disk, or an error occurred
 */
-static bool ldm_validate_partition_table (struct block_device *bdev)
+static bool ldm_validate_partition_table(struct parsed_partitions *state)
 {
        Sector sect;
        u8 *data;
@@ -556,9 +556,9 @@ static bool ldm_validate_partition_table (struct block_device *bdev)
        int i;
        bool result = false;
-        BUG_ON (!bdev);
+        BUG_ON(!state);
-        data = read_dev_sector (bdev, 0, &sect);
+        data = read_part_sector(state, 0, &sect);
        if (!data) {
                ldm_crit ("Disk read failed.");
                return false;
@@ -1391,8 +1391,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
 /**
 * ldm_get_vblks - Read the on-disk database of VBLKs into memory
- * @bdev:  Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
- * @base:  Offset, into @bdev, of the database
+ * @base:  Offset, into @state->bdev, of the database
 * @ldb:   Cache of the database structures
 *
 * To use the information from the VBLKs, they need to be read from the disk,
@@ -1401,8 +1401,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
 * Return:  'true'   All the VBLKs were read successfully
 *          'false'  An error occurred
 */
-static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
+static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base,
-                           struct ldmdb *ldb)
+                          struct ldmdb *ldb)
 {
        int size, perbuf, skip, finish, s, v, recs;
        u8 *data = NULL;
@@ -1410,7 +1410,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
        bool result = false;
        LIST_HEAD (frags);
-        BUG_ON (!bdev || !ldb);
+        BUG_ON(!state || !ldb);
        size   = ldb->vm.vblk_size;
        perbuf = 512 / size;
@@ -1418,7 +1418,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
        finish = (size * ldb->vm.last_vblk_seq) >> 9;
        for (s = skip; s < finish; s++) {               /* For each sector */
-                data = read_dev_sector (bdev, base + OFF_VMDB + s, &sect);
+                data = read_part_sector(state, base + OFF_VMDB + s, &sect);
                if (!data) {
                        ldm_crit ("Disk read failed.");
                        goto out;
@@ -1474,8 +1474,7 @@ static void ldm_free_vblks (struct list_head *lh)
 /**
 * ldm_partition - Find out whether a device is a dynamic disk and handle it
- * @pp:    List of the partitions parsed so far
+ * @state: Partition check state including device holding the LDM Database
- * @bdev:  Device holding the LDM Database
 *
 * This determines whether the device @bdev is a dynamic disk and if so creates
 * the partitions necessary in the gendisk structure pointed to by @hd.
@@ -1485,21 +1484,21 @@ static void ldm_free_vblks (struct list_head *lh)
 * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3,
 * and so on: the actual data containing partitions.
 *
- * Return:  1 Success, @bdev is a dynamic disk and we handled it
+ * Return:  1 Success, @state->bdev is a dynamic disk and we handled it
- *          0 Success, @bdev is not a dynamic disk
+ *          0 Success, @state->bdev is not a dynamic disk
 *         -1 An error occurred before enough information had been read
- *            Or @bdev is a dynamic disk, but it may be corrupted
+ *            Or @state->bdev is a dynamic disk, but it may be corrupted
 */
-int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
+int ldm_partition(struct parsed_partitions *state)
 {
        struct ldmdb  *ldb;
        unsigned long base;
        int result = -1;
-        BUG_ON (!pp || !bdev);
+        BUG_ON(!state);
        /* Look for signs of a Dynamic Disk */
-        if (!ldm_validate_partition_table (bdev))
+        if (!ldm_validate_partition_table(state))
                return 0;
        ldb = kmalloc (sizeof (*ldb), GFP_KERNEL);
@@ -1509,15 +1508,15 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
        }
        /* Parse and check privheads. */
-        if (!ldm_validate_privheads (bdev, &ldb->ph))
+        if (!ldm_validate_privheads(state, &ldb->ph))
                goto out;               /* Already logged */
        /* All further references are relative to base (database start). */
        base = ldb->ph.config_start;
        /* Parse and check tocs and vmdb. */
-        if (!ldm_validate_tocblocks (bdev, base, ldb) ||
+        if (!ldm_validate_tocblocks(state, base, ldb) ||
-            !ldm_validate_vmdb      (bdev, base, ldb))
+            !ldm_validate_vmdb(state, base, ldb))
                goto out;               /* Already logged */
        /* Initialize vblk lists in ldmdb struct */
@@ -1527,13 +1526,13 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
        INIT_LIST_HEAD (&ldb->v_comp);
        INIT_LIST_HEAD (&ldb->v_part);
-        if (!ldm_get_vblks (bdev, base, ldb)) {
+        if (!ldm_get_vblks(state, base, ldb)) {
                ldm_crit ("Failed to read the VBLKs from the database.");
                goto cleanup;
        }
        /* Finally, create the data partition devices. */
-        if (ldm_create_data_partitions (pp, ldb)) {
+        if (ldm_create_data_partitions(state, ldb)) {
                ldm_debug ("Parsed LDM database successfully.");
                result = 1;
        }
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index 30e08e809c1d..d1fb50b28d86 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -209,7 +209,7 @@ struct ldmdb {				/* Cache of the database */
        struct list_head v_part;
 };
-int ldm_partition (struct parsed_partitions *state, struct block_device *bdev);
+int ldm_partition(struct parsed_partitions *state);
 #endif /* _FS_PT_LDM_H_ */
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
index d4a0fad3563b..74465ff7c263 100644
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -27,7 +27,7 @@ static inline void mac_fix_string(char *stg, int len)
                stg[i] = 0;
 }
-int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
+int mac_partition(struct parsed_partitions *state)
 {
        int slot = 1;
        Sector sect;
@@ -42,7 +42,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
        struct mac_driver_desc *md;
        /* Get 0th block and look at the first partition map entry. */
-        md = (struct mac_driver_desc *) read_dev_sector(bdev, 0, &sect);
+        md = read_part_sector(state, 0, &sect);
        if (!md)
                return -1;
        if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) {
@@ -51,7 +51,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
        }
        secsize = be16_to_cpu(md->block_size);
        put_dev_sector(sect);
-        data = read_dev_sector(bdev, secsize/512, &sect);
+        data = read_part_sector(state, secsize/512, &sect);
        if (!data)
                return -1;
        part = (struct mac_partition *) (data + secsize%512);
@@ -64,7 +64,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
        for (blk = 1; blk <= blocks_in_map; ++blk) {
                int pos = blk * secsize;
                put_dev_sector(sect);
-                data = read_dev_sector(bdev, pos/512, &sect);
+                data = read_part_sector(state, pos/512, &sect);
                if (!data)
                        return -1;
                part = (struct mac_partition *) (data + pos%512);
@@ -75,7 +75,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
                        be32_to_cpu(part->block_count) * (secsize/512));
                if (!strnicmp(part->type, "Linux_RAID", 10))
-                        state->parts[slot].flags = 1;
+                        state->parts[slot].flags = ADDPART_FLAG_RAID;
 #ifdef CONFIG_PPC_PMAC
                /*
                 * If this is the first bootable partition, tell the
@@ -123,7 +123,8 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
        }
 #ifdef CONFIG_PPC_PMAC
        if (found_root_goodness)
-                note_bootable_part(bdev->bd_dev, found_root, found_root_goodness);
+                note_bootable_part(state->bdev->bd_dev, found_root,
+                                   found_root_goodness);
 #endif
        put_dev_sector(sect);
diff --git a/fs/partitions/mac.h b/fs/partitions/mac.h
index bbf26e1386fa..3c7d98436380 100644
--- a/fs/partitions/mac.h
+++ b/fs/partitions/mac.h
@@ -41,4 +41,4 @@ struct mac_driver_desc {
    /* ... more stuff */
 };
-int mac_partition(struct parsed_partitions *state, struct block_device *bdev);
+int mac_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 0028d2ef0662..15bfb7b1e044 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -31,14 +31,17 @@
 */
 #include <asm/unaligned.h>
-#define SYS_IND(p)      (get_unaligned(&p->sys_ind))
+#define SYS_IND(p)      get_unaligned(&p->sys_ind)
-#define NR_SECTS(p)     ({ __le32 __a = get_unaligned(&p->nr_sects);    \
-                                le32_to_cpu(__a); \
-                        })
-#define START_SECT(p)   ({ __le32 __a = get_unaligned(&p->start_sect);  \
+static inline sector_t nr_sects(struct partition *p)
-                                le32_to_cpu(__a); \
+{
-                        })
+        return (sector_t)get_unaligned_le32(&p->nr_sects);
+}
+static inline sector_t start_sect(struct partition *p)
+{
+        return (sector_t)get_unaligned_le32(&p->start_sect);
+}
 static inline int is_extended_partition(struct partition *p)
 {
@@ -61,7 +64,7 @@ msdos_magic_present(unsigned char *p)
 #define AIX_LABEL_MAGIC2        0xC2
 #define AIX_LABEL_MAGIC3        0xD4
 #define AIX_LABEL_MAGIC4        0xC1
-static int aix_magic_present(unsigned char *p, struct block_device *bdev)
+static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
 {
        struct partition *pt = (struct partition *) (p + 0x1be);
        Sector sect;
@@ -82,7 +85,7 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
                        is_extended_partition(pt))
                        return 0;
        }
-        d = read_dev_sector(bdev, 7, &sect);
+        d = read_part_sector(state, 7, &sect);
        if (d) {
                if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
                        ret = 1;
@@ -102,15 +105,14 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
 * only for the actual data partitions.
 */
-static void
+static void parse_extended(struct parsed_partitions *state,
-parse_extended(struct parsed_partitions *state, struct block_device *bdev,
+                           sector_t first_sector, sector_t first_size)
-                        u32 first_sector, u32 first_size)
 {
        struct partition *p;
        Sector sect;
        unsigned char *data;
-        u32 this_sector, this_size;
+        sector_t this_sector, this_size;
-        int sector_size = bdev_logical_block_size(bdev) / 512;
+        sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
        int loopct = 0;         /* number of links followed
                                   without finding a data partition */
        int i;
@@ -123,7 +125,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
                        return;
                if (state->next == state->limit)
                        return;
-                data = read_dev_sector(bdev, this_sector, &sect);
+                data = read_part_sector(state, this_sector, &sect);
                if (!data)
                        return;
@@ -145,14 +147,14 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
                 * First process the data partition(s)
                 */
                for (i=0; i<4; i++, p++) {
-                        u32 offs, size, next;
+                        sector_t offs, size, next;
-                        if (!NR_SECTS(p) || is_extended_partition(p))
+                        if (!nr_sects(p) || is_extended_partition(p))
                                continue;
                        /* Check the 3rd and 4th entries -
                           these sometimes contain random garbage */
-                        offs = START_SECT(p)*sector_size;
+                        offs = start_sect(p)*sector_size;
-                        size = NR_SECTS(p)*sector_size;
+                        size = nr_sects(p)*sector_size;
                        next = this_sector + offs;
                        if (i >= 2) {
                                if (offs + size > this_size)
@@ -179,13 +181,13 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
                 */
                p -= 4;
                for (i=0; i<4; i++, p++)
-                        if (NR_SECTS(p) && is_extended_partition(p))
+                        if (nr_sects(p) && is_extended_partition(p))
                                break;
                if (i == 4)
                        goto done;       /* nothing left to do */
-                this_sector = first_sector + START_SECT(p) * sector_size;
+                this_sector = first_sector + start_sect(p) * sector_size;
-                this_size = NR_SECTS(p) * sector_size;
+                this_size = nr_sects(p) * sector_size;
                put_dev_sector(sect);
        }
 done:
@@ -195,9 +197,8 @@ done:
 /* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also
   indicates linux swap.  Be careful before believing this is Solaris. */
-static void
+static void parse_solaris_x86(struct parsed_partitions *state,
-parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
+                              sector_t offset, sector_t size, int origin)
-                        u32 offset, u32 size, int origin)
 {
 #ifdef CONFIG_SOLARIS_X86_PARTITION
        Sector sect;
@@ -205,7 +206,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
        int i;
        short max_nparts;
-        v = (struct solaris_x86_vtoc *)read_dev_sector(bdev, offset+1, &sect);
+        v = read_part_sector(state, offset + 1, &sect);
        if (!v)
                return;
        if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) {
@@ -242,16 +243,15 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
 * Create devices for BSD partitions listed in a disklabel, under a
 * dos-like partition. See parse_extended() for more information.
 */
-static void
+static void parse_bsd(struct parsed_partitions *state,
-parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
+                      sector_t offset, sector_t size, int origin, char *flavour,
-                u32 offset, u32 size, int origin, char *flavour,
+                      int max_partitions)
-                int max_partitions)
 {
        Sector sect;
        struct bsd_disklabel *l;
        struct bsd_partition *p;
-        l = (struct bsd_disklabel *)read_dev_sector(bdev, offset+1, &sect);
+        l = read_part_sector(state, offset + 1, &sect);
        if (!l)
                return;
        if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) {
@@ -263,7 +263,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
        if (le16_to_cpu(l->d_npartitions) < max_partitions)
                max_partitions = le16_to_cpu(l->d_npartitions);
        for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) {
-                u32 bsd_start, bsd_size;
+                sector_t bsd_start, bsd_size;
                if (state->next == state->limit)
                        break;
@@ -288,33 +288,28 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
 }
 #endif
-static void
+static void parse_freebsd(struct parsed_partitions *state,
-parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
+                          sector_t offset, sector_t size, int origin)
-                u32 offset, u32 size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
-        parse_bsd(state, bdev, offset, size, origin,
+        parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS);
-                        "bsd", BSD_MAXPARTITIONS);
 #endif
 }
-static void
+static void parse_netbsd(struct parsed_partitions *state,
-parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
+                         sector_t offset, sector_t size, int origin)
-                u32 offset, u32 size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
-        parse_bsd(state, bdev, offset, size, origin,
+        parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS);
-                        "netbsd", BSD_MAXPARTITIONS);
 #endif
 }
-static void
+static void parse_openbsd(struct parsed_partitions *state,
-parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
+                          sector_t offset, sector_t size, int origin)
-                u32 offset, u32 size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
-        parse_bsd(state, bdev, offset, size, origin,
+        parse_bsd(state, offset, size, origin, "openbsd",
-                        "openbsd", OPENBSD_MAXPARTITIONS);
+                  OPENBSD_MAXPARTITIONS);
 #endif
 }
@@ -322,16 +317,15 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
 * Create devices for Unixware partitions listed in a disklabel, under a
 * dos-like partition. See parse_extended() for more information.
 */
-static void
+static void parse_unixware(struct parsed_partitions *state,
-parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
+                           sector_t offset, sector_t size, int origin)
-                u32 offset, u32 size, int origin)
 {
 #ifdef CONFIG_UNIXWARE_DISKLABEL
        Sector sect;
        struct unixware_disklabel *l;
        struct unixware_slice *p;
-        l = (struct unixware_disklabel *)read_dev_sector(bdev, offset+29, &sect);
+        l = read_part_sector(state, offset + 29, &sect);
        if (!l)
                return;
        if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC ||
@@ -348,7 +342,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
                if (p->s_label != UNIXWARE_FS_UNUSED)
                        put_partition(state, state->next++,
-                                                START_SECT(p), NR_SECTS(p));
+                                      le32_to_cpu(p->start_sect),
+                                      le32_to_cpu(p->nr_sects));
                p++;
        }
        put_dev_sector(sect);
@@ -361,9 +356,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
 * Anand Krishnamurthy <anandk@wiproge.med.ge.com>
 * Rajeev V. Pillai    <rajeevvp@yahoo.com>
 */
-static void
+static void parse_minix(struct parsed_partitions *state,
-parse_minix(struct parsed_partitions *state, struct block_device *bdev,
+                        sector_t offset, sector_t size, int origin)
-                u32 offset, u32 size, int origin)
 {
 #ifdef CONFIG_MINIX_SUBPARTITION
        Sector sect;
@@ -371,7 +365,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
        struct partition *p;
        int i;
-        data = read_dev_sector(bdev, offset, &sect);
+        data = read_part_sector(state, offset, &sect);
        if (!data)
                return;
@@ -390,7 +384,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
                        /* add each partition in use */
                        if (SYS_IND(p) == MINIX_PARTITION)
                                put_partition(state, state->next++,
-                                              START_SECT(p), NR_SECTS(p));
+                                              start_sect(p), nr_sects(p));
                }
                printk(" >\n");
        }
@@ -400,8 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
 static struct {
        unsigned char id;
-        void (*parse)(struct parsed_partitions *, struct block_device *,
+        void (*parse)(struct parsed_partitions *, sector_t, sector_t, int);
-                        u32, u32, int);
 } subtypes[] = {
        {FREEBSD_PARTITION, parse_freebsd},
        {NETBSD_PARTITION, parse_netbsd},
@@ -413,16 +406,16 @@ static struct {
        {0, NULL},
 };
 
-int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
+int msdos_partition(struct parsed_partitions *state)
 {
-        int sector_size = bdev_logical_block_size(bdev) / 512;
+        sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
        Sector sect;
        unsigned char *data;
        struct partition *p;
        struct fat_boot_sector *fb;
        int slot;
-        data = read_dev_sector(bdev, 0, &sect);
+        data = read_part_sector(state, 0, &sect);
        if (!data)
                return -1;
        if (!msdos_magic_present(data + 510)) {
@@ -430,7 +423,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
                return 0;
        }
-        if (aix_magic_present(data, bdev)) {
+        if (aix_magic_present(state, data)) {
                put_dev_sector(sect);
                printk( " [AIX]");
                return 0;
@@ -483,22 +476,29 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
        state->next = 5;
        for (slot = 1 ; slot <= 4 ; slot++, p++) {
-                u32 start = START_SECT(p)*sector_size;
+                sector_t start = start_sect(p)*sector_size;
-                u32 size = NR_SECTS(p)*sector_size;
+                sector_t size = nr_sects(p)*sector_size;
                if (!size)
                        continue;
                if (is_extended_partition(p)) {
-                        /* prevent someone doing mkfs or mkswap on an
+                        /*
-                           extended partition, but leave room for LILO */
+                         * prevent someone doing mkfs or mkswap on an
-                        put_partition(state, slot, start, size == 1 ? 1 : 2);
+                         * extended partition, but leave room for LILO
+                         * FIXME: this uses one logical sector for > 512b
+                         * sector, although it may not be enough/proper.
+                         */
+                        sector_t n = 2;
+                        n = min(size, max(sector_size, n));
+                        put_partition(state, slot, start, n);
                        printk(" <");
-                        parse_extended(state, bdev, start, size);
+                        parse_extended(state, start, size);
                        printk(" >");
                        continue;
                }
                put_partition(state, slot, start, size);
                if (SYS_IND(p) == LINUX_RAID_PARTITION)
-                        state->parts[slot].flags = 1;
+                        state->parts[slot].flags = ADDPART_FLAG_RAID;
                if (SYS_IND(p) == DM6_PARTITION)
                        printk("[DM]");
                if (SYS_IND(p) == EZD_PARTITION)
@@ -513,7 +513,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
                unsigned char id = SYS_IND(p);
                int n;
-                if (!NR_SECTS(p))
+                if (!nr_sects(p))
                        continue;
                for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++)
@@ -521,8 +521,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
                if (!subtypes[n].parse)
                        continue;
-                subtypes[n].parse(state, bdev, START_SECT(p)*sector_size,
+                subtypes[n].parse(state, start_sect(p) * sector_size,
-                                                NR_SECTS(p)*sector_size, slot);
+                                  nr_sects(p) * sector_size, slot);
        }
        put_dev_sector(sect);
        return 1;
diff --git a/fs/partitions/msdos.h b/fs/partitions/msdos.h
index 01e5e0b6902d..38c781c490b3 100644
--- a/fs/partitions/msdos.h
+++ b/fs/partitions/msdos.h
@@ -4,5 +4,5 @@
 #define MSDOS_LABEL_MAGIC               0xAA55
-int msdos_partition(struct parsed_partitions *state, struct block_device *bdev);
+int msdos_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index c05c17bc5df3..fc22b85d436a 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,7 +10,7 @@
 #include "check.h"
 #include "osf.h"
-int osf_partition(struct parsed_partitions *state, struct block_device *bdev)
+int osf_partition(struct parsed_partitions *state)
 {
        int i;
        int slot = 1;
@@ -49,7 +49,7 @@ int osf_partition(struct parsed_partitions *state, struct block_device *bdev)
        } * label;
        struct d_partition * partition;
-        data = read_dev_sector(bdev, 0, &sect);
+        data = read_part_sector(state, 0, &sect);
        if (!data)
                return -1;
diff --git a/fs/partitions/osf.h b/fs/partitions/osf.h
index 427b8eab314b..20ed2315ec16 100644
--- a/fs/partitions/osf.h
+++ b/fs/partitions/osf.h
@@ -4,4 +4,4 @@
 #define DISKLABELMAGIC (0x82564557UL)
-int osf_partition(struct parsed_partitions *state, struct block_device *bdev);
+int osf_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c
index ed5ac83fe83a..43b1df9aa16c 100644
--- a/fs/partitions/sgi.c
+++ b/fs/partitions/sgi.c
@@ -27,7 +27,7 @@ struct sgi_disklabel {
        __be32 _unused1;                        /* Padding */
 };
-int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
+int sgi_partition(struct parsed_partitions *state)
 {
        int i, csum;
        __be32 magic;
@@ -39,7 +39,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
        struct sgi_partition *p;
        char b[BDEVNAME_SIZE];
-        label = (struct sgi_disklabel *) read_dev_sector(bdev, 0, &sect);
+        label = read_part_sector(state, 0, &sect);
        if (!label)
                return -1;
        p = &label->partitions[0];
@@ -57,7 +57,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
        }
        if(csum) {
                printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n",
-                       bdevname(bdev, b));
+                       bdevname(state->bdev, b));
                put_dev_sector(sect);
                return 0;
        }
diff --git a/fs/partitions/sgi.h b/fs/partitions/sgi.h
index 5d5595c09928..b9553ebdd5a9 100644
--- a/fs/partitions/sgi.h
+++ b/fs/partitions/sgi.h
@@ -2,7 +2,7 @@
 *  fs/partitions/sgi.h
 */
-extern int sgi_partition(struct parsed_partitions *state, struct block_device *bdev);
+extern int sgi_partition(struct parsed_partitions *state);
 #define SGI_LABEL_MAGIC 0x0be5a941
diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c
index c95e6a62c01d..a32660e25f7f 100644
--- a/fs/partitions/sun.c
+++ b/fs/partitions/sun.c
@@ -10,7 +10,7 @@
 #include "check.h"
 #include "sun.h"
-int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
+int sun_partition(struct parsed_partitions *state)
 {
        int i;
        __be16 csum;
@@ -61,7 +61,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
        int use_vtoc;
        int nparts;
-        label = (struct sun_disklabel *)read_dev_sector(bdev, 0, &sect);
+        label = read_part_sector(state, 0, &sect);
        if (!label)
                return -1;
@@ -78,7 +78,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
                csum ^= *ush--;
        if (csum) {
                printk("Dev %s Sun disklabel: Csum bad, label corrupted\n",
-                       bdevname(bdev, b));
+                       bdevname(state->bdev, b));
                put_dev_sector(sect);
                return 0;
        }
diff --git a/fs/partitions/sun.h b/fs/partitions/sun.h
index 7f864d1f86d4..2424baa8319f 100644
--- a/fs/partitions/sun.h
+++ b/fs/partitions/sun.h
@@ -5,4 +5,4 @@
 #define SUN_LABEL_MAGIC          0xDABE
 #define SUN_VTOC_SANITY          0x600DDEEE
-int sun_partition(struct parsed_partitions *state, struct block_device *bdev);
+int sun_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c
index 4eba27b78643..9030c864428e 100644
--- a/fs/partitions/sysv68.c
+++ b/fs/partitions/sysv68.c
@@ -46,7 +46,7 @@ struct slice {
 };
-int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
+int sysv68_partition(struct parsed_partitions *state)
 {
        int i, slices;
        int slot = 1;
@@ -55,7 +55,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
        struct dkblk0 *b;
        struct slice *slice;
-        data = read_dev_sector(bdev, 0, &sect);
+        data = read_part_sector(state, 0, &sect);
        if (!data)
                return -1;
@@ -68,7 +68,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
        i = be32_to_cpu(b->dk_ios.ios_slcblk);
        put_dev_sector(sect);
-        data = read_dev_sector(bdev, i, &sect);
+        data = read_part_sector(state, i, &sect);
        if (!data)
                return -1;
diff --git a/fs/partitions/sysv68.h b/fs/partitions/sysv68.h
index fa733f68431b..bf2f5ffa97ac 100644
--- a/fs/partitions/sysv68.h
+++ b/fs/partitions/sysv68.h
@@ -1 +1 @@
-extern int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev);
+extern int sysv68_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c
index ec852c11dce4..db9eef260364 100644
--- a/fs/partitions/ultrix.c
+++ b/fs/partitions/ultrix.c
@@ -9,7 +9,7 @@
 #include "check.h"
 #include "ultrix.h"
-int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev)
+int ultrix_partition(struct parsed_partitions *state)
 {
        int i;
        Sector sect;
@@ -26,7 +26,7 @@ int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev)
 #define PT_MAGIC        0x032957        /* Partition magic number */
 #define PT_VALID        1               /* Indicates if struct is valid */
-        data = read_dev_sector(bdev, (16384 - sizeof(*label))/512, &sect);
+        data = read_part_sector(state, (16384 - sizeof(*label))/512, &sect);
        if (!data)
                return -1;
        
diff --git a/fs/partitions/ultrix.h b/fs/partitions/ultrix.h
index a74bf8e2d370..a3cc00b2bded 100644
--- a/fs/partitions/ultrix.h
+++ b/fs/partitions/ultrix.h
@@ -2,4 +2,4 @@
 *  fs/partitions/ultrix.h
 */
-int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev);
+int ultrix_partition(struct parsed_partitions *state);
diff --git a/fs/pipe.c b/fs/pipe.c
index 37ba29ff3158..db6eaaba0dd8 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/log2.h>
 #include <linux/mount.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/uio.h>
@@ -18,11 +19,18 @@
 #include <linux/pagemap.h>
 #include <linux/audit.h>
 #include <linux/syscalls.h>
+#include <linux/fcntl.h>
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
 /*
+ * The max size that a non-root user is allowed to grow the pipe. Can
+ * be set by root in /proc/sys/fs/pipe-max-pages
+ */
+unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16;
+/*
 * We use a start+len construction, which provides full use of the 
 * allocated memory.
 * -- Florian Coosmann (FGC)
@@ -222,6 +230,7 @@ void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
        return kmap(buf->page);
 }
+EXPORT_SYMBOL(generic_pipe_buf_map);
 /**
 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
@@ -241,6 +250,7 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
        } else
                kunmap(buf->page);
 }
+EXPORT_SYMBOL(generic_pipe_buf_unmap);
 /**
 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
@@ -271,6 +281,7 @@ int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
        return 1;
 }
+EXPORT_SYMBOL(generic_pipe_buf_steal);
 /**
 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
@@ -286,6 +297,7 @@ void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 {
        page_cache_get(buf->page);
 }
+EXPORT_SYMBOL(generic_pipe_buf_get);
 /**
 * generic_pipe_buf_confirm - verify contents of the pipe buffer
@@ -301,6 +313,7 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info,
 {
        return 0;
 }
+EXPORT_SYMBOL(generic_pipe_buf_confirm);
 /**
 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
@@ -315,6 +328,7 @@ void generic_pipe_buf_release(struct pipe_inode_info *pipe,
 {
        page_cache_release(buf->page);
 }
+EXPORT_SYMBOL(generic_pipe_buf_release);
 static const struct pipe_buf_operations anon_pipe_buf_ops = {
        .can_merge = 1,
@@ -390,7 +404,7 @@ redo:
                        if (!buf->len) {
                                buf->ops = NULL;
                                ops->release(pipe, buf);
-                                curbuf = (curbuf + 1) & (PIPE_BUFFERS-1);
+                                curbuf = (curbuf + 1) & (pipe->buffers - 1);
                                pipe->curbuf = curbuf;
                                pipe->nrbufs = --bufs;
                                do_wakeup = 1;
@@ -472,7 +486,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
        chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
        if (pipe->nrbufs && chars != 0) {
                int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
-                                                        (PIPE_BUFFERS-1);
+                                                        (pipe->buffers - 1);
                struct pipe_buffer *buf = pipe->bufs + lastbuf;
                const struct pipe_buf_operations *ops = buf->ops;
                int offset = buf->offset + buf->len;
@@ -518,8 +532,8 @@ redo1:
                        break;
                }
                bufs = pipe->nrbufs;
-                if (bufs < PIPE_BUFFERS) {
+                if (bufs < pipe->buffers) {
-                        int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1);
+                        int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
                        struct pipe_buffer *buf = pipe->bufs + newbuf;
                        struct page *page = pipe->tmp_page;
                        char *src;
@@ -580,7 +594,7 @@ redo2:
                        if (!total_len)
                                break;
                }
-                if (bufs < PIPE_BUFFERS)
+                if (bufs < pipe->buffers)
                        continue;
                if (filp->f_flags & O_NONBLOCK) {
                        if (!ret)
@@ -640,7 +654,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        nrbufs = pipe->nrbufs;
                        while (--nrbufs >= 0) {
                                count += pipe->bufs[buf].len;
-                                buf = (buf+1) & (PIPE_BUFFERS-1);
+                                buf = (buf+1) & (pipe->buffers - 1);
                        }
                        mutex_unlock(&inode->i_mutex);
@@ -671,7 +685,7 @@ pipe_poll(struct file *filp, poll_table *wait)
        }
        if (filp->f_mode & FMODE_WRITE) {
-                mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0;
+                mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
                /*
                 * Most Unices do not set POLLERR for FIFOs but on Linux they
                 * behave exactly like pipes for poll().
@@ -877,25 +891,32 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
        pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
        if (pipe) {
-                init_waitqueue_head(&pipe->wait);
+                pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
-                pipe->r_counter = pipe->w_counter = 1;
+                if (pipe->bufs) {
-                pipe->inode = inode;
+                        init_waitqueue_head(&pipe->wait);
+                        pipe->r_counter = pipe->w_counter = 1;
+                        pipe->inode = inode;
+                        pipe->buffers = PIPE_DEF_BUFFERS;
+                        return pipe;
+                }
+                kfree(pipe);
        }
-        return pipe;
+        return NULL;
 }
 void __free_pipe_info(struct pipe_inode_info *pipe)
 {
        int i;
-        for (i = 0; i < PIPE_BUFFERS; i++) {
+        for (i = 0; i < pipe->buffers; i++) {
                struct pipe_buffer *buf = pipe->bufs + i;
                if (buf->ops)
                        buf->ops->release(pipe, buf);
        }
        if (pipe->tmp_page)
                __free_page(pipe->tmp_page);
+        kfree(pipe->bufs);
        kfree(pipe);
 }
@@ -1094,6 +1115,94 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
 }
 /*
+ * Allocate a new array of pipe buffers and copy the info over. Returns the
+ * pipe size if successful, or return -ERROR on error.
+ */
+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
+{
+        struct pipe_buffer *bufs;
+        /*
+         * Must be a power-of-2 currently
+         */
+        if (!is_power_of_2(arg))
+                return -EINVAL;
+        /*
+         * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
+         * expect a lot of shrink+grow operations, just free and allocate
+         * again like we would do for growing. If the pipe currently
+         * contains more buffers than arg, then return busy.
+         */
+        if (arg < pipe->nrbufs)
+                return -EBUSY;
+        bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL);
+        if (unlikely(!bufs))
+                return -ENOMEM;
+        /*
+         * The pipe array wraps around, so just start the new one at zero
+         * and adjust the indexes.
+         */
+        if (pipe->nrbufs) {
+                const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1);
+                const unsigned int head = pipe->nrbufs - tail;
+                if (head)
+                        memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
+                if (tail)
+                        memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer));
+        }
+        pipe->curbuf = 0;
+        kfree(pipe->bufs);
+        pipe->bufs = bufs;
+        pipe->buffers = arg;
+        return arg;
+}
+long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+        struct pipe_inode_info *pipe;
+        long ret;
+        pipe = file->f_path.dentry->d_inode->i_pipe;
+        if (!pipe)
+                return -EBADF;
+        mutex_lock(&pipe->inode->i_mutex);
+        switch (cmd) {
+        case F_SETPIPE_SZ:
+                if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages) {
+                        ret = -EINVAL;
+                        goto out;
+                }
+                /*
+                 * The pipe needs to be at least 2 pages large to
+                 * guarantee POSIX behaviour.
+                 */
+                if (arg < 2) {
+                        ret = -EINVAL;
+                        goto out;
+                }
+                ret = pipe_set_size(pipe, arg);
+                break;
+        case F_GETPIPE_SZ:
+                ret = pipe->buffers;
+                break;
+        default:
+                ret = -EINVAL;
+                break;
+        }
+out:
+        mutex_unlock(&pipe->inode->i_mutex);
+        return ret;
+}
+/*
 * pipefs should _never_ be mounted by userland - too much of security hassle,
 * no real gain from having the whole whorehouse mounted. So we don't need
 * any operations on the root directory. However, we need a non-trivial
diff --git a/fs/proc/array.c b/fs/proc/array.c
index aa8637b81028..9b58d38bc911 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -68,7 +68,6 @@
 #include <linux/hugetlb.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
-#include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/signal.h>
 #include <linux/highmem.h>
@@ -82,7 +81,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/ptrace.h>
 #include <linux/tracehook.h>
-#include <linux/swapops.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -269,7 +267,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
                shpending = p->signal->shared_pending.signal;
                blocked = p->blocked;
                collect_sigign_sigcatch(p, &ignored, &caught);
-                num_threads = atomic_read(&p->signal->count);
+                num_threads = get_nr_threads(p);
                rcu_read_lock();  /* FIXME: is this correct? */
                qsize = atomic_read(&__task_cred(p)->user->sigpending);
                rcu_read_unlock();
@@ -412,7 +410,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                        tty_nr = new_encode_dev(tty_devnum(sig->tty));
                }
-                num_threads = atomic_read(&sig->count);
+                num_threads = get_nr_threads(task);
                collect_sigign_sigcatch(task, &sigign, &sigcatch);
                cmin_flt = sig->cmin_flt;
@@ -496,7 +494,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                rsslim,
                mm ? mm->start_code : 0,
                mm ? mm->end_code : 0,
-                (permitted && mm) ? task->stack_start : 0,
+                (permitted && mm) ? mm->start_stack : 0,
                esp,
                eip,
                /* The signal information here is obsolete.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a7310841c831..acb7ef80ea4f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -81,6 +81,7 @@
 #include <linux/elf.h>
 #include <linux/pid_namespace.h>
 #include <linux/fs_struct.h>
+#include <linux/slab.h>
 #include "internal.h"
 /* NOTE:
@@ -165,18 +166,6 @@ static int get_fs_path(struct task_struct *task, struct path *path, bool root)
        return result;
 }
-static int get_nr_threads(struct task_struct *tsk)
-{
-        unsigned long flags;
-        int count = 0;
-        if (lock_task_sighand(tsk, &flags)) {
-                count = atomic_read(&tsk->signal->count);
-                unlock_task_sighand(tsk, &flags);
-        }
-        return count;
-}
 static int proc_cwd_link(struct inode *inode, struct path *path)
 {
        struct task_struct *task = get_proc_task(inode);
@@ -442,12 +431,13 @@ static const struct file_operations proc_lstats_operations = {
 unsigned long badness(struct task_struct *p, unsigned long uptime);
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
-        unsigned long points;
+        unsigned long points = 0;
        struct timespec uptime;
        do_posix_clock_monotonic_gettime(&uptime);
        read_lock(&tasklist_lock);
-        points = badness(task->group_leader, uptime.tv_sec);
+        if (pid_alive(task))
+                points = badness(task, uptime.tv_sec);
        read_unlock(&tasklist_lock);
        return sprintf(buffer, "%lu\n", points);
 }
@@ -728,6 +718,7 @@ out_no_task:
 static const struct file_operations proc_info_file_operations = {
        .read           = proc_info_read,
+        .llseek         = generic_file_llseek,
 };
 static int proc_single_show(struct seq_file *m, void *v)
@@ -985,6 +976,7 @@ out_no_task:
 static const struct file_operations proc_environ_operations = {
        .read           = environ_read,
+        .llseek         = generic_file_llseek,
 };
 static ssize_t oom_adjust_read(struct file *file, char __user *buf,
@@ -1058,6 +1050,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 static const struct file_operations proc_oom_adjust_operations = {
        .read           = oom_adjust_read,
        .write          = oom_adjust_write,
+        .llseek         = generic_file_llseek,
 };
 #ifdef CONFIG_AUDITSYSCALL
@@ -1129,6 +1122,7 @@ out_free_page:
 static const struct file_operations proc_loginuid_operations = {
        .read           = proc_loginuid_read,
        .write          = proc_loginuid_write,
+        .llseek         = generic_file_llseek,
 };
 static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
@@ -1149,6 +1143,7 @@ static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
 static const struct file_operations proc_sessionid_operations = {
        .read           = proc_sessionid_read,
+        .llseek         = generic_file_llseek,
 };
 #endif
@@ -1200,6 +1195,7 @@ static ssize_t proc_fault_inject_write(struct file * file,
 static const struct file_operations proc_fault_inject_operations = {
        .read           = proc_fault_inject_read,
        .write          = proc_fault_inject_write,
+        .llseek         = generic_file_llseek,
 };
 #endif
@@ -1941,7 +1937,7 @@ static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
 }
 static const struct file_operations proc_fdinfo_file_operations = {
-        .open           = nonseekable_open,
+        .open           = nonseekable_open,
        .read           = proc_fdinfo_read,
 };
@@ -2225,6 +2221,7 @@ out_no_task:
 static const struct file_operations proc_pid_attr_operations = {
        .read           = proc_pid_attr_read,
        .write          = proc_pid_attr_write,
+        .llseek         = generic_file_llseek,
 };
 static const struct pid_entry attr_dir_stuff[] = {
@@ -2345,6 +2342,7 @@ static ssize_t proc_coredump_filter_write(struct file *file,
 static const struct file_operations proc_coredump_filter_operations = {
        .read           = proc_coredump_filter_read,
        .write          = proc_coredump_filter_write,
+        .llseek         = generic_file_llseek,
 };
 #endif
@@ -2434,7 +2432,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
        const struct pid_entry *p = ptr;
        struct inode *inode;
        struct proc_inode *ei;
-        struct dentry *error = ERR_PTR(-EINVAL);
+        struct dentry *error;
        /* Allocate the inode */
        error = ERR_PTR(-ENOMEM);
@@ -2784,7 +2782,7 @@ out:
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
-        struct dentry *result = ERR_PTR(-ENOENT);
+        struct dentry *result;
        struct task_struct *task;
        unsigned tgid;
        struct pid_namespace *ns;
@@ -2907,7 +2905,7 @@ out_no_task:
 */
 static const struct pid_entry tid_base_stuff[] = {
        DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
-        DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fd_operations),
+        DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
        REG("environ",   S_IRUSR, proc_environ_operations),
        INF("auxv",      S_IRUSR, proc_pid_auxv),
        ONE("status",    S_IRUGO, proc_pid_status),
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 08f4d71dacd7..2791907744ed 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -13,6 +13,7 @@
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/init.h>
 #include <linux/idr.h>
@@ -342,21 +343,6 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
 /*
 * Return an inode number between PROC_DYNAMIC_FIRST and
 * 0xffffffff, or zero on failure.
- *
- * Current inode allocations in the proc-fs (hex-numbers):
- *
- * 00000000             reserved
- * 00000001-00000fff    static entries  (goners)
- *      001             root-ino
- *
- * 00001000-00001fff    unused
- * 0001xxxx-7fffxxxx    pid-dir entries for pid 1-7fff
- * 80000000-efffffff    unused
- * f0000000-ffffffff    dynamic entries
- *
- * Goal:
- *      Once we split the thing into several virtual filesystems,
- *      we will get rid of magical ranges (and this comment, BTW).
 */
 static unsigned int get_inode_number(void)
 {
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 445a02bcaab3..aea8502e58a3 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -18,6 +18,7 @@
 #include <linux/module.h>
 #include <linux/smp_lock.h>
 #include <linux/sysctl.h>
+#include <linux/slab.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -231,9 +232,9 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
                if (rv == -ENOIOCTLCMD)
                        rv = -EINVAL;
        } else if (ioctl) {
-                lock_kernel();
+                WARN_ONCE(1, "Procfs ioctl handlers must use unlocked_ioctl, "
+                          "%pf will be called without the Bkl held\n", ioctl);
                rv = ioctl(file->f_path.dentry->d_inode, file, cmd, arg);
-                unlock_kernel();
        }
        pde_users_dec(pde);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a44a7897fd4d..6f37c391468d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -19,6 +19,7 @@
 #include <linux/highmem.h>
 #include <linux/bootmem.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <linux/list.h>
@@ -490,7 +491,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
                }
                read_unlock(&kclist_lock);
-                if (m == NULL) {
+                if (&m->list == &kclist_head) {
                        if (clear_user(buffer, tsz))
                                return -EFAULT;
                } else if (is_vmalloc_or_module_addr((void *)start)) {
@@ -557,6 +558,7 @@ static int open_kcore(struct inode *inode, struct file *filp)
 static const struct file_operations proc_kcore_operations = {
        .read           = read_kcore,
        .open           = open_kcore,
+        .llseek         = generic_file_llseek,
 };
 #ifdef CONFIG_MEMORY_HOTPLUG
@@ -586,7 +588,7 @@ static struct kcore_list kcore_text;
 */
 static void __init proc_kcore_text_init(void)
 {
-        kclist_add(&kcore_text, _stext, _end - _stext, KCORE_TEXT);
+        kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT);
 }
 #else
 static void __init proc_kcore_text_init(void)
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index cfe90a48a6e8..bd4b5a740ff1 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -53,6 +53,7 @@ static const struct file_operations proc_kmsg_operations = {
        .poll           = kmsg_poll,
        .open           = kmsg_open,
        .release        = kmsg_release,
+        .llseek         = generic_file_llseek,
 };
 static int __init proc_kmsg_init(void)
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 9fe7d7ebe115..b1822dde55c2 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -21,7 +21,6 @@
 #include <linux/mmzone.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
-#include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/seq_file.h>
 #include <linux/hugetlb.h>
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index f8650dce74fb..ce94801f48ca 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -12,6 +12,7 @@
 #include <linux/string.h>
 #include <linux/of.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <asm/prom.h>
 #include <asm/uaccess.h>
 #include "internal.h"
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 04d1270f1c38..9020ac15baaa 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -14,6 +14,7 @@
 #include <linux/time.h>
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/module.h>
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 757c069f2a65..4258384ed22d 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -110,7 +110,6 @@ void __init proc_root_init(void)
        if (err)
                return;
        proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
-        err = PTR_ERR(proc_mnt);
        if (IS_ERR(proc_mnt)) {
                unregister_filesystem(&proc_fs_type);
                return;
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index b9b7aad2003d..bf31b03fc275 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -1,6 +1,5 @@
 #include <linux/cpumask.h>
 #include <linux/fs.h>
-#include <linux/gfp.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 183f8ff5f400..aea1d3f1ffb5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -4,6 +4,7 @@
 #include <linux/seq_file.h>
 #include <linux/highmem.h>
 #include <linux/ptrace.h>
+#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/mempolicy.h>
 #include <linux/swap.h>
@@ -246,25 +247,6 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
                                } else if (vma->vm_start <= mm->start_stack &&
                                           vma->vm_end >= mm->start_stack) {
                                        name = "[stack]";
-                                } else {
-                                        unsigned long stack_start;
-                                        struct proc_maps_private *pmp;
-                                        pmp = m->private;
-                                        stack_start = pmp->task->stack_start;
-                                        if (vma->vm_start <= stack_start &&
-                                            vma->vm_end >= stack_start) {
-                                                pad_len_spaces(m, len);
-                                                seq_printf(m,
-                                                 "[threadstack:%08lx]",
-#ifdef CONFIG_STACK_GROWSUP
-                                                 vma->vm_end - stack_start
-#else
-                                                 stack_start - vma->vm_start
-#endif
-                                                );
-                                        }
                                }
                        } else {
                                name = "[vdso]";
@@ -406,6 +388,7 @@ static int show_smap(struct seq_file *m, void *v)
        memset(&mss, 0, sizeof mss);
        mss.vma = vma;
+        /* mmap_sem is held in m_start */
        if (vma->vm_mm && !is_vm_hugetlb_page(vma))
                walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
@@ -552,7 +535,8 @@ const struct file_operations proc_clear_refs_operations = {
 };
 struct pagemapread {
-        u64 __user *out, *end;
+        int pos, len;
+        u64 *buffer;
 };
 #define PM_ENTRY_BYTES      sizeof(u64)
@@ -575,10 +559,8 @@ struct pagemapread {
 static int add_to_pagemap(unsigned long addr, u64 pfn,
                          struct pagemapread *pm)
 {
-        if (put_user(pfn, pm->out))
+        pm->buffer[pm->pos++] = pfn;
-                return -EFAULT;
+        if (pm->pos >= pm->len)
-        pm->out++;
-        if (pm->out >= pm->end)
                return PM_END_OF_BUFFER;
        return 0;
 }
@@ -652,6 +634,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        return err;
 }
+#ifdef CONFIG_HUGETLB_PAGE
 static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
 {
        u64 pme = 0;
@@ -661,31 +644,18 @@ static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
        return pme;
 }
-static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr,
+/* This function walks within one hugetlb entry in the single call */
-                                 unsigned long end, struct mm_walk *walk)
+static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
+                                 unsigned long addr, unsigned long end,
+                                 struct mm_walk *walk)
 {
-        struct vm_area_struct *vma;
        struct pagemapread *pm = walk->private;
-        struct hstate *hs = NULL;
        int err = 0;
+        u64 pfn;
-        vma = find_vma(walk->mm, addr);
-        if (vma)
-                hs = hstate_vma(vma);
        for (; addr != end; addr += PAGE_SIZE) {
-                u64 pfn = PM_NOT_PRESENT;
+                int offset = (addr & ~hmask) >> PAGE_SHIFT;
+                pfn = huge_pte_to_pagemap_entry(*pte, offset);
-                if (vma && (addr >= vma->vm_end)) {
-                        vma = find_vma(walk->mm, addr);
-                        if (vma)
-                                hs = hstate_vma(vma);
-                }
-                if (vma && (vma->vm_start <= addr) && is_vm_hugetlb_page(vma)) {
-                        /* calculate pfn of the "raw" page in the hugepage. */
-                        int offset = (addr & ~huge_page_mask(hs)) >> PAGE_SHIFT;
-                        pfn = huge_pte_to_pagemap_entry(*pte, offset);
-                }
                err = add_to_pagemap(addr, pfn, pm);
                if (err)
                        return err;
@@ -695,6 +665,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr,
        return err;
 }
+#endif /* HUGETLB_PAGE */
 /*
 * /proc/pid/pagemap - an array mapping virtual pages to pfns
@@ -720,21 +691,20 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr,
 * determine which areas of memory are actually mapped and llseek to
 * skip over unmapped regions.
 */
+#define PAGEMAP_WALK_SIZE       (PMD_SIZE)
 static ssize_t pagemap_read(struct file *file, char __user *buf,
                            size_t count, loff_t *ppos)
 {
        struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
-        struct page **pages, *page;
-        unsigned long uaddr, uend;
        struct mm_struct *mm;
        struct pagemapread pm;
-        int pagecount;
        int ret = -ESRCH;
        struct mm_walk pagemap_walk = {};
        unsigned long src;
        unsigned long svpfn;
        unsigned long start_vaddr;
        unsigned long end_vaddr;
+        int copied = 0;
        if (!task)
                goto out;
@@ -757,38 +727,17 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        if (!mm)
                goto out_task;
+        pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
-        uaddr = (unsigned long)buf & PAGE_MASK;
+        pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
-        uend = (unsigned long)(buf + count);
-        pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
-        ret = 0;
-        if (pagecount == 0)
-                goto out_mm;
-        pages = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
        ret = -ENOMEM;
-        if (!pages)
+        if (!pm.buffer)
                goto out_mm;
-        down_read(&current->mm->mmap_sem);
-        ret = get_user_pages(current, current->mm, uaddr, pagecount,
-                             1, 0, pages, NULL);
-        up_read(&current->mm->mmap_sem);
-        if (ret < 0)
-                goto out_free;
-        if (ret != pagecount) {
-                pagecount = ret;
-                ret = -EFAULT;
-                goto out_pages;
-        }
-        pm.out = (u64 __user *)buf;
-        pm.end = (u64 __user *)(buf + count);
        pagemap_walk.pmd_entry = pagemap_pte_range;
        pagemap_walk.pte_hole = pagemap_pte_hole;
+#ifdef CONFIG_HUGETLB_PAGE
        pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
+#endif
        pagemap_walk.mm = mm;
        pagemap_walk.private = &pm;
@@ -807,23 +756,36 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
         * user buffer is tracked in "pm", and the walk
         * will stop when we hit the end of the buffer.
         */
-        ret = walk_page_range(start_vaddr, end_vaddr, &pagemap_walk);
+        ret = 0;
-        if (ret == PM_END_OF_BUFFER)
+        while (count && (start_vaddr < end_vaddr)) {
-                ret = 0;
+                int len;
-        /* don't need mmap_sem for these, but this looks cleaner */
+                unsigned long end;
-        *ppos += (char __user *)pm.out - buf;
-        if (!ret)
+                pm.pos = 0;
-                ret = (char __user *)pm.out - buf;
+                end = start_vaddr + PAGEMAP_WALK_SIZE;
+                /* overflow ? */
-out_pages:
+                if (end < start_vaddr || end > end_vaddr)
-        for (; pagecount; pagecount--) {
+                        end = end_vaddr;
-                page = pages[pagecount-1];
+                down_read(&mm->mmap_sem);
-                if (!PageReserved(page))
+                ret = walk_page_range(start_vaddr, end, &pagemap_walk);
-                        SetPageDirty(page);
+                up_read(&mm->mmap_sem);
-                page_cache_release(page);
+                start_vaddr = end;
+                len = min(count, PM_ENTRY_BYTES * pm.pos);
+                if (copy_to_user(buf, pm.buffer, len)) {
+                        ret = -EFAULT;
+                        goto out_free;
+                }
+                copied += len;
+                buf += len;
+                count -= len;
        }
+        *ppos += copied;
+        if (!ret || ret == PM_END_OF_BUFFER)
+                ret = copied;
 out_free:
-        kfree(pages);
+        kfree(pm.buffer);
 out_mm:
        mmput(mm);
 out_task:
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 5d9fd64ef81a..46d4b5d72bd3 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -5,6 +5,7 @@
 #include <linux/fs_struct.h>
 #include <linux/mount.h>
 #include <linux/ptrace.h>
+#include <linux/slab.h>
 #include <linux/seq_file.h>
 #include "internal.h"
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 0872afa58d39..91c817ff02c3 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -12,6 +12,7 @@
 #include <linux/user.h>
 #include <linux/elf.h>
 #include <linux/elfcore.h>
+#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/bootmem.h>
 #include <linux/init.h>
@@ -162,6 +163,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 static const struct file_operations proc_vmcore_operations = {
        .read           = read_vmcore,
+        .llseek         = generic_file_llseek,
 };
 static struct vmcore* __init get_new_element(void)
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 6f30c3d5bcbf..6e8fc62b40a8 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -77,9 +77,10 @@ out:
 const struct file_operations qnx4_dir_operations =
 {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = qnx4_readdir,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
 };
 const struct inode_operations qnx4_dir_inode_operations =
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index dad7fb247ddc..3e21b1e2ad3a 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -33,6 +33,14 @@ config PRINT_QUOTA_WARNING
          Note that this behavior is currently deprecated and may go away in
          future. Please use notification via netlink socket instead.
+config QUOTA_DEBUG
+        bool "Additional quota sanity checks"
+        depends on QUOTA
+        default n
+        help
+          If you say Y here, quota subsystem will perform some additional
+          sanity checks of quota internal structures. If unsure, say N.
 # Generic support for tree structured quota files. Selected when needed.
 config QUOTA_TREE
         tristate
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index e0b870f4749f..12c233da1b6b 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -80,11 +80,9 @@
 #include <asm/uaccess.h>
-#define __DQUOT_PARANOIA
 /*
 * There are three quota SMP locks. dq_list_lock protects all lists with quotas
- * and quota formats, dqstats structure containing statistics about the lists
+ * and quota formats.
 * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and
 * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes.
 * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly
@@ -134,7 +132,9 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
 EXPORT_SYMBOL(dq_data_lock);
+#if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING)
 static char *quotatypes[] = INITQFNAMES;
+#endif
 static struct quota_format_type *quota_formats; /* List of registered formats */
 static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES;
@@ -275,7 +275,7 @@ static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb,
 static inline void put_dquot_last(struct dquot *dquot)
 {
        list_add_tail(&dquot->dq_free, &free_dquots);
-        dqstats.free_dquots++;
+        dqstats_inc(DQST_FREE_DQUOTS);
 }
 static inline void remove_free_dquot(struct dquot *dquot)
@@ -283,7 +283,7 @@ static inline void remove_free_dquot(struct dquot *dquot)
        if (list_empty(&dquot->dq_free))
                return;
        list_del_init(&dquot->dq_free);
-        dqstats.free_dquots--;
+        dqstats_dec(DQST_FREE_DQUOTS);
 }
 static inline void put_inuse(struct dquot *dquot)
@@ -291,12 +291,12 @@ static inline void put_inuse(struct dquot *dquot)
        /* We add to the back of inuse list so we don't have to restart
         * when traversing this list and we block */
        list_add_tail(&dquot->dq_inuse, &inuse_list);
-        dqstats.allocated_dquots++;
+        dqstats_inc(DQST_ALLOC_DQUOTS);
 }
 static inline void remove_inuse(struct dquot *dquot)
 {
-        dqstats.allocated_dquots--;
+        dqstats_dec(DQST_ALLOC_DQUOTS);
        list_del(&dquot->dq_inuse);
 }
 /*
@@ -319,14 +319,23 @@ static inline int mark_dquot_dirty(struct dquot *dquot)
        return dquot->dq_sb->dq_op->mark_dirty(dquot);
 }
+/* Mark dquot dirty in atomic manner, and return it's old dirty flag state */
 int dquot_mark_dquot_dirty(struct dquot *dquot)
 {
+        int ret = 1;
+        /* If quota is dirty already, we don't have to acquire dq_list_lock */
+        if (test_bit(DQ_MOD_B, &dquot->dq_flags))
+                return 1;
        spin_lock(&dq_list_lock);
-        if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags))
+        if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) {
                list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)->
                                info[dquot->dq_type].dqi_dirty_list);
+                ret = 0;
+        }
        spin_unlock(&dq_list_lock);
-        return 0;
+        return ret;
 }
 EXPORT_SYMBOL(dquot_mark_dquot_dirty);
@@ -552,8 +561,8 @@ int dquot_scan_active(struct super_block *sb,
                        continue;
                /* Now we have active dquot so we can just increase use count */
                atomic_inc(&dquot->dq_count);
-                dqstats.lookups++;
                spin_unlock(&dq_list_lock);
+                dqstats_inc(DQST_LOOKUPS);
                dqput(old_dquot);
                old_dquot = dquot;
                ret = fn(dquot, priv);
@@ -571,7 +580,7 @@ out:
 }
 EXPORT_SYMBOL(dquot_scan_active);
-int vfs_quota_sync(struct super_block *sb, int type, int wait)
+int dquot_quota_sync(struct super_block *sb, int type, int wait)
 {
        struct list_head *dirty;
        struct dquot *dquot;
@@ -598,8 +607,8 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait)
                         * holding reference so we can safely just increase
                         * use count */
                        atomic_inc(&dquot->dq_count);
-                        dqstats.lookups++;
                        spin_unlock(&dq_list_lock);
+                        dqstats_inc(DQST_LOOKUPS);
                        sb->dq_op->write_dquot(dquot);
                        dqput(dquot);
                        spin_lock(&dq_list_lock);
@@ -611,9 +620,7 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait)
                if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
                    && info_dirty(&dqopt->info[cnt]))
                        sb->dq_op->write_info(sb, cnt);
-        spin_lock(&dq_list_lock);
+        dqstats_inc(DQST_SYNCS);
-        dqstats.syncs++;
-        spin_unlock(&dq_list_lock);
        mutex_unlock(&dqopt->dqonoff_mutex);
        if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE))
@@ -645,7 +652,7 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait)
        return 0;
 }
-EXPORT_SYMBOL(vfs_quota_sync);
+EXPORT_SYMBOL(dquot_quota_sync);
 /* Free unused dquots from cache */
 static void prune_dqcache(int count)
@@ -669,7 +676,6 @@ static void prune_dqcache(int count)
 * This is called from kswapd when we think we need some
 * more memory
 */
 static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
 {
        if (nr) {
@@ -677,7 +683,9 @@ static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
                prune_dqcache(nr);
                spin_unlock(&dq_list_lock);
        }
-        return (dqstats.free_dquots / 100) * sysctl_vfs_cache_pressure;
+        return ((unsigned)
+                percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])
+                /100) * sysctl_vfs_cache_pressure;
 }
 static struct shrinker dqcache_shrinker = {
@@ -695,7 +703,7 @@ void dqput(struct dquot *dquot)
        if (!dquot)
                return;
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
        if (!atomic_read(&dquot->dq_count)) {
                printk("VFS: dqput: trying to free free dquot\n");
                printk("VFS: device %s, dquot of %s %d\n",
@@ -705,10 +713,7 @@ void dqput(struct dquot *dquot)
                BUG();
        }
 #endif
-        
+        dqstats_inc(DQST_DROPS);
-        spin_lock(&dq_list_lock);
-        dqstats.drops++;
-        spin_unlock(&dq_list_lock);
 we_slept:
        spin_lock(&dq_list_lock);
        if (atomic_read(&dquot->dq_count) > 1) {
@@ -748,7 +753,7 @@ we_slept:
                goto we_slept;
        }
        atomic_dec(&dquot->dq_count);
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
        /* sanity check */
        BUG_ON(!list_empty(&dquot->dq_free));
 #endif
@@ -825,15 +830,15 @@ we_slept:
                put_inuse(dquot);
                /* hash it first so it can be found */
                insert_dquot_hash(dquot);
-                dqstats.lookups++;
                spin_unlock(&dq_list_lock);
+                dqstats_inc(DQST_LOOKUPS);
        } else {
                if (!atomic_read(&dquot->dq_count))
                        remove_free_dquot(dquot);
                atomic_inc(&dquot->dq_count);
-                dqstats.cache_hits++;
-                dqstats.lookups++;
                spin_unlock(&dq_list_lock);
+                dqstats_inc(DQST_CACHE_HITS);
+                dqstats_inc(DQST_LOOKUPS);
        }
        /* Wait for dq_lock - after this we know that either dquot_release() is
         * already finished or it will be canceled due to dq_count > 1 test */
@@ -845,7 +850,7 @@ we_slept:
                dquot = NULL;
                goto out;
        }
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
        BUG_ON(!dquot->dq_sb);  /* Has somebody invalidated entry under us? */
 #endif
 out:
@@ -874,14 +879,18 @@ static int dqinit_needed(struct inode *inode, int type)
 static void add_dquot_ref(struct super_block *sb, int type)
 {
        struct inode *inode, *old_inode = NULL;
+#ifdef CONFIG_QUOTA_DEBUG
        int reserved = 0;
+#endif
        spin_lock(&inode_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
                        continue;
+#ifdef CONFIG_QUOTA_DEBUG
                if (unlikely(inode_get_rsv_space(inode) > 0))
                        reserved = 1;
+#endif
                if (!atomic_read(&inode->i_writecount))
                        continue;
                if (!dqinit_needed(inode, type))
@@ -903,11 +912,13 @@ static void add_dquot_ref(struct super_block *sb, int type)
        spin_unlock(&inode_lock);
        iput(old_inode);
+#ifdef CONFIG_QUOTA_DEBUG
        if (reserved) {
                printk(KERN_WARNING "VFS (%s): Writes happened before quota"
                        " was turned on thus quota information is probably "
                        "inconsistent. Please run quotacheck(8).\n", sb->s_id);
        }
+#endif
 }
 /*
@@ -934,7 +945,7 @@ static int remove_inode_dquot_ref(struct inode *inode, int type,
        inode->i_dquot[type] = NULL;
        if (dquot) {
                if (dqput_blocks(dquot)) {
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
                        if (atomic_read(&dquot->dq_count) != 1)
                                printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count));
 #endif
@@ -1484,11 +1495,13 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
 /*
 * This operation can block, but only after everything is updated
 */
-int __dquot_alloc_space(struct inode *inode, qsize_t number,
+int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
-                int warn, int reserve)
 {
        int cnt, ret = 0;
        char warntype[MAXQUOTAS];
+        int warn = flags & DQUOT_SPACE_WARN;
+        int reserve = flags & DQUOT_SPACE_RESERVE;
+        int nofail = flags & DQUOT_SPACE_NOFAIL;
        /*
         * First test before acquiring mutex - solves deadlocks when we
@@ -1509,7 +1522,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
                        continue;
                ret = check_bdq(inode->i_dquot[cnt], number, !warn,
                                warntype+cnt);
-                if (ret) {
+                if (ret && !nofail) {
                        spin_unlock(&dq_data_lock);
                        goto out_flush_warn;
                }
@@ -1608,10 +1621,11 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
 /*
 * This operation can block, but only after everything is updated
 */
-void __dquot_free_space(struct inode *inode, qsize_t number, int reserve)
+void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
 {
        unsigned int cnt;
        char warntype[MAXQUOTAS];
+        int reserve = flags & DQUOT_SPACE_RESERVE;
        /* First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex */
@@ -1673,16 +1687,19 @@ EXPORT_SYMBOL(dquot_free_inode);
 /*
 * Transfer the number of inode and blocks from one diskquota to an other.
+ * On success, dquot references in transfer_to are consumed and references
+ * to original dquots that need to be released are placed there. On failure,
+ * references are kept untouched.
 *
 * This operation can block, but only after everything is updated
 * A transaction must be started when entering this function.
+ *
 */
-static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask)
+int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 {
        qsize_t space, cur_space;
        qsize_t rsv_space = 0;
-        struct dquot *transfer_from[MAXQUOTAS];
+        struct dquot *transfer_from[MAXQUOTAS] = {};
-        struct dquot *transfer_to[MAXQUOTAS];
        int cnt, ret = 0;
        char warntype_to[MAXQUOTAS];
        char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
@@ -1692,19 +1709,12 @@ static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask
        if (IS_NOQUOTA(inode))
                return 0;
        /* Initialize the arrays */
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                transfer_from[cnt] = NULL;
-                transfer_to[cnt] = NULL;
                warntype_to[cnt] = QUOTA_NL_NOWARN;
-        }
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                if (mask & (1 << cnt))
-                        transfer_to[cnt] = dqget(inode->i_sb, chid[cnt], cnt);
-        }
        down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
        if (IS_NOQUOTA(inode)) {        /* File without quota accounting? */
                up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-                goto put_all;
+                return 0;
        }
        spin_lock(&dq_data_lock);
        cur_space = inode_get_bytes(inode);
@@ -1756,47 +1766,41 @@ static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask
        mark_all_dquot_dirty(transfer_from);
        mark_all_dquot_dirty(transfer_to);
-        /* The reference we got is transferred to the inode */
+        /* Pass back references to put */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                transfer_to[cnt] = NULL;
+                transfer_to[cnt] = transfer_from[cnt];
-warn_put_all:
+warn:
        flush_warnings(transfer_to, warntype_to);
        flush_warnings(transfer_from, warntype_from_inodes);
        flush_warnings(transfer_from, warntype_from_space);
-put_all:
-        dqput_all(transfer_from);
-        dqput_all(transfer_to);
        return ret;
 over_quota:
        spin_unlock(&dq_data_lock);
        up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-        /* Clear dquot pointers we don't want to dqput() */
+        goto warn;
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                transfer_from[cnt] = NULL;
-        goto warn_put_all;
 }
+EXPORT_SYMBOL(__dquot_transfer);
 /* Wrapper for transferring ownership of an inode for uid/gid only
 * Called from FSXXX_setattr()
 */
 int dquot_transfer(struct inode *inode, struct iattr *iattr)
 {
-        qid_t chid[MAXQUOTAS];
+        struct dquot *transfer_to[MAXQUOTAS] = {};
-        unsigned long mask = 0;
+        struct super_block *sb = inode->i_sb;
+        int ret;
-        if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) {
+        if (!sb_any_quota_active(sb) || IS_NOQUOTA(inode))
-                mask |= 1 << USRQUOTA;
+                return 0;
-                chid[USRQUOTA] = iattr->ia_uid;
-        }
+        if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid)
-        if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) {
+                transfer_to[USRQUOTA] = dqget(sb, iattr->ia_uid, USRQUOTA);
-                mask |= 1 << GRPQUOTA;
+        if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)
-                chid[GRPQUOTA] = iattr->ia_gid;
+                transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_gid, GRPQUOTA);
-        }
-        if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) {
+        ret = __dquot_transfer(inode, transfer_to);
-                dquot_initialize(inode);
+        dqput_all(transfer_to);
-                return __dquot_transfer(inode, chid, mask);
+        return ret;
-        }
-        return 0;
 }
 EXPORT_SYMBOL(dquot_transfer);
@@ -1827,6 +1831,7 @@ const struct dquot_operations dquot_operations = {
        .alloc_dquot    = dquot_alloc,
        .destroy_dquot  = dquot_destroy,
 };
+EXPORT_SYMBOL(dquot_operations);
 /*
 * Generic helper for ->open on filesystems supporting disk quotas.
@@ -1845,7 +1850,7 @@ EXPORT_SYMBOL(dquot_file_open);
 /*
 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
 */
-int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
+int dquot_disable(struct super_block *sb, int type, unsigned int flags)
 {
        int cnt, ret = 0;
        struct quota_info *dqopt = sb_dqopt(sb);
@@ -1975,14 +1980,15 @@ put_inodes:
                }
        return ret;
 }
-EXPORT_SYMBOL(vfs_quota_disable);
+EXPORT_SYMBOL(dquot_disable);
-int vfs_quota_off(struct super_block *sb, int type, int remount)
+int dquot_quota_off(struct super_block *sb, int type)
 {
-        return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED :
+        return dquot_disable(sb, type,
-                                 (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED));
+                             DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 }
-EXPORT_SYMBOL(vfs_quota_off);
+EXPORT_SYMBOL(dquot_quota_off);
 /*
 *      Turn quotas on on a device
 */
@@ -2100,36 +2106,43 @@ out_fmt:
 }
 /* Reenable quotas on remount RW */
-static int vfs_quota_on_remount(struct super_block *sb, int type)
+int dquot_resume(struct super_block *sb, int type)
 {
        struct quota_info *dqopt = sb_dqopt(sb);
        struct inode *inode;
-        int ret;
+        int ret = 0, cnt;
        unsigned int flags;
-        mutex_lock(&dqopt->dqonoff_mutex);
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-        if (!sb_has_quota_suspended(sb, type)) {
+                if (type != -1 && cnt != type)
+                        continue;
+                mutex_lock(&dqopt->dqonoff_mutex);
+                if (!sb_has_quota_suspended(sb, cnt)) {
+                        mutex_unlock(&dqopt->dqonoff_mutex);
+                        continue;
+                }
+                inode = dqopt->files[cnt];
+                dqopt->files[cnt] = NULL;
+                spin_lock(&dq_state_lock);
+                flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
+                                                        DQUOT_LIMITS_ENABLED,
+                                                        cnt);
+                dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt);
+                spin_unlock(&dq_state_lock);
                mutex_unlock(&dqopt->dqonoff_mutex);
-                return 0;
-        }
-        inode = dqopt->files[type];
-        dqopt->files[type] = NULL;
-        spin_lock(&dq_state_lock);
-        flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
-                                                DQUOT_LIMITS_ENABLED, type);
-        dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
-        spin_unlock(&dq_state_lock);
-        mutex_unlock(&dqopt->dqonoff_mutex);
-        flags = dquot_generic_flag(flags, type);
+                flags = dquot_generic_flag(flags, cnt);
-        ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id,
+                ret = vfs_load_quota_inode(inode, cnt,
-                                   flags);
+                                dqopt->info[cnt].dqi_fmt_id, flags);
-        iput(inode);
+                iput(inode);
+        }
        return ret;
 }
+EXPORT_SYMBOL(dquot_resume);
-int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
+int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
                      struct path *path)
 {
        int error = security_quota_on(path->dentry);
@@ -2144,40 +2157,36 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
                                             DQUOT_LIMITS_ENABLED);
        return error;
 }
-EXPORT_SYMBOL(vfs_quota_on_path);
+EXPORT_SYMBOL(dquot_quota_on_path);
-int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
+int dquot_quota_on(struct super_block *sb, int type, int format_id, char *name)
-                 int remount)
 {
        struct path path;
        int error;
-        if (remount)
-                return vfs_quota_on_remount(sb, type);
        error = kern_path(name, LOOKUP_FOLLOW, &path);
        if (!error) {
-                error = vfs_quota_on_path(sb, type, format_id, &path);
+                error = dquot_quota_on_path(sb, type, format_id, &path);
                path_put(&path);
        }
        return error;
 }
-EXPORT_SYMBOL(vfs_quota_on);
+EXPORT_SYMBOL(dquot_quota_on);
 /*
 * More powerful function for turning on quotas allowing setting
 * of individual quota flags
 */
-int vfs_quota_enable(struct inode *inode, int type, int format_id,
+int dquot_enable(struct inode *inode, int type, int format_id,
-                unsigned int flags)
+                 unsigned int flags)
 {
        int ret = 0;
        struct super_block *sb = inode->i_sb;
        struct quota_info *dqopt = sb_dqopt(sb);
        /* Just unsuspend quotas? */
-        if (flags & DQUOT_SUSPENDED)
+        BUG_ON(flags & DQUOT_SUSPENDED);
-                return vfs_quota_on_remount(sb, type);
        if (!flags)
                return 0;
        /* Just updating flags needed? */
@@ -2209,13 +2218,13 @@ out_lock:
 load_quota:
        return vfs_load_quota_inode(inode, type, format_id, flags);
 }
-EXPORT_SYMBOL(vfs_quota_enable);
+EXPORT_SYMBOL(dquot_enable);
 /*
 * This function is used when filesystem needs to initialize quotas
 * during mount time.
 */
-int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
+int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
                int format_id, int type)
 {
        struct dentry *dentry;
@@ -2241,24 +2250,7 @@ out:
        dput(dentry);
        return error;
 }
-EXPORT_SYMBOL(vfs_quota_on_mount);
+EXPORT_SYMBOL(dquot_quota_on_mount);
-/* Wrapper to turn on quotas when remounting rw */
-int vfs_dq_quota_on_remount(struct super_block *sb)
-{
-        int cnt;
-        int ret = 0, err;
-        if (!sb->s_qcop || !sb->s_qcop->quota_on)
-                return -ENOSYS;
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                err = sb->s_qcop->quota_on(sb, cnt, 0, NULL, 1);
-                if (err < 0 && !ret)
-                        ret = err;
-        }
-        return ret;
-}
-EXPORT_SYMBOL(vfs_dq_quota_on_remount);
 static inline qsize_t qbtos(qsize_t blocks)
 {
@@ -2271,25 +2263,30 @@ static inline qsize_t stoqb(qsize_t space)
 }
 /* Generic routine for getting common part of quota structure */
-static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
+static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
 {
        struct mem_dqblk *dm = &dquot->dq_dqb;
+        memset(di, 0, sizeof(*di));
+        di->d_version = FS_DQUOT_VERSION;
+        di->d_flags = dquot->dq_type == USRQUOTA ?
+                        XFS_USER_QUOTA : XFS_GROUP_QUOTA;
+        di->d_id = dquot->dq_id;
        spin_lock(&dq_data_lock);
-        di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit);
+        di->d_blk_hardlimit = stoqb(dm->dqb_bhardlimit);
-        di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit);
+        di->d_blk_softlimit = stoqb(dm->dqb_bsoftlimit);
-        di->dqb_curspace = dm->dqb_curspace + dm->dqb_rsvspace;
+        di->d_ino_hardlimit = dm->dqb_ihardlimit;
-        di->dqb_ihardlimit = dm->dqb_ihardlimit;
+        di->d_ino_softlimit = dm->dqb_isoftlimit;
-        di->dqb_isoftlimit = dm->dqb_isoftlimit;
+        di->d_bcount = dm->dqb_curspace + dm->dqb_rsvspace;
-        di->dqb_curinodes = dm->dqb_curinodes;
+        di->d_icount = dm->dqb_curinodes;
-        di->dqb_btime = dm->dqb_btime;
+        di->d_btimer = dm->dqb_btime;
-        di->dqb_itime = dm->dqb_itime;
+        di->d_itimer = dm->dqb_itime;
-        di->dqb_valid = QIF_ALL;
        spin_unlock(&dq_data_lock);
 }
-int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
+int dquot_get_dqblk(struct super_block *sb, int type, qid_t id,
-                  struct if_dqblk *di)
+                    struct fs_disk_quota *di)
 {
        struct dquot *dquot;
@@ -2301,55 +2298,74 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
        return 0;
 }
-EXPORT_SYMBOL(vfs_get_dqblk);
+EXPORT_SYMBOL(dquot_get_dqblk);
+#define VFS_FS_DQ_MASK \
+        (FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \
+         FS_DQ_ICOUNT | FS_DQ_ISOFT | FS_DQ_IHARD | \
+         FS_DQ_BTIMER | FS_DQ_ITIMER)
 /* Generic routine for setting common part of quota structure */
-static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
+static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
 {
        struct mem_dqblk *dm = &dquot->dq_dqb;
        int check_blim = 0, check_ilim = 0;
        struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type];
-        if ((di->dqb_valid & QIF_BLIMITS &&
+        if (di->d_fieldmask & ~VFS_FS_DQ_MASK)
-             (di->dqb_bhardlimit > dqi->dqi_maxblimit ||
+                return -EINVAL;
-              di->dqb_bsoftlimit > dqi->dqi_maxblimit)) ||
-            (di->dqb_valid & QIF_ILIMITS &&
+        if (((di->d_fieldmask & FS_DQ_BSOFT) &&
-             (di->dqb_ihardlimit > dqi->dqi_maxilimit ||
+             (di->d_blk_softlimit > dqi->dqi_maxblimit)) ||
-              di->dqb_isoftlimit > dqi->dqi_maxilimit)))
+            ((di->d_fieldmask & FS_DQ_BHARD) &&
+             (di->d_blk_hardlimit > dqi->dqi_maxblimit)) ||
+            ((di->d_fieldmask & FS_DQ_ISOFT) &&
+             (di->d_ino_softlimit > dqi->dqi_maxilimit)) ||
+            ((di->d_fieldmask & FS_DQ_IHARD) &&
+             (di->d_ino_hardlimit > dqi->dqi_maxilimit)))
                return -ERANGE;
        spin_lock(&dq_data_lock);
-        if (di->dqb_valid & QIF_SPACE) {
+        if (di->d_fieldmask & FS_DQ_BCOUNT) {
-                dm->dqb_curspace = di->dqb_curspace - dm->dqb_rsvspace;
+                dm->dqb_curspace = di->d_bcount - dm->dqb_rsvspace;
                check_blim = 1;
-                __set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
+                set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
        }
-        if (di->dqb_valid & QIF_BLIMITS) {
-                dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit);
+        if (di->d_fieldmask & FS_DQ_BSOFT)
-                dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit);
+                dm->dqb_bsoftlimit = qbtos(di->d_blk_softlimit);
+        if (di->d_fieldmask & FS_DQ_BHARD)
+                dm->dqb_bhardlimit = qbtos(di->d_blk_hardlimit);
+        if (di->d_fieldmask & (FS_DQ_BSOFT | FS_DQ_BHARD)) {
                check_blim = 1;
-                __set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
+                set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
        }
-        if (di->dqb_valid & QIF_INODES) {
-                dm->dqb_curinodes = di->dqb_curinodes;
+        if (di->d_fieldmask & FS_DQ_ICOUNT) {
+                dm->dqb_curinodes = di->d_icount;
                check_ilim = 1;
-                __set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
+                set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
        }
-        if (di->dqb_valid & QIF_ILIMITS) {
-                dm->dqb_isoftlimit = di->dqb_isoftlimit;
+        if (di->d_fieldmask & FS_DQ_ISOFT)
-                dm->dqb_ihardlimit = di->dqb_ihardlimit;
+                dm->dqb_isoftlimit = di->d_ino_softlimit;
+        if (di->d_fieldmask & FS_DQ_IHARD)
+                dm->dqb_ihardlimit = di->d_ino_hardlimit;
+        if (di->d_fieldmask & (FS_DQ_ISOFT | FS_DQ_IHARD)) {
                check_ilim = 1;
-                __set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
+                set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
        }
-        if (di->dqb_valid & QIF_BTIME) {
-                dm->dqb_btime = di->dqb_btime;
+        if (di->d_fieldmask & FS_DQ_BTIMER) {
+                dm->dqb_btime = di->d_btimer;
                check_blim = 1;
-                __set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+                set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
        }
-        if (di->dqb_valid & QIF_ITIME) {
-                dm->dqb_itime = di->dqb_itime;
+        if (di->d_fieldmask & FS_DQ_ITIMER) {
+                dm->dqb_itime = di->d_itimer;
                check_ilim = 1;
-                __set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+                set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
        }
        if (check_blim) {
@@ -2357,7 +2373,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
                    dm->dqb_curspace < dm->dqb_bsoftlimit) {
                        dm->dqb_btime = 0;
                        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
-                } else if (!(di->dqb_valid & QIF_BTIME))
+                } else if (!(di->d_fieldmask & FS_DQ_BTIMER))
                        /* Set grace only if user hasn't provided his own... */
                        dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
        }
@@ -2366,7 +2382,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
                    dm->dqb_curinodes < dm->dqb_isoftlimit) {
                        dm->dqb_itime = 0;
                        clear_bit(DQ_INODES_B, &dquot->dq_flags);
-                } else if (!(di->dqb_valid & QIF_ITIME))
+                } else if (!(di->d_fieldmask & FS_DQ_ITIMER))
                        /* Set grace only if user hasn't provided his own... */
                        dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
        }
@@ -2381,8 +2397,8 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
        return 0;
 }
-int vfs_set_dqblk(struct super_block *sb, int type, qid_t id,
+int dquot_set_dqblk(struct super_block *sb, int type, qid_t id,
-                  struct if_dqblk *di)
+                  struct fs_disk_quota *di)
 {
        struct dquot *dquot;
        int rc;
@@ -2397,10 +2413,10 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id,
 out:
        return rc;
 }
-EXPORT_SYMBOL(vfs_set_dqblk);
+EXPORT_SYMBOL(dquot_set_dqblk);
 /* Generic routine for getting common part of quota file information */
-int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
+int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 {
        struct mem_dqinfo *mi;
  
@@ -2419,10 +2435,10 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
        return 0;
 }
-EXPORT_SYMBOL(vfs_get_dqinfo);
+EXPORT_SYMBOL(dquot_get_dqinfo);
 /* Generic routine for setting common part of quota file information */
-int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
+int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 {
        struct mem_dqinfo *mi;
        int err = 0;
@@ -2449,74 +2465,86 @@ out:
        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
        return err;
 }
-EXPORT_SYMBOL(vfs_set_dqinfo);
+EXPORT_SYMBOL(dquot_set_dqinfo);
-const struct quotactl_ops vfs_quotactl_ops = {
+const struct quotactl_ops dquot_quotactl_ops = {
-        .quota_on       = vfs_quota_on,
+        .quota_on       = dquot_quota_on,
-        .quota_off      = vfs_quota_off,
+        .quota_off      = dquot_quota_off,
-        .quota_sync     = vfs_quota_sync,
+        .quota_sync     = dquot_quota_sync,
-        .get_info       = vfs_get_dqinfo,
+        .get_info       = dquot_get_dqinfo,
-        .set_info       = vfs_set_dqinfo,
+        .set_info       = dquot_set_dqinfo,
-        .get_dqblk      = vfs_get_dqblk,
+        .get_dqblk      = dquot_get_dqblk,
-        .set_dqblk      = vfs_set_dqblk
+        .set_dqblk      = dquot_set_dqblk
 };
+EXPORT_SYMBOL(dquot_quotactl_ops);
+static int do_proc_dqstats(struct ctl_table *table, int write,
+                     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        unsigned int type = (int *)table->data - dqstats.stat;
+        /* Update global table */
+        dqstats.stat[type] =
+                        percpu_counter_sum_positive(&dqstats.counter[type]);
+        return proc_dointvec(table, write, buffer, lenp, ppos);
+}
 static ctl_table fs_dqstats_table[] = {
        {
                .procname       = "lookups",
-                .data           = &dqstats.lookups,
+                .data           = &dqstats.stat[DQST_LOOKUPS],
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = do_proc_dqstats,
        },
        {
                .procname       = "drops",
-                .data           = &dqstats.drops,
+                .data           = &dqstats.stat[DQST_DROPS],
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = do_proc_dqstats,
        },
        {
                .procname       = "reads",
-                .data           = &dqstats.reads,
+                .data           = &dqstats.stat[DQST_READS],
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = do_proc_dqstats,
        },
        {
                .procname       = "writes",
-                .data           = &dqstats.writes,
+                .data           = &dqstats.stat[DQST_WRITES],
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = do_proc_dqstats,
        },
        {
                .procname       = "cache_hits",
-                .data           = &dqstats.cache_hits,
+                .data           = &dqstats.stat[DQST_CACHE_HITS],
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = do_proc_dqstats,
        },
        {
                .procname       = "allocated_dquots",
-                .data           = &dqstats.allocated_dquots,
+                .data           = &dqstats.stat[DQST_ALLOC_DQUOTS],
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = do_proc_dqstats,
        },
        {
                .procname       = "free_dquots",
-                .data           = &dqstats.free_dquots,
+                .data           = &dqstats.stat[DQST_FREE_DQUOTS],
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = do_proc_dqstats,
        },
        {
                .procname       = "syncs",
-                .data           = &dqstats.syncs,
+                .data           = &dqstats.stat[DQST_SYNCS],
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = do_proc_dqstats,
        },
 #ifdef CONFIG_PRINT_QUOTA_WARNING
        {
@@ -2550,7 +2578,7 @@ static ctl_table sys_table[] = {
 static int __init dquot_init(void)
 {
-        int i;
+        int i, ret;
        unsigned long nr_hash, order;
        printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__);
@@ -2568,6 +2596,12 @@ static int __init dquot_init(void)
        if (!dquot_hash)
                panic("Cannot create dquot hash table");
+        for (i = 0; i < _DQST_DQSTAT_LAST; i++) {
+                ret = percpu_counter_init(&dqstats.counter[i], 0);
+                if (ret)
+                        panic("Cannot create dquot stat counters");
+        }
        /* Find power-of-two hlist_heads which can fit into allocation */
        nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
        dq_hash_bits = 0;
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index 2663ed90fb03..d67908b407d9 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -5,6 +5,7 @@
 #include <linux/kernel.h>
 #include <linux/quotaops.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <net/netlink.h>
 #include <net/genetlink.h>
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 95388f9b7356..b299961e1edb 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -45,36 +45,22 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
        return security_quotactl(cmd, type, id, sb);
 }
+static void quota_sync_one(struct super_block *sb, void *arg)
+{
+        if (sb->s_qcop && sb->s_qcop->quota_sync)
+                sb->s_qcop->quota_sync(sb, *(int *)arg, 1);
+}
 static int quota_sync_all(int type)
 {
-        struct super_block *sb;
        int ret;
        if (type >= MAXQUOTAS)
                return -EINVAL;
        ret = security_quotactl(Q_SYNC, type, 0, NULL);
-        if (ret)
+        if (!ret)
-                return ret;
+                iterate_supers(quota_sync_one, &type);
+        return ret;
-        spin_lock(&sb_lock);
-restart:
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (!sb->s_qcop || !sb->s_qcop->quota_sync)
-                        continue;
-                sb->s_count++;
-                spin_unlock(&sb_lock);
-                down_read(&sb->s_umount);
-                if (sb->s_root)
-                        sb->s_qcop->quota_sync(sb, type, 1);
-                up_read(&sb->s_umount);
-                spin_lock(&sb_lock);
-                if (__put_super_and_need_restart(sb))
-                        goto restart;
-        }
-        spin_unlock(&sb_lock);
-        return 0;
 }
 static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
@@ -87,7 +73,7 @@ static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
        if (IS_ERR(pathname))
                return PTR_ERR(pathname);
        if (sb->s_qcop->quota_on)
-                ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0);
+                ret = sb->s_qcop->quota_on(sb, type, id, pathname);
        putname(pathname);
        return ret;
 }
@@ -113,8 +99,6 @@ static int quota_getinfo(struct super_block *sb, int type, void __user *addr)
        struct if_dqinfo info;
        int ret;
-        if (!sb_has_quota_active(sb, type))
-                return -ESRCH;
        if (!sb->s_qcop->get_info)
                return -ENOSYS;
        ret = sb->s_qcop->get_info(sb, type, &info);
@@ -129,43 +113,80 @@ static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
        if (copy_from_user(&info, addr, sizeof(info)))
                return -EFAULT;
-        if (!sb_has_quota_active(sb, type))
-                return -ESRCH;
        if (!sb->s_qcop->set_info)
                return -ENOSYS;
        return sb->s_qcop->set_info(sb, type, &info);
 }
+static void copy_to_if_dqblk(struct if_dqblk *dst, struct fs_disk_quota *src)
+{
+        dst->dqb_bhardlimit = src->d_blk_hardlimit;
+        dst->dqb_bsoftlimit = src->d_blk_softlimit;
+        dst->dqb_curspace = src->d_bcount;
+        dst->dqb_ihardlimit = src->d_ino_hardlimit;
+        dst->dqb_isoftlimit = src->d_ino_softlimit;
+        dst->dqb_curinodes = src->d_icount;
+        dst->dqb_btime = src->d_btimer;
+        dst->dqb_itime = src->d_itimer;
+        dst->dqb_valid = QIF_ALL;
+}
 static int quota_getquota(struct super_block *sb, int type, qid_t id,
                          void __user *addr)
 {
+        struct fs_disk_quota fdq;
        struct if_dqblk idq;
        int ret;
-        if (!sb_has_quota_active(sb, type))
-                return -ESRCH;
        if (!sb->s_qcop->get_dqblk)
                return -ENOSYS;
-        ret = sb->s_qcop->get_dqblk(sb, type, id, &idq);
+        ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq);
        if (ret)
                return ret;
+        copy_to_if_dqblk(&idq, &fdq);
        if (copy_to_user(addr, &idq, sizeof(idq)))
                return -EFAULT;
        return 0;
 }
+static void copy_from_if_dqblk(struct fs_disk_quota *dst, struct if_dqblk *src)
+{
+        dst->d_blk_hardlimit = src->dqb_bhardlimit;
+        dst->d_blk_softlimit  = src->dqb_bsoftlimit;
+        dst->d_bcount = src->dqb_curspace;
+        dst->d_ino_hardlimit = src->dqb_ihardlimit;
+        dst->d_ino_softlimit = src->dqb_isoftlimit;
+        dst->d_icount = src->dqb_curinodes;
+        dst->d_btimer = src->dqb_btime;
+        dst->d_itimer = src->dqb_itime;
+        dst->d_fieldmask = 0;
+        if (src->dqb_valid & QIF_BLIMITS)
+                dst->d_fieldmask |= FS_DQ_BSOFT | FS_DQ_BHARD;
+        if (src->dqb_valid & QIF_SPACE)
+                dst->d_fieldmask |= FS_DQ_BCOUNT;
+        if (src->dqb_valid & QIF_ILIMITS)
+                dst->d_fieldmask |= FS_DQ_ISOFT | FS_DQ_IHARD;
+        if (src->dqb_valid & QIF_INODES)
+                dst->d_fieldmask |= FS_DQ_ICOUNT;
+        if (src->dqb_valid & QIF_BTIME)
+                dst->d_fieldmask |= FS_DQ_BTIMER;
+        if (src->dqb_valid & QIF_ITIME)
+                dst->d_fieldmask |= FS_DQ_ITIMER;
+}
 static int quota_setquota(struct super_block *sb, int type, qid_t id,
                          void __user *addr)
 {
+        struct fs_disk_quota fdq;
        struct if_dqblk idq;
        if (copy_from_user(&idq, addr, sizeof(idq)))
                return -EFAULT;
-        if (!sb_has_quota_active(sb, type))
-                return -ESRCH;
        if (!sb->s_qcop->set_dqblk)
                return -ENOSYS;
-        return sb->s_qcop->set_dqblk(sb, type, id, &idq);
+        copy_from_if_dqblk(&fdq, &idq);
+        return sb->s_qcop->set_dqblk(sb, type, id, &fdq);
 }
 static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr)
@@ -199,9 +220,9 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id,
        if (copy_from_user(&fdq, addr, sizeof(fdq)))
                return -EFAULT;
-        if (!sb->s_qcop->set_xquota)
+        if (!sb->s_qcop->set_dqblk)
                return -ENOSYS;
-        return sb->s_qcop->set_xquota(sb, type, id, &fdq);
+        return sb->s_qcop->set_dqblk(sb, type, id, &fdq);
 }
 static int quota_getxquota(struct super_block *sb, int type, qid_t id,
@@ -210,9 +231,9 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
        struct fs_disk_quota fdq;
        int ret;
-        if (!sb->s_qcop->get_xquota)
+        if (!sb->s_qcop->get_dqblk)
                return -ENOSYS;
-        ret = sb->s_qcop->get_xquota(sb, type, id, &fdq);
+        ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq);
        if (!ret && copy_to_user(addr, &fdq, sizeof(fdq)))
                return -EFAULT;
        return ret;
@@ -239,7 +260,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
        case Q_QUOTAOFF:
                if (!sb->s_qcop->quota_off)
                        return -ENOSYS;
-                return sb->s_qcop->quota_off(sb, type, 0);
+                return sb->s_qcop->quota_off(sb, type);
        case Q_GETFMT:
                return quota_getfmt(sb, type, addr);
        case Q_GETINFO:
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index f81f4bcfb178..24f03407eeb5 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -60,9 +60,17 @@ static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
 static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
 {
        struct super_block *sb = info->dqi_sb;
+        ssize_t ret;
-        return sb->s_op->quota_write(sb, info->dqi_type, buf,
+        ret = sb->s_op->quota_write(sb, info->dqi_type, buf,
               info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+        if (ret != info->dqi_usable_bs) {
+                q_warn(KERN_WARNING "VFS: dquota write failed on "
+                        "dev %s\n", sb->s_id);
+                if (ret >= 0)
+                        ret = -EIO;
+        }
+        return ret;
 }
 /* Remove empty block from list and return it */
@@ -152,7 +160,7 @@ static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
        dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
        /* No matter whether write succeeds block is out of list */
        if (write_blk(info, blk, buf) < 0)
-                printk(KERN_ERR
+                q_warn(KERN_ERR
                       "VFS: Can't write block (%u) with free entries.\n",
                       blk);
        return 0;
@@ -244,7 +252,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
        if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
                *err = remove_free_dqentry(info, buf, blk);
                if (*err < 0) {
-                        printk(KERN_ERR "VFS: find_free_dqentry(): Can't "
+                        q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't "
                               "remove block (%u) from entry free list.\n",
                               blk);
                        goto out_buf;
@@ -268,7 +276,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
 #endif
        *err = write_blk(info, blk, buf);
        if (*err < 0) {
-                printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
+                q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
                                "data block %u.\n", blk);
                goto out_buf;
        }
@@ -303,7 +311,7 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
        } else {
                ret = read_blk(info, *treeblk, buf);
                if (ret < 0) {
-                        printk(KERN_ERR "VFS: Can't read tree quota block "
+                        q_warn(KERN_ERR "VFS: Can't read tree quota block "
                                        "%u.\n", *treeblk);
                        goto out_buf;
                }
@@ -365,7 +373,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
        if (!dquot->dq_off) {
                ret = dq_insert_tree(info, dquot);
                if (ret < 0) {
-                        printk(KERN_ERR "VFS: Error %zd occurred while "
+                        q_warn(KERN_ERR "VFS: Error %zd occurred while "
                                        "creating quota.\n", ret);
                        kfree(ddquot);
                        return ret;
@@ -377,14 +385,14 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
        ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size,
                                    dquot->dq_off);
        if (ret != info->dqi_entry_size) {
-                printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
+                q_warn(KERN_WARNING "VFS: dquota write failed on dev %s\n",
                       sb->s_id);
                if (ret >= 0)
                        ret = -ENOSPC;
        } else {
                ret = 0;
        }
-        dqstats.writes++;
+        dqstats_inc(DQST_WRITES);
        kfree(ddquot);
        return ret;
@@ -402,14 +410,14 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
        if (!buf)
                return -ENOMEM;
        if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
-                printk(KERN_ERR "VFS: Quota structure has offset to other "
+                q_warn(KERN_ERR "VFS: Quota structure has offset to other "
                  "block (%u) than it should (%u).\n", blk,
                  (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
                goto out_buf;
        }
        ret = read_blk(info, blk, buf);
        if (ret < 0) {
-                printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
+                q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
                goto out_buf;
        }
        dh = (struct qt_disk_dqdbheader *)buf;
@@ -419,7 +427,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                if (ret >= 0)
                        ret = put_free_dqblk(info, buf, blk);
                if (ret < 0) {
-                        printk(KERN_ERR "VFS: Can't move quota data block (%u) "
+                        q_warn(KERN_ERR "VFS: Can't move quota data block (%u) "
                          "to free list.\n", blk);
                        goto out_buf;
                }
@@ -432,14 +440,14 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                        /* Insert will write block itself */
                        ret = insert_free_dqentry(info, buf, blk);
                        if (ret < 0) {
-                                printk(KERN_ERR "VFS: Can't insert quota data "
+                                q_warn(KERN_ERR "VFS: Can't insert quota data "
                                       "block (%u) to free entry list.\n", blk);
                                goto out_buf;
                        }
                } else {
                        ret = write_blk(info, blk, buf);
                        if (ret < 0) {
-                                printk(KERN_ERR "VFS: Can't write quota data "
+                                q_warn(KERN_ERR "VFS: Can't write quota data "
                                  "block %u\n", blk);
                                goto out_buf;
                        }
@@ -464,7 +472,7 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                return -ENOMEM;
        ret = read_blk(info, *blk, buf);
        if (ret < 0) {
-                printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
+                q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
                goto out_buf;
        }
        newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -488,7 +496,7 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                } else {
                        ret = write_blk(info, *blk, buf);
                        if (ret < 0)
-                                printk(KERN_ERR "VFS: Can't write quota tree "
+                                q_warn(KERN_ERR "VFS: Can't write quota tree "
                                  "block %u.\n", *blk);
                }
        }
@@ -521,7 +529,7 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
                return -ENOMEM;
        ret = read_blk(info, blk, buf);
        if (ret < 0) {
-                printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+                q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
                goto out_buf;
        }
        ddquot = buf + sizeof(struct qt_disk_dqdbheader);
@@ -531,7 +539,7 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
                ddquot += info->dqi_entry_size;
        }
        if (i == qtree_dqstr_in_blk(info)) {
-                printk(KERN_ERR "VFS: Quota for id %u referenced "
+                q_warn(KERN_ERR "VFS: Quota for id %u referenced "
                  "but not present.\n", dquot->dq_id);
                ret = -EIO;
                goto out_buf;
@@ -556,7 +564,7 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
                return -ENOMEM;
        ret = read_blk(info, blk, buf);
        if (ret < 0) {
-                printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+                q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
                goto out_buf;
        }
        ret = 0;
@@ -599,7 +607,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
                offset = find_dqentry(info, dquot);
                if (offset <= 0) {      /* Entry not present? */
                        if (offset < 0)
-                                printk(KERN_ERR "VFS: Can't read quota "
+                                q_warn(KERN_ERR "VFS: Can't read quota "
                                  "structure for id %u.\n", dquot->dq_id);
                        dquot->dq_off = 0;
                        set_bit(DQ_FAKE_B, &dquot->dq_flags);
@@ -617,7 +625,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
        if (ret != info->dqi_entry_size) {
                if (ret >= 0)
                        ret = -EIO;
-                printk(KERN_ERR "VFS: Error while reading quota "
+                q_warn(KERN_ERR "VFS: Error while reading quota "
                                "structure for id %u.\n", dquot->dq_id);
                set_bit(DQ_FAKE_B, &dquot->dq_flags);
                memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
@@ -634,7 +642,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
        spin_unlock(&dq_data_lock);
        kfree(ddquot);
 out:
-        dqstats.reads++;
+        dqstats_inc(DQST_READS);
        return ret;
 }
 EXPORT_SYMBOL(qtree_read_dquot);
diff --git a/fs/quota/quota_tree.h b/fs/quota/quota_tree.h
index a1ab8db81a51..ccc3e71fb1d8 100644
--- a/fs/quota/quota_tree.h
+++ b/fs/quota/quota_tree.h
@@ -22,4 +22,10 @@ struct qt_disk_dqdbheader {
 #define QT_TREEOFF      1               /* Offset of tree in file in blocks */
+#define q_warn(fmt, args...) \
+do { \
+        if (printk_ratelimit()) \
+                printk(fmt, ## args); \
+} while(0)
 #endif /* _LINUX_QUOTAIO_TREE_H */
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 2ae757e9c008..4af344c5852a 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -71,7 +71,7 @@ static int v1_read_dqblk(struct dquot *dquot)
            dquot->dq_dqb.dqb_ihardlimit == 0 &&
            dquot->dq_dqb.dqb_isoftlimit == 0)
                set_bit(DQ_FAKE_B, &dquot->dq_flags);
-        dqstats.reads++;
+        dqstats_inc(DQST_READS);
        return 0;
 }
@@ -104,7 +104,7 @@ static int v1_commit_dqblk(struct dquot *dquot)
        ret = 0;
 out:
-        dqstats.writes++;
+        dqstats_inc(DQST_WRITES);
        return ret;
 }
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index e3da02f4986f..135206af1458 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -63,7 +63,7 @@ static int v2_read_header(struct super_block *sb, int type,
        size = sb->s_op->quota_read(sb, type, (char *)dqhead,
                                    sizeof(struct v2_disk_dqheader), 0);
        if (size != sizeof(struct v2_disk_dqheader)) {
-                printk(KERN_WARNING "quota_v2: Failed header read:"
+                q_warn(KERN_WARNING "quota_v2: Failed header read:"
                       " expected=%zd got=%zd\n",
                        sizeof(struct v2_disk_dqheader), size);
                return 0;
@@ -106,7 +106,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
        size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
               sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
        if (size != sizeof(struct v2_disk_dqinfo)) {
-                printk(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n",
+                q_warn(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n",
                        sb->s_id);
                return -1;
        }
@@ -167,7 +167,7 @@ static int v2_write_file_info(struct super_block *sb, int type)
        size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
               sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
        if (size != sizeof(struct v2_disk_dqinfo)) {
-                printk(KERN_WARNING "Can't write info structure on device %s.\n",
+                q_warn(KERN_WARNING "Can't write info structure on device %s.\n",
                        sb->s_id);
                return -1;
        }
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 78f613cb9c76..4884ac5ae9be 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -43,12 +43,13 @@ const struct file_operations ramfs_file_operations = {
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
-        .fsync          = simple_sync_file,
+        .fsync          = noop_fsync,
        .splice_read    = generic_file_splice_read,
        .splice_write   = generic_file_splice_write,
        .llseek         = generic_file_llseek,
 };
 const struct inode_operations ramfs_file_inode_operations = {
+        .setattr        = simple_setattr,
        .getattr        = simple_getattr,
 };
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 1739a4aba25f..d532c20fc179 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -21,6 +21,7 @@
 #include <linux/pagevec.h>
 #include <linux/mman.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -41,7 +42,7 @@ const struct file_operations ramfs_file_operations = {
        .aio_read               = generic_file_aio_read,
        .write                  = do_sync_write,
        .aio_write              = generic_file_aio_write,
-        .fsync                  = simple_sync_file,
+        .fsync                  = noop_fsync,
        .splice_read            = generic_file_splice_read,
        .splice_write           = generic_file_splice_write,
        .llseek                 = generic_file_llseek,
@@ -145,7 +146,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
                        return ret;
        }
-        ret = vmtruncate(inode, newsize);
+        ret = simple_setsize(inode, newsize);
        return ret;
 }
@@ -168,7 +169,8 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
        /* pick out size-changing events */
        if (ia->ia_valid & ATTR_SIZE) {
-                loff_t size = i_size_read(inode);
+                loff_t size = inode->i_size;
                if (ia->ia_size != size) {
                        ret = ramfs_nommu_resize(inode, ia->ia_size, size);
                        if (ret < 0 || ia->ia_valid == ATTR_SIZE)
@@ -181,7 +183,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
                }
        }
-        ret = inode_setattr(inode, ia);
+        generic_setattr(inode, ia);
 out:
        ia->ia_valid = old_ia_valid;
        return ret;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a6090aa1a7c1..a5ebae70dc6d 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -35,6 +35,7 @@
 #include <linux/sched.h>
 #include <linux/parser.h>
 #include <linux/magic.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -51,14 +52,13 @@ static struct backing_dev_info ramfs_backing_dev_info = {
                          BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
 };
-struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
+struct inode *ramfs_get_inode(struct super_block *sb,
+                                const struct inode *dir, int mode, dev_t dev)
 {
        struct inode * inode = new_inode(sb);
        if (inode) {
-                inode->i_mode = mode;
+                inode_init_owner(inode, dir, mode);
-                inode->i_uid = current_fsuid();
-                inode->i_gid = current_fsgid();
                inode->i_mapping->a_ops = &ramfs_aops;
                inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
                mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
@@ -94,15 +94,10 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
 static int
 ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 {
-        struct inode * inode = ramfs_get_inode(dir->i_sb, mode, dev);
+        struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev);
        int error = -ENOSPC;
        if (inode) {
-                if (dir->i_mode & S_ISGID) {
-                        inode->i_gid = dir->i_gid;
-                        if (S_ISDIR(mode))
-                                inode->i_mode |= S_ISGID;
-                }
                d_instantiate(dentry, inode);
                dget(dentry);   /* Extra count - pin the dentry in core */
                error = 0;
@@ -129,13 +124,11 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
        struct inode *inode;
        int error = -ENOSPC;
-        inode = ramfs_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
+        inode = ramfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
        if (inode) {
                int l = strlen(symname)+1;
                error = page_symlink(inode, symname, l);
                if (!error) {
-                        if (dir->i_mode & S_ISGID)
-                                inode->i_gid = dir->i_gid;
                        d_instantiate(dentry, inode);
                        dget(dentry);
                        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
@@ -213,7 +206,7 @@ static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
        return 0;
 }
-static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
+int ramfs_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct ramfs_fs_info *fsi;
        struct inode *inode = NULL;
@@ -240,7 +233,7 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
        sb->s_op                = &ramfs_ops;
        sb->s_time_gran         = 1;
-        inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0);
+        inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
        if (!inode) {
                err = -ENOMEM;
                goto fail;
diff --git a/fs/read_write.c b/fs/read_write.c
index b7f4a1f94d48..9c0485236e68 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -97,6 +97,23 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 }
 EXPORT_SYMBOL(generic_file_llseek);
+/**
+ * noop_llseek - No Operation Performed llseek implementation
+ * @file:       file structure to seek on
+ * @offset:     file offset to seek to
+ * @origin:     type of seek
+ *
+ * This is an implementation of ->llseek useable for the rare special case when
+ * userspace expects the seek to succeed but the (device) file is actually not
+ * able to perform the seek. In this case you use noop_llseek() instead of
+ * falling back to the default implementation of ->llseek.
+ */
+loff_t noop_llseek(struct file *file, loff_t offset, int origin)
+{
+        return file->f_pos;
+}
+EXPORT_SYMBOL(noop_llseek);
 loff_t no_llseek(struct file *file, loff_t offset, int origin)
 {
        return -ESPIPE;
@@ -258,6 +275,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = *ppos;
        kiocb.ki_left = len;
+        kiocb.ki_nbytes = len;
        for (;;) {
                ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
@@ -313,6 +331,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = *ppos;
        kiocb.ki_left = len;
+        kiocb.ki_nbytes = len;
        for (;;) {
                ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index c094f58c7448..198dabf1b2bb 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -8,15 +8,16 @@
 #include <linux/reiserfs_fs.h>
 #include <linux/stat.h>
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 extern const struct reiserfs_key MIN_KEY;
 static int reiserfs_readdir(struct file *, void *, filldir_t);
-static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
+static int reiserfs_dir_fsync(struct file *filp, int datasync);
-                              int datasync);
 const struct file_operations reiserfs_dir_operations = {
+        .llseek = generic_file_llseek,
        .read = generic_read_dir,
        .readdir = reiserfs_readdir,
        .fsync = reiserfs_dir_fsync,
@@ -26,10 +27,9 @@ const struct file_operations reiserfs_dir_operations = {
 #endif
 };
-static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
+static int reiserfs_dir_fsync(struct file *filp, int datasync)
-                              int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = filp->f_mapping->host;
        int err;
        reiserfs_write_lock(inode->i_sb);
        err = reiserfs_commit_for_inode(inode);
@@ -45,8 +45,6 @@ static inline bool is_privroot_deh(struct dentry *dir,
                                   struct reiserfs_de_head *deh)
 {
        struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
-        if (reiserfs_expose_privroot(dir->d_sb))
-                return 0;
        return (dir == dir->d_parent && privroot->d_inode &&
                deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
 }
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 1d9c12714c5c..b82cdd8a45dd 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -134,10 +134,9 @@ static void reiserfs_vfs_truncate_file(struct inode *inode)
 * be removed...
 */
-static int reiserfs_sync_file(struct file *filp,
+static int reiserfs_sync_file(struct file *filp, int datasync)
-                              struct dentry *dentry, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = filp->f_mapping->host;
        int err;
        int barrier_done;
@@ -147,7 +146,8 @@ static int reiserfs_sync_file(struct file *filp,
        barrier_done = reiserfs_commit_for_inode(inode);
        reiserfs_write_unlock(inode->i_sb);
        if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
-                blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 
+                        BLKDEV_IFL_WAIT);
        if (barrier_done < 0)
                return barrier_done;
        return (err < 0) ? -EIO : 0;
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 6591cb21edf6..1e4250bc3a6f 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -35,6 +35,7 @@
 **/
 #include <linux/time.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/buffer_head.h>
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index d1da94b82d8f..0f22fdaf54ac 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -11,6 +11,7 @@
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/unaligned.h>
 #include <linux/buffer_head.h>
@@ -3075,9 +3076,10 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
        ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
        depth = reiserfs_write_lock_once(inode->i_sb);
-        if (attr->ia_valid & ATTR_SIZE) {
+        if (is_quota_modification(inode, attr))
                dquot_initialize(inode);
+        if (attr->ia_valid & ATTR_SIZE) {
                /* version 2 items will be caught by the s_maxbytes check
                 ** done for us in vmtruncate
                 */
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index ba98546fabbd..19fbc810e8e7 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -50,6 +50,7 @@
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/uaccess.h>
+#include <linux/slab.h>
 #include <asm/system.h>
@@ -2217,6 +2218,15 @@ static int journal_read_transaction(struct super_block *sb,
                brelse(d_bh);
                return 1;
        }
+        if (bdev_read_only(sb->s_bdev)) {
+                reiserfs_warning(sb, "clm-2076",
+                                 "device is readonly, unable to replay log");
+                brelse(c_bh);
+                brelse(d_bh);
+                return -EROFS;
+        }
        trans_id = get_desc_trans_id(desc);
        /* now we know we've got a good transaction, and it was inside the valid time ranges */
        log_blocks = kmalloc(get_desc_trans_len(desc) *
@@ -2459,12 +2469,6 @@ static int journal_read(struct super_block *sb)
                goto start_log_replay;
        }
-        if (continue_replay && bdev_read_only(sb->s_bdev)) {
-                reiserfs_warning(sb, "clm-2076",
-                                 "device is readonly, unable to replay log");
-                return -1;
-        }
        /* ok, there are transactions that need to be replayed.  start with the first log block, find
         ** all the valid transactions, and pick out the oldest.
         */
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 96e4cbbfaa18..ee78d4a0086a 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -13,6 +13,7 @@
 #include <linux/time.h>
 #include <linux/bitops.h>
+#include <linux/slab.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/reiserfs_acl.h>
 #include <linux/reiserfs_xattr.h>
@@ -560,23 +561,13 @@ static int drop_new_inode(struct inode *inode)
 */
 static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
 {
-        /* the quota init calls have to know who to charge the quota to, so
-         ** we have to set uid and gid here
-         */
-        inode->i_uid = current_fsuid();
-        inode->i_mode = mode;
        /* Make inode invalid - just in case we are going to drop it before
         * the initialization happens */
        INODE_PKEY(inode)->k_objectid = 0;
+        /* the quota init calls have to know who to charge the quota to, so
-        if (dir->i_mode & S_ISGID) {
+         ** we have to set uid and gid here
-                inode->i_gid = dir->i_gid;
+         */
-                if (S_ISDIR(mode))
+        inode_init_owner(inode, dir, mode);
-                        inode->i_mode |= S_ISGID;
-        } else {
-                inode->i_gid = current_fsgid();
-        }
        dquot_initialize(inode);
        return 0;
 }
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 04bf5d791bda..9822fa15118b 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -12,6 +12,7 @@
 */
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/time.h>
 #include <asm/uaccess.h>
@@ -157,6 +158,7 @@ static int finish_unfinished(struct super_block *s)
 #ifdef CONFIG_QUOTA
        int i;
        int ms_active_set;
+        int quota_enabled[MAXQUOTAS];
 #endif
        /* compose key to look for "save" links */
@@ -178,8 +180,15 @@ static int finish_unfinished(struct super_block *s)
        }
        /* Turn on quotas so that they are updated correctly */
        for (i = 0; i < MAXQUOTAS; i++) {
+                quota_enabled[i] = 1;
                if (REISERFS_SB(s)->s_qf_names[i]) {
-                        int ret = reiserfs_quota_on_mount(s, i);
+                        int ret;
+                        if (sb_has_quota_active(s, i)) {
+                                quota_enabled[i] = 0;
+                                continue;
+                        }
+                        ret = reiserfs_quota_on_mount(s, i);
                        if (ret < 0)
                                reiserfs_warning(s, "reiserfs-2500",
                                                 "cannot turn on journaled "
@@ -303,8 +312,8 @@ static int finish_unfinished(struct super_block *s)
 #ifdef CONFIG_QUOTA
        /* Turn quotas off */
        for (i = 0; i < MAXQUOTAS; i++) {
-                if (sb_dqopt(s)->files[i])
+                if (sb_dqopt(s)->files[i] && quota_enabled[i])
-                        vfs_quota_off(s, i, 0);
+                        dquot_quota_off(s, i);
        }
        if (ms_active_set)
                /* Restore the flag back */
@@ -465,6 +474,8 @@ static void reiserfs_put_super(struct super_block *s)
        struct reiserfs_transaction_handle th;
        th.t_trans_id = 0;
+        dquot_disable(s, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
        reiserfs_write_lock(s);
        if (s->s_dirt)
@@ -619,7 +630,7 @@ static int reiserfs_acquire_dquot(struct dquot *);
 static int reiserfs_release_dquot(struct dquot *);
 static int reiserfs_mark_dquot_dirty(struct dquot *);
 static int reiserfs_write_info(struct super_block *, int);
-static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
+static int reiserfs_quota_on(struct super_block *, int, int, char *);
 static const struct dquot_operations reiserfs_quota_operations = {
        .write_dquot = reiserfs_write_dquot,
@@ -633,12 +644,12 @@ static const struct dquot_operations reiserfs_quota_operations = {
 static const struct quotactl_ops reiserfs_qctl_operations = {
        .quota_on = reiserfs_quota_on,
-        .quota_off = vfs_quota_off,
+        .quota_off = dquot_quota_off,
-        .quota_sync = vfs_quota_sync,
+        .quota_sync = dquot_quota_sync,
-        .get_info = vfs_get_dqinfo,
+        .get_info = dquot_get_dqinfo,
-        .set_info = vfs_set_dqinfo,
+        .set_info = dquot_set_dqinfo,
-        .get_dqblk = vfs_get_dqblk,
+        .get_dqblk = dquot_get_dqblk,
-        .set_dqblk = vfs_set_dqblk,
+        .set_dqblk = dquot_set_dqblk,
 };
 #endif
@@ -1241,6 +1252,11 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
                if (s->s_flags & MS_RDONLY)
                        /* it is read-only already */
                        goto out_ok;
+                err = dquot_suspend(s, -1);
+                if (err < 0)
+                        goto out_err;
                /* try to remount file system with read-only permissions */
                if (sb_umount_state(rs) == REISERFS_VALID_FS
                    || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) {
@@ -1294,6 +1310,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
        s->s_dirt = 0;
        if (!(*mount_flags & MS_RDONLY)) {
+                dquot_resume(s, -1);
                finish_unfinished(s);
                reiserfs_xattr_init(s, *mount_flags);
        }
@@ -1618,10 +1635,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        save_mount_options(s, data);
        sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
-        if (!sbi) {
+        if (!sbi)
-                errval = -ENOMEM;
+                return -ENOMEM;
-                goto error_alloc;
-        }
        s->s_fs_info = sbi;
        /* Set default values for options: non-aggressive tails, RO on errors */
        REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
@@ -1878,12 +1893,12 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        return (0);
 error:
-        reiserfs_write_unlock(s);
-error_alloc:
        if (jinit_done) {       /* kill the commit thread, free journal ram */
                journal_release_error(NULL, s);
        }
+        reiserfs_write_unlock(s);
        reiserfs_free_bitmap_cache(s);
        if (SB_BUFFER_WITH_SB(s))
                brelse(SB_BUFFER_WITH_SB(s));
@@ -2023,15 +2038,15 @@ static int reiserfs_write_info(struct super_block *sb, int type)
 */
 static int reiserfs_quota_on_mount(struct super_block *sb, int type)
 {
-        return vfs_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
+        return dquot_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
-                                  REISERFS_SB(sb)->s_jquota_fmt, type);
+                                        REISERFS_SB(sb)->s_jquota_fmt, type);
 }
 /*
 * Standard function to be called on quota_on
 */
 static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
-                             char *name, int remount)
+                             char *name)
 {
        int err;
        struct path path;
@@ -2040,9 +2055,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
        if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
                return -EINVAL;
-        /* No more checks needed? Path and format_id are bogus anyway... */
-        if (remount)
-                return vfs_quota_on(sb, type, format_id, name, 1);
        err = kern_path(name, LOOKUP_FOLLOW, &path);
        if (err)
                return err;
@@ -2086,7 +2099,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
                if (err)
                        goto out;
        }
-        err = vfs_quota_on_path(sb, type, format_id, &path);
+        err = dquot_quota_on_path(sb, type, format_id, &path);
 out:
        path_put(&path);
        return err;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 37d034ca7d99..8c4cf273c672 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -38,6 +38,7 @@
 #include <linux/dcache.h>
 #include <linux/namei.h>
 #include <linux/errno.h>
+#include <linux/gfp.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/pagemap.h>
@@ -553,7 +554,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
        if (!err && new_size < i_size_read(dentry->d_inode)) {
                struct iattr newattrs = {
                        .ia_ctime = current_fs_time(inode->i_sb),
-                        .ia_size = buffer_size,
+                        .ia_size = new_size,
                        .ia_valid = ATTR_SIZE | ATTR_CTIME,
                };
@@ -722,11 +723,11 @@ out:
                        (handler) = *(handlers)++)
 /* This is the implementation for the xattr plugin infrastructure */
-static inline struct xattr_handler *
+static inline const struct xattr_handler *
-find_xattr_handler_prefix(struct xattr_handler **handlers,
+find_xattr_handler_prefix(const struct xattr_handler **handlers,
                           const char *name)
 {
-        struct xattr_handler *xah;
+        const struct xattr_handler *xah;
        if (!handlers)
                return NULL;
@@ -747,7 +748,7 @@ ssize_t
 reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
                  size_t size)
 {
-        struct xattr_handler *handler;
+        const struct xattr_handler *handler;
        handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
@@ -766,7 +767,7 @@ int
 reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                  size_t size, int flags)
 {
-        struct xattr_handler *handler;
+        const struct xattr_handler *handler;
        handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
@@ -783,7 +784,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 */
 int reiserfs_removexattr(struct dentry *dentry, const char *name)
 {
-        struct xattr_handler *handler;
+        const struct xattr_handler *handler;
        handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
        if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
@@ -806,7 +807,7 @@ static int listxattr_filler(void *buf, const char *name, int namelen,
        size_t size;
        if (name[0] != '.' ||
            (namelen != 1 && (name[1] != '.' || namelen != 2))) {
-                struct xattr_handler *handler;
+                const struct xattr_handler *handler;
                handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr,
                                                    name);
                if (!handler)   /* Unsupported xattr name */
@@ -919,7 +920,7 @@ static int create_privroot(struct dentry *dentry) { return 0; }
 #endif
 /* Actual operations that are exported to VFS-land */
-struct xattr_handler *reiserfs_xattr_handlers[] = {
+const struct xattr_handler *reiserfs_xattr_handlers[] = {
 #ifdef CONFIG_REISERFS_FS_XATTR
        &reiserfs_xattr_user_handler,
        &reiserfs_xattr_trusted_handler,
@@ -972,21 +973,13 @@ int reiserfs_permission(struct inode *inode, int mask)
        return generic_permission(inode, mask, NULL);
 }
-/* This will catch lookups from the fs root to .reiserfs_priv */
+static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
-static int
-xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name)
 {
-        struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root;
+        return -EPERM;
-        if (container_of(q1, struct dentry, d_name) == priv_root)
-                return -ENOENT;
-        if (q1->len == name->len &&
-                   !memcmp(q1->name, name->name, name->len))
-                return 0;
-        return 1;
 }
 static const struct dentry_operations xattr_lookup_poison_ops = {
-        .d_compare = xattr_lookup_poison,
+        .d_revalidate = xattr_hide_revalidate,
 };
 int reiserfs_lookup_privroot(struct super_block *s)
@@ -1000,8 +993,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
                                strlen(PRIVROOT_NAME));
        if (!IS_ERR(dentry)) {
                REISERFS_SB(s)->priv_root = dentry;
-                if (!reiserfs_expose_privroot(s))
+                dentry->d_op = &xattr_lookup_poison_ops;
-                        s->s_root->d_op = &xattr_lookup_poison_ops;
                if (dentry->d_inode)
                        dentry->d_inode->i_flags |= S_PRIVATE;
        } else
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index dd20a7883f0f..536d697a8a28 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -5,6 +5,7 @@
 #include <linux/errno.h>
 #include <linux/pagemap.h>
 #include <linux/xattr.h>
+#include <linux/slab.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/reiserfs_xattr.h>
 #include <linux/reiserfs_acl.h>
@@ -499,7 +500,7 @@ static size_t posix_acl_access_list(struct dentry *dentry, char *list,
        return size;
 }
-struct xattr_handler reiserfs_posix_acl_access_handler = {
+const struct xattr_handler reiserfs_posix_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags = ACL_TYPE_ACCESS,
        .get = posix_acl_get,
@@ -519,7 +520,7 @@ static size_t posix_acl_default_list(struct dentry *dentry, char *list,
        return size;
 }
-struct xattr_handler reiserfs_posix_acl_default_handler = {
+const struct xattr_handler reiserfs_posix_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags = ACL_TYPE_DEFAULT,
        .get = posix_acl_get,
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index d8b5bfcbdd30..237c6928d3c6 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -3,6 +3,7 @@
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/xattr.h>
+#include <linux/slab.h>
 #include <linux/reiserfs_xattr.h>
 #include <linux/security.h>
 #include <asm/uaccess.h>
@@ -76,7 +77,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
                return error;
        }
-        if (sec->length) {
+        if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) {
                blocks = reiserfs_xattr_jcreate_nblocks(inode) +
                         reiserfs_xattr_nblocks(inode, sec->length);
                /* We don't want to count the directories twice if we have
@@ -110,7 +111,7 @@ void reiserfs_security_free(struct reiserfs_security_handle *sec)
        sec->value = NULL;
 }
-struct xattr_handler reiserfs_xattr_security_handler = {
+const struct xattr_handler reiserfs_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .get = security_get,
        .set = security_set,
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 5b08aaca3daf..9883736ce3ec 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -48,7 +48,7 @@ static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size,
        return len;
 }
-struct xattr_handler reiserfs_xattr_trusted_handler = {
+const struct xattr_handler reiserfs_xattr_trusted_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .get = trusted_get,
        .set = trusted_set,
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 75d59c49b911..45ae1a00013a 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -44,7 +44,7 @@ static size_t user_list(struct dentry *dentry, char *list, size_t list_size,
        return len;
 }
-struct xattr_handler reiserfs_xattr_user_handler = {
+const struct xattr_handler reiserfs_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
        .get = user_get,
        .set = user_set,
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 1dabe4ee02fe..f329849ce3c0 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -22,6 +22,7 @@
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/list.h>
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 3e4803b4427e..00a70cab1f36 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -37,9 +37,10 @@ static int smb_link(struct dentry *, struct inode *, struct dentry *);
 const struct file_operations smb_dir_operations =
 {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = smb_readdir,
-        .ioctl          = smb_ioctl,
+        .unlocked_ioctl = smb_ioctl,
        .open           = smb_dir_open,
 };
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index 92d5e8ffb639..8e187a0f94bb 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -13,7 +13,6 @@
 #include <linux/fcntl.h>
 #include <linux/stat.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/smp_lock.h>
 #include <linux/net.h>
@@ -29,8 +28,9 @@
 #include "proto.h"
 static int
-smb_fsync(struct file *file, struct dentry * dentry, int datasync)
+smb_fsync(struct file *file, int datasync)
 {
+        struct dentry *dentry = file->f_path.dentry;
        struct smb_sb_info *server = server_from_dentry(dentry);
        int result;
@@ -438,7 +438,7 @@ const struct file_operations smb_file_operations =
        .aio_read       = smb_file_aio_read,
        .write          = do_sync_write,
        .aio_write      = smb_file_aio_write,
-        .ioctl          = smb_ioctl,
+        .unlocked_ioctl = smb_ioctl,
        .mmap           = smb_file_mmap,
        .open           = smb_file_open,
        .release        = smb_file_release,
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 1c4c8f089970..9551cb6f7fe4 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -479,6 +479,7 @@ smb_put_super(struct super_block *sb)
        if (server->conn_pid)
                kill_pid(server->conn_pid, SIGTERM, 1);
+        bdi_destroy(&server->bdi);
        kfree(server->ops);
        smb_unload_nls(server);
        sb->s_fs_info = NULL;
@@ -525,6 +526,11 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
        if (!server)
                goto out_no_server;
        sb->s_fs_info = server;
+        
+        if (bdi_setup_and_register(&server->bdi, "smbfs", BDI_CAP_MAP_COPY))
+                goto out_bdi;
+        sb->s_bdi = &server->bdi;
        server->super_block = sb;
        server->mnt = NULL;
@@ -624,6 +630,8 @@ out_no_smbiod:
 out_bad_option:
        kfree(mem);
 out_no_mem:
+        bdi_destroy(&server->bdi);
+out_bdi:
        if (!server->mnt)
                printk(KERN_ERR "smb_fill_super: allocation failure\n");
        sb->s_fs_info = NULL;
@@ -706,7 +714,7 @@ smb_notify_change(struct dentry *dentry, struct iattr *attr)
                error = server->ops->truncate(inode, attr->ia_size);
                if (error)
                        goto out;
-                error = vmtruncate(inode, attr->ia_size);
+                error = simple_setsize(inode, attr->ia_size);
                if (error)
                        goto out;
                refresh = 1;
diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c
index dbae1f8ea26f..07215312ad39 100644
--- a/fs/smbfs/ioctl.c
+++ b/fs/smbfs/ioctl.c
@@ -13,6 +13,7 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/highuid.h>
+#include <linux/smp_lock.h>
 #include <linux/net.h>
 #include <linux/smb_fs.h>
@@ -22,14 +23,14 @@
 #include "proto.h"
-int
+long
-smb_ioctl(struct inode *inode, struct file *filp,
+smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-          unsigned int cmd, unsigned long arg)
 {
-        struct smb_sb_info *server = server_from_inode(inode);
+        struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode);
        struct smb_conn_opt opt;
        int result = -EINVAL;
+        lock_kernel();
        switch (cmd) {
                uid16_t uid16;
                uid_t uid32;
@@ -62,6 +63,7 @@ smb_ioctl(struct inode *inode, struct file *filp,
        default:
                break;
        }
+        unlock_kernel();
        return result;
 }
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index 03f456c1b7d4..05939a6f43e6 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -67,7 +67,7 @@ extern const struct address_space_operations smb_file_aops;
 extern const struct file_operations smb_file_operations;
 extern const struct inode_operations smb_file_inode_operations;
 /* ioctl.c */
-extern int smb_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg);
+extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
 /* smbiod.c */
 extern void smbiod_wake_up(void);
 extern int smbiod_register_server(struct smb_sb_info *server);
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
index 6bd9b691a463..0e39a924f10a 100644
--- a/fs/smbfs/smbiod.c
+++ b/fs/smbfs/smbiod.c
@@ -12,7 +12,6 @@
 #include <linux/string.h>
 #include <linux/stat.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/file.h>
 #include <linux/dcache.h>
diff --git a/fs/splice.c b/fs/splice.c
index 39208663aaf1..ac22b00d86c3 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -30,6 +30,7 @@
 #include <linux/syscalls.h>
 #include <linux/uio.h>
 #include <linux/security.h>
+#include <linux/gfp.h>
 /*
 * Attempt to steal a page from a pipe buffer. This should perhaps go into
@@ -192,8 +193,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
                        break;
                }
-                if (pipe->nrbufs < PIPE_BUFFERS) {
+                if (pipe->nrbufs < pipe->buffers) {
-                        int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
+                        int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
                        struct pipe_buffer *buf = pipe->bufs + newbuf;
                        buf->page = spd->pages[page_nr];
@@ -213,7 +214,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
                        if (!--spd->nr_pages)
                                break;
-                        if (pipe->nrbufs < PIPE_BUFFERS)
+                        if (pipe->nrbufs < pipe->buffers)
                                continue;
                        break;
@@ -264,6 +265,36 @@ static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
        page_cache_release(spd->pages[i]);
 }
+/*
+ * Check if we need to grow the arrays holding pages and partial page
+ * descriptions.
+ */
+int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
+{
+        if (pipe->buffers <= PIPE_DEF_BUFFERS)
+                return 0;
+        spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL);
+        spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL);
+        if (spd->pages && spd->partial)
+                return 0;
+        kfree(spd->pages);
+        kfree(spd->partial);
+        return -ENOMEM;
+}
+void splice_shrink_spd(struct pipe_inode_info *pipe,
+                       struct splice_pipe_desc *spd)
+{
+        if (pipe->buffers <= PIPE_DEF_BUFFERS)
+                return;
+        kfree(spd->pages);
+        kfree(spd->partial);
+}
 static int
 __generic_file_splice_read(struct file *in, loff_t *ppos,
                           struct pipe_inode_info *pipe, size_t len,
@@ -271,8 +302,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 {
        struct address_space *mapping = in->f_mapping;
        unsigned int loff, nr_pages, req_pages;
-        struct page *pages[PIPE_BUFFERS];
+        struct page *pages[PIPE_DEF_BUFFERS];
-        struct partial_page partial[PIPE_BUFFERS];
+        struct partial_page partial[PIPE_DEF_BUFFERS];
        struct page *page;
        pgoff_t index, end_index;
        loff_t isize;
@@ -285,15 +316,18 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                .spd_release = spd_release_page,
        };
+        if (splice_grow_spd(pipe, &spd))
+                return -ENOMEM;
        index = *ppos >> PAGE_CACHE_SHIFT;
        loff = *ppos & ~PAGE_CACHE_MASK;
        req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS);
+        nr_pages = min(req_pages, pipe->buffers);
        /*
         * Lookup the (hopefully) full range of pages we need.
         */
-        spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
+        spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
        index += spd.nr_pages;
        /*
@@ -334,7 +368,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                        unlock_page(page);
                }
-                pages[spd.nr_pages++] = page;
+                spd.pages[spd.nr_pages++] = page;
                index++;
        }
@@ -355,7 +389,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                 * this_len is the max we'll use from this page
                 */
                this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
-                page = pages[page_nr];
+                page = spd.pages[page_nr];
                if (PageReadahead(page))
                        page_cache_async_readahead(mapping, &in->f_ra, in,
@@ -392,8 +426,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                                        error = -ENOMEM;
                                        break;
                                }
-                                page_cache_release(pages[page_nr]);
+                                page_cache_release(spd.pages[page_nr]);
-                                pages[page_nr] = page;
+                                spd.pages[page_nr] = page;
                        }
                        /*
                         * page was already under io and is now done, great
@@ -450,8 +484,8 @@ fill_it:
                        len = this_len;
                }
-                partial[page_nr].offset = loff;
+                spd.partial[page_nr].offset = loff;
-                partial[page_nr].len = this_len;
+                spd.partial[page_nr].len = this_len;
                len -= this_len;
                loff = 0;
                spd.nr_pages++;
@@ -463,12 +497,13 @@ fill_it:
         * we got, 'nr_pages' is how many pages are in the map.
         */
        while (page_nr < nr_pages)
-                page_cache_release(pages[page_nr++]);
+                page_cache_release(spd.pages[page_nr++]);
        in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
        if (spd.nr_pages)
-                return splice_to_pipe(pipe, &spd);
+                error = splice_to_pipe(pipe, &spd);
+        splice_shrink_spd(pipe, &spd);
        return error;
 }
@@ -559,9 +594,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
        unsigned int nr_pages;
        unsigned int nr_freed;
        size_t offset;
-        struct page *pages[PIPE_BUFFERS];
+        struct page *pages[PIPE_DEF_BUFFERS];
-        struct partial_page partial[PIPE_BUFFERS];
+        struct partial_page partial[PIPE_DEF_BUFFERS];
-        struct iovec vec[PIPE_BUFFERS];
+        struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
        pgoff_t index;
        ssize_t res;
        size_t this_len;
@@ -575,11 +610,22 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
                .spd_release = spd_release_page,
        };
+        if (splice_grow_spd(pipe, &spd))
+                return -ENOMEM;
+        res = -ENOMEM;
+        vec = __vec;
+        if (pipe->buffers > PIPE_DEF_BUFFERS) {
+                vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL);
+                if (!vec)
+                        goto shrink_ret;
+        }
        index = *ppos >> PAGE_CACHE_SHIFT;
        offset = *ppos & ~PAGE_CACHE_MASK;
        nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
+        for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) {
                struct page *page;
                page = alloc_page(GFP_USER);
@@ -590,7 +636,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
                this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
                vec[i].iov_base = (void __user *) page_address(page);
                vec[i].iov_len = this_len;
-                pages[i] = page;
+                spd.pages[i] = page;
                spd.nr_pages++;
                len -= this_len;
                offset = 0;
@@ -609,11 +655,11 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
        nr_freed = 0;
        for (i = 0; i < spd.nr_pages; i++) {
                this_len = min_t(size_t, vec[i].iov_len, res);
-                partial[i].offset = 0;
+                spd.partial[i].offset = 0;
-                partial[i].len = this_len;
+                spd.partial[i].len = this_len;
                if (!this_len) {
-                        __free_page(pages[i]);
+                        __free_page(spd.pages[i]);
-                        pages[i] = NULL;
+                        spd.pages[i] = NULL;
                        nr_freed++;
                }
                res -= this_len;
@@ -624,13 +670,18 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
        if (res > 0)
                *ppos += res;
+shrink_ret:
+        if (vec != __vec)
+                kfree(vec);
+        splice_shrink_spd(pipe, &spd);
        return res;
 err:
        for (i = 0; i < spd.nr_pages; i++)
-                __free_page(pages[i]);
+                __free_page(spd.pages[i]);
-        return error;
+        res = error;
+        goto shrink_ret;
 }
 EXPORT_SYMBOL(default_file_splice_read);
@@ -783,7 +834,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
                if (!buf->len) {
                        buf->ops = NULL;
                        ops->release(pipe, buf);
-                        pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
+                        pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
                        pipe->nrbufs--;
                        if (pipe->inode)
                                sd->need_wakeup = true;
@@ -1210,7 +1261,7 @@ out_release:
         * If we did an incomplete transfer we must release
         * the pipe buffers in question:
         */
-        for (i = 0; i < PIPE_BUFFERS; i++) {
+        for (i = 0; i < pipe->buffers; i++) {
                struct pipe_buffer *buf = pipe->bufs + i;
                if (buf->ops) {
@@ -1370,7 +1421,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 */
 static int get_iovec_page_array(const struct iovec __user *iov,
                                unsigned int nr_vecs, struct page **pages,
-                                struct partial_page *partial, int aligned)
+                                struct partial_page *partial, int aligned,
+                                unsigned int pipe_buffers)
 {
        int buffers = 0, error = 0;
@@ -1413,8 +1465,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
                        break;
                npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-                if (npages > PIPE_BUFFERS - buffers)
+                if (npages > pipe_buffers - buffers)
-                        npages = PIPE_BUFFERS - buffers;
+                        npages = pipe_buffers - buffers;
                error = get_user_pages_fast((unsigned long)base, npages,
                                        0, &pages[buffers]);
@@ -1449,7 +1501,7 @@ static int get_iovec_page_array(const struct iovec __user *iov,
                 * or if we mapped the max number of pages that we have
                 * room for.
                 */
-                if (error < npages || buffers == PIPE_BUFFERS)
+                if (error < npages || buffers == pipe_buffers)
                        break;
                nr_vecs--;
@@ -1592,8 +1644,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
                             unsigned long nr_segs, unsigned int flags)
 {
        struct pipe_inode_info *pipe;
-        struct page *pages[PIPE_BUFFERS];
+        struct page *pages[PIPE_DEF_BUFFERS];
-        struct partial_page partial[PIPE_BUFFERS];
+        struct partial_page partial[PIPE_DEF_BUFFERS];
        struct splice_pipe_desc spd = {
                .pages = pages,
                .partial = partial,
@@ -1601,17 +1653,25 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
                .ops = &user_page_pipe_buf_ops,
                .spd_release = spd_release_page,
        };
+        long ret;
        pipe = pipe_info(file->f_path.dentry->d_inode);
        if (!pipe)
                return -EBADF;
-        spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
+        if (splice_grow_spd(pipe, &spd))
-                                            flags & SPLICE_F_GIFT);
+                return -ENOMEM;
+        spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
+                                            spd.partial, flags & SPLICE_F_GIFT,
+                                            pipe->buffers);
        if (spd.nr_pages <= 0)
-                return spd.nr_pages;
+                ret = spd.nr_pages;
+        else
+                ret = splice_to_pipe(pipe, &spd);
-        return splice_to_pipe(pipe, &spd);
+        splice_shrink_spd(pipe, &spd);
+        return ret;
 }
 /*
@@ -1737,13 +1797,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
         * Check ->nrbufs without the inode lock first. This function
         * is speculative anyways, so missing one is ok.
         */
-        if (pipe->nrbufs < PIPE_BUFFERS)
+        if (pipe->nrbufs < pipe->buffers)
                return 0;
        ret = 0;
        pipe_lock(pipe);
-        while (pipe->nrbufs >= PIPE_BUFFERS) {
+        while (pipe->nrbufs >= pipe->buffers) {
                if (!pipe->readers) {
                        send_sig(SIGPIPE, current, 0);
                        ret = -EPIPE;
@@ -1809,7 +1869,7 @@ retry:
                 * Cannot make any progress, because either the input
                 * pipe is empty or the output pipe is full.
                 */
-                if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) {
+                if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
                        /* Already processed some buffers, break */
                        if (ret)
                                break;
@@ -1830,7 +1890,7 @@ retry:
                }
                ibuf = ipipe->bufs + ipipe->curbuf;
-                nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS;
+                nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
                obuf = opipe->bufs + nbuf;
                if (len >= ibuf->len) {
@@ -1840,7 +1900,7 @@ retry:
                        *obuf = *ibuf;
                        ibuf->ops = NULL;
                        opipe->nrbufs++;
-                        ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS;
+                        ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
                        ipipe->nrbufs--;
                        input_wakeup = true;
                } else {
@@ -1913,11 +1973,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
                 * If we have iterated all input buffers or ran out of
                 * output room, break.
                 */
-                if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
+                if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
                        break;
-                ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
+                ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
-                nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
+                nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
                /*
                 * Get a reference to this pipe buffer,
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 25a00d19d686..cc6ce8a84c21 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -26,6 +26,17 @@ config SQUASHFS
          If unsure, say N.
+config SQUASHFS_XATTRS
+        bool "Squashfs XATTR support"
+        depends on SQUASHFS
+        default n
+        help
+          Saying Y here includes support for extended attributes (xattrs).
+          Xattrs are name:value pairs associated with inodes by
+          the kernel or by users (see the attr(5) manual page).
+          If unsure, say N.
 config SQUASHFS_EMBEDDED
        bool "Additional option for memory-constrained systems" 
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index df8a19ef870d..2cee3e9fa452 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,3 +5,5 @@
 obj-$(CONFIG_SQUASHFS) += squashfs.o
 squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
 squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
+squashfs-$(CONFIG_SQUASHFS_XATTRS) += xattr.o xattr_id.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 1cb0d81b164b..653c030eb840 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -87,9 +87,8 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
        u64 cur_index = index >> msblk->devblksize_log2;
        int bytes, compressed, b = 0, k = 0, page = 0, avail;
+        bh = kcalloc(((srclength + msblk->devblksize - 1)
-        bh = kcalloc((msblk->block_size >> msblk->devblksize_log2) + 1,
+                >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL);
-                                sizeof(*bh), GFP_KERNEL);
        if (bh == NULL)
                return -ENOMEM;
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 49daaf669e41..62e63ad25075 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -40,11 +40,13 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
+#include <linux/xattr.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "xattr.h"
 /*
 * Initialise VFS inode with the base inode information common to all
@@ -111,6 +113,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
        int err, type, offset = SQUASHFS_INODE_OFFSET(ino);
        union squashfs_inode squashfs_ino;
        struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base;
+        int xattr_id = SQUASHFS_INVALID_XATTR;
        TRACE("Entered squashfs_read_inode\n");
@@ -199,8 +202,10 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                        frag_offset = 0;
                }
+                xattr_id = le32_to_cpu(sqsh_ino->xattr);
                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
                inode->i_size = le64_to_cpu(sqsh_ino->file_size);
+                inode->i_op = &squashfs_inode_ops;
                inode->i_fop = &generic_ro_fops;
                inode->i_mode |= S_IFREG;
                inode->i_blocks = ((inode->i_size -
@@ -251,6 +256,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                if (err < 0)
                        goto failed_read;
+                xattr_id = le32_to_cpu(sqsh_ino->xattr);
                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
                inode->i_size = le32_to_cpu(sqsh_ino->file_size);
                inode->i_op = &squashfs_dir_inode_ops;
@@ -280,21 +286,33 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
                inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
-                inode->i_op = &page_symlink_inode_operations;
+                inode->i_op = &squashfs_symlink_inode_ops;
                inode->i_data.a_ops = &squashfs_symlink_aops;
                inode->i_mode |= S_IFLNK;
                squashfs_i(inode)->start = block;
                squashfs_i(inode)->offset = offset;
+                if (type == SQUASHFS_LSYMLINK_TYPE) {
+                        __le32 xattr;
+                        err = squashfs_read_metadata(sb, NULL, &block,
+                                                &offset, inode->i_size);
+                        if (err < 0)
+                                goto failed_read;
+                        err = squashfs_read_metadata(sb, &xattr, &block,
+                                                &offset, sizeof(xattr));
+                        if (err < 0)
+                                goto failed_read;
+                        xattr_id = le32_to_cpu(xattr);
+                }
                TRACE("Symbolic link inode %x:%x, start_block %llx, offset "
                                "%x\n", SQUASHFS_INODE_BLK(ino), offset,
                                block, offset);
                break;
        }
        case SQUASHFS_BLKDEV_TYPE:
-        case SQUASHFS_CHRDEV_TYPE:
+        case SQUASHFS_CHRDEV_TYPE: {
-        case SQUASHFS_LBLKDEV_TYPE:
-        case SQUASHFS_LCHRDEV_TYPE: {
                struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev;
                unsigned int rdev;
@@ -315,10 +333,32 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                                SQUASHFS_INODE_BLK(ino), offset, rdev);
                break;
        }
+        case SQUASHFS_LBLKDEV_TYPE:
+        case SQUASHFS_LCHRDEV_TYPE: {
+                struct squashfs_ldev_inode *sqsh_ino = &squashfs_ino.ldev;
+                unsigned int rdev;
+                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+                                sizeof(*sqsh_ino));
+                if (err < 0)
+                        goto failed_read;
+                if (type == SQUASHFS_LCHRDEV_TYPE)
+                        inode->i_mode |= S_IFCHR;
+                else
+                        inode->i_mode |= S_IFBLK;
+                xattr_id = le32_to_cpu(sqsh_ino->xattr);
+                inode->i_op = &squashfs_inode_ops;
+                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                rdev = le32_to_cpu(sqsh_ino->rdev);
+                init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
+                TRACE("Device inode %x:%x, rdev %x\n",
+                                SQUASHFS_INODE_BLK(ino), offset, rdev);
+                break;
+        }
        case SQUASHFS_FIFO_TYPE:
-        case SQUASHFS_SOCKET_TYPE:
+        case SQUASHFS_SOCKET_TYPE: {
-        case SQUASHFS_LFIFO_TYPE:
-        case SQUASHFS_LSOCKET_TYPE: {
                struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc;
                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
@@ -334,14 +374,52 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                init_special_inode(inode, inode->i_mode, 0);
                break;
        }
+        case SQUASHFS_LFIFO_TYPE:
+        case SQUASHFS_LSOCKET_TYPE: {
+                struct squashfs_lipc_inode *sqsh_ino = &squashfs_ino.lipc;
+                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+                                sizeof(*sqsh_ino));
+                if (err < 0)
+                        goto failed_read;
+                if (type == SQUASHFS_LFIFO_TYPE)
+                        inode->i_mode |= S_IFIFO;
+                else
+                        inode->i_mode |= S_IFSOCK;
+                xattr_id = le32_to_cpu(sqsh_ino->xattr);
+                inode->i_op = &squashfs_inode_ops;
+                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                init_special_inode(inode, inode->i_mode, 0);
+                break;
+        }
        default:
                ERROR("Unknown inode type %d in squashfs_iget!\n", type);
                return -EINVAL;
        }
+        if (xattr_id != SQUASHFS_INVALID_XATTR && msblk->xattr_id_table) {
+                err = squashfs_xattr_lookup(sb, xattr_id,
+                                        &squashfs_i(inode)->xattr_count,
+                                        &squashfs_i(inode)->xattr_size,
+                                        &squashfs_i(inode)->xattr);
+                if (err < 0)
+                        goto failed_read;
+                inode->i_blocks += ((squashfs_i(inode)->xattr_size - 1) >> 9)
+                                + 1;
+        } else
+                squashfs_i(inode)->xattr_count = 0;
        return 0;
 failed_read:
        ERROR("Unable to read inode 0x%llx\n", ino);
        return err;
 }
+const struct inode_operations squashfs_inode_ops = {
+        .getxattr = generic_getxattr,
+        .listxattr = squashfs_listxattr
+};
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 5266bd8ad932..7a9464d08cf6 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -57,11 +57,13 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/dcache.h>
+#include <linux/xattr.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "xattr.h"
 /*
 * Lookup name in the directory index, returning the location of the metadata
@@ -237,5 +239,7 @@ failed:
 const struct inode_operations squashfs_dir_inode_ops = {
-        .lookup = squashfs_lookup
+        .lookup = squashfs_lookup,
+        .getxattr = generic_getxattr,
+        .listxattr = squashfs_listxattr
 };
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index fe2587af5512..733a17c42945 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -73,8 +73,11 @@ extern struct inode *squashfs_iget(struct super_block *, long long,
                                unsigned int);
 extern int squashfs_read_inode(struct inode *, long long);
+/* xattr.c */
+extern ssize_t squashfs_listxattr(struct dentry *, char *, size_t);
 /*
- * Inodes, files and decompressor operations
+ * Inodes, files,  decompressor and xattr operations
 */
 /* dir.c */
@@ -86,11 +89,18 @@ extern const struct export_operations squashfs_export_ops;
 /* file.c */
 extern const struct address_space_operations squashfs_aops;
+/* inode.c */
+extern const struct inode_operations squashfs_inode_ops;
 /* namei.c */
 extern const struct inode_operations squashfs_dir_inode_ops;
 /* symlink.c */
 extern const struct address_space_operations squashfs_symlink_aops;
+extern const struct inode_operations squashfs_symlink_inode_ops;
+/* xattr.c */
+extern const struct xattr_handler *squashfs_xattr_handlers[];
 /* zlib_wrapper.c */
 extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 79024245ea00..8eabb808b78d 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -46,6 +46,7 @@
 #define SQUASHFS_NAME_LEN               256
 #define SQUASHFS_INVALID_FRAG           (0xffffffffU)
+#define SQUASHFS_INVALID_XATTR          (0xffffffffU)
 #define SQUASHFS_INVALID_BLK            (-1LL)
 /* Filesystem flags */
@@ -96,6 +97,13 @@
 #define SQUASHFS_LFIFO_TYPE             13
 #define SQUASHFS_LSOCKET_TYPE           14
+/* Xattr types */
+#define SQUASHFS_XATTR_USER             0
+#define SQUASHFS_XATTR_TRUSTED          1
+#define SQUASHFS_XATTR_SECURITY         2
+#define SQUASHFS_XATTR_VALUE_OOL        256
+#define SQUASHFS_XATTR_PREFIX_MASK      0xff
 /* Flag whether block is compressed or uncompressed, bit is set if block is
 * uncompressed */
 #define SQUASHFS_COMPRESSED_BIT         (1 << 15)
@@ -174,6 +182,24 @@
 #define SQUASHFS_ID_BLOCK_BYTES(A)      (SQUASHFS_ID_BLOCKS(A) *\
                                        sizeof(u64))
+/* xattr id lookup table defines */
+#define SQUASHFS_XATTR_BYTES(A)         ((A) * sizeof(struct squashfs_xattr_id))
+#define SQUASHFS_XATTR_BLOCK(A)         (SQUASHFS_XATTR_BYTES(A) / \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_XATTR_BLOCK_OFFSET(A)  (SQUASHFS_XATTR_BYTES(A) % \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_XATTR_BLOCKS(A)        ((SQUASHFS_XATTR_BYTES(A) + \
+                                        SQUASHFS_METADATA_SIZE - 1) / \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_XATTR_BLOCK_BYTES(A)   (SQUASHFS_XATTR_BLOCKS(A) *\
+                                        sizeof(u64))
+#define SQUASHFS_XATTR_BLK(A)           ((unsigned int) ((A) >> 16))
+#define SQUASHFS_XATTR_OFFSET(A)        ((unsigned int) ((A) & 0xffff))
 /* cached data constants for filesystem */
 #define SQUASHFS_CACHED_BLKS            8
@@ -228,7 +254,7 @@ struct squashfs_super_block {
        __le64                  root_inode;
        __le64                  bytes_used;
        __le64                  id_table_start;
-        __le64                  xattr_table_start;
+        __le64                  xattr_id_table_start;
        __le64                  inode_table_start;
        __le64                  directory_table_start;
        __le64                  fragment_table_start;
@@ -261,6 +287,17 @@ struct squashfs_ipc_inode {
        __le32                  nlink;
 };
+struct squashfs_lipc_inode {
+        __le16                  inode_type;
+        __le16                  mode;
+        __le16                  uid;
+        __le16                  guid;
+        __le32                  mtime;
+        __le32                  inode_number;
+        __le32                  nlink;
+        __le32                  xattr;
+};
 struct squashfs_dev_inode {
        __le16                  inode_type;
        __le16                  mode;
@@ -272,6 +309,18 @@ struct squashfs_dev_inode {
        __le32                  rdev;
 };
+struct squashfs_ldev_inode {
+        __le16                  inode_type;
+        __le16                  mode;
+        __le16                  uid;
+        __le16                  guid;
+        __le32                  mtime;
+        __le32                  inode_number;
+        __le32                  nlink;
+        __le32                  rdev;
+        __le32                  xattr;
+};
 struct squashfs_symlink_inode {
        __le16                  inode_type;
        __le16                  mode;
@@ -349,12 +398,14 @@ struct squashfs_ldir_inode {
 union squashfs_inode {
        struct squashfs_base_inode              base;
        struct squashfs_dev_inode               dev;
+        struct squashfs_ldev_inode              ldev;
        struct squashfs_symlink_inode           symlink;
        struct squashfs_reg_inode               reg;
        struct squashfs_lreg_inode              lreg;
        struct squashfs_dir_inode               dir;
        struct squashfs_ldir_inode              ldir;
        struct squashfs_ipc_inode               ipc;
+        struct squashfs_lipc_inode              lipc;
 };
 struct squashfs_dir_entry {
@@ -377,4 +428,27 @@ struct squashfs_fragment_entry {
        unsigned int            unused;
 };
+struct squashfs_xattr_entry {
+        __le16                  type;
+        __le16                  size;
+        char                    data[0];
+};
+struct squashfs_xattr_val {
+        __le32                  vsize;
+        char                    value[0];
+};
+struct squashfs_xattr_id {
+        __le64                  xattr;
+        __le32                  count;
+        __le32                  size;
+};
+struct squashfs_xattr_id_table {
+        __le64                  xattr_table_start;
+        __le32                  xattr_ids;
+        __le32                  unused;
+};
 #endif
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index fbfca30c0c68..d3e3a37f28a1 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -26,6 +26,9 @@
 struct squashfs_inode_info {
        u64             start;
        int             offset;
+        u64             xattr;
+        unsigned int    xattr_size;
+        int             xattr_count;
        union {
                struct {
                        u64             fragment_block;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 2e77dc547e25..d9037a5215f0 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -61,6 +61,7 @@ struct squashfs_sb_info {
        int                                     next_meta_index;
        __le64                                  *id_table;
        __le64                                  *fragment_index;
+        __le64                                  *xattr_id_table;
        struct mutex                            read_data_mutex;
        struct mutex                            meta_index_mutex;
        struct meta_index                       *meta_index;
@@ -68,9 +69,11 @@ struct squashfs_sb_info {
        __le64                                  *inode_lookup_table;
        u64                                     inode_table;
        u64                                     directory_table;
+        u64                                     xattr_table;
        unsigned int                            block_size;
        unsigned short                          block_log;
        long long                               bytes_used;
        unsigned int                            inodes;
+        int                                     xattr_ids;
 };
 #endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 3550aec2f655..88b4f8606652 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -36,12 +36,14 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/magic.h>
+#include <linux/xattr.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "decompressor.h"
+#include "xattr.h"
 static struct file_system_type squashfs_fs_type;
 static const struct super_operations squashfs_super_ops;
@@ -82,7 +84,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
        long long root_inode;
        unsigned short flags;
        unsigned int fragments;
-        u64 lookup_table_start;
+        u64 lookup_table_start, xattr_id_table_start;
        int err;
        TRACE("Entered squashfs_fill_superblock\n");
@@ -139,13 +141,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
        if (msblk->decompressor == NULL)
                goto failed_mount;
-        /*
-         * Check if there's xattrs in the filesystem.  These are not
-         * supported in this version, so warn that they will be ignored.
-         */
-        if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK)
-                ERROR("Xattrs in filesystem, these will be ignored\n");
        /* Check the filesystem does not extend beyond the end of the
           block device */
        msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
@@ -253,7 +248,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 allocate_lookup_table:
        lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
        if (lookup_table_start == SQUASHFS_INVALID_BLK)
-                goto allocate_root;
+                goto allocate_xattr_table;
        /* Allocate and read inode lookup table */
        msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
@@ -266,6 +261,21 @@ allocate_lookup_table:
        sb->s_export_op = &squashfs_export_ops;
+allocate_xattr_table:
+        sb->s_xattr = squashfs_xattr_handlers;
+        xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start);
+        if (xattr_id_table_start == SQUASHFS_INVALID_BLK)
+                goto allocate_root;
+        /* Allocate and read xattr id lookup table */
+        msblk->xattr_id_table = squashfs_read_xattr_id_table(sb,
+                xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids);
+        if (IS_ERR(msblk->xattr_id_table)) {
+                err = PTR_ERR(msblk->xattr_id_table);
+                msblk->xattr_id_table = NULL;
+                if (err != -ENOTSUPP)
+                        goto failed_mount;
+        }
 allocate_root:
        root = new_inode(sb);
        if (!root) {
@@ -275,7 +285,8 @@ allocate_root:
        err = squashfs_read_inode(root, root_inode);
        if (err) {
-                iget_failed(root);
+                make_bad_inode(root);
+                iput(root);
                goto failed_mount;
        }
        insert_inode_hash(root);
@@ -300,6 +311,7 @@ failed_mount:
        kfree(msblk->inode_lookup_table);
        kfree(msblk->fragment_index);
        kfree(msblk->id_table);
+        kfree(msblk->xattr_id_table);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
        kfree(sblk);
@@ -353,6 +365,8 @@ static void squashfs_put_super(struct super_block *sb)
                kfree(sbi->id_table);
                kfree(sbi->fragment_index);
                kfree(sbi->meta_index);
+                kfree(sbi->inode_lookup_table);
+                kfree(sbi->xattr_id_table);
                kfree(sb->s_fs_info);
                sb->s_fs_info = NULL;
        }
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index e80be2022a7f..ec86434921e1 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -33,14 +33,15 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/pagemap.h>
+#include <linux/xattr.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "xattr.h"
 static int squashfs_symlink_readpage(struct file *file, struct page *page)
 {
@@ -115,3 +116,12 @@ error_out:
 const struct address_space_operations squashfs_symlink_aops = {
        .readpage = squashfs_symlink_readpage
 };
+const struct inode_operations squashfs_symlink_inode_ops = {
+        .readlink = generic_readlink,
+        .follow_link = page_follow_link_light,
+        .put_link = page_put_link,
+        .getxattr = generic_getxattr,
+        .listxattr = squashfs_listxattr
+};
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
new file mode 100644
index 000000000000..c7655e8b31cd
--- /dev/null
+++ b/fs/squashfs/xattr.c
@@ -0,0 +1,323 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr_id.c
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/xattr.h>
+#include <linux/slab.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+static const struct xattr_handler *squashfs_xattr_handler(int);
+ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
+        size_t buffer_size)
+{
+        struct inode *inode = d->d_inode;
+        struct super_block *sb = inode->i_sb;
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
+                                                 + msblk->xattr_table;
+        int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
+        int count = squashfs_i(inode)->xattr_count;
+        size_t rest = buffer_size;
+        int err;
+        /* check that the file system has xattrs */
+        if (msblk->xattr_id_table == NULL)
+                return -EOPNOTSUPP;
+        /* loop reading each xattr name */
+        while (count--) {
+                struct squashfs_xattr_entry entry;
+                struct squashfs_xattr_val val;
+                const struct xattr_handler *handler;
+                int name_size, prefix_size = 0;
+                err = squashfs_read_metadata(sb, &entry, &start, &offset,
+                                                        sizeof(entry));
+                if (err < 0)
+                        goto failed;
+                name_size = le16_to_cpu(entry.size);
+                handler = squashfs_xattr_handler(le16_to_cpu(entry.type));
+                if (handler)
+                        prefix_size = handler->list(d, buffer, rest, NULL,
+                                name_size, handler->flags);
+                if (prefix_size) {
+                        if (buffer) {
+                                if (prefix_size + name_size + 1 > rest) {
+                                        err = -ERANGE;
+                                        goto failed;
+                                }
+                                buffer += prefix_size;
+                        }
+                        err = squashfs_read_metadata(sb, buffer, &start,
+                                &offset, name_size);
+                        if (err < 0)
+                                goto failed;
+                        if (buffer) {
+                                buffer[name_size] = '\0';
+                                buffer += name_size + 1;
+                        }
+                        rest -= prefix_size + name_size + 1;
+                } else  {
+                        /* no handler or insuffficient privileges, so skip */
+                        err = squashfs_read_metadata(sb, NULL, &start,
+                                &offset, name_size);
+                        if (err < 0)
+                                goto failed;
+                }
+                /* skip remaining xattr entry */
+                err = squashfs_read_metadata(sb, &val, &start, &offset,
+                                                sizeof(val));
+                if (err < 0)
+                        goto failed;
+                err = squashfs_read_metadata(sb, NULL, &start, &offset,
+                                                le32_to_cpu(val.vsize));
+                if (err < 0)
+                        goto failed;
+        }
+        err = buffer_size - rest;
+failed:
+        return err;
+}
+static int squashfs_xattr_get(struct inode *inode, int name_index,
+        const char *name, void *buffer, size_t buffer_size)
+{
+        struct super_block *sb = inode->i_sb;
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
+                                                 + msblk->xattr_table;
+        int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
+        int count = squashfs_i(inode)->xattr_count;
+        int name_len = strlen(name);
+        int err, vsize;
+        char *target = kmalloc(name_len, GFP_KERNEL);
+        if (target == NULL)
+                return  -ENOMEM;
+        /* loop reading each xattr name */
+        for (; count; count--) {
+                struct squashfs_xattr_entry entry;
+                struct squashfs_xattr_val val;
+                int type, prefix, name_size;
+                err = squashfs_read_metadata(sb, &entry, &start, &offset,
+                                                        sizeof(entry));
+                if (err < 0)
+                        goto failed;
+                name_size = le16_to_cpu(entry.size);
+                type = le16_to_cpu(entry.type);
+                prefix = type & SQUASHFS_XATTR_PREFIX_MASK;
+                if (prefix == name_index && name_size == name_len)
+                        err = squashfs_read_metadata(sb, target, &start,
+                                                &offset, name_size);
+                else
+                        err = squashfs_read_metadata(sb, NULL, &start,
+                                                &offset, name_size);
+                if (err < 0)
+                        goto failed;
+                if (prefix == name_index && name_size == name_len &&
+                                        strncmp(target, name, name_size) == 0) {
+                        /* found xattr */
+                        if (type & SQUASHFS_XATTR_VALUE_OOL) {
+                                __le64 xattr;
+                                /* val is a reference to the real location */
+                                err = squashfs_read_metadata(sb, &val, &start,
+                                                &offset, sizeof(val));
+                                if (err < 0)
+                                        goto failed;
+                                err = squashfs_read_metadata(sb, &xattr, &start,
+                                         &offset, sizeof(xattr));
+                                if (err < 0)
+                                        goto failed;
+                                xattr = le64_to_cpu(xattr);
+                                start = SQUASHFS_XATTR_BLK(xattr) +
+                                                        msblk->xattr_table;
+                                offset = SQUASHFS_XATTR_OFFSET(xattr);
+                        }
+                        /* read xattr value */
+                        err = squashfs_read_metadata(sb, &val, &start, &offset,
+                                                        sizeof(val));
+                        if (err < 0)
+                                goto failed;
+                        vsize = le32_to_cpu(val.vsize);
+                        if (buffer) {
+                                if (vsize > buffer_size) {
+                                        err = -ERANGE;
+                                        goto failed;
+                                }
+                                err = squashfs_read_metadata(sb, buffer, &start,
+                                         &offset, vsize);
+                                if (err < 0)
+                                        goto failed;
+                        }
+                        break;
+                }
+                /* no match, skip remaining xattr entry */
+                err = squashfs_read_metadata(sb, &val, &start, &offset,
+                                                        sizeof(val));
+                if (err < 0)
+                        goto failed;
+                err = squashfs_read_metadata(sb, NULL, &start, &offset,
+                                                le32_to_cpu(val.vsize));
+                if (err < 0)
+                        goto failed;
+        }
+        err = count ? vsize : -ENODATA;
+failed:
+        kfree(target);
+        return err;
+}
+/*
+ * User namespace support
+ */
+static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size,
+        const char *name, size_t name_len, int type)
+{
+        if (list && XATTR_USER_PREFIX_LEN <= list_size)
+                memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+        return XATTR_USER_PREFIX_LEN;
+}
+static int squashfs_user_get(struct dentry *d, const char *name, void *buffer,
+        size_t size, int type)
+{
+        if (name[0] == '\0')
+                return  -EINVAL;
+        return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_USER, name,
+                buffer, size);
+}
+static const struct xattr_handler squashfs_xattr_user_handler = {
+        .prefix = XATTR_USER_PREFIX,
+        .list   = squashfs_user_list,
+        .get    = squashfs_user_get
+};
+/*
+ * Trusted namespace support
+ */
+static size_t squashfs_trusted_list(struct dentry *d, char *list,
+        size_t list_size, const char *name, size_t name_len, int type)
+{
+        if (!capable(CAP_SYS_ADMIN))
+                return 0;
+        if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size)
+                memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
+        return XATTR_TRUSTED_PREFIX_LEN;
+}
+static int squashfs_trusted_get(struct dentry *d, const char *name,
+        void *buffer, size_t size, int type)
+{
+        if (name[0] == '\0')
+                return  -EINVAL;
+        return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_TRUSTED, name,
+                buffer, size);
+}
+static const struct xattr_handler squashfs_xattr_trusted_handler = {
+        .prefix = XATTR_TRUSTED_PREFIX,
+        .list   = squashfs_trusted_list,
+        .get    = squashfs_trusted_get
+};
+/*
+ * Security namespace support
+ */
+static size_t squashfs_security_list(struct dentry *d, char *list,
+        size_t list_size, const char *name, size_t name_len, int type)
+{
+        if (list && XATTR_SECURITY_PREFIX_LEN <= list_size)
+                memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
+        return XATTR_SECURITY_PREFIX_LEN;
+}
+static int squashfs_security_get(struct dentry *d, const char *name,
+        void *buffer, size_t size, int type)
+{
+        if (name[0] == '\0')
+                return  -EINVAL;
+        return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_SECURITY, name,
+                buffer, size);
+}
+static const struct xattr_handler squashfs_xattr_security_handler = {
+        .prefix = XATTR_SECURITY_PREFIX,
+        .list   = squashfs_security_list,
+        .get    = squashfs_security_get
+};
+static inline const struct xattr_handler *squashfs_xattr_handler(int type)
+{
+        if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL))
+                /* ignore unrecognised type */
+                return NULL;
+        switch (type & SQUASHFS_XATTR_PREFIX_MASK) {
+        case SQUASHFS_XATTR_USER:
+                return &squashfs_xattr_user_handler;
+        case SQUASHFS_XATTR_TRUSTED:
+                return &squashfs_xattr_trusted_handler;
+        case SQUASHFS_XATTR_SECURITY:
+                return &squashfs_xattr_security_handler;
+        default:
+                /* ignore unrecognised type */
+                return NULL;
+        }
+}
+const struct xattr_handler *squashfs_xattr_handlers[] = {
+        &squashfs_xattr_user_handler,
+        &squashfs_xattr_trusted_handler,
+        &squashfs_xattr_security_handler,
+        NULL
+};
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
new file mode 100644
index 000000000000..9da071ae181c
--- /dev/null
+++ b/fs/squashfs/xattr.h
@@ -0,0 +1,46 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr.h
+ */
+#ifdef CONFIG_SQUASHFS_XATTRS
+extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
+                u64 *, int *);
+extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
+                int *, unsigned long long *);
+#else
+static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
+                u64 start, u64 *xattr_table_start, int *xattr_ids)
+{
+        ERROR("Xattrs in filesystem, these will be ignored\n");
+        return ERR_PTR(-ENOTSUPP);
+}
+static inline int squashfs_xattr_lookup(struct super_block *sb,
+                unsigned int index, int *count, int *size,
+                unsigned long long *xattr)
+{
+        return 0;
+}
+#define squashfs_listxattr NULL
+#define generic_getxattr NULL
+#define squashfs_xattr_handlers NULL
+#endif
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
new file mode 100644
index 000000000000..cfb41106098f
--- /dev/null
+++ b/fs/squashfs/xattr_id.c
@@ -0,0 +1,100 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr_id.c
+ */
+/*
+ * This file implements code to map the 32-bit xattr id stored in the inode
+ * into the on disk location of the xattr data.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+/*
+ * Map xattr id using the xattr id look up table
+ */
+int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
+                int *count, unsigned int *size, unsigned long long *xattr)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        int block = SQUASHFS_XATTR_BLOCK(index);
+        int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index);
+        u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]);
+        struct squashfs_xattr_id id;
+        int err;
+        err = squashfs_read_metadata(sb, &id, &start_block, &offset,
+                                                        sizeof(id));
+        if (err < 0)
+                return err;
+        *xattr = le64_to_cpu(id.xattr);
+        *size = le32_to_cpu(id.size);
+        *count = le32_to_cpu(id.count);
+        return 0;
+}
+/*
+ * Read uncompressed xattr id lookup table indexes from disk into memory
+ */
+__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start,
+                u64 *xattr_table_start, int *xattr_ids)
+{
+        unsigned int len;
+        __le64 *xid_table;
+        struct squashfs_xattr_id_table id_table;
+        int err;
+        err = squashfs_read_table(sb, &id_table, start, sizeof(id_table));
+        if (err < 0) {
+                ERROR("unable to read xattr id table\n");
+                return ERR_PTR(err);
+        }
+        *xattr_table_start = le64_to_cpu(id_table.xattr_table_start);
+        *xattr_ids = le32_to_cpu(id_table.xattr_ids);
+        len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
+        TRACE("In read_xattr_index_table, length %d\n", len);
+        /* Allocate xattr id lookup table indexes */
+        xid_table = kmalloc(len, GFP_KERNEL);
+        if (xid_table == NULL) {
+                ERROR("Failed to allocate xattr id index table\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        err = squashfs_read_table(sb, xid_table, start + sizeof(id_table), len);
+        if (err < 0) {
+                ERROR("unable to read xattr id index table\n");
+                kfree(xid_table);
+                return ERR_PTR(err);
+        }
+        return xid_table;
+}
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 4dd70e04333b..7a603874e483 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -24,6 +24,7 @@
 #include <linux/mutex.h>
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include <linux/zlib.h>
 #include "squashfs_fs.h"
@@ -127,8 +128,9 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
                goto release_mutex;
        }
+        length = stream->total_out;
        mutex_unlock(&msblk->read_data_mutex);
-        return stream->total_out;
+        return length;
 release_mutex:
        mutex_unlock(&msblk->read_data_mutex);
diff --git a/fs/statfs.c b/fs/statfs.c
new file mode 100644
index 000000000000..4ef021f3b612
--- /dev/null
+++ b/fs/statfs.c
@@ -0,0 +1,196 @@
+#include <linux/syscalls.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/statfs.h>
+#include <linux/security.h>
+#include <linux/uaccess.h>
+int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        int retval = -ENODEV;
+        if (dentry) {
+                retval = -ENOSYS;
+                if (dentry->d_sb->s_op->statfs) {
+                        memset(buf, 0, sizeof(*buf));
+                        retval = security_sb_statfs(dentry);
+                        if (retval)
+                                return retval;
+                        retval = dentry->d_sb->s_op->statfs(dentry, buf);
+                        if (retval == 0 && buf->f_frsize == 0)
+                                buf->f_frsize = buf->f_bsize;
+                }
+        }
+        return retval;
+}
+EXPORT_SYMBOL(vfs_statfs);
+static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
+{
+        struct kstatfs st;
+        int retval;
+        retval = vfs_statfs(dentry, &st);
+        if (retval)
+                return retval;
+        if (sizeof(*buf) == sizeof(st))
+                memcpy(buf, &st, sizeof(st));
+        else {
+                if (sizeof buf->f_blocks == 4) {
+                        if ((st.f_blocks | st.f_bfree | st.f_bavail |
+                             st.f_bsize | st.f_frsize) &
+                            0xffffffff00000000ULL)
+                                return -EOVERFLOW;
+                        /*
+                         * f_files and f_ffree may be -1; it's okay to stuff
+                         * that into 32 bits
+                         */
+                        if (st.f_files != -1 &&
+                            (st.f_files & 0xffffffff00000000ULL))
+                                return -EOVERFLOW;
+                        if (st.f_ffree != -1 &&
+                            (st.f_ffree & 0xffffffff00000000ULL))
+                                return -EOVERFLOW;
+                }
+                buf->f_type = st.f_type;
+                buf->f_bsize = st.f_bsize;
+                buf->f_blocks = st.f_blocks;
+                buf->f_bfree = st.f_bfree;
+                buf->f_bavail = st.f_bavail;
+                buf->f_files = st.f_files;
+                buf->f_ffree = st.f_ffree;
+                buf->f_fsid = st.f_fsid;
+                buf->f_namelen = st.f_namelen;
+                buf->f_frsize = st.f_frsize;
+                memset(buf->f_spare, 0, sizeof(buf->f_spare));
+        }
+        return 0;
+}
+static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
+{
+        struct kstatfs st;
+        int retval;
+        retval = vfs_statfs(dentry, &st);
+        if (retval)
+                return retval;
+        if (sizeof(*buf) == sizeof(st))
+                memcpy(buf, &st, sizeof(st));
+        else {
+                buf->f_type = st.f_type;
+                buf->f_bsize = st.f_bsize;
+                buf->f_blocks = st.f_blocks;
+                buf->f_bfree = st.f_bfree;
+                buf->f_bavail = st.f_bavail;
+                buf->f_files = st.f_files;
+                buf->f_ffree = st.f_ffree;
+                buf->f_fsid = st.f_fsid;
+                buf->f_namelen = st.f_namelen;
+                buf->f_frsize = st.f_frsize;
+                memset(buf->f_spare, 0, sizeof(buf->f_spare));
+        }
+        return 0;
+}
+SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
+{
+        struct path path;
+        int error;
+        error = user_path(pathname, &path);
+        if (!error) {
+                struct statfs tmp;
+                error = vfs_statfs_native(path.dentry, &tmp);
+                if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+                        error = -EFAULT;
+                path_put(&path);
+        }
+        return error;
+}
+SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
+{
+        struct path path;
+        long error;
+        if (sz != sizeof(*buf))
+                return -EINVAL;
+        error = user_path(pathname, &path);
+        if (!error) {
+                struct statfs64 tmp;
+                error = vfs_statfs64(path.dentry, &tmp);
+                if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+                        error = -EFAULT;
+                path_put(&path);
+        }
+        return error;
+}
+SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
+{
+        struct file *file;
+        struct statfs tmp;
+        int error;
+        error = -EBADF;
+        file = fget(fd);
+        if (!file)
+                goto out;
+        error = vfs_statfs_native(file->f_path.dentry, &tmp);
+        if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+                error = -EFAULT;
+        fput(file);
+out:
+        return error;
+}
+SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
+{
+        struct file *file;
+        struct statfs64 tmp;
+        int error;
+        if (sz != sizeof(*buf))
+                return -EINVAL;
+        error = -EBADF;
+        file = fget(fd);
+        if (!file)
+                goto out;
+        error = vfs_statfs64(file->f_path.dentry, &tmp);
+        if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+                error = -EFAULT;
+        fput(file);
+out:
+        return error;
+}
+SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
+{
+        struct super_block *s;
+        struct ustat tmp;
+        struct kstatfs sbuf;
+        int err;
+        s = user_get_super(new_decode_dev(dev));
+        if (!s)
+                return -EINVAL;
+        err = vfs_statfs(s->s_root, &sbuf);
+        drop_super(s);
+        if (err)
+                return err;
+        memset(&tmp,0,sizeof(struct ustat));
+        tmp.f_tfree = sbuf.f_bfree;
+        tmp.f_tinode = sbuf.f_ffree;
+        return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0;
+}
diff --git a/fs/super.c b/fs/super.c
index f35ac6022109..5c35bc7a499e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -22,22 +22,14 @@
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/smp_lock.h>
 #include <linux/acct.h>
 #include <linux/blkdev.h>
-#include <linux/quotaops.h>
-#include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/security.h>
-#include <linux/syscalls.h>
-#include <linux/vfs.h>
 #include <linux/writeback.h>            /* for the emergency remount stuff */
 #include <linux/idr.h>
-#include <linux/kobject.h>
 #include <linux/mutex.h>
-#include <linux/file.h>
+#include <linux/backing-dev.h>
-#include <asm/uaccess.h>
 #include "internal.h"
@@ -92,16 +84,15 @@ static struct super_block *alloc_super(struct file_system_type *type)
                 * subclass.
                 */
                down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
-                s->s_count = S_BIAS;
+                s->s_count = 1;
                atomic_set(&s->s_active, 1);
                mutex_init(&s->s_vfs_rename_mutex);
+                lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
                mutex_init(&s->s_dquot.dqio_mutex);
                mutex_init(&s->s_dquot.dqonoff_mutex);
                init_rwsem(&s->s_dquot.dqptr_sem);
                init_waitqueue_head(&s->s_wait_unfrozen);
                s->s_maxbytes = MAX_NON_LFS;
-                s->dq_op = sb_dquot_ops;
-                s->s_qcop = sb_quotactl_ops;
                s->s_op = &default_op;
                s->s_time_gran = 1000000000;
        }
@@ -126,39 +117,14 @@ static inline void destroy_super(struct super_block *s)
 /* Superblock refcounting  */
 /*
- * Drop a superblock's refcount.  Returns non-zero if the superblock was
+ * Drop a superblock's refcount.  The caller must hold sb_lock.
- * destroyed.  The caller must hold sb_lock.
 */
-static int __put_super(struct super_block *sb)
+void __put_super(struct super_block *sb)
 {
-        int ret = 0;
        if (!--sb->s_count) {
+                list_del_init(&sb->s_list);
                destroy_super(sb);
-                ret = 1;
-        }
-        return ret;
-}
-/*
- * Drop a superblock's refcount.
- * Returns non-zero if the superblock is about to be destroyed and
- * at least is already removed from super_blocks list, so if we are
- * making a loop through super blocks then we need to restart.
- * The caller must hold sb_lock.
- */
-int __put_super_and_need_restart(struct super_block *sb)
-{
-        /* check for race with generic_shutdown_super() */
-        if (list_empty(&sb->s_list)) {
-                /* super block is removed, need to restart... */
-                __put_super(sb);
-                return 1;
        }
-        /* can't be the last, since s_list is still in use */
-        sb->s_count--;
-        BUG_ON(sb->s_count == 0);
-        return 0;
 }
 /**
@@ -177,57 +143,47 @@ void put_super(struct super_block *sb)
 /**
- *      deactivate_super        -       drop an active reference to superblock
+ *      deactivate_locked_super -       drop an active reference to superblock
 *      @s: superblock to deactivate
 *
- *      Drops an active reference to superblock, acquiring a temprory one if
+ *      Drops an active reference to superblock, converting it into a temprory
- *      there is no active references left.  In that case we lock superblock,
+ *      one if there is no other active references left.  In that case we
 *      tell fs driver to shut it down and drop the temporary reference we
 *      had just acquired.
+ *
+ *      Caller holds exclusive lock on superblock; that lock is released.
 */
-void deactivate_super(struct super_block *s)
+void deactivate_locked_super(struct super_block *s)
 {
        struct file_system_type *fs = s->s_type;
-        if (atomic_dec_and_lock(&s->s_active, &sb_lock)) {
+        if (atomic_dec_and_test(&s->s_active)) {
-                s->s_count -= S_BIAS-1;
-                spin_unlock(&sb_lock);
-                vfs_dq_off(s, 0);
-                down_write(&s->s_umount);
                fs->kill_sb(s);
                put_filesystem(fs);
                put_super(s);
+        } else {
+                up_write(&s->s_umount);
        }
 }
-EXPORT_SYMBOL(deactivate_super);
+EXPORT_SYMBOL(deactivate_locked_super);
 /**
- *      deactivate_locked_super -       drop an active reference to superblock
+ *      deactivate_super        -       drop an active reference to superblock
 *      @s: superblock to deactivate
 *
- *      Equivalent of up_write(&s->s_umount); deactivate_super(s);, except that
+ *      Variant of deactivate_locked_super(), except that superblock is *not*
- *      it does not unlock it until it's all over.  As the result, it's safe to
+ *      locked by caller.  If we are going to drop the final active reference,
- *      use to dispose of new superblock on ->get_sb() failure exits - nobody
+ *      lock will be acquired prior to that.
- *      will see the sucker until it's all over.  Equivalent using up_write +
- *      deactivate_super is safe for that purpose only if superblock is either
- *      safe to use or has NULL ->s_root when we unlock.
 */
-void deactivate_locked_super(struct super_block *s)
+void deactivate_super(struct super_block *s)
 {
-        struct file_system_type *fs = s->s_type;
+        if (!atomic_add_unless(&s->s_active, -1, 1)) {
-        if (atomic_dec_and_lock(&s->s_active, &sb_lock)) {
+                down_write(&s->s_umount);
-                s->s_count -= S_BIAS-1;
+                deactivate_locked_super(s);
-                spin_unlock(&sb_lock);
-                vfs_dq_off(s, 0);
-                fs->kill_sb(s);
-                put_filesystem(fs);
-                put_super(s);
-        } else {
-                up_write(&s->s_umount);
        }
 }
-EXPORT_SYMBOL(deactivate_locked_super);
+EXPORT_SYMBOL(deactivate_super);
 /**
 *      grab_super - acquire an active reference
@@ -242,22 +198,17 @@ EXPORT_SYMBOL(deactivate_locked_super);
 */
 static int grab_super(struct super_block *s) __releases(sb_lock)
 {
+        if (atomic_inc_not_zero(&s->s_active)) {
+                spin_unlock(&sb_lock);
+                return 1;
+        }
+        /* it's going away */
        s->s_count++;
        spin_unlock(&sb_lock);
+        /* wait for it to die */
        down_write(&s->s_umount);
-        if (s->s_root) {
-                spin_lock(&sb_lock);
-                if (s->s_count > S_BIAS) {
-                        atomic_inc(&s->s_active);
-                        s->s_count--;
-                        spin_unlock(&sb_lock);
-                        return 1;
-                }
-                spin_unlock(&sb_lock);
-        }
        up_write(&s->s_umount);
        put_super(s);
-        yield();
        return 0;
 }
@@ -320,8 +271,7 @@ void generic_shutdown_super(struct super_block *sb)
        }
        spin_lock(&sb_lock);
        /* should be initialized for __put_super_and_need_restart() */
-        list_del_init(&sb->s_list);
+        list_del_init(&sb->s_instances);
-        list_del(&sb->s_instances);
        spin_unlock(&sb_lock);
        up_write(&sb->s_umount);
 }
@@ -356,6 +306,7 @@ retry:
                                up_write(&s->s_umount);
                                destroy_super(s);
                        }
+                        down_write(&old->s_umount);
                        return old;
                }
        }
@@ -407,11 +358,12 @@ EXPORT_SYMBOL(drop_super);
 */
 void sync_supers(void)
 {
-        struct super_block *sb;
+        struct super_block *sb, *n;
        spin_lock(&sb_lock);
-restart:
+        list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
-        list_for_each_entry(sb, &super_blocks, s_list) {
+                if (list_empty(&sb->s_instances))
+                        continue;
                if (sb->s_op->write_super && sb->s_dirt) {
                        sb->s_count++;
                        spin_unlock(&sb_lock);
@@ -422,14 +374,43 @@ restart:
                        up_read(&sb->s_umount);
                        spin_lock(&sb_lock);
-                        if (__put_super_and_need_restart(sb))
+                        __put_super(sb);
-                                goto restart;
                }
        }
        spin_unlock(&sb_lock);
 }
 /**
+ *      iterate_supers - call function for all active superblocks
+ *      @f: function to call
+ *      @arg: argument to pass to it
+ *
+ *      Scans the superblock list and calls given function, passing it
+ *      locked superblock and given argument.
+ */
+void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
+{
+        struct super_block *sb, *n;
+        spin_lock(&sb_lock);
+        list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
+                if (list_empty(&sb->s_instances))
+                        continue;
+                sb->s_count++;
+                spin_unlock(&sb_lock);
+                down_read(&sb->s_umount);
+                if (sb->s_root)
+                        f(sb, arg);
+                up_read(&sb->s_umount);
+                spin_lock(&sb_lock);
+                __put_super(sb);
+        }
+        spin_unlock(&sb_lock);
+}
+/**
 *      get_super - get the superblock of a device
 *      @bdev: device to get the superblock for
 *      
@@ -437,7 +418,7 @@ restart:
 *      mounted on the device given. %NULL is returned if no match is found.
 */
-struct super_block * get_super(struct block_device *bdev)
+struct super_block *get_super(struct block_device *bdev)
 {
        struct super_block *sb;
@@ -447,17 +428,20 @@ struct super_block * get_super(struct block_device *bdev)
        spin_lock(&sb_lock);
 rescan:
        list_for_each_entry(sb, &super_blocks, s_list) {
+                if (list_empty(&sb->s_instances))
+                        continue;
                if (sb->s_bdev == bdev) {
                        sb->s_count++;
                        spin_unlock(&sb_lock);
                        down_read(&sb->s_umount);
+                        /* still alive? */
                        if (sb->s_root)
                                return sb;
                        up_read(&sb->s_umount);
-                        /* restart only when sb is no longer on the list */
+                        /* nope, got unmounted */
                        spin_lock(&sb_lock);
-                        if (__put_super_and_need_restart(sb))
+                        __put_super(sb);
-                                goto rescan;
+                        goto rescan;
                }
        }
        spin_unlock(&sb_lock);
@@ -472,7 +456,7 @@ EXPORT_SYMBOL(get_super);
 *
 * Scans the superblock list and finds the superblock of the file system
 * mounted on the device given.  Returns the superblock with an active
- * reference and s_umount held exclusively or %NULL if none was found.
+ * reference or %NULL if none was found.
 */
 struct super_block *get_active_super(struct block_device *bdev)
 {
@@ -481,81 +465,49 @@ struct super_block *get_active_super(struct block_device *bdev)
        if (!bdev)
                return NULL;
+restart:
        spin_lock(&sb_lock);
        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (sb->s_bdev != bdev)
+                if (list_empty(&sb->s_instances))
                        continue;
+                if (sb->s_bdev == bdev) {
-                sb->s_count++;
+                        if (grab_super(sb)) /* drops sb_lock */
-                spin_unlock(&sb_lock);
-                down_write(&sb->s_umount);
-                if (sb->s_root) {
-                        spin_lock(&sb_lock);
-                        if (sb->s_count > S_BIAS) {
-                                atomic_inc(&sb->s_active);
-                                sb->s_count--;
-                                spin_unlock(&sb_lock);
                                return sb;
-                        }
+                        else
-                        spin_unlock(&sb_lock);
+                                goto restart;
                }
-                up_write(&sb->s_umount);
-                put_super(sb);
-                yield();
-                spin_lock(&sb_lock);
        }
        spin_unlock(&sb_lock);
        return NULL;
 }
 
-struct super_block * user_get_super(dev_t dev)
+struct super_block *user_get_super(dev_t dev)
 {
        struct super_block *sb;
        spin_lock(&sb_lock);
 rescan:
        list_for_each_entry(sb, &super_blocks, s_list) {
+                if (list_empty(&sb->s_instances))
+                        continue;
                if (sb->s_dev ==  dev) {
                        sb->s_count++;
                        spin_unlock(&sb_lock);
                        down_read(&sb->s_umount);
+                        /* still alive? */
                        if (sb->s_root)
                                return sb;
                        up_read(&sb->s_umount);
-                        /* restart only when sb is no longer on the list */
+                        /* nope, got unmounted */
                        spin_lock(&sb_lock);
-                        if (__put_super_and_need_restart(sb))
+                        __put_super(sb);
-                                goto rescan;
+                        goto rescan;
                }
        }
        spin_unlock(&sb_lock);
        return NULL;
 }
-SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
-{
-        struct super_block *s;
-        struct ustat tmp;
-        struct kstatfs sbuf;
-        int err = -EINVAL;
-        s = user_get_super(new_decode_dev(dev));
-        if (s == NULL)
-                goto out;
-        err = vfs_statfs(s->s_root, &sbuf);
-        drop_super(s);
-        if (err)
-                goto out;
-        memset(&tmp,0,sizeof(struct ustat));
-        tmp.f_tfree = sbuf.f_bfree;
-        tmp.f_tinode = sbuf.f_ffree;
-        err = copy_to_user(ubuf,&tmp,sizeof(struct ustat)) ? -EFAULT : 0;
-out:
-        return err;
-}
 /**
 *      do_remount_sb - asks filesystem to change mount options.
 *      @sb:    superblock in question
@@ -568,7 +520,7 @@ out:
 int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 {
        int retval;
-        int remount_rw, remount_ro;
+        int remount_ro;
        if (sb->s_frozen != SB_UNFROZEN)
                return -EBUSY;
@@ -584,7 +536,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
        sync_filesystem(sb);
        remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
-        remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
        /* If we are remounting RDONLY and current sb is read/write,
           make sure there are no rw files opened */
@@ -593,9 +544,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
                        mark_files_ro(sb);
                else if (!fs_may_remount_ro(sb))
                        return -EBUSY;
-                retval = vfs_dq_off(sb, 1);
-                if (retval < 0 && retval != -ENOSYS)
-                        return -EBUSY;
        }
        if (sb->s_op->remount_fs) {
@@ -604,8 +552,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
                        return retval;
        }
        sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
-        if (remount_rw)
-                vfs_dq_quota_on_remount(sb);
        /*
         * Some filesystems modify their metadata via some other path than the
         * bdev buffer cache (eg. use a private mapping, or directories in
@@ -621,24 +568,24 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 static void do_emergency_remount(struct work_struct *work)
 {
-        struct super_block *sb;
+        struct super_block *sb, *n;
        spin_lock(&sb_lock);
-        list_for_each_entry(sb, &super_blocks, s_list) {
+        list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
+                if (list_empty(&sb->s_instances))
+                        continue;
                sb->s_count++;
                spin_unlock(&sb_lock);
                down_write(&sb->s_umount);
                if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) {
                        /*
-                         * ->remount_fs needs lock_kernel().
-                         *
                         * What lock protects sb->s_flags??
                         */
                        do_remount_sb(sb, MS_RDONLY, NULL, 1);
                }
                up_write(&sb->s_umount);
-                put_super(sb);
                spin_lock(&sb_lock);
+                __put_super(sb);
        }
        spin_unlock(&sb_lock);
        kfree(work);
@@ -693,6 +640,7 @@ int set_anon_super(struct super_block *s, void *data)
                return -EMFILE;
        }
        s->s_dev = MKDEV(0, dev & MINORMASK);
+        s->s_bdi = &noop_backing_dev_info;
        return 0;
 }
@@ -954,10 +902,11 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
        if (error < 0)
                goto out_free_secdata;
        BUG_ON(!mnt->mnt_sb);
+        WARN_ON(!mnt->mnt_sb->s_bdi);
-        error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
+        error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
-        if (error)
+        if (error)
-                goto out_sb;
+                goto out_sb;
        /*
         * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
@@ -987,6 +936,96 @@ out:
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
+/**
+ * freeze_super - lock the filesystem and force it into a consistent state
+ * @sb: the super to lock
+ *
+ * Syncs the super to make sure the filesystem is consistent and calls the fs's
+ * freeze_fs.  Subsequent calls to this without first thawing the fs will return
+ * -EBUSY.
+ */
+int freeze_super(struct super_block *sb)
+{
+        int ret;
+        atomic_inc(&sb->s_active);
+        down_write(&sb->s_umount);
+        if (sb->s_frozen) {
+                deactivate_locked_super(sb);
+                return -EBUSY;
+        }
+        if (sb->s_flags & MS_RDONLY) {
+                sb->s_frozen = SB_FREEZE_TRANS;
+                smp_wmb();
+                up_write(&sb->s_umount);
+                return 0;
+        }
+        sb->s_frozen = SB_FREEZE_WRITE;
+        smp_wmb();
+        sync_filesystem(sb);
+        sb->s_frozen = SB_FREEZE_TRANS;
+        smp_wmb();
+        sync_blockdev(sb->s_bdev);
+        if (sb->s_op->freeze_fs) {
+                ret = sb->s_op->freeze_fs(sb);
+                if (ret) {
+                        printk(KERN_ERR
+                                "VFS:Filesystem freeze failed\n");
+                        sb->s_frozen = SB_UNFROZEN;
+                        deactivate_locked_super(sb);
+                        return ret;
+                }
+        }
+        up_write(&sb->s_umount);
+        return 0;
+}
+EXPORT_SYMBOL(freeze_super);
+/**
+ * thaw_super -- unlock filesystem
+ * @sb: the super to thaw
+ *
+ * Unlocks the filesystem and marks it writeable again after freeze_super().
+ */
+int thaw_super(struct super_block *sb)
+{
+        int error;
+        down_write(&sb->s_umount);
+        if (sb->s_frozen == SB_UNFROZEN) {
+                up_write(&sb->s_umount);
+                return -EINVAL;
+        }
+        if (sb->s_flags & MS_RDONLY)
+                goto out;
+        if (sb->s_op->unfreeze_fs) {
+                error = sb->s_op->unfreeze_fs(sb);
+                if (error) {
+                        printk(KERN_ERR
+                                "VFS:Filesystem thaw failed\n");
+                        sb->s_frozen = SB_FREEZE_TRANS;
+                        up_write(&sb->s_umount);
+                        return error;
+                }
+        }
+out:
+        sb->s_frozen = SB_UNFROZEN;
+        smp_wmb();
+        wake_up(&sb->s_wait_unfrozen);
+        deactivate_locked_super(sb);
+        return 0;
+}
+EXPORT_SYMBOL(thaw_super);
 static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
 {
        int err;
diff --git a/fs/sync.c b/fs/sync.c
index f557d71cb097..c9f83f480ec5 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -5,6 +5,7 @@
 #include <linux/kernel.h>
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/writeback.h>
@@ -13,6 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
+#include <linux/backing-dev.h>
 #include "internal.h"
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
@@ -31,7 +33,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
         * This should be safe, as we require bdi backing to actually
         * write out data in the first place
         */
-        if (!sb->s_bdi)
+        if (!sb->s_bdi || sb->s_bdi == &noop_backing_dev_info)
                return 0;
        if (sb->s_qcop && sb->s_qcop->quota_sync)
@@ -40,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
        if (wait)
                sync_inodes_sb(sb);
        else
-                writeback_inodes_sb(sb);
+                writeback_inodes_sb_locked(sb);
        if (sb->s_op->sync_fs)
                sb->s_op->sync_fs(sb, wait);
@@ -75,50 +77,18 @@ int sync_filesystem(struct super_block *sb)
 }
 EXPORT_SYMBOL_GPL(sync_filesystem);
+static void sync_one_sb(struct super_block *sb, void *arg)
+{
+        if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi)
+                __sync_filesystem(sb, *(int *)arg);
+}
 /*
 * Sync all the data for all the filesystems (called by sys_sync() and
 * emergency sync)
- *
- * This operation is careful to avoid the livelock which could easily happen
- * if two or more filesystems are being continuously dirtied.  s_need_sync
- * is used only here.  We set it against all filesystems and then clear it as
- * we sync them.  So redirtied filesystems are skipped.
- *
- * But if process A is currently running sync_filesystems and then process B
- * calls sync_filesystems as well, process B will set all the s_need_sync
- * flags again, which will cause process A to resync everything.  Fix that with
- * a local mutex.
 */
 static void sync_filesystems(int wait)
 {
-        struct super_block *sb;
+        iterate_supers(sync_one_sb, &wait);
-        static DEFINE_MUTEX(mutex);
-        mutex_lock(&mutex);             /* Could be down_interruptible */
-        spin_lock(&sb_lock);
-        list_for_each_entry(sb, &super_blocks, s_list)
-                sb->s_need_sync = 1;
-restart:
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (!sb->s_need_sync)
-                        continue;
-                sb->s_need_sync = 0;
-                sb->s_count++;
-                spin_unlock(&sb_lock);
-                down_read(&sb->s_umount);
-                if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
-                        __sync_filesystem(sb, wait);
-                up_read(&sb->s_umount);
-                /* restart only when sb is no longer on the list */
-                spin_lock(&sb_lock);
-                if (__put_super_and_need_restart(sb))
-                        goto restart;
-        }
-        spin_unlock(&sb_lock);
-        mutex_unlock(&mutex);
 }
 /*
@@ -160,12 +130,10 @@ void emergency_sync(void)
 /*
 * Generic function to fsync a file.
- *
- * filp may be NULL if called via the msync of a vma.
 */
-int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int file_fsync(struct file *filp, int datasync)
 {
-        struct inode * inode = dentry->d_inode;
+        struct inode *inode = filp->f_mapping->host;
        struct super_block * sb;
        int ret, err;
@@ -188,7 +156,6 @@ EXPORT_SYMBOL(file_fsync);
 /**
 * vfs_fsync_range - helper to sync a range of data & metadata to disk
 * @file:               file to sync
- * @dentry:             dentry of @file
 * @start:              offset in bytes of the beginning of data range to sync
 * @end:                offset in bytes of the end of data range (inclusive)
 * @datasync:           perform only datasync
@@ -196,32 +163,13 @@ EXPORT_SYMBOL(file_fsync);
 * Write back data in range @start..@end and metadata for @file to disk.  If
 * @datasync is set only metadata needed to access modified file data is
 * written.
- *
- * In case this function is called from nfsd @file may be %NULL and
- * only @dentry is set.  This can only happen when the filesystem
- * implements the export_operations API.
 */
-int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
+int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
-                    loff_t end, int datasync)
 {
-        const struct file_operations *fop;
+        struct address_space *mapping = file->f_mapping;
-        struct address_space *mapping;
        int err, ret;
-        /*
+        if (!file->f_op || !file->f_op->fsync) {
-         * Get mapping and operations from the file in case we have
-         * as file, or get the default values for them in case we
-         * don't have a struct file available.  Damn nfsd..
-         */
-        if (file) {
-                mapping = file->f_mapping;
-                fop = file->f_op;
-        } else {
-                mapping = dentry->d_inode->i_mapping;
-                fop = dentry->d_inode->i_fop;
-        }
-        if (!fop || !fop->fsync) {
                ret = -EINVAL;
                goto out;
        }
@@ -233,7 +181,7 @@ int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
         * livelocks in fsync_buffers_list().
         */
        mutex_lock(&mapping->host->i_mutex);
-        err = fop->fsync(file, dentry, datasync);
+        err = file->f_op->fsync(file, datasync);
        if (!ret)
                ret = err;
        mutex_unlock(&mapping->host->i_mutex);
@@ -246,19 +194,14 @@ EXPORT_SYMBOL(vfs_fsync_range);
 /**
 * vfs_fsync - perform a fsync or fdatasync on a file
 * @file:               file to sync
- * @dentry:             dentry of @file
 * @datasync:           only perform a fdatasync operation
 *
 * Write back data and metadata for @file to disk.  If @datasync is
 * set only metadata needed to access modified file data is written.
- *
- * In case this function is called from nfsd @file may be %NULL and
- * only @dentry is set.  This can only happen when the filesystem
- * implements the export_operations API.
 */
-int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int vfs_fsync(struct file *file, int datasync)
 {
-        return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync);
+        return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
 }
 EXPORT_SYMBOL(vfs_fsync);
@@ -269,7 +212,7 @@ static int do_fsync(unsigned int fd, int datasync)
        file = fget(fd);
        if (file) {
-                ret = vfs_fsync(file, file->f_path.dentry, datasync);
+                ret = vfs_fsync(file, datasync);
                fput(file);
        }
        return ret;
@@ -297,8 +240,7 @@ int generic_write_sync(struct file *file, loff_t pos, loff_t count)
 {
        if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
                return 0;
-        return vfs_fsync_range(file, file->f_path.dentry, pos,
+        return vfs_fsync_range(file, pos, pos + count - 1,
-                               pos + count - 1,
                               (file->f_flags & __O_SYNC) ? 0 : 1);
 }
 EXPORT_SYMBOL(generic_write_sync);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index e9d293593e52..4e321f7353fa 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -46,9 +46,9 @@ struct bin_buffer {
 };
 static int
-fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
+fill_read(struct file *file, char *buffer, loff_t off, size_t count)
 {
-        struct sysfs_dirent *attr_sd = dentry->d_fsdata;
+        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
        struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
        struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
        int rc;
@@ -59,7 +59,7 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
        rc = -EIO;
        if (attr->read)
-                rc = attr->read(kobj, attr, buffer, off, count);
+                rc = attr->read(file, kobj, attr, buffer, off, count);
        sysfs_put_active(attr_sd);
@@ -70,8 +70,7 @@ static ssize_t
 read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
 {
        struct bin_buffer *bb = file->private_data;
-        struct dentry *dentry = file->f_path.dentry;
+        int size = file->f_path.dentry->d_inode->i_size;
-        int size = dentry->d_inode->i_size;
        loff_t offs = *off;
        int count = min_t(size_t, bytes, PAGE_SIZE);
        char *temp;
@@ -92,7 +91,7 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
        mutex_lock(&bb->mutex);
-        count = fill_read(dentry, bb->buffer, offs, count);
+        count = fill_read(file, bb->buffer, offs, count);
        if (count < 0) {
                mutex_unlock(&bb->mutex);
                goto out_free;
@@ -117,9 +116,9 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
 }
 static int
-flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
+flush_write(struct file *file, char *buffer, loff_t offset, size_t count)
 {
-        struct sysfs_dirent *attr_sd = dentry->d_fsdata;
+        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
        struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
        struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
        int rc;
@@ -130,7 +129,7 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
        rc = -EIO;
        if (attr->write)
-                rc = attr->write(kobj, attr, buffer, offset, count);
+                rc = attr->write(file, kobj, attr, buffer, offset, count);
        sysfs_put_active(attr_sd);
@@ -141,8 +140,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
                     size_t bytes, loff_t *off)
 {
        struct bin_buffer *bb = file->private_data;
-        struct dentry *dentry = file->f_path.dentry;
+        int size = file->f_path.dentry->d_inode->i_size;
-        int size = dentry->d_inode->i_size;
        loff_t offs = *off;
        int count = min_t(size_t, bytes, PAGE_SIZE);
        char *temp;
@@ -165,7 +163,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
        memcpy(bb->buffer, temp, count);
-        count = flush_write(dentry, bb->buffer, offs, count);
+        count = flush_write(file, bb->buffer, offs, count);
        mutex_unlock(&bb->mutex);
        if (count > 0)
@@ -363,7 +361,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
        if (!attr->mmap)
                goto out_put;
-        rc = attr->mmap(kobj, attr, vma);
+        rc = attr->mmap(file, kobj, attr, vma);
        if (rc)
                goto out_put;
@@ -501,7 +499,7 @@ int sysfs_create_bin_file(struct kobject *kobj,
 void sysfs_remove_bin_file(struct kobject *kobj,
                           const struct bin_attribute *attr)
 {
-        sysfs_hash_and_remove(kobj->sd, attr->attr.name);
+        sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name);
 }
 EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 590717861c7a..7e54bac8c4b0 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -380,7 +380,7 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
        struct sysfs_inode_attrs *ps_iattr;
-        if (sysfs_find_dirent(acxt->parent_sd, sd->s_name))
+        if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name))
                return -EEXIST;
        sd->s_parent = sysfs_get(acxt->parent_sd);
@@ -533,13 +533,17 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
 *      Pointer to sysfs_dirent if found, NULL if not.
 */
 struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
+                                       const void *ns,
                                       const unsigned char *name)
 {
        struct sysfs_dirent *sd;
-        for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling)
+        for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) {
+                if (ns && sd->s_ns && (sd->s_ns != ns))
+                        continue;
                if (!strcmp(sd->s_name, name))
                        return sd;
+        }
        return NULL;
 }
@@ -558,12 +562,13 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 *      Pointer to sysfs_dirent if found, NULL if not.
 */
 struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
+                                      const void *ns,
                                      const unsigned char *name)
 {
        struct sysfs_dirent *sd;
        mutex_lock(&sysfs_mutex);
-        sd = sysfs_find_dirent(parent_sd, name);
+        sd = sysfs_find_dirent(parent_sd, ns, name);
        sysfs_get(sd);
        mutex_unlock(&sysfs_mutex);
@@ -572,7 +577,8 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
 EXPORT_SYMBOL_GPL(sysfs_get_dirent);
 static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
-                      const char *name, struct sysfs_dirent **p_sd)
+        enum kobj_ns_type type, const void *ns, const char *name,
+        struct sysfs_dirent **p_sd)
 {
        umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
        struct sysfs_addrm_cxt acxt;
@@ -583,6 +589,9 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
        sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
        if (!sd)
                return -ENOMEM;
+        sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT);
+        sd->s_ns = ns;
        sd->s_dir.kobj = kobj;
        /* link in */
@@ -601,7 +610,33 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
 int sysfs_create_subdir(struct kobject *kobj, const char *name,
                        struct sysfs_dirent **p_sd)
 {
-        return create_dir(kobj, kobj->sd, name, p_sd);
+        return create_dir(kobj, kobj->sd,
+                          KOBJ_NS_TYPE_NONE, NULL, name, p_sd);
+}
+/**
+ *      sysfs_read_ns_type: return associated ns_type
+ *      @kobj: the kobject being queried
+ *
+ *      Each kobject can be tagged with exactly one namespace type
+ *      (i.e. network or user).  Return the ns_type associated with
+ *      this object if any
+ */
+static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
+{
+        const struct kobj_ns_type_operations *ops;
+        enum kobj_ns_type type;
+        ops = kobj_child_ns_ops(kobj);
+        if (!ops)
+                return KOBJ_NS_TYPE_NONE;
+        type = ops->type;
+        BUG_ON(type <= KOBJ_NS_TYPE_NONE);
+        BUG_ON(type >= KOBJ_NS_TYPES);
+        BUG_ON(!kobj_ns_type_registered(type));
+        return type;
 }
 /**
@@ -610,7 +645,9 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
 */
 int sysfs_create_dir(struct kobject * kobj)
 {
+        enum kobj_ns_type type;
        struct sysfs_dirent *parent_sd, *sd;
+        const void *ns = NULL;
        int error = 0;
        BUG_ON(!kobj);
@@ -620,7 +657,11 @@ int sysfs_create_dir(struct kobject * kobj)
        else
                parent_sd = &sysfs_root;
-        error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd);
+        if (sysfs_ns_type(parent_sd))
+                ns = kobj->ktype->namespace(kobj);
+        type = sysfs_read_ns_type(kobj);
+        error = create_dir(kobj, parent_sd, type, ns, kobject_name(kobj), &sd);
        if (!error)
                kobj->sd = sd;
        return error;
@@ -630,13 +671,19 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
                                struct nameidata *nd)
 {
        struct dentry *ret = NULL;
-        struct sysfs_dirent *parent_sd = dentry->d_parent->d_fsdata;
+        struct dentry *parent = dentry->d_parent;
+        struct sysfs_dirent *parent_sd = parent->d_fsdata;
        struct sysfs_dirent *sd;
        struct inode *inode;
+        enum kobj_ns_type type;
+        const void *ns;
        mutex_lock(&sysfs_mutex);
-        sd = sysfs_find_dirent(parent_sd, dentry->d_name.name);
+        type = sysfs_ns_type(parent_sd);
+        ns = sysfs_info(dir->i_sb)->ns[type];
+        sd = sysfs_find_dirent(parent_sd, ns, dentry->d_name.name);
        /* no such entry */
        if (!sd) {
@@ -735,7 +782,8 @@ void sysfs_remove_dir(struct kobject * kobj)
 }
 int sysfs_rename(struct sysfs_dirent *sd,
-        struct sysfs_dirent *new_parent_sd, const char *new_name)
+        struct sysfs_dirent *new_parent_sd, const void *new_ns,
+        const char *new_name)
 {
        const char *dup_name = NULL;
        int error;
@@ -743,12 +791,12 @@ int sysfs_rename(struct sysfs_dirent *sd,
        mutex_lock(&sysfs_mutex);
        error = 0;
-        if ((sd->s_parent == new_parent_sd) &&
+        if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) &&
            (strcmp(sd->s_name, new_name) == 0))
                goto out;       /* nothing to rename */
        error = -EEXIST;
-        if (sysfs_find_dirent(new_parent_sd, new_name))
+        if (sysfs_find_dirent(new_parent_sd, new_ns, new_name))
                goto out;
        /* rename sysfs_dirent */
@@ -770,6 +818,7 @@ int sysfs_rename(struct sysfs_dirent *sd,
                sd->s_parent = new_parent_sd;
                sysfs_link_sibling(sd);
        }
+        sd->s_ns = new_ns;
        error = 0;
 out:
@@ -780,19 +829,28 @@ int sysfs_rename(struct sysfs_dirent *sd,
 int sysfs_rename_dir(struct kobject *kobj, const char *new_name)
 {
-        return sysfs_rename(kobj->sd, kobj->sd->s_parent, new_name);
+        struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
+        const void *new_ns = NULL;
+        if (sysfs_ns_type(parent_sd))
+                new_ns = kobj->ktype->namespace(kobj);
+        return sysfs_rename(kobj->sd, parent_sd, new_ns, new_name);
 }
 int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
 {
        struct sysfs_dirent *sd = kobj->sd;
        struct sysfs_dirent *new_parent_sd;
+        const void *new_ns = NULL;
        BUG_ON(!sd->s_parent);
+        if (sysfs_ns_type(sd->s_parent))
+                new_ns = kobj->ktype->namespace(kobj);
        new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
                new_parent_kobj->sd : &sysfs_root;
-        return sysfs_rename(sd, new_parent_sd, sd->s_name);
+        return sysfs_rename(sd, new_parent_sd, new_ns, sd->s_name);
 }
 /* Relationship between s_mode and the DT_xxx types */
@@ -807,32 +865,35 @@ static int sysfs_dir_release(struct inode *inode, struct file *filp)
        return 0;
 }
-static struct sysfs_dirent *sysfs_dir_pos(struct sysfs_dirent *parent_sd,
+static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
-        ino_t ino, struct sysfs_dirent *pos)
+        struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
 {
        if (pos) {
                int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
                        pos->s_parent == parent_sd &&
                        ino == pos->s_ino;
                sysfs_put(pos);
-                if (valid)
+                if (!valid)
-                        return pos;
+                        pos = NULL;
        }
-        pos = NULL;
+        if (!pos && (ino > 1) && (ino < INT_MAX)) {
-        if ((ino > 1) && (ino < INT_MAX)) {
                pos = parent_sd->s_dir.children;
                while (pos && (ino > pos->s_ino))
                        pos = pos->s_sibling;
        }
+        while (pos && pos->s_ns && pos->s_ns != ns)
+                pos = pos->s_sibling;
        return pos;
 }
-static struct sysfs_dirent *sysfs_dir_next_pos(struct sysfs_dirent *parent_sd,
+static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
-        ino_t ino, struct sysfs_dirent *pos)
+        struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
 {
-        pos = sysfs_dir_pos(parent_sd, ino, pos);
+        pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
        if (pos)
                pos = pos->s_sibling;
+        while (pos && pos->s_ns && pos->s_ns != ns)
+                pos = pos->s_sibling;
        return pos;
 }
@@ -841,8 +902,13 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
        struct dentry *dentry = filp->f_path.dentry;
        struct sysfs_dirent * parent_sd = dentry->d_fsdata;
        struct sysfs_dirent *pos = filp->private_data;
+        enum kobj_ns_type type;
+        const void *ns;
        ino_t ino;
+        type = sysfs_ns_type(parent_sd);
+        ns = sysfs_info(dentry->d_sb)->ns[type];
        if (filp->f_pos == 0) {
                ino = parent_sd->s_ino;
                if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0)
@@ -857,9 +923,9 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
                        filp->f_pos++;
        }
        mutex_lock(&sysfs_mutex);
-        for (pos = sysfs_dir_pos(parent_sd, filp->f_pos, pos);
+        for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos);
             pos;
-             pos = sysfs_dir_next_pos(parent_sd, filp->f_pos, pos)) {
+             pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) {
                const char * name;
                unsigned int type;
                int len, ret;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index e222b2582746..1beaa739d0a6 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -478,9 +478,12 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
        mutex_lock(&sysfs_mutex);
        if (sd && dir)
-                sd = sysfs_find_dirent(sd, dir);
+                /* Only directories are tagged, so no need to pass
+                 * a tag explicitly.
+                 */
+                sd = sysfs_find_dirent(sd, NULL, dir);
        if (sd && attr)
-                sd = sysfs_find_dirent(sd, attr);
+                sd = sysfs_find_dirent(sd, NULL, attr);
        if (sd)
                sysfs_notify_dirent(sd);
@@ -569,7 +572,7 @@ int sysfs_add_file_to_group(struct kobject *kobj,
        int error;
        if (group)
-                dir_sd = sysfs_get_dirent(kobj->sd, group);
+                dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
        else
                dir_sd = sysfs_get(kobj->sd);
@@ -599,7 +602,7 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
        mutex_lock(&sysfs_mutex);
        rc = -ENOENT;
-        sd = sysfs_find_dirent(kobj->sd, attr->name);
+        sd = sysfs_find_dirent(kobj->sd, NULL, attr->name);
        if (!sd)
                goto out;
@@ -624,7 +627,7 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
 void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
 {
-        sysfs_hash_and_remove(kobj->sd, attr->name);
+        sysfs_hash_and_remove(kobj->sd, NULL, attr->name);
 }
 void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr)
@@ -646,11 +649,11 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
        struct sysfs_dirent *dir_sd;
        if (group)
-                dir_sd = sysfs_get_dirent(kobj->sd, group);
+                dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
        else
                dir_sd = sysfs_get(kobj->sd);
        if (dir_sd) {
-                sysfs_hash_and_remove(dir_sd, attr->name);
+                sysfs_hash_and_remove(dir_sd, NULL, attr->name);
                sysfs_put(dir_sd);
        }
 }
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index fe611949a7f7..23c1e598792a 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -23,7 +23,7 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
        int i;
        for (i = 0, attr = grp->attrs; *attr; i++, attr++)
-                sysfs_hash_and_remove(dir_sd, (*attr)->name);
+                sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
 }
 static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
@@ -39,7 +39,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
                 * visibility.  Do this by first removing then
                 * re-adding (if required) the file */
                if (update)
-                        sysfs_hash_and_remove(dir_sd, (*attr)->name);
+                        sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
                if (grp->is_visible) {
                        mode = grp->is_visible(kobj, *attr, i);
                        if (!mode)
@@ -132,7 +132,7 @@ void sysfs_remove_group(struct kobject * kobj,
        struct sysfs_dirent *sd;
        if (grp->name) {
-                sd = sysfs_get_dirent(dir_sd, grp->name);
+                sd = sysfs_get_dirent(dir_sd, NULL, grp->name);
                if (!sd) {
                        WARN(!sd, KERN_WARNING "sysfs group %p not found for "
                                "kobject '%s'\n", grp, kobject_name(kobj));
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 082daaecac1b..bde1a4c3679a 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -18,6 +18,7 @@
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/xattr.h>
 #include <linux/security.h>
 #include "sysfs.h"
@@ -116,13 +117,11 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
        if (error)
                goto out;
-        iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */
+        /* this ignores size changes */
+        generic_setattr(inode, iattr);
-        error = inode_setattr(inode, iattr);
-        if (error)
-                goto out;
        error = sysfs_sd_setattr(sd, iattr);
 out:
        mutex_unlock(&sysfs_mutex);
        return error;
@@ -323,7 +322,7 @@ void sysfs_delete_inode(struct inode *inode)
        sysfs_put(sd);
 }
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
+int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name)
 {
        struct sysfs_addrm_cxt acxt;
        struct sysfs_dirent *sd;
@@ -333,7 +332,9 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
        sysfs_addrm_start(&acxt, dir_sd);
-        sd = sysfs_find_dirent(dir_sd, name);
+        sd = sysfs_find_dirent(dir_sd, ns, name);
+        if (sd && (sd->s_ns != ns))
+                sd = NULL;
        if (sd)
                sysfs_remove_one(&acxt, sd);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 0cb10884a2fc..281c0c9bc39f 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -18,6 +18,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/magic.h>
+#include <linux/slab.h>
 #include "sysfs.h"
@@ -34,7 +35,7 @@ static const struct super_operations sysfs_ops = {
 struct sysfs_dirent sysfs_root = {
        .s_name         = "",
        .s_count        = ATOMIC_INIT(1),
-        .s_flags        = SYSFS_DIR,
+        .s_flags        = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT),
        .s_mode         = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
        .s_ino          = 1,
 };
@@ -71,18 +72,107 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 }
+static int sysfs_test_super(struct super_block *sb, void *data)
+{
+        struct sysfs_super_info *sb_info = sysfs_info(sb);
+        struct sysfs_super_info *info = data;
+        enum kobj_ns_type type;
+        int found = 1;
+        for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
+                if (sb_info->ns[type] != info->ns[type])
+                        found = 0;
+        }
+        return found;
+}
+static int sysfs_set_super(struct super_block *sb, void *data)
+{
+        int error;
+        error = set_anon_super(sb, data);
+        if (!error)
+                sb->s_fs_info = data;
+        return error;
+}
 static int sysfs_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt);
+        struct sysfs_super_info *info;
+        enum kobj_ns_type type;
+        struct super_block *sb;
+        int error;
+        error = -ENOMEM;
+        info = kzalloc(sizeof(*info), GFP_KERNEL);
+        if (!info)
+                goto out;
+        for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
+                info->ns[type] = kobj_ns_current(type);
+        sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info);
+        if (IS_ERR(sb) || sb->s_fs_info != info)
+                kfree(info);
+        if (IS_ERR(sb)) {
+                error = PTR_ERR(sb);
+                goto out;
+        }
+        if (!sb->s_root) {
+                sb->s_flags = flags;
+                error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+                if (error) {
+                        deactivate_locked_super(sb);
+                        goto out;
+                }
+                sb->s_flags |= MS_ACTIVE;
+        }
+        simple_set_mnt(mnt, sb);
+        error = 0;
+out:
+        return error;
+}
+static void sysfs_kill_sb(struct super_block *sb)
+{
+        struct sysfs_super_info *info = sysfs_info(sb);
+        /* Remove the superblock from fs_supers/s_instances
+         * so we can't find it, before freeing sysfs_super_info.
+         */
+        kill_anon_super(sb);
+        kfree(info);
 }
 static struct file_system_type sysfs_fs_type = {
        .name           = "sysfs",
        .get_sb         = sysfs_get_sb,
-        .kill_sb        = kill_anon_super,
+        .kill_sb        = sysfs_kill_sb,
 };
+void sysfs_exit_ns(enum kobj_ns_type type, const void *ns)
+{
+        struct super_block *sb;
+        mutex_lock(&sysfs_mutex);
+        spin_lock(&sb_lock);
+        list_for_each_entry(sb, &sysfs_fs_type.fs_supers, s_instances) {
+                struct sysfs_super_info *info = sysfs_info(sb);
+                /*
+                 * If we see a superblock on the fs_supers/s_instances
+                 * list the unmount has not completed and sb->s_fs_info
+                 * points to a valid struct sysfs_super_info.
+                 */
+                /* Ignore superblocks with the wrong ns */
+                if (info->ns[type] != ns)
+                        continue;
+                info->ns[type] = NULL;
+        }
+        spin_unlock(&sb_lock);
+        mutex_unlock(&sysfs_mutex);
+}
 int __init sysfs_init(void)
 {
        int err = -ENOMEM;
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 1b9a3a1e8a17..f71246bebfe4 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -11,6 +11,7 @@
 */
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/mount.h>
 #include <linux/module.h>
 #include <linux/kobject.h>
@@ -57,6 +58,8 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
        if (!sd)
                goto out_put;
+        if (sysfs_ns_type(parent_sd))
+                sd->s_ns = target->ktype->namespace(target);
        sd->s_symlink.target_sd = target_sd;
        target_sd = NULL;       /* reference is now owned by the symlink */
@@ -106,6 +109,26 @@ int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target,
 }
 /**
+ *      sysfs_delete_link - remove symlink in object's directory.
+ *      @kobj:  object we're acting for.
+ *      @targ:  object we're pointing to.
+ *      @name:  name of the symlink to remove.
+ *
+ *      Unlike sysfs_remove_link sysfs_delete_link has enough information
+ *      to successfully delete symlinks in tagged directories.
+ */
+void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
+                        const char *name)
+{
+        const void *ns = NULL;
+        spin_lock(&sysfs_assoc_lock);
+        if (targ->sd)
+                ns = targ->sd->s_ns;
+        spin_unlock(&sysfs_assoc_lock);
+        sysfs_hash_and_remove(kobj->sd, ns, name);
+}
+/**
 *      sysfs_remove_link - remove symlink in object's directory.
 *      @kobj:  object we're acting for.
 *      @name:  name of the symlink to remove.
@@ -120,7 +143,7 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
        else
                parent_sd = kobj->sd;
-        sysfs_hash_and_remove(parent_sd, name);
+        sysfs_hash_and_remove(parent_sd, NULL, name);
 }
 /**
@@ -136,6 +159,7 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
                        const char *old, const char *new)
 {
        struct sysfs_dirent *parent_sd, *sd = NULL;
+        const void *old_ns = NULL, *new_ns = NULL;
        int result;
        if (!kobj)
@@ -143,8 +167,11 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
        else
                parent_sd = kobj->sd;
+        if (targ->sd)
+                old_ns = targ->sd->s_ns;
        result = -ENOENT;
-        sd = sysfs_get_dirent(parent_sd, old);
+        sd = sysfs_get_dirent(parent_sd, old_ns, old);
        if (!sd)
                goto out;
@@ -154,7 +181,10 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
        if (sd->s_symlink.target_sd->s_dir.kobj != targ)
                goto out;
-        result = sysfs_rename(sd, parent_sd, new);
+        if (sysfs_ns_type(parent_sd))
+                new_ns = targ->ktype->namespace(targ);
+        result = sysfs_rename(sd, parent_sd, new_ns, new);
 out:
        sysfs_put(sd);
@@ -260,3 +290,4 @@ const struct inode_operations sysfs_symlink_inode_operations = {
 EXPORT_SYMBOL_GPL(sysfs_create_link);
 EXPORT_SYMBOL_GPL(sysfs_remove_link);
+EXPORT_SYMBOL_GPL(sysfs_rename_link);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 30f5a44fb5d3..6a13105b5594 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -58,6 +58,7 @@ struct sysfs_dirent {
        struct sysfs_dirent     *s_sibling;
        const char              *s_name;
+        const void              *s_ns; /* namespace tag */
        union {
                struct sysfs_elem_dir           s_dir;
                struct sysfs_elem_symlink       s_symlink;
@@ -81,14 +82,27 @@ struct sysfs_dirent {
 #define SYSFS_COPY_NAME                 (SYSFS_DIR | SYSFS_KOBJ_LINK)
 #define SYSFS_ACTIVE_REF                (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
-#define SYSFS_FLAG_MASK                 ~SYSFS_TYPE_MASK
+/* identify any namespace tag on sysfs_dirents */
-#define SYSFS_FLAG_REMOVED              0x0200
+#define SYSFS_NS_TYPE_MASK              0xff00
+#define SYSFS_NS_TYPE_SHIFT             8
+#define SYSFS_FLAG_MASK                 ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK)
+#define SYSFS_FLAG_REMOVED              0x020000
 static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
 {
        return sd->s_flags & SYSFS_TYPE_MASK;
 }
+/*
+ * Return any namespace tags on this dirent.
+ * enum kobj_ns_type is defined in linux/kobject.h
+ */
+static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd)
+{
+        return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT;
+}
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #define sysfs_dirent_init_lockdep(sd)                           \
 do {                                                            \
@@ -114,6 +128,16 @@ struct sysfs_addrm_cxt {
 /*
 * mount.c
 */
+/*
+ * Each sb is associated with a set of namespace tags (i.e.
+ * the network namespace of the task which mounted this sysfs
+ * instance).
+ */
+struct sysfs_super_info {
+        const void *ns[KOBJ_NS_TYPES];
+};
+#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
 extern struct sysfs_dirent sysfs_root;
 extern struct kmem_cache *sysfs_dir_cachep;
@@ -137,8 +161,10 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
 struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
+                                       const void *ns,
                                       const unsigned char *name);
 struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
+                                      const void *ns,
                                      const unsigned char *name);
 struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
@@ -149,7 +175,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
 void sysfs_remove_subdir(struct sysfs_dirent *sd);
 int sysfs_rename(struct sysfs_dirent *sd,
-        struct sysfs_dirent *new_parent_sd, const char *new_name);
+        struct sysfs_dirent *new_parent_sd, const void *ns, const char *new_name);
 static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
 {
@@ -179,7 +205,7 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
 int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                size_t size, int flags);
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
+int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name);
 int sysfs_inode_init(void);
 /*
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 4e50286a4cc3..79941e4964a4 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -24,7 +24,7 @@ const struct file_operations sysv_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = sysv_readdir,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
 };
 static inline void dir_put_page(struct page *page)
@@ -164,8 +164,8 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_
                                                        name, de->name))
                                        goto found;
                        }
+                        dir_put_page(page);
                }
-                dir_put_page(page);
                if (++n >= npages)
                        n = 0;
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 96340c01f4a7..750cc22349bd 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -26,7 +26,7 @@ const struct file_operations sysv_file_operations = {
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
        .splice_read    = generic_file_splice_read,
 };
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 241e9765cfad..bbd69bdb0fa8 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -159,15 +159,7 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
        *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count);
        fs16_add(sbi, sbi->s_sb_total_free_inodes, -1);
        dirty_sb(sb);
-        
+        inode_init_owner(inode, dir, mode);
-        if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
-        } else
-                inode->i_gid = current_fsgid();
-        inode->i_uid = current_fsuid();
        inode->i_ino = fs16_to_cpu(sbi, ino);
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
        inode->i_blocks = 0;
@@ -176,7 +168,6 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
        insert_inode_hash(inode);
        mark_inode_dirty(inode);
-        inode->i_mode = mode;           /* for sysv_write_inode() */
        sysv_write_inode(inode, 0);     /* ensure inode not allocated again */
        mark_inode_dirty(inode);        /* cleared by sysv_write_inode() */
        /* That's it. */
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 4573734d723d..d4a5380b5669 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -43,6 +43,7 @@ static int sysv_sync_fs(struct super_block *sb, int wait)
         * then attach current time stamp.
         * But if the filesystem was marked clean, keep it clean.
         */
+        sb->s_dirt = 0;
        old_time = fs32_to_cpu(sbi, *sbi->s_sb_time);
        if (sbi->s_type == FSTYPE_SYSV4) {
                if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time))
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 1bfc95ad5f71..b86ab8eff79a 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -14,6 +14,7 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/time.h>
@@ -109,31 +110,14 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
        struct timerfd_ctx *ctx = file->private_data;
        ssize_t res;
        u64 ticks = 0;
-        DECLARE_WAITQUEUE(wait, current);
        if (count < sizeof(ticks))
                return -EINVAL;
        spin_lock_irq(&ctx->wqh.lock);
-        res = -EAGAIN;
+        if (file->f_flags & O_NONBLOCK)
-        if (!ctx->ticks && !(file->f_flags & O_NONBLOCK)) {
+                res = -EAGAIN;
-                __add_wait_queue(&ctx->wqh, &wait);
+        else
-                for (res = 0;;) {
+                res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks);
-                        set_current_state(TASK_INTERRUPTIBLE);
-                        if (ctx->ticks) {
-                                res = 0;
-                                break;
-                        }
-                        if (signal_pending(current)) {
-                                res = -ERESTARTSYS;
-                                break;
-                        }
-                        spin_unlock_irq(&ctx->wqh.lock);
-                        schedule();
-                        spin_lock_irq(&ctx->wqh.lock);
-                }
-                __remove_wait_queue(&ctx->wqh, &wait);
-                __set_current_state(TASK_RUNNING);
-        }
        if (ctx->ticks) {
                ticks = ctx->ticks;
                if (ctx->expired && ctx->tintv.tv64) {
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 4775af401167..37fa7ed062d8 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -45,6 +45,7 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 /**
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 90492327b383..c2a68baa782f 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -34,6 +34,7 @@
 #include <linux/moduleparam.h>
 #include <linux/debugfs.h>
 #include <linux/math64.h>
+#include <linux/slab.h>
 #ifdef CONFIG_UBIFS_FS_DEBUG
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 401e503d44a1..87ebcce72213 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -104,14 +104,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
         */
        inode->i_flags |= (S_NOCMTIME);
-        inode->i_uid = current_fsuid();
+        inode_init_owner(inode, dir, mode);
-        if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
-        } else
-                inode->i_gid = current_fsgid();
-        inode->i_mode = mode;
        inode->i_mtime = inode->i_atime = inode->i_ctime =
                         ubifs_current_time(inode);
        inode->i_mapping->nrpages = 0;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index e26c02ab6cd5..12f445cee9f7 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -52,6 +52,7 @@
 #include "ubifs.h"
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 static int read_block(struct inode *inode, void *addr, unsigned int block,
                      struct ubifs_data_node *dn)
@@ -966,12 +967,15 @@ static int do_writepage(struct page *page, int len)
 * the page locked, and it locks @ui_mutex. However, write-back does take inode
 * @i_mutex, which means other VFS operations may be run on this inode at the
 * same time. And the problematic one is truncation to smaller size, from where
- * we have to call 'vmtruncate()', which first changes @inode->i_size, then
+ * we have to call 'simple_setsize()', which first changes @inode->i_size, then
 * drops the truncated pages. And while dropping the pages, it takes the page
- * lock. This means that 'do_truncation()' cannot call 'vmtruncate()' with
+ * lock. This means that 'do_truncation()' cannot call 'simple_setsize()' with
 * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
 * means that @inode->i_size is changed while @ui_mutex is unlocked.
 *
+ * XXX: with the new truncate the above is not true anymore, the simple_setsize
+ * calls can be replaced with the individual components.
+ *
 * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond
 * inode size. How do we do this if @inode->i_size may became smaller while we
 * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the
@@ -1124,7 +1128,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
                budgeted = 0;
        }
-        err = vmtruncate(inode, new_size);
+        err = simple_setsize(inode, new_size);
        if (err)
                goto out_budg;
@@ -1213,7 +1217,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
        if (attr->ia_valid & ATTR_SIZE) {
                dbg_gen("size %lld -> %lld", inode->i_size, new_size);
-                err = vmtruncate(inode, new_size);
+                err = simple_setsize(inode, new_size);
                if (err)
                        goto out;
        }
@@ -1222,7 +1226,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
        if (attr->ia_valid & ATTR_SIZE) {
                /* Truncation changes inode [mc]time */
                inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
-                /* 'vmtruncate()' changed @i_size, update @ui_size */
+                /* 'simple_setsize()' changed @i_size, update @ui_size */
                ui->ui_size = inode->i_size;
        }
@@ -1303,9 +1307,9 @@ static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
        return NULL;
 }
-int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int ubifs_fsync(struct file *file, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
        int err;
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index e5a3d8e96bb7..918d1582ca05 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -53,6 +53,7 @@
 * good, and GC takes extra care when moving them.
 */
+#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/list_sort.h>
 #include "ubifs.h"
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index e589fedaf1ef..bcf5a16f30bb 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -51,6 +51,7 @@
 */
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 /**
@@ -63,6 +64,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
        if (!c->ro_media) {
                c->ro_media = 1;
                c->no_chk_data_crc = 0;
+                c->vfs_sb->s_flags |= MS_RDONLY;
                ubifs_warn("switched to read-only mode, error %d", err);
                dbg_dump_stack();
        }
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index b2792e84d245..ad7f67b827ea 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -46,6 +46,7 @@
 #include "ubifs.h"
 #include <linux/crc16.h>
 #include <linux/math64.h>
+#include <linux/slab.h>
 /**
 * do_calc_lpt_geom - calculate sizes for the LPT area.
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 8cbfb8248025..13cb7a4237bf 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -26,6 +26,7 @@
 */
 #include <linux/crc16.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 /**
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 868a55ee080f..109c6ea03bb5 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -31,6 +31,7 @@
 */
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 /**
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 57085e43320f..96cb62c8a9dd 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -27,6 +27,7 @@
 */
 #include "ubifs.h"
+#include <linux/slab.h>
 #include <linux/random.h>
 #include <linux/math64.h>
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e5b1a7d00fa0..2194915220e5 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -31,6 +31,7 @@
 */
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 /*
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index b2d976366a46..2eef553d50c8 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -28,6 +28,7 @@
 #include <linux/fs.h>
 #include <linux/err.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
@@ -378,7 +379,7 @@ struct ubifs_gced_idx_leb {
 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
 * make sure @inode->i_size is always changed under @ui_mutex, because it
- * cannot call 'vmtruncate()' with @ui_mutex locked, because it would deadlock
+ * cannot call 'simple_setsize()' with @ui_mutex locked, because it would deadlock
 * with 'ubifs_writepage()' (see file.c). All the other inode fields are
 * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
 * could consider to rework locking and base it on "shadow" fields.
@@ -1677,7 +1678,7 @@ const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
 int ubifs_calc_dark(const struct ubifs_info *c, int spc);
 /* file.c */
-int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync);
+int ubifs_fsync(struct file *file, int datasync);
 int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
 /* dir.c */
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 195830f47569..c74400f88fe0 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -56,6 +56,7 @@
 */
 #include "ubifs.h"
+#include <linux/slab.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 19626e2491c4..b608efaa4cee 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -21,7 +21,6 @@
 #include "udfdecl.h"
-#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
@@ -125,9 +124,8 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
        mutex_lock(&sbi->s_alloc_mutex);
        partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
-        if (bloc->logicalBlockNum < 0 ||
+        if (bloc->logicalBlockNum + count < count ||
-            (bloc->logicalBlockNum + count) >
+            (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
-                partmap->s_partition_len) {
                udf_debug("%d < %d || %d + %d > %d\n",
                          bloc->logicalBlockNum, 0, bloc->logicalBlockNum,
                          count, partmap->s_partition_len);
@@ -160,8 +158,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
                                udf_debug("byte=%2x\n",
                                        ((char *)bh->b_data)[(bit + i) >> 3]);
                        } else {
-                                if (inode)
-                                        dquot_free_block(inode, 1);
                                udf_add_free_space(sb, sbi->s_partition, 1);
                        }
                }
@@ -211,15 +207,8 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
                bit = block % (sb->s_blocksize << 3);
                while (bit < (sb->s_blocksize << 3) && block_count > 0) {
-                        if (!udf_test_bit(bit, bh->b_data))
+                        if (!udf_clear_bit(bit, bh->b_data))
                                goto out;
-                        else if (dquot_prealloc_block(inode, 1))
-                                goto out;
-                        else if (!udf_clear_bit(bit, bh->b_data)) {
-                                udf_debug("bit already cleared for block %d\n", bit);
-                                dquot_free_block(inode, 1);
-                                goto out;
-                        }
                        block_count--;
                        alloc_count++;
                        bit++;
@@ -339,20 +328,6 @@ search_back:
        }
 got_block:
-        /*
-         * Check quota for allocation of this block.
-         */
-        if (inode) {
-                int ret = dquot_alloc_block(inode, 1);
-                if (ret) {
-                        mutex_unlock(&sbi->s_alloc_mutex);
-                        *err = ret;
-                        return 0;
-                }
-        }
        newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) -
                (sizeof(struct spaceBitmapDesc) << 3);
@@ -393,9 +368,8 @@ static void udf_table_free_blocks(struct super_block *sb,
        mutex_lock(&sbi->s_alloc_mutex);
        partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
-        if (bloc->logicalBlockNum < 0 ||
+        if (bloc->logicalBlockNum + count < count ||
-            (bloc->logicalBlockNum + count) >
+            (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
-                partmap->s_partition_len) {
                udf_debug("%d < %d || %d + %d > %d\n",
                          bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count,
                          partmap->s_partition_len);
@@ -403,10 +377,6 @@ static void udf_table_free_blocks(struct super_block *sb,
        }
        iinfo = UDF_I(table);
-        /* We do this up front - There are some error conditions that
-           could occure, but.. oh well */
-        if (inode)
-                dquot_free_block(inode, count);
        udf_add_free_space(sb, sbi->s_partition, count);
        start = bloc->logicalBlockNum + offset;
@@ -651,10 +621,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
                epos.offset -= adsize;
                alloc_count = (elen >> sb->s_blocksize_bits);
-                if (inode && dquot_prealloc_block(inode,
+                if (alloc_count > block_count) {
-                        alloc_count > block_count ? block_count : alloc_count))
-                        alloc_count = 0;
-                else if (alloc_count > block_count) {
                        alloc_count = block_count;
                        eloc.logicalBlockNum += alloc_count;
                        elen -= (alloc_count << sb->s_blocksize_bits);
@@ -754,14 +721,6 @@ static int udf_table_new_block(struct super_block *sb,
        newblock = goal_eloc.logicalBlockNum;
        goal_eloc.logicalBlockNum++;
        goal_elen -= sb->s_blocksize;
-        if (inode) {
-                *err = dquot_alloc_block(inode, 1);
-                if (*err) {
-                        brelse(goal_epos.bh);
-                        mutex_unlock(&sbi->s_alloc_mutex);
-                        return 0;
-                }
-        }
        if (goal_elen)
                udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1);
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index f0f2a436251e..51552bf50225 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -207,8 +207,9 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
 /* readdir and lookup functions */
 const struct file_operations udf_dir_operations = {
+        .llseek                 = generic_file_llseek,
        .read                   = generic_read_dir,
        .readdir                = udf_readdir,
-        .ioctl                  = udf_ioctl,
+        .unlocked_ioctl         = udf_ioctl,
-        .fsync                  = simple_fsync,
+        .fsync                  = generic_file_fsync,
 };
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 1eb06774ed90..94e06d6bddbd 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -34,9 +34,9 @@
 #include <linux/errno.h>
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
-#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/aio.h>
+#include <linux/smp_lock.h>
 #include "udf_i.h"
 #include "udf_sb.h"
@@ -144,50 +144,60 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        return retval;
 }
-int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
+long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-              unsigned long arg)
 {
+        struct inode *inode = filp->f_dentry->d_inode;
        long old_block, new_block;
        int result = -EINVAL;
+        lock_kernel();
        if (file_permission(filp, MAY_READ) != 0) {
-                udf_debug("no permission to access inode %lu\n",
+                udf_debug("no permission to access inode %lu\n", inode->i_ino);
-                          inode->i_ino);
+                result = -EPERM;
-                return -EPERM;
+                goto out;
        }
        if (!arg) {
                udf_debug("invalid argument to udf_ioctl\n");
-                return -EINVAL;
+                result = -EINVAL;
+                goto out;
        }
        switch (cmd) {
        case UDF_GETVOLIDENT:
                if (copy_to_user((char __user *)arg,
                                 UDF_SB(inode->i_sb)->s_volume_ident, 32))
-                        return -EFAULT;
+                        result = -EFAULT;
                else
-                        return 0;
+                        result = 0;
+                goto out;
        case UDF_RELOCATE_BLOCKS:
-                if (!capable(CAP_SYS_ADMIN))
+                if (!capable(CAP_SYS_ADMIN)) {
-                        return -EACCES;
+                        result = -EACCES;
-                if (get_user(old_block, (long __user *)arg))
+                        goto out;
-                        return -EFAULT;
+                }
+                if (get_user(old_block, (long __user *)arg)) {
+                        result = -EFAULT;
+                        goto out;
+                }
                result = udf_relocate_blocks(inode->i_sb,
                                                old_block, &new_block);
                if (result == 0)
                        result = put_user(new_block, (long __user *)arg);
-                return result;
+                goto out;
        case UDF_GETEASIZE:
                result = put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg);
-                break;
+                goto out;
        case UDF_GETEABLOCK:
                result = copy_to_user((char __user *)arg,
                                      UDF_I(inode)->i_ext.i_data,
                                      UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0;
-                break;
+                goto out;
        }
+out:
+        unlock_kernel();
        return result;
 }
@@ -207,40 +217,17 @@ static int udf_release_file(struct inode *inode, struct file *filp)
 const struct file_operations udf_file_operations = {
        .read                   = do_sync_read,
        .aio_read               = generic_file_aio_read,
-        .ioctl                  = udf_ioctl,
+        .unlocked_ioctl         = udf_ioctl,
-        .open                   = dquot_file_open,
+        .open                   = generic_file_open,
        .mmap                   = generic_file_mmap,
        .write                  = do_sync_write,
        .aio_write              = udf_file_aio_write,
        .release                = udf_release_file,
-        .fsync                  = simple_fsync,
+        .fsync                  = generic_file_fsync,
        .splice_read            = generic_file_splice_read,
        .llseek                 = generic_file_llseek,
 };
-static int udf_setattr(struct dentry *dentry, struct iattr *iattr)
-{
-        struct inode *inode = dentry->d_inode;
-        int error;
-        error = inode_change_ok(inode, iattr);
-        if (error)
-                return error;
-        if (iattr->ia_valid & ATTR_SIZE)
-                dquot_initialize(inode);
-        if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
-            (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
-                error = dquot_transfer(inode, iattr);
-                if (error)
-                        return error;
-        }
-        return inode_setattr(inode, iattr);
-}
 const struct inode_operations udf_file_inode_operations = {
        .truncate               = udf_truncate,
-        .setattr                = udf_setattr,
 };
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index fb68c9cd0c3e..18cd7111185d 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -20,7 +20,6 @@
 #include "udfdecl.h"
 #include <linux/fs.h>
-#include <linux/quotaops.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -32,13 +31,6 @@ void udf_free_inode(struct inode *inode)
        struct super_block *sb = inode->i_sb;
        struct udf_sb_info *sbi = UDF_SB(sb);
-        /*
-         * Note: we must free any quota before locking the superblock,
-         * as writing the quota to disk may need the lock as well.
-         */
-        dquot_free_inode(inode);
-        dquot_drop(inode);
        clear_inode(inode);
        mutex_lock(&sbi->s_alloc_mutex);
@@ -61,7 +53,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
        struct super_block *sb = dir->i_sb;
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct inode *inode;
-        int block, ret;
+        int block;
        uint32_t start = UDF_I(dir)->i_location.logicalBlockNum;
        struct udf_inode_info *iinfo;
        struct udf_inode_info *dinfo = UDF_I(dir);
@@ -124,15 +116,8 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
                udf_updated_lvid(sb);
        }
        mutex_unlock(&sbi->s_alloc_mutex);
-        inode->i_mode = mode;
-        inode->i_uid = current_fsuid();
+        inode_init_owner(inode, dir, mode);
-        if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
-        } else {
-                inode->i_gid = current_fsgid();
-        }
        iinfo->i_location.logicalBlockNum = block;
        iinfo->i_location.partitionReferenceNum =
@@ -153,17 +138,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
        insert_inode_hash(inode);
        mark_inode_dirty(inode);
-        dquot_initialize(inode);
-        ret = dquot_alloc_inode(inode);
-        if (ret) {
-                dquot_drop(inode);
-                inode->i_flags |= S_NOQUOTA;
-                inode->i_nlink = 0;
-                iput(inode);
-                *err = ret;
-                return NULL;
-        }
        *err = 0;
        return inode;
 }
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index bb863fe579ac..124852bcf6fe 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -36,7 +36,6 @@
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
-#include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/crc-itu-t.h>
@@ -71,9 +70,6 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
 void udf_delete_inode(struct inode *inode)
 {
-        if (!is_bad_inode(inode))
-                dquot_initialize(inode);
        truncate_inode_pages(&inode->i_data, 0);
        if (is_bad_inode(inode))
@@ -113,7 +109,6 @@ void udf_clear_inode(struct inode *inode)
                        (unsigned long long)iinfo->i_lenExtents);
        }
-        dquot_drop(inode);
        kfree(iinfo->i_ext.i_data);
        iinfo->i_ext.i_data = NULL;
 }
@@ -1314,7 +1309,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
                break;
        case ICBTAG_FILE_TYPE_SYMLINK:
                inode->i_data.a_ops = &udf_symlink_aops;
-                inode->i_op = &page_symlink_inode_operations;
+                inode->i_op = &udf_symlink_inode_operations;
                inode->i_mode = S_IFLNK | S_IRWXUGO;
                break;
        case ICBTAG_FILE_TYPE_MAIN:
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index db423ab078b1..bf5fc674193c 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -27,7 +27,6 @@
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/quotaops.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
@@ -563,8 +562,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
        int err;
        struct udf_inode_info *iinfo;
-        dquot_initialize(dir);
        lock_kernel();
        inode = udf_new_inode(dir, mode, &err);
        if (!inode) {
@@ -579,7 +576,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
                inode->i_data.a_ops = &udf_aops;
        inode->i_op = &udf_file_inode_operations;
        inode->i_fop = &udf_file_operations;
-        inode->i_mode = mode;
        mark_inode_dirty(inode);
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
@@ -618,8 +614,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
        if (!old_valid_dev(rdev))
                return -EINVAL;
-        dquot_initialize(dir);
        lock_kernel();
        err = -EIO;
        inode = udf_new_inode(dir, mode, &err);
@@ -627,7 +621,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
                goto out;
        iinfo = UDF_I(inode);
-        inode->i_uid = current_fsuid();
        init_special_inode(inode, mode, rdev);
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
        if (!fi) {
@@ -666,15 +659,13 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct udf_inode_info *dinfo = UDF_I(dir);
        struct udf_inode_info *iinfo;
-        dquot_initialize(dir);
        lock_kernel();
        err = -EMLINK;
        if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
                goto out;
        err = -EIO;
-        inode = udf_new_inode(dir, S_IFDIR, &err);
+        inode = udf_new_inode(dir, S_IFDIR | mode, &err);
        if (!inode)
                goto out;
@@ -697,9 +688,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                        FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT;
        udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL);
        brelse(fibh.sbh);
-        inode->i_mode = S_IFDIR | mode;
-        if (dir->i_mode & S_ISGID)
-                inode->i_mode |= S_ISGID;
        mark_inode_dirty(inode);
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
@@ -805,8 +793,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
        struct fileIdentDesc *fi, cfi;
        struct kernel_lb_addr tloc;
-        dquot_initialize(dir);
        retval = -ENOENT;
        lock_kernel();
        fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -853,8 +839,6 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
        struct fileIdentDesc cfi;
        struct kernel_lb_addr tloc;
-        dquot_initialize(dir);
        retval = -ENOENT;
        lock_kernel();
        fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -909,10 +893,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        struct buffer_head *bh;
        struct udf_inode_info *iinfo;
-        dquot_initialize(dir);
        lock_kernel();
-        inode = udf_new_inode(dir, S_IFLNK, &err);
+        inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
        if (!inode)
                goto out;
@@ -923,9 +905,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        }
        iinfo = UDF_I(inode);
-        inode->i_mode = S_IFLNK | S_IRWXUGO;
        inode->i_data.a_ops = &udf_symlink_aops;
-        inode->i_op = &page_symlink_inode_operations;
+        inode->i_op = &udf_symlink_inode_operations;
        if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
                struct kernel_lb_addr eloc;
@@ -1081,8 +1062,6 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
        int err;
        struct buffer_head *bh;
-        dquot_initialize(dir);
        lock_kernel();
        if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
                unlock_kernel();
@@ -1145,9 +1124,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct kernel_lb_addr tloc;
        struct udf_inode_info *old_iinfo = UDF_I(old_inode);
-        dquot_initialize(old_dir);
-        dquot_initialize(new_dir);
        lock_kernel();
        ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
        if (ofi) {
@@ -1401,3 +1377,8 @@ const struct inode_operations udf_dir_inode_operations = {
        .mknod                          = udf_mknod,
        .rename                         = udf_rename,
 };
+const struct inode_operations udf_symlink_inode_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = page_follow_link_light,
+        .put_link       = page_put_link,
+};
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 4b540ee632d5..745eb209be0c 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -24,7 +24,6 @@
 #include <linux/fs.h>
 #include <linux/string.h>
-#include <linux/slab.h>
 #include <linux/buffer_head.h>
 uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 1e4543cbcd27..612d1e2e285a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -557,6 +557,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 {
        struct udf_options uopt;
        struct udf_sb_info *sbi = UDF_SB(sb);
+        int error = 0;
        uopt.flags = sbi->s_flags;
        uopt.uid   = sbi->s_uid;
@@ -582,17 +583,17 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
                        *flags |= MS_RDONLY;
        }
-        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
-                unlock_kernel();
+                goto out_unlock;
-                return 0;
-        }
        if (*flags & MS_RDONLY)
                udf_close_lvid(sb);
        else
                udf_open_lvid(sb);
+out_unlock:
        unlock_kernel();
-        return 0;
+        return error;
 }
 /* Check Volume Structure Descriptors (ECMA 167 2/9.1) */
@@ -1939,7 +1940,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        /* Fill in the rest of the superblock */
        sb->s_op = &udf_sb_ops;
        sb->s_export_op = &udf_export_ops;
-        sb->dq_op = NULL;
        sb->s_dirt = 0;
        sb->s_magic = UDF_SUPER_MAGIC;
        sb->s_time_gran = 1000;
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 852e91845688..16064787d2b7 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -26,7 +26,6 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/stat.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 4223ac855da9..2bac0354891f 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -76,6 +76,7 @@ extern const struct inode_operations udf_dir_inode_operations;
 extern const struct file_operations udf_dir_operations;
 extern const struct inode_operations udf_file_inode_operations;
 extern const struct file_operations udf_file_operations;
+extern const struct inode_operations udf_symlink_inode_operations;
 extern const struct address_space_operations udf_aops;
 extern const struct address_space_operations udf_adinicb_aops;
 extern const struct address_space_operations udf_symlink_aops;
@@ -129,9 +130,7 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
                        uint8_t *, uint8_t *);
 /* file.c */
-extern int udf_ioctl(struct inode *, struct file *, unsigned int,
+extern long udf_ioctl(struct file *, unsigned int, unsigned long);
-                     unsigned long);
 /* inode.c */
 extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
 extern int udf_sync_inode(struct inode *);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index cefa8c8913e6..d03a90b6ad69 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -24,6 +24,7 @@
 #include <linux/string.h>       /* for memset */
 #include <linux/nls.h>
 #include <linux/crc-itu-t.h>
+#include <linux/slab.h>
 #include "udf_sb.h"
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 5cfa4d85ccf2..048484fb10d2 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -12,7 +12,6 @@
 #include <linux/stat.h>
 #include <linux/time.h>
 #include <linux/string.h>
-#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/capability.h>
 #include <linux/bitops.h>
@@ -85,9 +84,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
                                   "bit already cleared for fragment %u", i);
        }
        
-        dquot_free_block(inode, count);
-        
        fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
        uspi->cs_total.cs_nffree += count;
        fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
@@ -195,7 +191,6 @@ do_more:
                ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
                if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
                        ufs_clusteracct (sb, ucpi, blkno, 1);
-                dquot_free_block(inode, uspi->s_fpb);
                fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
                uspi->cs_total.cs_nbfree++;
@@ -511,7 +506,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
        struct ufs_cg_private_info * ucpi;
        struct ufs_cylinder_group * ucg;
        unsigned cgno, fragno, fragoff, count, fragsize, i;
-        int ret;
        
        UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n",
             (unsigned long long)fragment, oldcount, newcount);
@@ -557,11 +551,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
                fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
        for (i = oldcount; i < newcount; i++)
                ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
-        ret = dquot_alloc_block(inode, count);
-        if (ret) {
-                *err = ret;
-                return 0;
-        }
        fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
        fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
@@ -598,7 +587,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
        struct ufs_cylinder_group * ucg;
        unsigned oldcg, i, j, k, allocsize;
        u64 result;
-        int ret;
        
        UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n",
             inode->i_ino, cgno, (unsigned long long)goal, count);
@@ -667,7 +655,6 @@ cg_found:
                for (i = count; i < uspi->s_fpb; i++)
                        ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
                i = uspi->s_fpb - count;
-                dquot_free_block(inode, i);
                fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
                uspi->cs_total.cs_nffree += i;
@@ -679,11 +666,6 @@ cg_found:
        result = ufs_bitmap_search (sb, ucpi, goal, allocsize);
        if (result == INVBLOCK)
                return 0;
-        ret = dquot_alloc_block(inode, count);
-        if (ret) {
-                *err = ret;
-                return 0;
-        }
        for (i = 0; i < count; i++)
                ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i);
        
@@ -718,7 +700,6 @@ static u64 ufs_alloccg_block(struct inode *inode,
        struct ufs_super_block_first * usb1;
        struct ufs_cylinder_group * ucg;
        u64 result, blkno;
-        int ret;
        UFSD("ENTER, goal %llu\n", (unsigned long long)goal);
@@ -752,11 +733,6 @@ gotit:
        ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
        if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
                ufs_clusteracct (sb, ucpi, blkno, -1);
-        ret = dquot_alloc_block(inode, uspi->s_fpb);
-        if (ret) {
-                *err = ret;
-                return INVBLOCK;
-        }
        fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1);
        uspi->cs_total.cs_nbfree--;
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 317a0d444f6b..ec784756dc65 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -666,6 +666,6 @@ not_empty:
 const struct file_operations ufs_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = ufs_readdir,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
        .llseek         = generic_file_llseek,
 };
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index a8962cecde5b..33afa20d4509 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -24,7 +24,6 @@
 */
 #include <linux/fs.h>
-#include <linux/quotaops.h>
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -41,7 +40,7 @@ const struct file_operations ufs_file_operations = {
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
-        .open           = dquot_file_open,
+        .open           = generic_file_open,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
        .splice_read    = generic_file_splice_read,
 };
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 230ecf608026..594480e537d2 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -27,7 +27,6 @@
 #include <linux/time.h>
 #include <linux/stat.h>
 #include <linux/string.h>
-#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
 #include <linux/bitops.h>
@@ -95,9 +94,6 @@ void ufs_free_inode (struct inode * inode)
        is_directory = S_ISDIR(inode->i_mode);
-        dquot_free_inode(inode);
-        dquot_drop(inode);
        clear_inode (inode);
        if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
@@ -303,15 +299,7 @@ cg_found:
        sb->s_dirt = 1;
        inode->i_ino = cg * uspi->s_ipg + bit;
-        inode->i_mode = mode;
+        inode_init_owner(inode, dir, mode);
-        inode->i_uid = current_fsuid();
-        if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        inode->i_mode |= S_ISGID;
-        } else
-                inode->i_gid = current_fsgid();
        inode->i_blocks = 0;
        inode->i_generation = 0;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
@@ -355,21 +343,12 @@ cg_found:
        unlock_super (sb);
-        dquot_initialize(inode);
-        err = dquot_alloc_inode(inode);
-        if (err) {
-                dquot_drop(inode);
-                goto fail_without_unlock;
-        }
        UFSD("allocating inode %lu\n", inode->i_ino);
        UFSD("EXIT\n");
        return inode;
 fail_remove_inode:
        unlock_super(sb);
-fail_without_unlock:
-        inode->i_flags |= S_NOQUOTA;
        inode->i_nlink = 0;
        iput(inode);
        UFSD("EXIT (FAILED): err %d\n", err);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 80b68c3702d1..73fe773aa034 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -37,7 +37,6 @@
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
-#include <linux/quotaops.h>
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -603,7 +602,7 @@ static void ufs_set_inode_ops(struct inode *inode)
                if (!inode->i_blocks)
                        inode->i_op = &ufs_fast_symlink_inode_operations;
                else {
-                        inode->i_op = &page_symlink_inode_operations;
+                        inode->i_op = &ufs_symlink_inode_operations;
                        inode->i_mapping->a_ops = &ufs_aops;
                }
        } else
@@ -910,9 +909,6 @@ void ufs_delete_inode (struct inode * inode)
 {
        loff_t old_i_size;
-        if (!is_bad_inode(inode))
-                dquot_initialize(inode);
        truncate_inode_pages(&inode->i_data, 0);
        if (is_bad_inode(inode))
                goto no_delete;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 118556243e7a..b056f02b1fb3 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -30,7 +30,6 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/smp_lock.h>
-#include <linux/quotaops.h>
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -86,8 +85,6 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
        UFSD("BEGIN\n");
-        dquot_initialize(dir);
        inode = ufs_new_inode(dir, mode);
        err = PTR_ERR(inode);
@@ -112,8 +109,6 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t
        if (!old_valid_dev(rdev))
                return -EINVAL;
-        dquot_initialize(dir);
        inode = ufs_new_inode(dir, mode);
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
@@ -138,8 +133,6 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
        if (l > sb->s_blocksize)
                goto out_notlocked;
-        dquot_initialize(dir);
        lock_kernel();
        inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
        err = PTR_ERR(inode);
@@ -148,7 +141,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
        if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
                /* slow symlink */
-                inode->i_op = &page_symlink_inode_operations;
+                inode->i_op = &ufs_symlink_inode_operations;
                inode->i_mapping->a_ops = &ufs_aops;
                err = page_symlink(inode, symname, l);
                if (err)
@@ -185,8 +178,6 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
                return -EMLINK;
        }
-        dquot_initialize(dir);
        inode->i_ctime = CURRENT_TIME_SEC;
        inode_inc_link_count(inode);
        atomic_inc(&inode->i_count);
@@ -204,8 +195,6 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
        if (dir->i_nlink >= UFS_LINK_MAX)
                goto out;
-        dquot_initialize(dir);
        lock_kernel();
        inode_inc_link_count(dir);
@@ -250,8 +239,6 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
        struct page *page;
        int err = -ENOENT;
-        dquot_initialize(dir);
        de = ufs_find_entry(dir, &dentry->d_name, &page);
        if (!de)
                goto out;
@@ -296,9 +283,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct ufs_dir_entry *old_de;
        int err = -ENOENT;
-        dquot_initialize(old_dir);
-        dquot_initialize(new_dir);
        old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
        if (!old_de)
                goto out;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 14743d935a93..3ec5a9eb6efb 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -77,7 +77,6 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
-#include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/stat.h>
@@ -918,6 +917,7 @@ again:
        sbi->s_bytesex = BYTESEX_LE;
        switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
                case UFS_MAGIC:
+                case UFS_MAGIC_BW:
                case UFS2_MAGIC:
                case UFS_MAGIC_LFN:
                case UFS_MAGIC_FEA:
@@ -927,6 +927,7 @@ again:
        sbi->s_bytesex = BYTESEX_BE;
        switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
                case UFS_MAGIC:
+                case UFS_MAGIC_BW:
                case UFS2_MAGIC:
                case UFS_MAGIC_LFN:
                case UFS_MAGIC_FEA:
@@ -1045,7 +1046,7 @@ magic_found:
         */
        sb->s_op = &ufs_super_ops;
        sb->s_export_op = &ufs_export_ops;
-        sb->dq_op = NULL; /***/
        sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic);
        uspi->s_sblkno = fs32_to_cpu(sb, usb1->fs_sblkno);
@@ -1435,126 +1436,19 @@ static void destroy_inodecache(void)
        kmem_cache_destroy(ufs_inode_cachep);
 }
-static void ufs_clear_inode(struct inode *inode)
-{
-        dquot_drop(inode);
-}
-#ifdef CONFIG_QUOTA
-static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t);
-static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t);
-#endif
 static const struct super_operations ufs_super_ops = {
        .alloc_inode    = ufs_alloc_inode,
        .destroy_inode  = ufs_destroy_inode,
        .write_inode    = ufs_write_inode,
        .delete_inode   = ufs_delete_inode,
-        .clear_inode    = ufs_clear_inode,
        .put_super      = ufs_put_super,
        .write_super    = ufs_write_super,
        .sync_fs        = ufs_sync_fs,
        .statfs         = ufs_statfs,
        .remount_fs     = ufs_remount,
        .show_options   = ufs_show_options,
-#ifdef CONFIG_QUOTA
-        .quota_read     = ufs_quota_read,
-        .quota_write    = ufs_quota_write,
-#endif
 };
-#ifdef CONFIG_QUOTA
-/* Read data from quotafile - avoid pagecache and such because we cannot afford
- * acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and noone else should touch the files)
- * we don't have to be afraid of races */
-static ssize_t ufs_quota_read(struct super_block *sb, int type, char *data,
-                               size_t len, loff_t off)
-{
-        struct inode *inode = sb_dqopt(sb)->files[type];
-        sector_t blk = off >> sb->s_blocksize_bits;
-        int err = 0;
-        int offset = off & (sb->s_blocksize - 1);
-        int tocopy;
-        size_t toread;
-        struct buffer_head *bh;
-        loff_t i_size = i_size_read(inode);
-        if (off > i_size)
-                return 0;
-        if (off+len > i_size)
-                len = i_size-off;
-        toread = len;
-        while (toread > 0) {
-                tocopy = sb->s_blocksize - offset < toread ?
-                                sb->s_blocksize - offset : toread;
-                bh = ufs_bread(inode, blk, 0, &err);
-                if (err)
-                        return err;
-                if (!bh)        /* A hole? */
-                        memset(data, 0, tocopy);
-                else {
-                        memcpy(data, bh->b_data+offset, tocopy);
-                        brelse(bh);
-                }
-                offset = 0;
-                toread -= tocopy;
-                data += tocopy;
-                blk++;
-        }
-        return len;
-}
-/* Write to quotafile */
-static ssize_t ufs_quota_write(struct super_block *sb, int type,
-                                const char *data, size_t len, loff_t off)
-{
-        struct inode *inode = sb_dqopt(sb)->files[type];
-        sector_t blk = off >> sb->s_blocksize_bits;
-        int err = 0;
-        int offset = off & (sb->s_blocksize - 1);
-        int tocopy;
-        size_t towrite = len;
-        struct buffer_head *bh;
-        mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
-        while (towrite > 0) {
-                tocopy = sb->s_blocksize - offset < towrite ?
-                                sb->s_blocksize - offset : towrite;
-                bh = ufs_bread(inode, blk, 1, &err);
-                if (!bh)
-                        goto out;
-                lock_buffer(bh);
-                memcpy(bh->b_data+offset, data, tocopy);
-                flush_dcache_page(bh->b_page);
-                set_buffer_uptodate(bh);
-                mark_buffer_dirty(bh);
-                unlock_buffer(bh);
-                brelse(bh);
-                offset = 0;
-                towrite -= tocopy;
-                data += tocopy;
-                blk++;
-        }
-out:
-        if (len == towrite) {
-                mutex_unlock(&inode->i_mutex);
-                return err;
-        }
-        if (inode->i_size < off+len-towrite)
-                i_size_write(inode, off+len-towrite);
-        inode->i_version++;
-        inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
-        mark_inode_dirty(inode);
-        mutex_unlock(&inode->i_mutex);
-        return len - towrite;
-}
-#endif
 static int ufs_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c
index c0156eda44bc..d283628b4778 100644
--- a/fs/ufs/symlink.c
+++ b/fs/ufs/symlink.c
@@ -42,4 +42,12 @@ static void *ufs_follow_link(struct dentry *dentry, struct nameidata *nd)
 const struct inode_operations ufs_fast_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = ufs_follow_link,
+        .setattr        = ufs_setattr,
+};
+const struct inode_operations ufs_symlink_inode_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = page_follow_link_light,
+        .put_link       = page_put_link,
+        .setattr        = ufs_setattr,
 };
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index d3b6270cb377..589e01a465ba 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -44,7 +44,6 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/sched.h>
-#include <linux/quotaops.h>
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -501,14 +500,12 @@ out:
        return err;
 }
 /*
- * We don't define our `inode->i_op->truncate', and call it here,
+ * TODO:
- * because of:
+ *      - truncate case should use proper ordering instead of using
- * - there is no way to know old size
+ *        simple_setsize
- * - there is no way inform user about error, if it happens in `truncate'
 */
-static int ufs_setattr(struct dentry *dentry, struct iattr *attr)
+int ufs_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
        unsigned int ia_valid = attr->ia_valid;
@@ -518,19 +515,10 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr)
        if (error)
                return error;
-        if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+        if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
-            (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
-                error = dquot_transfer(inode, attr);
-                if (error)
-                        return error;
-        }
-        if (ia_valid & ATTR_SIZE &&
-            attr->ia_size != i_size_read(inode)) {
                loff_t old_i_size = inode->i_size;
-                dquot_initialize(inode);
+                error = simple_setsize(inode, attr->ia_size);
-                error = vmtruncate(inode, attr->ia_size);
                if (error)
                        return error;
                error = ufs_truncate(inode, old_i_size);
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 43f9f5d5670e..179ae6b3180a 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -122,9 +122,11 @@ extern void ufs_panic (struct super_block *, const char *, const char *, ...) __
 /* symlink.c */
 extern const struct inode_operations ufs_fast_symlink_inode_operations;
+extern const struct inode_operations ufs_symlink_inode_operations;
 /* truncate.c */
 extern int ufs_truncate (struct inode *, loff_t);
+extern int ufs_setattr(struct dentry *dentry, struct iattr *attr);
 static inline struct ufs_sb_info *UFS_SB(struct super_block *sb)
 {
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index 6943ec677c0b..8aba544f9fad 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -48,6 +48,7 @@ typedef __u16 __bitwise __fs16;
 #define UFS_SECTOR_SIZE 512
 #define UFS_SECTOR_BITS 9
 #define UFS_MAGIC  0x00011954
+#define UFS_MAGIC_BW 0x0f242697
 #define UFS2_MAGIC 0x19540119
 #define UFS_CIGAM  0x54190100 /* byteswapped MAGIC */
diff --git a/fs/xattr.c b/fs/xattr.c
index 46f87e828b48..01bb8135e14a 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -590,10 +590,10 @@ strcmp_prefix(const char *a, const char *a_prefix)
 /*
 * Find the xattr_handler with the matching prefix.
 */
-static struct xattr_handler *
+static const struct xattr_handler *
-xattr_resolve_name(struct xattr_handler **handlers, const char **name)
+xattr_resolve_name(const struct xattr_handler **handlers, const char **name)
 {
-        struct xattr_handler *handler;
+        const struct xattr_handler *handler;
        if (!*name)
                return NULL;
@@ -614,7 +614,7 @@ xattr_resolve_name(struct xattr_handler **handlers, const char **name)
 ssize_t
 generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size)
 {
-        struct xattr_handler *handler;
+        const struct xattr_handler *handler;
        handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
        if (!handler)
@@ -629,7 +629,7 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s
 ssize_t
 generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
-        struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
+        const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
        unsigned int size = 0;
        if (!buffer) {
@@ -659,7 +659,7 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 int
 generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags)
 {
-        struct xattr_handler *handler;
+        const struct xattr_handler *handler;
        if (size == 0)
                value = "";  /* empty EA, do not remove */
@@ -676,7 +676,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
 int
 generic_removexattr(struct dentry *dentry, const char *name)
 {
-        struct xattr_handler *handler;
+        const struct xattr_handler *handler;
        handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
        if (!handler)
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
index 05ac0fe9c4d3..8d5a506c82eb 100644
--- a/fs/xattr_acl.c
+++ b/fs/xattr_acl.c
@@ -6,9 +6,9 @@
 */
 #include <linux/module.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/gfp.h>
 /*
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index b4769e40e8bc..c8fb13f83b3f 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -77,6 +77,7 @@ xfs-y				+= xfs_alloc.o \
                                   xfs_itable.o \
                                   xfs_dfrag.o \
                                   xfs_log.o \
+                                   xfs_log_cil.o \
                                   xfs_log_recover.o \
                                   xfs_mount.o \
                                   xfs_mru_cache.o \
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index bc7405585def..666c9db48eb6 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -17,6 +17,7 @@
 */
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index bf85bbe4a9ae..9f769b5b38fc 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -22,6 +22,7 @@
 #include "xfs_inode.h"
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
+#include <linux/slab.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
@@ -439,14 +440,14 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
        return error;
 }
-struct xattr_handler xfs_xattr_acl_access_handler = {
+const struct xattr_handler xfs_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags  = ACL_TYPE_ACCESS,
        .get    = xfs_xattr_acl_get,
        .set    = xfs_xattr_acl_set,
 };
-struct xattr_handler xfs_xattr_acl_default_handler = {
+const struct xattr_handler xfs_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags  = ACL_TYPE_DEFAULT,
        .get    = xfs_xattr_acl_get,
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 99628508cb11..089eaca860b4 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -40,10 +40,20 @@
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
 #include "xfs_bmap.h"
+#include <linux/gfp.h>
 #include <linux/mpage.h>
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
+/*
+ * Types of I/O for bmap clustering and I/O completion tracking.
+ */
+enum {
+        IO_READ,        /* mapping for a read */
+        IO_DELAY,       /* mapping covers delalloc region */
+        IO_UNWRITTEN,   /* mapping covers allocated but uninitialized data */
+        IO_NEW          /* just allocated */
+};
 /*
 * Prime number of hash buckets since address is used as the key.
@@ -102,8 +112,9 @@ xfs_count_page_state(
 STATIC struct block_device *
 xfs_find_bdev_for_inode(
-        struct xfs_inode        *ip)
+        struct inode            *inode)
 {
+        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
        if (XFS_IS_REALTIME_INODE(ip))
@@ -182,7 +193,7 @@ xfs_setfilesize(
        xfs_fsize_t             isize;
        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-        ASSERT(ioend->io_type != IOMAP_READ);
+        ASSERT(ioend->io_type != IO_READ);
        if (unlikely(ioend->io_error))
                return 0;
@@ -213,7 +224,7 @@ xfs_finish_ioend(
        if (atomic_dec_and_test(&ioend->io_remaining)) {
                struct workqueue_struct *wq;
-                wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
+                wq = (ioend->io_type == IO_UNWRITTEN) ?
                        xfsconvertd_workqueue : xfsdatad_workqueue;
                queue_work(wq, &ioend->io_work);
                if (wait)
@@ -236,7 +247,7 @@ xfs_end_io(
         * For unwritten extents we need to issue transactions to convert a
         * range to normal written extens after the data I/O has finished.
         */
-        if (ioend->io_type == IOMAP_UNWRITTEN &&
+        if (ioend->io_type == IO_UNWRITTEN &&
            likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
                error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
@@ -249,7 +260,7 @@ xfs_end_io(
         * We might have to update the on-disk file size after extending
         * writes.
         */
-        if (ioend->io_type != IOMAP_READ) {
+        if (ioend->io_type != IO_READ) {
                error = xfs_setfilesize(ioend);
                ASSERT(!error || error == EAGAIN);
        }
@@ -308,21 +319,25 @@ xfs_map_blocks(
        struct inode            *inode,
        loff_t                  offset,
        ssize_t                 count,
-        xfs_iomap_t             *mapp,
+        struct xfs_bmbt_irec    *imap,
        int                     flags)
 {
        int                     nmaps = 1;
+        int                     new = 0;
-        return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps);
+        return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);
 }
 STATIC int
-xfs_iomap_valid(
+xfs_imap_valid(
-        xfs_iomap_t             *iomapp,
+        struct inode            *inode,
-        loff_t                  offset)
+        struct xfs_bmbt_irec    *imap,
+        xfs_off_t               offset)
 {
-        return offset >= iomapp->iomap_offset &&
+        offset >>= inode->i_blkbits;
-                offset < iomapp->iomap_offset + iomapp->iomap_bsize;
+        return offset >= imap->br_startoff &&
+                offset < imap->br_startoff + imap->br_blockcount;
 }
 /*
@@ -553,19 +568,23 @@ xfs_add_to_ioend(
 STATIC void
 xfs_map_buffer(
+        struct inode            *inode,
        struct buffer_head      *bh,
-        xfs_iomap_t             *mp,
+        struct xfs_bmbt_irec    *imap,
-        xfs_off_t               offset,
+        xfs_off_t               offset)
-        uint                    block_bits)
 {
        sector_t                bn;
+        struct xfs_mount        *m = XFS_I(inode)->i_mount;
+        xfs_off_t               iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
+        xfs_daddr_t             iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
-        ASSERT(mp->iomap_bn != IOMAP_DADDR_NULL);
+        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-        bn = (mp->iomap_bn >> (block_bits - BBSHIFT)) +
+        bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
-              ((offset - mp->iomap_offset) >> block_bits);
+              ((offset - iomap_offset) >> inode->i_blkbits);
-        ASSERT(bn || (mp->iomap_flags & IOMAP_REALTIME));
+        ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
        bh->b_blocknr = bn;
        set_buffer_mapped(bh);
@@ -573,17 +592,17 @@ xfs_map_buffer(
 STATIC void
 xfs_map_at_offset(
+        struct inode            *inode,
        struct buffer_head      *bh,
-        loff_t                  offset,
+        struct xfs_bmbt_irec    *imap,
-        int                     block_bits,
+        xfs_off_t               offset)
-        xfs_iomap_t             *iomapp)
 {
-        ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE));
+        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
-        ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY));
+        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
        lock_buffer(bh);
-        xfs_map_buffer(bh, iomapp, offset, block_bits);
+        xfs_map_buffer(inode, bh, imap, offset);
-        bh->b_bdev = iomapp->iomap_target->bt_bdev;
+        bh->b_bdev = xfs_find_bdev_for_inode(inode);
        set_buffer_mapped(bh);
        clear_buffer_delay(bh);
        clear_buffer_unwritten(bh);
@@ -712,11 +731,11 @@ xfs_is_delayed_page(
                bh = head = page_buffers(page);
                do {
                        if (buffer_unwritten(bh))
-                                acceptable = (type == IOMAP_UNWRITTEN);
+                                acceptable = (type == IO_UNWRITTEN);
                        else if (buffer_delay(bh))
-                                acceptable = (type == IOMAP_DELAY);
+                                acceptable = (type == IO_DELAY);
                        else if (buffer_dirty(bh) && buffer_mapped(bh))
-                                acceptable = (type == IOMAP_NEW);
+                                acceptable = (type == IO_NEW);
                        else
                                break;
                } while ((bh = bh->b_this_page) != head);
@@ -739,7 +758,7 @@ xfs_convert_page(
        struct inode            *inode,
        struct page             *page,
        loff_t                  tindex,
-        xfs_iomap_t             *mp,
+        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
        struct writeback_control *wbc,
        int                     startio,
@@ -749,7 +768,6 @@ xfs_convert_page(
        xfs_off_t               end_offset;
        unsigned long           p_offset;
        unsigned int            type;
-        int                     bbits = inode->i_blkbits;
        int                     len, page_dirty;
        int                     count = 0, done = 0, uptodate = 1;
        xfs_off_t               offset = page_offset(page);
@@ -801,19 +819,19 @@ xfs_convert_page(
                if (buffer_unwritten(bh) || buffer_delay(bh)) {
                        if (buffer_unwritten(bh))
-                                type = IOMAP_UNWRITTEN;
+                                type = IO_UNWRITTEN;
                        else
-                                type = IOMAP_DELAY;
+                                type = IO_DELAY;
-                        if (!xfs_iomap_valid(mp, offset)) {
+                        if (!xfs_imap_valid(inode, imap, offset)) {
                                done = 1;
                                continue;
                        }
-                        ASSERT(!(mp->iomap_flags & IOMAP_HOLE));
+                        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
-                        ASSERT(!(mp->iomap_flags & IOMAP_DELAY));
+                        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-                        xfs_map_at_offset(bh, offset, bbits, mp);
+                        xfs_map_at_offset(inode, bh, imap, offset);
                        if (startio) {
                                xfs_add_to_ioend(inode, bh, offset,
                                                type, ioendp, done);
@@ -825,7 +843,7 @@ xfs_convert_page(
                        page_dirty--;
                        count++;
                } else {
-                        type = IOMAP_NEW;
+                        type = IO_NEW;
                        if (buffer_mapped(bh) && all_bh && startio) {
                                lock_buffer(bh);
                                xfs_add_to_ioend(inode, bh, offset,
@@ -865,7 +883,7 @@ STATIC void
 xfs_cluster_write(
        struct inode            *inode,
        pgoff_t                 tindex,
-        xfs_iomap_t             *iomapp,
+        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
        struct writeback_control *wbc,
        int                     startio,
@@ -884,7 +902,7 @@ xfs_cluster_write(
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-                                        iomapp, ioendp, wbc, startio, all_bh);
+                                        imap, ioendp, wbc, startio, all_bh);
                        if (done)
                                break;
                }
@@ -929,7 +947,7 @@ xfs_aops_discard_page(
        loff_t                  offset = page_offset(page);
        ssize_t                 len = 1 << inode->i_blkbits;
-        if (!xfs_is_delayed_page(page, IOMAP_DELAY))
+        if (!xfs_is_delayed_page(page, IO_DELAY))
                goto out_invalidate;
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -1041,15 +1059,15 @@ xfs_page_state_convert(
        int             unmapped) /* also implies page uptodate */
 {
        struct buffer_head      *bh, *head;
-        xfs_iomap_t             iomap;
+        struct xfs_bmbt_irec    imap;
        xfs_ioend_t             *ioend = NULL, *iohead = NULL;
        loff_t                  offset;
        unsigned long           p_offset = 0;
        unsigned int            type;
        __uint64_t              end_offset;
-        pgoff_t                 end_index, last_index, tlast;
+        pgoff_t                 end_index, last_index;
        ssize_t                 size, len;
-        int                     flags, err, iomap_valid = 0, uptodate = 1;
+        int                     flags, err, imap_valid = 0, uptodate = 1;
        int                     page_dirty, count = 0;
        int                     trylock = 0;
        int                     all_bh = unmapped;
@@ -1096,7 +1114,7 @@ xfs_page_state_convert(
        bh = head = page_buffers(page);
        offset = page_offset(page);
        flags = BMAPI_READ;
-        type = IOMAP_NEW;
+        type = IO_NEW;
        /* TODO: cleanup count and page_dirty */
@@ -1110,12 +1128,12 @@ xfs_page_state_convert(
                         * the iomap is actually still valid, but the ioend
                         * isn't.  shouldn't happen too often.
                         */
-                        iomap_valid = 0;
+                        imap_valid = 0;
                        continue;
                }
-                if (iomap_valid)
+                if (imap_valid)
-                        iomap_valid = xfs_iomap_valid(&iomap, offset);
+                        imap_valid = xfs_imap_valid(inode, &imap, offset);
                /*
                 * First case, map an unwritten extent and prepare for
@@ -1136,20 +1154,20 @@ xfs_page_state_convert(
                         * Make sure we don't use a read-only iomap
                         */
                        if (flags == BMAPI_READ)
-                                iomap_valid = 0;
+                                imap_valid = 0;
                        if (buffer_unwritten(bh)) {
-                                type = IOMAP_UNWRITTEN;
+                                type = IO_UNWRITTEN;
                                flags = BMAPI_WRITE | BMAPI_IGNSTATE;
                        } else if (buffer_delay(bh)) {
-                                type = IOMAP_DELAY;
+                                type = IO_DELAY;
                                flags = BMAPI_ALLOCATE | trylock;
                        } else {
-                                type = IOMAP_NEW;
+                                type = IO_NEW;
                                flags = BMAPI_WRITE | BMAPI_MMAP;
                        }
-                        if (!iomap_valid) {
+                        if (!imap_valid) {
                                /*
                                 * if we didn't have a valid mapping then we
                                 * need to ensure that we put the new mapping
@@ -1159,7 +1177,7 @@ xfs_page_state_convert(
                                 * for unwritten extent conversion.
                                 */
                                new_ioend = 1;
-                                if (type == IOMAP_NEW) {
+                                if (type == IO_NEW) {
                                        size = xfs_probe_cluster(inode,
                                                        page, bh, head, 0);
                                } else {
@@ -1167,14 +1185,14 @@ xfs_page_state_convert(
                                }
                                err = xfs_map_blocks(inode, offset, size,
-                                                &iomap, flags);
+                                                &imap, flags);
                                if (err)
                                        goto error;
-                                iomap_valid = xfs_iomap_valid(&iomap, offset);
+                                imap_valid = xfs_imap_valid(inode, &imap,
+                                                            offset);
                        }
-                        if (iomap_valid) {
+                        if (imap_valid) {
-                                xfs_map_at_offset(bh, offset,
+                                xfs_map_at_offset(inode, bh, &imap, offset);
-                                                inode->i_blkbits, &iomap);
                                if (startio) {
                                        xfs_add_to_ioend(inode, bh, offset,
                                                        type, &ioend,
@@ -1193,40 +1211,41 @@ xfs_page_state_convert(
                         * That means it must already have extents allocated
                         * underneath it. Map the extent by reading it.
                         */
-                        if (!iomap_valid || flags != BMAPI_READ) {
+                        if (!imap_valid || flags != BMAPI_READ) {
                                flags = BMAPI_READ;
                                size = xfs_probe_cluster(inode, page, bh,
                                                                head, 1);
                                err = xfs_map_blocks(inode, offset, size,
-                                                &iomap, flags);
+                                                &imap, flags);
                                if (err)
                                        goto error;
-                                iomap_valid = xfs_iomap_valid(&iomap, offset);
+                                imap_valid = xfs_imap_valid(inode, &imap,
+                                                            offset);
                        }
                        /*
-                         * We set the type to IOMAP_NEW in case we are doing a
+                         * We set the type to IO_NEW in case we are doing a
                         * small write at EOF that is extending the file but
                         * without needing an allocation. We need to update the
                         * file size on I/O completion in this case so it is
                         * the same case as having just allocated a new extent
                         * that we are writing into for the first time.
                         */
-                        type = IOMAP_NEW;
+                        type = IO_NEW;
                        if (trylock_buffer(bh)) {
                                ASSERT(buffer_mapped(bh));
-                                if (iomap_valid)
+                                if (imap_valid)
                                        all_bh = 1;
                                xfs_add_to_ioend(inode, bh, offset, type,
-                                                &ioend, !iomap_valid);
+                                                &ioend, !imap_valid);
                                page_dirty--;
                                count++;
                        } else {
-                                iomap_valid = 0;
+                                imap_valid = 0;
                        }
                } else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
                           (unmapped || startio)) {
-                        iomap_valid = 0;
+                        imap_valid = 0;
                }
                if (!iohead)
@@ -1240,12 +1259,23 @@ xfs_page_state_convert(
        if (startio)
                xfs_start_page_writeback(page, 1, count);
-        if (ioend && iomap_valid) {
+        if (ioend && imap_valid) {
-                offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >>
+                xfs_off_t               end_index;
-                                        PAGE_CACHE_SHIFT;
-                tlast = min_t(pgoff_t, offset, last_index);
+                end_index = imap.br_startoff + imap.br_blockcount;
-                xfs_cluster_write(inode, page->index + 1, &iomap, &ioend,
-                                        wbc, startio, all_bh, tlast);
+                /* to bytes */
+                end_index <<= inode->i_blkbits;
+                /* to pages */
+                end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
+                /* check against file size */
+                if (end_index > last_index)
+                        end_index = last_index;
+                xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
+                                        wbc, startio, all_bh, end_index);
        }
        if (iohead)
@@ -1447,10 +1477,11 @@ __xfs_get_blocks(
        int                     direct,
        bmapi_flags_t           flags)
 {
-        xfs_iomap_t             iomap;
+        struct xfs_bmbt_irec    imap;
        xfs_off_t               offset;
        ssize_t                 size;
-        int                     niomap = 1;
+        int                     nimap = 1;
+        int                     new = 0;
        int                     error;
        offset = (xfs_off_t)iblock << inode->i_blkbits;
@@ -1461,22 +1492,21 @@ __xfs_get_blocks(
                return 0;
        error = xfs_iomap(XFS_I(inode), offset, size,
-                             create ? flags : BMAPI_READ, &iomap, &niomap);
+                             create ? flags : BMAPI_READ, &imap, &nimap, &new);
        if (error)
                return -error;
-        if (niomap == 0)
+        if (nimap == 0)
                return 0;
-        if (iomap.iomap_bn != IOMAP_DADDR_NULL) {
+        if (imap.br_startblock != HOLESTARTBLOCK &&
+            imap.br_startblock != DELAYSTARTBLOCK) {
                /*
                 * For unwritten extents do not report a disk address on
                 * the read case (treat as if we're reading into a hole).
                 */
-                if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) {
+                if (create || !ISUNWRITTEN(&imap))
-                        xfs_map_buffer(bh_result, &iomap, offset,
+                        xfs_map_buffer(inode, bh_result, &imap, offset);
-                                       inode->i_blkbits);
+                if (create && ISUNWRITTEN(&imap)) {
-                }
-                if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) {
                        if (direct)
                                bh_result->b_private = inode;
                        set_buffer_unwritten(bh_result);
@@ -1487,7 +1517,7 @@ __xfs_get_blocks(
         * If this is a realtime file, data may be on a different device.
         * to that pointed to from the buffer_head b_bdev currently.
         */
-        bh_result->b_bdev = iomap.iomap_target->bt_bdev;
+        bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
        /*
         * If we previously allocated a block out beyond eof and we are now
@@ -1501,10 +1531,10 @@ __xfs_get_blocks(
        if (create &&
            ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
             (offset >= i_size_read(inode)) ||
-             (iomap.iomap_flags & (IOMAP_NEW|IOMAP_UNWRITTEN))))
+             (new || ISUNWRITTEN(&imap))))
                set_buffer_new(bh_result);
-        if (iomap.iomap_flags & IOMAP_DELAY) {
+        if (imap.br_startblock == DELAYSTARTBLOCK) {
                BUG_ON(direct);
                if (create) {
                        set_buffer_uptodate(bh_result);
@@ -1513,11 +1543,23 @@ __xfs_get_blocks(
                }
        }
+        /*
+         * If this is O_DIRECT or the mpage code calling tell them how large
+         * the mapping is, so that we can avoid repeated get_blocks calls.
+         */
        if (direct || size > (1 << inode->i_blkbits)) {
-                ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0);
+                xfs_off_t               mapping_size;
-                offset = min_t(xfs_off_t,
-                                iomap.iomap_bsize - iomap.iomap_delta, size);
+                mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
-                bh_result->b_size = (ssize_t)min_t(xfs_off_t, LONG_MAX, offset);
+                mapping_size <<= inode->i_blkbits;
+                ASSERT(mapping_size > 0);
+                if (mapping_size > size)
+                        mapping_size = size;
+                if (mapping_size > LONG_MAX)
+                        mapping_size = LONG_MAX;
+                bh_result->b_size = mapping_size;
        }
        return 0;
@@ -1575,7 +1617,7 @@ xfs_end_io_direct(
         */
        ioend->io_offset = offset;
        ioend->io_size = size;
-        if (ioend->io_type == IOMAP_READ) {
+        if (ioend->io_type == IO_READ) {
                xfs_finish_ioend(ioend, 0);
        } else if (private && size > 0) {
                xfs_finish_ioend(ioend, is_sync_kiocb(iocb));
@@ -1586,7 +1628,7 @@ xfs_end_io_direct(
                 * didn't map an unwritten extent so switch it's completion
                 * handler.
                 */
-                ioend->io_type = IOMAP_NEW;
+                ioend->io_type = IO_NEW;
                xfs_finish_ioend(ioend, 0);
        }
@@ -1611,10 +1653,10 @@ xfs_vm_direct_IO(
        struct block_device *bdev;
        ssize_t         ret;
-        bdev = xfs_find_bdev_for_inode(XFS_I(inode));
+        bdev = xfs_find_bdev_for_inode(inode);
        iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
-                                        IOMAP_UNWRITTEN : IOMAP_READ);
+                                        IO_UNWRITTEN : IO_READ);
        ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
                                            offset, nr_segs,
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index bd111b7e1daa..649ade8ef598 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -18,7 +18,7 @@
 #include "xfs.h"
 #include <linux/stddef.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
+#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/vmalloc.h>
@@ -37,6 +37,7 @@
 #include "xfs_sb.h"
 #include "xfs_inum.h"
+#include "xfs_log.h"
 #include "xfs_ag.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -850,6 +851,12 @@ xfs_buf_lock_value(
 *      Note that this in no way locks the underlying pages, so it is only
 *      useful for synchronizing concurrent use of buffer objects, not for
 *      synchronizing independent access to the underlying pages.
+ *
+ *      If we come across a stale, pinned, locked buffer, we know that we
+ *      are being asked to lock a buffer that has been reallocated. Because
+ *      it is pinned, we know that the log has not been pushed to disk and
+ *      hence it will still be locked. Rather than sleeping until someone
+ *      else pushes the log, push it ourselves before trying to get the lock.
 */
 void
 xfs_buf_lock(
@@ -857,6 +864,8 @@ xfs_buf_lock(
 {
        trace_xfs_buf_lock(bp, _RET_IP_);
+        if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+                xfs_log_force(bp->b_mount, 0);
        if (atomic_read(&bp->b_io_remaining))
                blk_run_address_space(bp->b_target->bt_mapping);
        down(&bp->b_sema);
@@ -1007,25 +1016,20 @@ xfs_bwrite(
        struct xfs_mount        *mp,
        struct xfs_buf          *bp)
 {
-        int                     iowait = (bp->b_flags & XBF_ASYNC) == 0;
+        int                     error;
-        int                     error = 0;
        bp->b_strat = xfs_bdstrat_cb;
        bp->b_mount = mp;
        bp->b_flags |= XBF_WRITE;
-        if (!iowait)
+        bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
-                bp->b_flags |= _XBF_RUN_QUEUES;
        xfs_buf_delwri_dequeue(bp);
        xfs_buf_iostrategy(bp);
-        if (iowait) {
+        error = xfs_buf_iowait(bp);
-                error = xfs_buf_iowait(bp);
+        if (error)
-                if (error)
+                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
-                        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+        xfs_buf_relse(bp);
-                xfs_buf_relse(bp);
-        }
        return error;
 }
@@ -1614,7 +1618,8 @@ xfs_mapping_buftarg(
 STATIC int
 xfs_alloc_delwrite_queue(
-        xfs_buftarg_t           *btp)
+        xfs_buftarg_t           *btp,
+        const char              *fsname)
 {
        int     error = 0;
@@ -1622,7 +1627,7 @@ xfs_alloc_delwrite_queue(
        INIT_LIST_HEAD(&btp->bt_delwrite_queue);
        spin_lock_init(&btp->bt_delwrite_lock);
        btp->bt_flags = 0;
-        btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd");
+        btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
        if (IS_ERR(btp->bt_task)) {
                error = PTR_ERR(btp->bt_task);
                goto out_error;
@@ -1635,7 +1640,8 @@ out_error:
 xfs_buftarg_t *
 xfs_alloc_buftarg(
        struct block_device     *bdev,
-        int                     external)
+        int                     external,
+        const char              *fsname)
 {
        xfs_buftarg_t           *btp;
@@ -1647,7 +1653,7 @@ xfs_alloc_buftarg(
                goto error;
        if (xfs_mapping_buftarg(btp, bdev))
                goto error;
-        if (xfs_alloc_delwrite_queue(btp))
+        if (xfs_alloc_delwrite_queue(btp, fsname))
                goto error;
        xfs_alloc_bufhash(btp, external);
        return btp;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 386e7361e50e..5fbecefa5dfd 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -390,7 +390,7 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
 /*
 *      Handling of buftargs.
 */
-extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int);
+extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *);
 extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
 extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 42dd3bcfba6b..257a56b127cf 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -100,10 +100,10 @@ xfs_iozero(
 STATIC int
 xfs_file_fsync(
        struct file             *file,
-        struct dentry           *dentry,
        int                     datasync)
 {
-        struct xfs_inode        *ip = XFS_I(dentry->d_inode);
+        struct inode            *inode = file->f_mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_trans        *tp;
        int                     error = 0;
        int                     log_flushed = 0;
@@ -115,6 +115,8 @@ xfs_file_fsync(
        xfs_iflags_clear(ip, XFS_ITRUNCATED);
+        xfs_ioend_wait(ip);
        /*
         * We always need to make sure that the required inode state is safe on
         * disk.  The inode might be clean but we still might need to force the
@@ -138,8 +140,8 @@ xfs_file_fsync(
         * might gets cleared when the inode gets written out via the AIL
         * or xfs_iflush_cluster.
         */
-        if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) ||
+        if (((inode->i_state & I_DIRTY_DATASYNC) ||
-            ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
+            ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
            ip->i_update_core) {
                /*
                 * Kick off a transaction to log the inode core to get the
@@ -866,7 +868,7 @@ write_retry:
                        mutex_lock(&inode->i_mutex);
                xfs_ilock(ip, iolock);
-                error2 = -xfs_file_fsync(file, file->f_path.dentry,
+                error2 = -xfs_file_fsync(file,
                                         (file->f_flags & __O_SYNC) ? 0 : 1);
                if (!error)
                        error = error2;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 4ea1ee18aded..699b60cbab9c 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -58,6 +58,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/exportfs.h>
 /*
@@ -526,6 +527,10 @@ xfs_attrmulti_by_handle(
        if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
                return -XFS_ERROR(EFAULT);
+        /* overflow check */
+        if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
+                return -E2BIG;
        dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0bf6d61f0528..9287135e9bfc 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -18,6 +18,7 @@
 #include <linux/compat.h>
 #include <linux/ioctl.h>
 #include <linux/mount.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -419,6 +420,10 @@ xfs_compat_attrmulti_by_handle(
                           sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
                return -XFS_ERROR(EFAULT);
+        /* overflow check */
+        if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t))
+                return -E2BIG;
        dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 61a99608731e..9c8019c78c92 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -56,6 +56,7 @@
 #include <linux/security.h>
 #include <linux/falloc.h>
 #include <linux/fiemap.h>
+#include <linux/slab.h>
 /*
 * Bring the timestamps in the XFS inode uptodate.
@@ -672,7 +673,10 @@ xfs_vn_fiemap(
                bm.bmv_length = BTOBB(length);
        /* We add one because in getbmap world count includes the header */
-        bm.bmv_count = fieinfo->fi_extents_max + 1;
+        bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
+                                        fieinfo->fi_extents_max + 1;
+        bm.bmv_count = min_t(__s32, bm.bmv_count,
+                             (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
        bm.bmv_iflags = BMV_IF_PREALLOC;
        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
                bm.bmv_iflags |= BMV_IF_ATTRFORK;
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 1947514ce1ad..9ac8aea91529 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -19,6 +19,7 @@
 #include "xfs_dmapi.h"
 #include "xfs_sb.h"
 #include "xfs_inum.h"
+#include "xfs_log.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_quota.h"
@@ -97,7 +98,7 @@ xfs_fs_set_xstate(
 }
 STATIC int
-xfs_fs_get_xquota(
+xfs_fs_get_dqblk(
        struct super_block      *sb,
        int                     type,
        qid_t                   id,
@@ -114,7 +115,7 @@ xfs_fs_get_xquota(
 }
 STATIC int
-xfs_fs_set_xquota(
+xfs_fs_set_dqblk(
        struct super_block      *sb,
        int                     type,
        qid_t                   id,
@@ -135,6 +136,6 @@ xfs_fs_set_xquota(
 const struct quotactl_ops xfs_quotactl_operations = {
        .get_xstate             = xfs_fs_get_xstate,
        .set_xstate             = xfs_fs_set_xstate,
-        .get_xquota             = xfs_fs_get_xquota,
+        .get_dqblk              = xfs_fs_get_dqblk,
-        .set_xquota             = xfs_fs_set_xquota,
+        .set_dqblk              = xfs_fs_set_dqblk,
 };
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 71345a370d9f..f2d1718c9165 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -61,6 +61,7 @@
 #include <linux/namei.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/mempool.h>
 #include <linux/writeback.h>
@@ -118,6 +119,8 @@ mempool_t *xfs_ioend_pool;
 #define MNTOPT_DMAPI    "dmapi"         /* DMI enabled (DMAPI / XDSM) */
 #define MNTOPT_XDSM     "xdsm"          /* DMI enabled (DMAPI / XDSM) */
 #define MNTOPT_DMI      "dmi"           /* DMI enabled (DMAPI / XDSM) */
+#define MNTOPT_DELAYLOG   "delaylog"    /* Delayed loging enabled */
+#define MNTOPT_NODELAYLOG "nodelaylog"  /* Delayed loging disabled */
 /*
 * Table driven mount option parser.
@@ -373,6 +376,13 @@ xfs_parseargs(
                        mp->m_flags |= XFS_MOUNT_DMAPI;
                } else if (!strcmp(this_char, MNTOPT_DMI)) {
                        mp->m_flags |= XFS_MOUNT_DMAPI;
+                } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
+                        mp->m_flags |= XFS_MOUNT_DELAYLOG;
+                        cmn_err(CE_WARN,
+                                "Enabling EXPERIMENTAL delayed logging feature "
+                                "- use at your own risk.\n");
+                } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
+                        mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
                } else if (!strcmp(this_char, "ihashsize")) {
                        cmn_err(CE_WARN,
        "XFS: ihashsize no longer used, option is deprecated.");
@@ -534,6 +544,7 @@ xfs_showargs(
                { XFS_MOUNT_FILESTREAMS,        "," MNTOPT_FILESTREAM },
                { XFS_MOUNT_DMAPI,              "," MNTOPT_DMAPI },
                { XFS_MOUNT_GRPID,              "," MNTOPT_GRPID },
+                { XFS_MOUNT_DELAYLOG,           "," MNTOPT_DELAYLOG },
                { 0, NULL }
        };
        static struct proc_xfs_info xfs_info_unset[] = {
@@ -724,7 +735,8 @@ void
 xfs_blkdev_issue_flush(
        xfs_buftarg_t           *buftarg)
 {
-        blkdev_issue_flush(buftarg->bt_bdev, NULL);
+        blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
+                        BLKDEV_IFL_WAIT);
 }
 STATIC void
@@ -788,18 +800,18 @@ xfs_open_devices(
         * Setup xfs_mount buffer target pointers
         */
        error = ENOMEM;
-        mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0);
+        mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname);
        if (!mp->m_ddev_targp)
                goto out_close_rtdev;
        if (rtdev) {
-                mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1);
+                mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname);
                if (!mp->m_rtdev_targp)
                        goto out_free_ddev_targ;
        }
        if (logdev && logdev != ddev) {
-                mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1);
+                mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname);
                if (!mp->m_logdev_targp)
                        goto out_free_rtdev_targ;
        } else {
@@ -901,7 +913,8 @@ xfsaild_start(
        struct xfs_ail  *ailp)
 {
        ailp->xa_target = 0;
-        ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild");
+        ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
+                                    ailp->xa_mount->m_fsname);
        if (IS_ERR(ailp->xa_task))
                return -PTR_ERR(ailp->xa_task);
        return 0;
@@ -1091,6 +1104,7 @@ xfs_fs_write_inode(
                 * the code will only flush the inode if it isn't already
                 * being flushed.
                 */
+                xfs_ioend_wait(ip);
                xfs_ilock(ip, XFS_ILOCK_SHARED);
                if (ip->i_update_core) {
                        error = xfs_log_inode(ip);
@@ -1208,6 +1222,7 @@ xfs_fs_put_super(
        xfs_unmountfs(mp);
        xfs_freesb(mp);
+        xfs_inode_shrinker_unregister(mp);
        xfs_icsb_destroy_counters(mp);
        xfs_close_devices(mp);
        xfs_dmops_put(mp);
@@ -1621,6 +1636,8 @@ xfs_fs_fill_super(
        if (error)
                goto fail_vnrele;
+        xfs_inode_shrinker_register(mp);
        kfree(mtpt);
        return 0;
@@ -1748,7 +1765,7 @@ xfs_init_zones(void)
         * but it is much faster.
         */
        xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
-                                (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) /
+                                (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
                                  NBWORD) * sizeof(int))), "xfs_buf_item");
        if (!xfs_buf_item_zone)
                goto out_destroy_trans_zone;
@@ -1866,6 +1883,7 @@ init_xfs_fs(void)
                goto out_cleanup_procfs;
        vfs_initquota();
+        xfs_inode_shrinker_init();
        error = register_filesystem(&xfs_fs_type);
        if (error)
@@ -1893,6 +1911,7 @@ exit_xfs_fs(void)
 {
        vfs_exitquota();
        unregister_filesystem(&xfs_fs_type);
+        xfs_inode_shrinker_destroy();
        xfs_sysctl_unregister();
        xfs_cleanup_procfs();
        xfs_buf_terminate();
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 233d4b9881b1..519618e9279e 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -85,7 +85,7 @@ extern __uint64_t xfs_max_file_offset(unsigned int);
 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 extern const struct export_operations xfs_export_operations;
-extern struct xattr_handler *xfs_xattr_handlers[];
+extern const struct xattr_handler *xfs_xattr_handlers[];
 extern const struct quotactl_ops xfs_quotactl_operations;
 #define XFS_M(sb)               ((struct xfs_mount *)((sb)->s_fs_info))
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 05cd85317f6f..3884e20bc14e 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -95,7 +95,8 @@ xfs_inode_ag_walk(
                                           struct xfs_perag *pag, int flags),
        int                     flags,
        int                     tag,
-        int                     exclusive)
+        int                     exclusive,
+        int                     *nr_to_scan)
 {
        uint32_t                first_index;
        int                     last_error = 0;
@@ -134,7 +135,7 @@ restart:
                if (error == EFSCORRUPTED)
                        break;
-        } while (1);
+        } while ((*nr_to_scan)--);
        if (skipped) {
                delay(1);
@@ -150,12 +151,15 @@ xfs_inode_ag_iterator(
                                           struct xfs_perag *pag, int flags),
        int                     flags,
        int                     tag,
-        int                     exclusive)
+        int                     exclusive,
+        int                     *nr_to_scan)
 {
        int                     error = 0;
        int                     last_error = 0;
        xfs_agnumber_t          ag;
+        int                     nr;
+        nr = nr_to_scan ? *nr_to_scan : INT_MAX;
        for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
                struct xfs_perag        *pag;
@@ -165,14 +169,18 @@ xfs_inode_ag_iterator(
                        continue;
                }
                error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
-                                                exclusive);
+                                                exclusive, &nr);
                xfs_perag_put(pag);
                if (error) {
                        last_error = error;
                        if (error == EFSCORRUPTED)
                                break;
                }
+                if (nr <= 0)
+                        break;
        }
+        if (nr_to_scan)
+                *nr_to_scan = nr;
        return XFS_ERROR(last_error);
 }
@@ -291,7 +299,7 @@ xfs_sync_data(
        ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
        error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
-                                      XFS_ICI_NO_TAG, 0);
+                                      XFS_ICI_NO_TAG, 0, NULL);
        if (error)
                return XFS_ERROR(error);
@@ -310,7 +318,7 @@ xfs_sync_attr(
        ASSERT((flags & ~SYNC_WAIT) == 0);
        return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
-                                     XFS_ICI_NO_TAG, 0);
+                                     XFS_ICI_NO_TAG, 0, NULL);
 }
 STATIC int
@@ -348,68 +356,23 @@ xfs_commit_dummy_trans(
 STATIC int
 xfs_sync_fsdata(
-        struct xfs_mount        *mp,
+        struct xfs_mount        *mp)
-        int                     flags)
 {
        struct xfs_buf          *bp;
-        struct xfs_buf_log_item *bip;
-        int                     error = 0;
-        /*
-         * If this is xfssyncd() then only sync the superblock if we can
-         * lock it without sleeping and it is not pinned.
-         */
-        if (flags & SYNC_TRYLOCK) {
-                ASSERT(!(flags & SYNC_WAIT));
-                bp = xfs_getsb(mp, XBF_TRYLOCK);
-                if (!bp)
-                        goto out;
-                bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *);
-                if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp))
-                        goto out_brelse;
-        } else {
-                bp = xfs_getsb(mp, 0);
-                /*
-                 * If the buffer is pinned then push on the log so we won't
-                 * get stuck waiting in the write for someone, maybe
-                 * ourselves, to flush the log.
-                 *
-                 * Even though we just pushed the log above, we did not have
-                 * the superblock buffer locked at that point so it can
-                 * become pinned in between there and here.
-                 */
-                if (XFS_BUF_ISPINNED(bp))
-                        xfs_log_force(mp, 0);
-        }
-        if (flags & SYNC_WAIT)
-                XFS_BUF_UNASYNC(bp);
-        else
-                XFS_BUF_ASYNC(bp);
-        error = xfs_bwrite(mp, bp);
-        if (error)
-                return error;
        /*
-         * If this is a data integrity sync make sure all pending buffers
+         * If the buffer is pinned then push on the log so we won't get stuck
-         * are flushed out for the log coverage check below.
+         * waiting in the write for someone, maybe ourselves, to flush the log.
+         *
+         * Even though we just pushed the log above, we did not have the
+         * superblock buffer locked at that point so it can become pinned in
+         * between there and here.
         */
-        if (flags & SYNC_WAIT)
+        bp = xfs_getsb(mp, 0);
-                xfs_flush_buftarg(mp->m_ddev_targp, 1);
+        if (XFS_BUF_ISPINNED(bp))
+                xfs_log_force(mp, 0);
-        if (xfs_log_need_covered(mp))
-                error = xfs_commit_dummy_trans(mp, flags);
-        return error;
- out_brelse:
+        return xfs_bwrite(mp, bp);
-        xfs_buf_relse(bp);
- out:
-        return error;
 }
 /*
@@ -433,7 +396,7 @@ int
 xfs_quiesce_data(
        struct xfs_mount        *mp)
 {
-        int error;
+        int                     error, error2 = 0;
        /* push non-blocking */
        xfs_sync_data(mp, 0);
@@ -444,13 +407,20 @@ xfs_quiesce_data(
        xfs_qm_sync(mp, SYNC_WAIT);
        /* write superblock and hoover up shutdown errors */
-        error = xfs_sync_fsdata(mp, SYNC_WAIT);
+        error = xfs_sync_fsdata(mp);
+        /* make sure all delwri buffers are written out */
+        xfs_flush_buftarg(mp->m_ddev_targp, 1);
+        /* mark the log as covered if needed */
+        if (xfs_log_need_covered(mp))
+                error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT);
        /* flush data-only devices */
        if (mp->m_rtdev_targp)
                XFS_bflush(mp->m_rtdev_targp);
-        return error;
+        return error ? error : error2;
 }
 STATIC void
@@ -573,9 +543,9 @@ xfs_flush_inodes(
 }
 /*
- * Every sync period we need to unpin all items, reclaim inodes, sync
+ * Every sync period we need to unpin all items, reclaim inodes and sync
- * quota and write out the superblock. We might need to cover the log
+ * disk quotas.  We might need to cover the log to indicate that the
- * to indicate it is idle.
+ * filesystem is idle.
 */
 STATIC void
 xfs_sync_worker(
@@ -589,7 +559,8 @@ xfs_sync_worker(
                xfs_reclaim_inodes(mp, 0);
                /* dgc: errors ignored here */
                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
-                error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
+                if (xfs_log_need_covered(mp))
+                        error = xfs_commit_dummy_trans(mp, 0);
        }
        mp->m_sync_seq++;
        wake_up(&mp->m_wait_single_sync_task);
@@ -652,7 +623,7 @@ xfs_syncd_init(
        mp->m_sync_work.w_syncer = xfs_sync_worker;
        mp->m_sync_work.w_mount = mp;
        mp->m_sync_work.w_completion = NULL;
-        mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
+        mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname);
        if (IS_ERR(mp->m_sync_task))
                return -PTR_ERR(mp->m_sync_task);
        return 0;
@@ -673,6 +644,7 @@ __xfs_inode_set_reclaim_tag(
        radix_tree_tag_set(&pag->pag_ici_root,
                           XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
                           XFS_ICI_RECLAIM_TAG);
+        pag->pag_ici_reclaimable++;
 }
 /*
@@ -705,6 +677,7 @@ __xfs_inode_clear_reclaim_tag(
 {
        radix_tree_tag_clear(&pag->pag_ici_root,
                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+        pag->pag_ici_reclaimable--;
 }
 /*
@@ -820,10 +793,10 @@ xfs_reclaim_inode(
         * call into reclaim to find it in a clean state instead of waiting for
         * it now. We also don't return errors here - if the error is transient
         * then the next reclaim pass will flush the inode, and if the error
-         * is permanent then the next sync reclaim will relcaim the inode and
+         * is permanent then the next sync reclaim will reclaim the inode and
         * pass on the error.
         */
-        if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+        if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
                xfs_fs_cmn_err(CE_WARN, ip->i_mount,
                        "inode 0x%llx background reclaim flush failed with %d",
                        (long long)ip->i_ino, error);
@@ -854,5 +827,93 @@ xfs_reclaim_inodes(
        int             mode)
 {
        return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
-                                        XFS_ICI_RECLAIM_TAG, 1);
+                                        XFS_ICI_RECLAIM_TAG, 1, NULL);
+}
+/*
+ * Shrinker infrastructure.
+ *
+ * This is all far more complex than it needs to be. It adds a global list of
+ * mounts because the shrinkers can only call a global context. We need to make
+ * the shrinkers pass a context to avoid the need for global state.
+ */
+static LIST_HEAD(xfs_mount_list);
+static struct rw_semaphore xfs_mount_list_lock;
+static int
+xfs_reclaim_inode_shrink(
+        int             nr_to_scan,
+        gfp_t           gfp_mask)
+{
+        struct xfs_mount *mp;
+        struct xfs_perag *pag;
+        xfs_agnumber_t  ag;
+        int             reclaimable = 0;
+        if (nr_to_scan) {
+                if (!(gfp_mask & __GFP_FS))
+                        return -1;
+                down_read(&xfs_mount_list_lock);
+                list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
+                        xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
+                                        XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
+                        if (nr_to_scan <= 0)
+                                break;
+                }
+                up_read(&xfs_mount_list_lock);
+        }
+        down_read(&xfs_mount_list_lock);
+        list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
+                for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
+                        pag = xfs_perag_get(mp, ag);
+                        if (!pag->pag_ici_init) {
+                                xfs_perag_put(pag);
+                                continue;
+                        }
+                        reclaimable += pag->pag_ici_reclaimable;
+                        xfs_perag_put(pag);
+                }
+        }
+        up_read(&xfs_mount_list_lock);
+        return reclaimable;
+}
+static struct shrinker xfs_inode_shrinker = {
+        .shrink = xfs_reclaim_inode_shrink,
+        .seeks = DEFAULT_SEEKS,
+};
+void __init
+xfs_inode_shrinker_init(void)
+{
+        init_rwsem(&xfs_mount_list_lock);
+        register_shrinker(&xfs_inode_shrinker);
+}
+void
+xfs_inode_shrinker_destroy(void)
+{
+        ASSERT(list_empty(&xfs_mount_list));
+        unregister_shrinker(&xfs_inode_shrinker);
+}
+void
+xfs_inode_shrinker_register(
+        struct xfs_mount        *mp)
+{
+        down_write(&xfs_mount_list_lock);
+        list_add_tail(&mp->m_mplist, &xfs_mount_list);
+        up_write(&xfs_mount_list_lock);
+}
+void
+xfs_inode_shrinker_unregister(
+        struct xfs_mount        *mp)
+{
+        down_write(&xfs_mount_list_lock);
+        list_del(&mp->m_mplist);
+        up_write(&xfs_mount_list_lock);
 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index d480c346cabb..cdcbaaca9880 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -53,6 +53,11 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
 int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
        int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
-        int flags, int tag, int write_lock);
+        int flags, int tag, int write_lock, int *nr_to_scan);
+void xfs_inode_shrinker_init(void);
+void xfs_inode_shrinker_destroy(void);
+void xfs_inode_shrinker_register(struct xfs_mount *mp);
+void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
 #endif
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index 5a107601e969..207fa77f63ae 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -41,7 +41,6 @@
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
 #include "xfs_attr.h"
-#include "xfs_attr_sf.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_log_priv.h"
 #include "xfs_buf_item.h"
@@ -50,6 +49,9 @@
 #include "xfs_aops.h"
 #include "quota/xfs_dquot_item.h"
 #include "quota/xfs_dquot.h"
+#include "xfs_log_recover.h"
+#include "xfs_buf_item.h"
+#include "xfs_inode_item.h"
 /*
 * We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index fcaa62f0799e..ff6bc797baf2 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -32,6 +32,10 @@ struct xfs_da_node_entry;
 struct xfs_dquot;
 struct xlog_ticket;
 struct log;
+struct xlog_recover;
+struct xlog_recover_item;
+struct xfs_buf_log_format;
+struct xfs_inode_log_format;
 DECLARE_EVENT_CLASS(xfs_attr_list_class,
        TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -562,18 +566,21 @@ DECLARE_EVENT_CLASS(xfs_inode_class,
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(int, count)
+                __field(int, pincount)
                __field(unsigned long, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->count = atomic_read(&VFS_I(ip)->i_count);
+                __entry->pincount = atomic_read(&ip->i_pincount);
                __entry->caller_ip = caller_ip;
        ),
-        TP_printk("dev %d:%d ino 0x%llx count %d caller %pf",
+        TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->count,
+                  __entry->pincount,
                  (char *)__entry->caller_ip)
 )
@@ -583,6 +590,10 @@ DEFINE_EVENT(xfs_inode_class, name, \
        TP_ARGS(ip, caller_ip))
 DEFINE_INODE_EVENT(xfs_ihold);
 DEFINE_INODE_EVENT(xfs_irele);
+DEFINE_INODE_EVENT(xfs_inode_pin);
+DEFINE_INODE_EVENT(xfs_inode_unpin);
+DEFINE_INODE_EVENT(xfs_inode_unpin_nowait);
 /* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */
 DEFINE_INODE_EVENT(xfs_inode);
 #define xfs_itrace_entry(ip)    \
@@ -642,8 +653,6 @@ DEFINE_EVENT(xfs_dquot_class, name, \
        TP_PROTO(struct xfs_dquot *dqp), \
        TP_ARGS(dqp))
 DEFINE_DQUOT_EVENT(xfs_dqadjust);
-DEFINE_DQUOT_EVENT(xfs_dqshake_dirty);
-DEFINE_DQUOT_EVENT(xfs_dqshake_unlink);
 DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
 DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
 DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink);
@@ -658,7 +667,6 @@ DEFINE_DQUOT_EVENT(xfs_dqread_fail);
 DEFINE_DQUOT_EVENT(xfs_dqlookup_found);
 DEFINE_DQUOT_EVENT(xfs_dqlookup_want);
 DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist);
-DEFINE_DQUOT_EVENT(xfs_dqlookup_move);
 DEFINE_DQUOT_EVENT(xfs_dqlookup_done);
 DEFINE_DQUOT_EVENT(xfs_dqget_hit);
 DEFINE_DQUOT_EVENT(xfs_dqget_miss);
@@ -1051,83 +1059,112 @@ TRACE_EVENT(xfs_bunmap,
 );
+#define XFS_BUSY_SYNC \
+        { 0,    "async" }, \
+        { 1,    "sync" }
 TRACE_EVENT(xfs_alloc_busy,
-        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+        TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
-                 xfs_extlen_t len, int slot),
+                 xfs_agblock_t agbno, xfs_extlen_t len, int sync),
-        TP_ARGS(mp, agno, agbno, len, slot),
+        TP_ARGS(trans, agno, agbno, len, sync),
        TP_STRUCT__entry(
                __field(dev_t, dev)
+                __field(struct xfs_trans *, tp)
+                __field(int, tid)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, agbno)
                __field(xfs_extlen_t, len)
-                __field(int, slot)
+                __field(int, sync)
        ),
        TP_fast_assign(
-                __entry->dev = mp->m_super->s_dev;
+                __entry->dev = trans->t_mountp->m_super->s_dev;
+                __entry->tp = trans;
+                __entry->tid = trans->t_ticket->t_tid;
                __entry->agno = agno;
                __entry->agbno = agbno;
                __entry->len = len;
-                __entry->slot = slot;
+                __entry->sync = sync;
        ),
-        TP_printk("dev %d:%d agno %u agbno %u len %u slot %d",
+        TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->tp,
+                  __entry->tid,
                  __entry->agno,
                  __entry->agbno,
                  __entry->len,
-                  __entry->slot)
+                  __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
 );
-#define XFS_BUSY_STATES \
-        { 0,    "found" }, \
-        { 1,    "missing" }
 TRACE_EVENT(xfs_alloc_unbusy,
        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-                 int slot, int found),
+                 xfs_agblock_t agbno, xfs_extlen_t len),
-        TP_ARGS(mp, agno, slot, found),
+        TP_ARGS(mp, agno, agbno, len),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
-                __field(int, slot)
+                __field(xfs_agblock_t, agbno)
-                __field(int, found)
+                __field(xfs_extlen_t, len)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->agno = agno;
-                __entry->slot = slot;
+                __entry->agbno = agbno;
-                __entry->found = found;
+                __entry->len = len;
        ),
-        TP_printk("dev %d:%d agno %u slot %d %s",
+        TP_printk("dev %d:%d agno %u agbno %u len %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
-                  __entry->slot,
+                  __entry->agbno,
-                  __print_symbolic(__entry->found, XFS_BUSY_STATES))
+                  __entry->len)
 );
+#define XFS_BUSY_STATES \
+        { 0,    "missing" }, \
+        { 1,    "found" }
 TRACE_EVENT(xfs_alloc_busysearch,
-        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-                 xfs_extlen_t len, xfs_lsn_t lsn),
+                 xfs_agblock_t agbno, xfs_extlen_t len, int found),
-        TP_ARGS(mp, agno, agbno, len, lsn),
+        TP_ARGS(mp, agno, agbno, len, found),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, agbno)
                __field(xfs_extlen_t, len)
-                __field(xfs_lsn_t, lsn)
+                __field(int, found)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->agno = agno;
                __entry->agbno = agbno;
                __entry->len = len;
-                __entry->lsn = lsn;
+                __entry->found = found;
        ),
-        TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx",
+        TP_printk("dev %d:%d agno %u agbno %u len %u %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __entry->agbno,
                  __entry->len,
+                  __print_symbolic(__entry->found, XFS_BUSY_STATES))
+);
+TRACE_EVENT(xfs_trans_commit_lsn,
+        TP_PROTO(struct xfs_trans *trans),
+        TP_ARGS(trans),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(struct xfs_trans *, tp)
+                __field(xfs_lsn_t, lsn)
+        ),
+        TP_fast_assign(
+                __entry->dev = trans->t_mountp->m_super->s_dev;
+                __entry->tp = trans;
+                __entry->lsn = trans->t_commit_lsn;
+        ),
+        TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->tp,
                  __entry->lsn)
 );
@@ -1495,6 +1532,140 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \
 DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
 DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
+DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
+        TP_PROTO(struct log *log, struct xlog_recover *trans,
+                struct xlog_recover_item *item, int pass),
+        TP_ARGS(log, trans, item, pass),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(unsigned long, item)
+                __field(xlog_tid_t, tid)
+                __field(int, type)
+                __field(int, pass)
+                __field(int, count)
+                __field(int, total)
+        ),
+        TP_fast_assign(
+                __entry->dev = log->l_mp->m_super->s_dev;
+                __entry->item = (unsigned long)item;
+                __entry->tid = trans->r_log_tid;
+                __entry->type = ITEM_TYPE(item);
+                __entry->pass = pass;
+                __entry->count = item->ri_cnt;
+                __entry->total = item->ri_total;
+        ),
+        TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s "
+                  "item region count/total %d/%d",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->tid,
+                  __entry->pass,
+                  (void *)__entry->item,
+                  __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
+                  __entry->count,
+                  __entry->total)
+)
+#define DEFINE_LOG_RECOVER_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_item_class, name, \
+        TP_PROTO(struct log *log, struct xlog_recover *trans, \
+                struct xlog_recover_item *item, int pass), \
+        TP_ARGS(log, trans, item, pass))
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover);
+DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class,
+        TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f),
+        TP_ARGS(log, buf_f),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(__int64_t, blkno)
+                __field(unsigned short, len)
+                __field(unsigned short, flags)
+                __field(unsigned short, size)
+                __field(unsigned int, map_size)
+        ),
+        TP_fast_assign(
+                __entry->dev = log->l_mp->m_super->s_dev;
+                __entry->blkno = buf_f->blf_blkno;
+                __entry->len = buf_f->blf_len;
+                __entry->flags = buf_f->blf_flags;
+                __entry->size = buf_f->blf_size;
+                __entry->map_size = buf_f->blf_map_size;
+        ),
+        TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, "
+                        "map_size %d",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->blkno,
+                  __entry->len,
+                  __entry->flags,
+                  __entry->size,
+                  __entry->map_size)
+)
+#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \
+        TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), \
+        TP_ARGS(log, buf_f))
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf);
+DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class,
+        TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f),
+        TP_ARGS(log, in_f),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, ino)
+                __field(unsigned short, size)
+                __field(int, fields)
+                __field(unsigned short, asize)
+                __field(unsigned short, dsize)
+                __field(__int64_t, blkno)
+                __field(int, len)
+                __field(int, boffset)
+        ),
+        TP_fast_assign(
+                __entry->dev = log->l_mp->m_super->s_dev;
+                __entry->ino = in_f->ilf_ino;
+                __entry->size = in_f->ilf_size;
+                __entry->fields = in_f->ilf_fields;
+                __entry->asize = in_f->ilf_asize;
+                __entry->dsize = in_f->ilf_dsize;
+                __entry->blkno = in_f->ilf_blkno;
+                __entry->len = in_f->ilf_len;
+                __entry->boffset = in_f->ilf_boffset;
+        ),
+        TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, "
+                        "dsize %d, blkno 0x%llx, len %d, boffset %d",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->ino,
+                  __entry->size,
+                  __entry->fields,
+                  __entry->asize,
+                  __entry->dsize,
+                  __entry->blkno,
+                  __entry->len,
+                  __entry->boffset)
+)
+#define DEFINE_LOG_RECOVER_INO_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \
+        TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), \
+        TP_ARGS(log, in_f))
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
 #endif /* _TRACE_XFS_H */
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
index fa01b9daba6b..87d3e03878c8 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -72,28 +72,28 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
                                (void *)value, size, xflags);
 }
-static struct xattr_handler xfs_xattr_user_handler = {
+static const struct xattr_handler xfs_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
        .flags  = 0, /* no flags implies user namespace */
        .get    = xfs_xattr_get,
        .set    = xfs_xattr_set,
 };
-static struct xattr_handler xfs_xattr_trusted_handler = {
+static const struct xattr_handler xfs_xattr_trusted_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .flags  = ATTR_ROOT,
        .get    = xfs_xattr_get,
        .set    = xfs_xattr_set,
 };
-static struct xattr_handler xfs_xattr_security_handler = {
+static const struct xattr_handler xfs_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .flags  = ATTR_SECURE,
        .get    = xfs_xattr_get,
        .set    = xfs_xattr_set,
 };
-struct xattr_handler *xfs_xattr_handlers[] = {
+const struct xattr_handler *xfs_xattr_handlers[] = {
        &xfs_xattr_user_handler,
        &xfs_xattr_trusted_handler,
        &xfs_xattr_security_handler,
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 5f79dd78626b..585e7633dfc7 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -101,7 +101,7 @@ xfs_qm_dqinit(
         * No need to re-initialize these if this is a reclaimed dquot.
         */
        if (brandnewdquot) {
-                dqp->dq_flnext = dqp->dq_flprev = dqp;
+                INIT_LIST_HEAD(&dqp->q_freelist);
                mutex_init(&dqp->q_qlock);
                init_waitqueue_head(&dqp->q_pinwait);
@@ -119,20 +119,20 @@ xfs_qm_dqinit(
                 * Only the q_core portion was zeroed in dqreclaim_one().
                 * So, we need to reset others.
                 */
-                 dqp->q_nrefs = 0;
+                dqp->q_nrefs = 0;
-                 dqp->q_blkno = 0;
+                dqp->q_blkno = 0;
-                 dqp->MPL_NEXT = dqp->HL_NEXT = NULL;
+                INIT_LIST_HEAD(&dqp->q_mplist);
-                 dqp->HL_PREVP = dqp->MPL_PREVP = NULL;
+                INIT_LIST_HEAD(&dqp->q_hashlist);
-                 dqp->q_bufoffset = 0;
+                dqp->q_bufoffset = 0;
-                 dqp->q_fileoffset = 0;
+                dqp->q_fileoffset = 0;
-                 dqp->q_transp = NULL;
+                dqp->q_transp = NULL;
-                 dqp->q_gdquot = NULL;
+                dqp->q_gdquot = NULL;
-                 dqp->q_res_bcount = 0;
+                dqp->q_res_bcount = 0;
-                 dqp->q_res_icount = 0;
+                dqp->q_res_icount = 0;
-                 dqp->q_res_rtbcount = 0;
+                dqp->q_res_rtbcount = 0;
-                 atomic_set(&dqp->q_pincount, 0);
+                atomic_set(&dqp->q_pincount, 0);
-                 dqp->q_hash = NULL;
+                dqp->q_hash = NULL;
-                 ASSERT(dqp->dq_flnext == dqp->dq_flprev);
+                ASSERT(list_empty(&dqp->q_freelist));
                trace_xfs_dqreuse(dqp);
        }
@@ -158,7 +158,7 @@ void
 xfs_qm_dqdestroy(
        xfs_dquot_t     *dqp)
 {
-        ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp));
+        ASSERT(list_empty(&dqp->q_freelist));
        mutex_destroy(&dqp->q_qlock);
        sv_destroy(&dqp->q_pinwait);
@@ -252,7 +252,7 @@ xfs_qm_adjust_dqtimers(
                     (be64_to_cpu(d->d_bcount) >=
                      be64_to_cpu(d->d_blk_hardlimit)))) {
                        d->d_btimer = cpu_to_be32(get_seconds() +
-                                        XFS_QI_BTIMELIMIT(mp));
+                                        mp->m_quotainfo->qi_btimelimit);
                } else {
                        d->d_bwarns = 0;
                }
@@ -275,7 +275,7 @@ xfs_qm_adjust_dqtimers(
                     (be64_to_cpu(d->d_icount) >=
                      be64_to_cpu(d->d_ino_hardlimit)))) {
                        d->d_itimer = cpu_to_be32(get_seconds() +
-                                        XFS_QI_ITIMELIMIT(mp));
+                                        mp->m_quotainfo->qi_itimelimit);
                } else {
                        d->d_iwarns = 0;
                }
@@ -298,7 +298,7 @@ xfs_qm_adjust_dqtimers(
                     (be64_to_cpu(d->d_rtbcount) >=
                      be64_to_cpu(d->d_rtb_hardlimit)))) {
                        d->d_rtbtimer = cpu_to_be32(get_seconds() +
-                                        XFS_QI_RTBTIMELIMIT(mp));
+                                        mp->m_quotainfo->qi_rtbtimelimit);
                } else {
                        d->d_rtbwarns = 0;
                }
@@ -325,6 +325,7 @@ xfs_qm_init_dquot_blk(
        uint            type,
        xfs_buf_t       *bp)
 {
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
        xfs_dqblk_t     *d;
        int             curid, i;
@@ -337,16 +338,16 @@ xfs_qm_init_dquot_blk(
        /*
         * ID of the first dquot in the block - id's are zero based.
         */
-        curid = id - (id % XFS_QM_DQPERBLK(mp));
+        curid = id - (id % q->qi_dqperchunk);
        ASSERT(curid >= 0);
-        memset(d, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)));
+        memset(d, 0, BBTOB(q->qi_dqchunklen));
-        for (i = 0; i < XFS_QM_DQPERBLK(mp); i++, d++, curid++)
+        for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
                xfs_qm_dqinit_core(curid, type, d);
        xfs_trans_dquot_buf(tp, bp,
-                            (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF :
+                            (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
-                            ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF :
+                            ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
-                             XFS_BLI_GDQUOT_BUF)));
+                             XFS_BLF_GDQUOT_BUF)));
-        xfs_trans_log_buf(tp, bp, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1);
+        xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
 }
@@ -419,7 +420,7 @@ xfs_qm_dqalloc(
        /* now we can just get the buffer (there's nothing to read yet) */
        bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
                               dqp->q_blkno,
-                               XFS_QI_DQCHUNKLEN(mp),
+                               mp->m_quotainfo->qi_dqchunklen,
                               0);
        if (!bp || (error = XFS_BUF_GETERROR(bp)))
                goto error1;
@@ -500,7 +501,8 @@ xfs_qm_dqtobp(
         */
        if (dqp->q_blkno == (xfs_daddr_t) 0) {
                /* We use the id as an index */
-                dqp->q_fileoffset = (xfs_fileoff_t)id / XFS_QM_DQPERBLK(mp);
+                dqp->q_fileoffset = (xfs_fileoff_t)id /
+                                        mp->m_quotainfo->qi_dqperchunk;
                nmaps = 1;
                quotip = XFS_DQ_TO_QIP(dqp);
                xfs_ilock(quotip, XFS_ILOCK_SHARED);
@@ -529,7 +531,7 @@ xfs_qm_dqtobp(
                /*
                 * offset of dquot in the (fixed sized) dquot chunk.
                 */
-                dqp->q_bufoffset = (id % XFS_QM_DQPERBLK(mp)) *
+                dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
                        sizeof(xfs_dqblk_t);
                if (map.br_startblock == HOLESTARTBLOCK) {
                        /*
@@ -559,15 +561,13 @@ xfs_qm_dqtobp(
         * Read in the buffer, unless we've just done the allocation
         * (in which case we already have the buf).
         */
-        if (! newdquot) {
+        if (!newdquot) {
                trace_xfs_dqtobp_read(dqp);
-                if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+                error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-                                               dqp->q_blkno,
+                                           dqp->q_blkno,
-                                               XFS_QI_DQCHUNKLEN(mp),
+                                           mp->m_quotainfo->qi_dqchunklen,
-                                               0, &bp))) {
+                                           0, &bp);
-                        return (error);
-                }
                if (error || !bp)
                        return XFS_ERROR(error);
        }
@@ -689,14 +689,14 @@ xfs_qm_idtodq(
        tp = NULL;
        if (flags & XFS_QMOPT_DQALLOC) {
                tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
-                if ((error = xfs_trans_reserve(tp,
+                error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
-                                       XFS_QM_DQALLOC_SPACE_RES(mp),
+                                XFS_WRITE_LOG_RES(mp) +
-                                       XFS_WRITE_LOG_RES(mp) +
+                                BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 +
-                                              BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1 +
+                                128,
-                                              128,
+                                0,
-                                       0,
+                                XFS_TRANS_PERM_LOG_RES,
-                                       XFS_TRANS_PERM_LOG_RES,
+                                XFS_WRITE_LOG_COUNT);
-                                       XFS_WRITE_LOG_COUNT))) {
+                if (error) {
                        cancelflags = 0;
                        goto error0;
                }
@@ -751,7 +751,6 @@ xfs_qm_dqlookup(
 {
        xfs_dquot_t             *dqp;
        uint                    flist_locked;
-        xfs_dquot_t             *d;
        ASSERT(mutex_is_locked(&qh->qh_lock));
@@ -760,7 +759,7 @@ xfs_qm_dqlookup(
        /*
         * Traverse the hashchain looking for a match
         */
-        for (dqp = qh->qh_next; dqp != NULL; dqp = dqp->HL_NEXT) {
+        list_for_each_entry(dqp, &qh->qh_list, q_hashlist) {
                /*
                 * We already have the hashlock. We don't need the
                 * dqlock to look at the id field of the dquot, since the
@@ -772,12 +771,12 @@ xfs_qm_dqlookup(
                        /*
                         * All in core dquots must be on the dqlist of mp
                         */
-                        ASSERT(dqp->MPL_PREVP != NULL);
+                        ASSERT(!list_empty(&dqp->q_mplist));
                        xfs_dqlock(dqp);
                        if (dqp->q_nrefs == 0) {
-                                ASSERT (XFS_DQ_IS_ON_FREELIST(dqp));
+                                ASSERT(!list_empty(&dqp->q_freelist));
-                                if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
+                                if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
                                        trace_xfs_dqlookup_want(dqp);
                                        /*
@@ -787,7 +786,7 @@ xfs_qm_dqlookup(
                                         */
                                        dqp->dq_flags |= XFS_DQ_WANT;
                                        xfs_dqunlock(dqp);
-                                        xfs_qm_freelist_lock(xfs_Gqm);
+                                        mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
                                        xfs_dqlock(dqp);
                                        dqp->dq_flags &= ~(XFS_DQ_WANT);
                                }
@@ -802,46 +801,28 @@ xfs_qm_dqlookup(
                        if (flist_locked) {
                                if (dqp->q_nrefs != 0) {
-                                        xfs_qm_freelist_unlock(xfs_Gqm);
+                                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
                                        flist_locked = B_FALSE;
                                } else {
-                                        /*
+                                        /* take it off the freelist */
-                                         * take it off the freelist
-                                         */
                                        trace_xfs_dqlookup_freelist(dqp);
-                                        XQM_FREELIST_REMOVE(dqp);
+                                        list_del_init(&dqp->q_freelist);
-                                        /* xfs_qm_freelist_print(&(xfs_Gqm->
+                                        xfs_Gqm->qm_dqfrlist_cnt--;
-                                                        qm_dqfreelist),
-                                                        "after removal"); */
                                }
                        }
-                        /*
-                         * grab a reference
-                         */
                        XFS_DQHOLD(dqp);
                        if (flist_locked)
-                                xfs_qm_freelist_unlock(xfs_Gqm);
+                                mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
                        /*
                         * move the dquot to the front of the hashchain
                         */
                        ASSERT(mutex_is_locked(&qh->qh_lock));
-                        if (dqp->HL_PREVP != &qh->qh_next) {
+                        list_move(&dqp->q_hashlist, &qh->qh_list);
-                                trace_xfs_dqlookup_move(dqp);
-                                if ((d = dqp->HL_NEXT))
-                                        d->HL_PREVP = dqp->HL_PREVP;
-                                *(dqp->HL_PREVP) = d;
-                                d = qh->qh_next;
-                                d->HL_PREVP = &dqp->HL_NEXT;
-                                dqp->HL_NEXT = d;
-                                dqp->HL_PREVP = &qh->qh_next;
-                                qh->qh_next = dqp;
-                        }
                        trace_xfs_dqlookup_done(dqp);
                        *O_dqpp = dqp;
-                        ASSERT(mutex_is_locked(&qh->qh_lock));
+                        return 0;
-                        return (0);
                }
        }
@@ -975,16 +956,17 @@ xfs_qm_dqget(
         */
        if (ip) {
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                if (! XFS_IS_DQTYPE_ON(mp, type)) {
-                        /* inode stays locked on return */
-                        xfs_qm_dqdestroy(dqp);
-                        return XFS_ERROR(ESRCH);
-                }
                /*
                 * A dquot could be attached to this inode by now, since
                 * we had dropped the ilock.
                 */
                if (type == XFS_DQ_USER) {
+                        if (!XFS_IS_UQUOTA_ON(mp)) {
+                                /* inode stays locked on return */
+                                xfs_qm_dqdestroy(dqp);
+                                return XFS_ERROR(ESRCH);
+                        }
                        if (ip->i_udquot) {
                                xfs_qm_dqdestroy(dqp);
                                dqp = ip->i_udquot;
@@ -992,6 +974,11 @@ xfs_qm_dqget(
                                goto dqret;
                        }
                } else {
+                        if (!XFS_IS_OQUOTA_ON(mp)) {
+                                /* inode stays locked on return */
+                                xfs_qm_dqdestroy(dqp);
+                                return XFS_ERROR(ESRCH);
+                        }
                        if (ip->i_gdquot) {
                                xfs_qm_dqdestroy(dqp);
                                dqp = ip->i_gdquot;
@@ -1033,13 +1020,14 @@ xfs_qm_dqget(
         */
        ASSERT(mutex_is_locked(&h->qh_lock));
        dqp->q_hash = h;
-        XQM_HASHLIST_INSERT(h, dqp);
+        list_add(&dqp->q_hashlist, &h->qh_list);
+        h->qh_version++;
        /*
         * Attach this dquot to this filesystem's list of all dquots,
         * kept inside the mount structure in m_quotainfo field
         */
-        xfs_qm_mplist_lock(mp);
+        mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
        /*
         * We return a locked dquot to the caller, with a reference taken
@@ -1047,9 +1035,9 @@ xfs_qm_dqget(
        xfs_dqlock(dqp);
        dqp->q_nrefs = 1;
-        XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp);
+        list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist);
+        mp->m_quotainfo->qi_dquots++;
-        xfs_qm_mplist_unlock(mp);
+        mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
        mutex_unlock(&h->qh_lock);
 dqret:
        ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -1086,10 +1074,10 @@ xfs_qm_dqput(
         * drop the dqlock and acquire the freelist and dqlock
         * in the right order; but try to get it out-of-order first
         */
-        if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
+        if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
                trace_xfs_dqput_wait(dqp);
                xfs_dqunlock(dqp);
-                xfs_qm_freelist_lock(xfs_Gqm);
+                mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
                xfs_dqlock(dqp);
        }
@@ -1100,10 +1088,8 @@ xfs_qm_dqput(
                if (--dqp->q_nrefs == 0) {
                        trace_xfs_dqput_free(dqp);
-                        /*
+                        list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
-                         * insert at end of the freelist.
+                        xfs_Gqm->qm_dqfrlist_cnt++;
-                         */
-                        XQM_FREELIST_INSERT(&(xfs_Gqm->qm_dqfreelist), dqp);
                        /*
                         * If we just added a udquot to the freelist, then
@@ -1118,10 +1104,6 @@ xfs_qm_dqput(
                                xfs_dqlock(gdqp);
                                dqp->q_gdquot = NULL;
                        }
-                        /* xfs_qm_freelist_print(&(xfs_Gqm->qm_dqfreelist),
-                           "@@@@@++ Free list (after append) @@@@@+");
-                           */
                }
                xfs_dqunlock(dqp);
@@ -1133,7 +1115,7 @@ xfs_qm_dqput(
                        break;
                dqp = gdqp;
        }
-        xfs_qm_freelist_unlock(xfs_Gqm);
+        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
 }
 /*
@@ -1386,10 +1368,10 @@ int
 xfs_qm_dqpurge(
        xfs_dquot_t     *dqp)
 {
-        xfs_dqhash_t    *thishash;
+        xfs_dqhash_t    *qh = dqp->q_hash;
        xfs_mount_t     *mp = dqp->q_mount;
-        ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+        ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock));
        ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock));
        xfs_dqlock(dqp);
@@ -1407,7 +1389,7 @@ xfs_qm_dqpurge(
                return (1);
        }
-        ASSERT(XFS_DQ_IS_ON_FREELIST(dqp));
+        ASSERT(!list_empty(&dqp->q_freelist));
        /*
         * If we're turning off quotas, we have to make sure that, for
@@ -1452,14 +1434,16 @@ xfs_qm_dqpurge(
        ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
               !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
-        thishash = dqp->q_hash;
+        list_del_init(&dqp->q_hashlist);
-        XQM_HASHLIST_REMOVE(thishash, dqp);
+        qh->qh_version++;
-        XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(mp)), dqp);
+        list_del_init(&dqp->q_mplist);
+        mp->m_quotainfo->qi_dqreclaims++;
+        mp->m_quotainfo->qi_dquots--;
        /*
         * XXX Move this to the front of the freelist, if we can get the
         * freelist lock.
         */
-        ASSERT(XFS_DQ_IS_ON_FREELIST(dqp));
+        ASSERT(!list_empty(&dqp->q_freelist));
        dqp->q_mount = NULL;
        dqp->q_hash = NULL;
@@ -1467,7 +1451,7 @@ xfs_qm_dqpurge(
        memset(&dqp->q_core, 0, sizeof(dqp->q_core));
        xfs_dqfunlock(dqp);
        xfs_dqunlock(dqp);
-        mutex_unlock(&thishash->qh_lock);
+        mutex_unlock(&qh->qh_lock);
        return (0);
 }
@@ -1517,6 +1501,7 @@ void
 xfs_qm_dqflock_pushbuf_wait(
        xfs_dquot_t     *dqp)
 {
+        xfs_mount_t     *mp = dqp->q_mount;
        xfs_buf_t       *bp;
        /*
@@ -1525,14 +1510,14 @@ xfs_qm_dqflock_pushbuf_wait(
         * out immediately.  We'll be able to acquire
         * the flush lock when the I/O completes.
         */
-        bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno,
+        bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno,
-                    XFS_QI_DQCHUNKLEN(dqp->q_mount), XBF_TRYLOCK);
+                        mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
        if (!bp)
                goto out_lock;
        if (XFS_BUF_ISDELAYWRITE(bp)) {
                if (XFS_BUF_ISPINNED(bp))
-                        xfs_log_force(dqp->q_mount, 0);
+                        xfs_log_force(mp, 0);
                xfs_buf_delwri_promote(bp);
                wake_up_process(bp->b_target->bt_task);
        }
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index a0f7da586d1b..5da3a23b820d 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -33,40 +33,23 @@
 * The hash chain headers (hash buckets)
 */
 typedef struct xfs_dqhash {
-        struct xfs_dquot *qh_next;
+        struct list_head  qh_list;
        struct mutex      qh_lock;
        uint              qh_version;   /* ever increasing version */
        uint              qh_nelems;    /* number of dquots on the list */
 } xfs_dqhash_t;
-typedef struct xfs_dqlink {
-        struct xfs_dquot  *ql_next;     /* forward link */
-        struct xfs_dquot **ql_prevp;    /* pointer to prev ql_next */
-} xfs_dqlink_t;
 struct xfs_mount;
 struct xfs_trans;
 /*
- * This is the marker which is designed to occupy the first few
- * bytes of the xfs_dquot_t structure. Even inside this, the freelist pointers
- * must come first.
- * This serves as the marker ("sentinel") when we have to restart list
- * iterations because of locking considerations.
- */
-typedef struct xfs_dqmarker {
-        struct xfs_dquot*dqm_flnext;    /* link to freelist: must be first */
-        struct xfs_dquot*dqm_flprev;
-        xfs_dqlink_t     dqm_mplist;    /* link to mount's list of dquots */
-        xfs_dqlink_t     dqm_hashlist;  /* link to the hash chain */
-        uint             dqm_flags;     /* various flags (XFS_DQ_*) */
-} xfs_dqmarker_t;
-/*
 * The incore dquot structure
 */
 typedef struct xfs_dquot {
-        xfs_dqmarker_t   q_lists;       /* list ptrs, q_flags (marker) */
+        uint             dq_flags;      /* various flags (XFS_DQ_*) */
+        struct list_head q_freelist;    /* global free list of dquots */
+        struct list_head q_mplist;      /* mount's list of dquots */
+        struct list_head q_hashlist;    /* gloabl hash list of dquots */
        xfs_dqhash_t    *q_hash;        /* the hashchain header */
        struct xfs_mount*q_mount;       /* filesystem this relates to */
        struct xfs_trans*q_transp;      /* trans this belongs to currently */
@@ -87,13 +70,6 @@ typedef struct xfs_dquot {
        wait_queue_head_t q_pinwait;    /* dquot pinning wait queue */
 } xfs_dquot_t;
-#define dq_flnext       q_lists.dqm_flnext
-#define dq_flprev       q_lists.dqm_flprev
-#define dq_mplist       q_lists.dqm_mplist
-#define dq_hashlist     q_lists.dqm_hashlist
-#define dq_flags        q_lists.dqm_flags
 /*
 * Lock hierarchy for q_qlock:
 *      XFS_QLOCK_NORMAL is the implicit default,
@@ -127,7 +103,6 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
 }
 #define XFS_DQ_IS_LOCKED(dqp)   (mutex_is_locked(&((dqp)->q_qlock)))
-#define XFS_DQ_IS_ON_FREELIST(dqp)  ((dqp)->dq_flnext != (dqp))
 #define XFS_DQ_IS_DIRTY(dqp)    ((dqp)->dq_flags & XFS_DQ_DIRTY)
 #define XFS_QM_ISUDQ(dqp)       ((dqp)->dq_flags & XFS_DQ_USER)
 #define XFS_QM_ISPDQ(dqp)       ((dqp)->dq_flags & XFS_DQ_PROJ)
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 4e4ee9a57194..8d89a24ae324 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -107,8 +107,7 @@ xfs_qm_dquot_logitem_pin(
 /* ARGSUSED */
 STATIC void
 xfs_qm_dquot_logitem_unpin(
-        xfs_dq_logitem_t *logitem,
+        xfs_dq_logitem_t *logitem)
-        int               stale)
 {
        xfs_dquot_t *dqp = logitem->qli_dquot;
@@ -123,7 +122,7 @@ xfs_qm_dquot_logitem_unpin_remove(
        xfs_dq_logitem_t *logitem,
        xfs_trans_t      *tp)
 {
-        xfs_qm_dquot_logitem_unpin(logitem, 0);
+        xfs_qm_dquot_logitem_unpin(logitem);
 }
 /*
@@ -228,7 +227,7 @@ xfs_qm_dquot_logitem_pushbuf(
        }
        mp = dqp->q_mount;
        bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
-                    XFS_QI_DQCHUNKLEN(mp), XBF_TRYLOCK);
+                        mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
        xfs_dqunlock(dqp);
        if (!bp)
                return;
@@ -329,8 +328,7 @@ static struct xfs_item_ops xfs_dquot_item_ops = {
        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
                                        xfs_qm_dquot_logitem_format,
        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*, int))
+        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unpin,
-                                        xfs_qm_dquot_logitem_unpin,
        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
                                        xfs_qm_dquot_logitem_unpin_remove,
        .iop_trylock    = (uint(*)(xfs_log_item_t*))
@@ -357,9 +355,8 @@ xfs_qm_dquot_logitem_init(
        xfs_dq_logitem_t  *lp;
        lp = &dqp->q_logitem;
-        lp->qli_item.li_type = XFS_LI_DQUOT;
+        xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
-        lp->qli_item.li_ops = &xfs_dquot_item_ops;
+                                        &xfs_dquot_item_ops);
-        lp->qli_item.li_mountp = dqp->q_mount;
        lp->qli_dquot = dqp;
        lp->qli_format.qlf_type = XFS_LI_DQUOT;
        lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id);
@@ -426,7 +423,7 @@ xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf)
 */
 /*ARGSUSED*/
 STATIC void
-xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf, int stale)
+xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf)
 {
        return;
 }
@@ -537,8 +534,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
                                        xfs_qm_qoff_logitem_format,
        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
-        .iop_unpin      = (void(*)(xfs_log_item_t* ,int))
+        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
-                                        xfs_qm_qoff_logitem_unpin,
        .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
                                        xfs_qm_qoff_logitem_unpin_remove,
        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
@@ -559,8 +555,7 @@ static struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
                                        xfs_qm_qoff_logitem_format,
        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*, int))
+        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
-                                        xfs_qm_qoff_logitem_unpin,
        .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
                                        xfs_qm_qoff_logitem_unpin_remove,
        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
@@ -586,11 +581,8 @@ xfs_qm_qoff_logitem_init(
        qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP);
-        qf->qql_item.li_type = XFS_LI_QUOTAOFF;
+        xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
-        if (start)
+                        &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
-                qf->qql_item.li_ops = &xfs_qm_qoffend_logitem_ops;
-        else
-                qf->qql_item.li_ops = &xfs_qm_qoff_logitem_ops;
        qf->qql_item.li_mountp = mp;
        qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
        qf->qql_format.qf_flags = flags;
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 417e61e3d9dd..38e764146644 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -67,9 +67,6 @@ static cred_t	xfs_zerocr;
 STATIC void     xfs_qm_list_init(xfs_dqlist_t *, char *, int);
 STATIC void     xfs_qm_list_destroy(xfs_dqlist_t *);
-STATIC void     xfs_qm_freelist_init(xfs_frlist_t *);
-STATIC void     xfs_qm_freelist_destroy(xfs_frlist_t *);
 STATIC int      xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int      xfs_qm_init_quotainfo(xfs_mount_t *);
 STATIC int      xfs_qm_shake(int, gfp_t);
@@ -84,21 +81,25 @@ extern struct mutex	qcheck_lock;
 #endif
 #ifdef QUOTADEBUG
-#define XQM_LIST_PRINT(l, NXT, title) \
+static void
-{ \
+xfs_qm_dquot_list_print(
-        xfs_dquot_t     *dqp; int i = 0; \
+        struct xfs_mount *mp)
-        cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \
+{
-        for (dqp = (l)->qh_next; dqp != NULL; dqp = dqp->NXT) { \
+        xfs_dquot_t     *dqp;
-                cmn_err(CE_DEBUG, "   %d.  \"%d (%s)\"   " \
+        int             i = 0;
-                                  "bcnt = %d, icnt = %d, refs = %d", \
-                        ++i, (int) be32_to_cpu(dqp->q_core.d_id), \
+        list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) {
-                        DQFLAGTO_TYPESTR(dqp),       \
+                cmn_err(CE_DEBUG, "   %d. \"%d (%s)\"   "
-                        (int) be64_to_cpu(dqp->q_core.d_bcount), \
+                                  "bcnt = %lld, icnt = %lld, refs = %d",
-                        (int) be64_to_cpu(dqp->q_core.d_icount), \
+                        i++, be32_to_cpu(dqp->q_core.d_id),
-                        (int) dqp->q_nrefs);  } \
+                        DQFLAGTO_TYPESTR(dqp),
+                        (long long)be64_to_cpu(dqp->q_core.d_bcount),
+                        (long long)be64_to_cpu(dqp->q_core.d_icount),
+                        dqp->q_nrefs);
+        }
 }
 #else
-#define XQM_LIST_PRINT(l, NXT, title) do { } while (0)
+static void xfs_qm_dquot_list_print(struct xfs_mount *mp) { }
 #endif
 /*
@@ -144,7 +145,9 @@ xfs_Gqm_init(void)
        /*
         * Freelist of all dquots of all file systems
         */
-        xfs_qm_freelist_init(&(xqm->qm_dqfreelist));
+        INIT_LIST_HEAD(&xqm->qm_dqfrlist);
+        xqm->qm_dqfrlist_cnt = 0;
+        mutex_init(&xqm->qm_dqfrlist_lock);
        /*
         * dquot zone. we register our own low-memory callback.
@@ -189,6 +192,7 @@ STATIC void
 xfs_qm_destroy(
        struct xfs_qm   *xqm)
 {
+        struct xfs_dquot *dqp, *n;
        int             hsize, i;
        ASSERT(xqm != NULL);
@@ -204,7 +208,21 @@ xfs_qm_destroy(
        xqm->qm_usr_dqhtable = NULL;
        xqm->qm_grp_dqhtable = NULL;
        xqm->qm_dqhashmask = 0;
-        xfs_qm_freelist_destroy(&(xqm->qm_dqfreelist));
+        /* frlist cleanup */
+        mutex_lock(&xqm->qm_dqfrlist_lock);
+        list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
+                xfs_dqlock(dqp);
+#ifdef QUOTADEBUG
+                cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
+#endif
+                list_del_init(&dqp->q_freelist);
+                xfs_Gqm->qm_dqfrlist_cnt--;
+                xfs_dqunlock(dqp);
+                xfs_qm_dqdestroy(dqp);
+        }
+        mutex_unlock(&xqm->qm_dqfrlist_lock);
+        mutex_destroy(&xqm->qm_dqfrlist_lock);
 #ifdef DEBUG
        mutex_destroy(&qcheck_lock);
 #endif
@@ -256,7 +274,7 @@ STATIC void
 xfs_qm_rele_quotafs_ref(
        struct xfs_mount *mp)
 {
-        xfs_dquot_t     *dqp, *nextdqp;
+        xfs_dquot_t     *dqp, *n;
        ASSERT(xfs_Gqm);
        ASSERT(xfs_Gqm->qm_nrefs > 0);
@@ -264,26 +282,24 @@ xfs_qm_rele_quotafs_ref(
        /*
         * Go thru the freelist and destroy all inactive dquots.
         */
-        xfs_qm_freelist_lock(xfs_Gqm);
+        mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
-        for (dqp = xfs_Gqm->qm_dqfreelist.qh_next;
+        list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) {
-             dqp != (xfs_dquot_t *)&(xfs_Gqm->qm_dqfreelist); ) {
                xfs_dqlock(dqp);
-                nextdqp = dqp->dq_flnext;
                if (dqp->dq_flags & XFS_DQ_INACTIVE) {
                        ASSERT(dqp->q_mount == NULL);
                        ASSERT(! XFS_DQ_IS_DIRTY(dqp));
-                        ASSERT(dqp->HL_PREVP == NULL);
+                        ASSERT(list_empty(&dqp->q_hashlist));
-                        ASSERT(dqp->MPL_PREVP == NULL);
+                        ASSERT(list_empty(&dqp->q_mplist));
-                        XQM_FREELIST_REMOVE(dqp);
+                        list_del_init(&dqp->q_freelist);
+                        xfs_Gqm->qm_dqfrlist_cnt--;
                        xfs_dqunlock(dqp);
                        xfs_qm_dqdestroy(dqp);
                } else {
                        xfs_dqunlock(dqp);
                }
-                dqp = nextdqp;
        }
-        xfs_qm_freelist_unlock(xfs_Gqm);
+        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
        /*
         * Destroy the entire XQM. If somebody mounts with quotaon, this'll
@@ -305,7 +321,7 @@ xfs_qm_unmount(
        struct xfs_mount        *mp)
 {
        if (mp->m_quotainfo) {
-                xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
+                xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
                xfs_qm_destroy_quotainfo(mp);
        }
 }
@@ -449,20 +465,21 @@ xfs_qm_unmount_quotas(
 */
 STATIC int
 xfs_qm_dqflush_all(
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        int             sync_mode)
+        int                     sync_mode)
 {
-        int             recl;
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
-        xfs_dquot_t     *dqp;
+        int                     recl;
-        int             niters;
+        struct xfs_dquot        *dqp;
-        int             error;
+        int                     niters;
+        int                     error;
-        if (mp->m_quotainfo == NULL)
+        if (!q)
                return 0;
        niters = 0;
 again:
-        xfs_qm_mplist_lock(mp);
+        mutex_lock(&q->qi_dqlist_lock);
-        FOREACH_DQUOT_IN_MP(dqp, mp) {
+        list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
                xfs_dqlock(dqp);
                if (! XFS_DQ_IS_DIRTY(dqp)) {
                        xfs_dqunlock(dqp);
@@ -470,7 +487,7 @@ again:
                }
                /* XXX a sentinel would be better */
-                recl = XFS_QI_MPLRECLAIMS(mp);
+                recl = q->qi_dqreclaims;
                if (!xfs_dqflock_nowait(dqp)) {
                        /*
                         * If we can't grab the flush lock then check
@@ -485,21 +502,21 @@ again:
                 * Let go of the mplist lock. We don't want to hold it
                 * across a disk write.
                 */
-                xfs_qm_mplist_unlock(mp);
+                mutex_unlock(&q->qi_dqlist_lock);
                error = xfs_qm_dqflush(dqp, sync_mode);
                xfs_dqunlock(dqp);
                if (error)
                        return error;
-                xfs_qm_mplist_lock(mp);
+                mutex_lock(&q->qi_dqlist_lock);
-                if (recl != XFS_QI_MPLRECLAIMS(mp)) {
+                if (recl != q->qi_dqreclaims) {
-                        xfs_qm_mplist_unlock(mp);
+                        mutex_unlock(&q->qi_dqlist_lock);
                        /* XXX restart limit */
                        goto again;
                }
        }
-        xfs_qm_mplist_unlock(mp);
+        mutex_unlock(&q->qi_dqlist_lock);
        /* return ! busy */
        return 0;
 }
@@ -509,15 +526,15 @@ again:
 */
 STATIC void
 xfs_qm_detach_gdquots(
-        xfs_mount_t     *mp)
+        struct xfs_mount        *mp)
 {
-        xfs_dquot_t     *dqp, *gdqp;
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
-        int             nrecl;
+        struct xfs_dquot        *dqp, *gdqp;
+        int                     nrecl;
 again:
-        ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+        ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
-        dqp = XFS_QI_MPLNEXT(mp);
+        list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
-        while (dqp) {
                xfs_dqlock(dqp);
                if ((gdqp = dqp->q_gdquot)) {
                        xfs_dqlock(gdqp);
@@ -530,15 +547,14 @@ xfs_qm_detach_gdquots(
                         * Can't hold the mplist lock across a dqput.
                         * XXXmust convert to marker based iterations here.
                         */
-                        nrecl = XFS_QI_MPLRECLAIMS(mp);
+                        nrecl = q->qi_dqreclaims;
-                        xfs_qm_mplist_unlock(mp);
+                        mutex_unlock(&q->qi_dqlist_lock);
                        xfs_qm_dqput(gdqp);
-                        xfs_qm_mplist_lock(mp);
+                        mutex_lock(&q->qi_dqlist_lock);
-                        if (nrecl != XFS_QI_MPLRECLAIMS(mp))
+                        if (nrecl != q->qi_dqreclaims)
                                goto again;
                }
-                dqp = dqp->MPL_NEXT;
        }
 }
@@ -550,23 +566,23 @@ xfs_qm_detach_gdquots(
 */
 STATIC int
 xfs_qm_dqpurge_int(
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        uint            flags) /* QUOTAOFF/UMOUNTING/UQUOTA/PQUOTA/GQUOTA */
+        uint                    flags)
 {
-        xfs_dquot_t     *dqp;
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
-        uint            dqtype;
+        struct xfs_dquot        *dqp, *n;
-        int             nrecl;
+        uint                    dqtype;
-        xfs_dquot_t     *nextdqp;
+        int                     nrecl;
-        int             nmisses;
+        int                     nmisses;
-        if (mp->m_quotainfo == NULL)
+        if (!q)
                return 0;
        dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
        dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0;
        dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
-        xfs_qm_mplist_lock(mp);
+        mutex_lock(&q->qi_dqlist_lock);
        /*
         * In the first pass through all incore dquots of this filesystem,
@@ -578,28 +594,25 @@ xfs_qm_dqpurge_int(
      again:
        nmisses = 0;
-        ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+        ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
        /*
         * Try to get rid of all of the unwanted dquots. The idea is to
         * get them off mplist and hashlist, but leave them on freelist.
         */
-        dqp = XFS_QI_MPLNEXT(mp);
+        list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) {
-        while (dqp) {
                /*
                 * It's OK to look at the type without taking dqlock here.
                 * We're holding the mplist lock here, and that's needed for
                 * a dqreclaim.
                 */
-                if ((dqp->dq_flags & dqtype) == 0) {
+                if ((dqp->dq_flags & dqtype) == 0)
-                        dqp = dqp->MPL_NEXT;
                        continue;
-                }
                if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
-                        nrecl = XFS_QI_MPLRECLAIMS(mp);
+                        nrecl = q->qi_dqreclaims;
-                        xfs_qm_mplist_unlock(mp);
+                        mutex_unlock(&q->qi_dqlist_lock);
                        mutex_lock(&dqp->q_hash->qh_lock);
-                        xfs_qm_mplist_lock(mp);
+                        mutex_lock(&q->qi_dqlist_lock);
                        /*
                         * XXXTheoretically, we can get into a very long
@@ -607,7 +620,7 @@ xfs_qm_dqpurge_int(
                         * No one can be adding dquots to the mplist at
                         * this point, but somebody might be taking things off.
                         */
-                        if (nrecl != XFS_QI_MPLRECLAIMS(mp)) {
+                        if (nrecl != q->qi_dqreclaims) {
                                mutex_unlock(&dqp->q_hash->qh_lock);
                                goto again;
                        }
@@ -617,11 +630,9 @@ xfs_qm_dqpurge_int(
                 * Take the dquot off the mplist and hashlist. It may remain on
                 * freelist in INACTIVE state.
                 */
-                nextdqp = dqp->MPL_NEXT;
                nmisses += xfs_qm_dqpurge(dqp);
-                dqp = nextdqp;
        }
-        xfs_qm_mplist_unlock(mp);
+        mutex_unlock(&q->qi_dqlist_lock);
        return nmisses;
 }
@@ -921,12 +932,13 @@ xfs_qm_dqdetach(
 int
 xfs_qm_sync(
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        int             flags)
+        int                     flags)
 {
-        int             recl, restarts;
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
-        xfs_dquot_t     *dqp;
+        int                     recl, restarts;
-        int             error;
+        struct xfs_dquot        *dqp;
+        int                     error;
        if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
                return 0;
@@ -934,18 +946,19 @@ xfs_qm_sync(
        restarts = 0;
  again:
-        xfs_qm_mplist_lock(mp);
+        mutex_lock(&q->qi_dqlist_lock);
        /*
         * dqpurge_all() also takes the mplist lock and iterate thru all dquots
         * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared
         * when we have the mplist lock, we know that dquots will be consistent
         * as long as we have it locked.
         */
-        if (! XFS_IS_QUOTA_ON(mp)) {
+        if (!XFS_IS_QUOTA_ON(mp)) {
-                xfs_qm_mplist_unlock(mp);
+                mutex_unlock(&q->qi_dqlist_lock);
                return 0;
        }
-        FOREACH_DQUOT_IN_MP(dqp, mp) {
+        ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
+        list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
                /*
                 * If this is vfs_sync calling, then skip the dquots that
                 * don't 'seem' to be dirty. ie. don't acquire dqlock.
@@ -969,7 +982,7 @@ xfs_qm_sync(
                }
                /* XXX a sentinel would be better */
-                recl = XFS_QI_MPLRECLAIMS(mp);
+                recl = q->qi_dqreclaims;
                if (!xfs_dqflock_nowait(dqp)) {
                        if (flags & SYNC_TRYLOCK) {
                                xfs_dqunlock(dqp);
@@ -989,7 +1002,7 @@ xfs_qm_sync(
                 * Let go of the mplist lock. We don't want to hold it
                 * across a disk write
                 */
-                xfs_qm_mplist_unlock(mp);
+                mutex_unlock(&q->qi_dqlist_lock);
                error = xfs_qm_dqflush(dqp, flags);
                xfs_dqunlock(dqp);
                if (error && XFS_FORCED_SHUTDOWN(mp))
@@ -997,17 +1010,17 @@ xfs_qm_sync(
                else if (error)
                        return error;
-                xfs_qm_mplist_lock(mp);
+                mutex_lock(&q->qi_dqlist_lock);
-                if (recl != XFS_QI_MPLRECLAIMS(mp)) {
+                if (recl != q->qi_dqreclaims) {
                        if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS)
                                break;
-                        xfs_qm_mplist_unlock(mp);
+                        mutex_unlock(&q->qi_dqlist_lock);
                        goto again;
                }
        }
-        xfs_qm_mplist_unlock(mp);
+        mutex_unlock(&q->qi_dqlist_lock);
        return 0;
 }
@@ -1052,8 +1065,9 @@ xfs_qm_init_quotainfo(
                return error;
        }
-        xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0);
+        INIT_LIST_HEAD(&qinf->qi_dqlist);
-        lockdep_set_class(&qinf->qi_dqlist.qh_lock, &xfs_quota_mplist_class);
+        mutex_init(&qinf->qi_dqlist_lock);
+        lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class);
        qinf->qi_dqreclaims = 0;
@@ -1150,7 +1164,8 @@ xfs_qm_destroy_quotainfo(
         */
        xfs_qm_rele_quotafs_ref(mp);
-        xfs_qm_list_destroy(&qi->qi_dqlist);
+        ASSERT(list_empty(&qi->qi_dqlist));
+        mutex_destroy(&qi->qi_dqlist_lock);
        if (qi->qi_uquotaip) {
                IRELE(qi->qi_uquotaip);
@@ -1177,7 +1192,7 @@ xfs_qm_list_init(
        int             n)
 {
        mutex_init(&list->qh_lock);
-        list->qh_next = NULL;
+        INIT_LIST_HEAD(&list->qh_list);
        list->qh_version = 0;
        list->qh_nelems = 0;
 }
@@ -1316,9 +1331,6 @@ xfs_qm_qino_alloc(
         */
        spin_lock(&mp->m_sb_lock);
        if (flags & XFS_QMOPT_SBVERSION) {
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-                unsigned oldv = mp->m_sb.sb_versionnum;
-#endif
                ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
                ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
                                   XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
@@ -1331,11 +1343,6 @@ xfs_qm_qino_alloc(
                /* qflags will get updated _after_ quotacheck */
                mp->m_sb.sb_qflags = 0;
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-                cmn_err(CE_NOTE,
-                        "Old superblock version %x, converting to %x.",
-                        oldv, mp->m_sb.sb_versionnum);
-#endif
        }
        if (flags & XFS_QMOPT_UQUOTA)
                mp->m_sb.sb_uquotino = (*ip)->i_ino;
@@ -1371,10 +1378,10 @@ xfs_qm_reset_dqcounts(
 #ifdef DEBUG
        j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
        do_div(j, sizeof(xfs_dqblk_t));
-        ASSERT(XFS_QM_DQPERBLK(mp) == j);
+        ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
 #endif
        ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp);
-        for (j = 0; j < XFS_QM_DQPERBLK(mp); j++) {
+        for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
                /*
                 * Do a sanity check, and if needed, repair the dqblk. Don't
                 * output any warnings because it's perfectly possible to
@@ -1429,7 +1436,7 @@ xfs_qm_dqiter_bufs(
        while (blkcnt--) {
                error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
                              XFS_FSB_TO_DADDR(mp, bno),
-                              (int)XFS_QI_DQCHUNKLEN(mp), 0, &bp);
+                              mp->m_quotainfo->qi_dqchunklen, 0, &bp);
                if (error)
                        break;
@@ -1439,7 +1446,7 @@ xfs_qm_dqiter_bufs(
                 * goto the next block.
                 */
                bno++;
-                firstid += XFS_QM_DQPERBLK(mp);
+                firstid += mp->m_quotainfo->qi_dqperchunk;
        }
        return error;
 }
@@ -1505,7 +1512,7 @@ xfs_qm_dqiterate(
                                continue;
                        firstid = (xfs_dqid_t) map[i].br_startoff *
-                                XFS_QM_DQPERBLK(mp);
+                                mp->m_quotainfo->qi_dqperchunk;
                        /*
                         * Do a read-ahead on the next extent.
                         */
@@ -1516,7 +1523,7 @@ xfs_qm_dqiterate(
                                while (rablkcnt--) {
                                        xfs_baread(mp->m_ddev_targp,
                                               XFS_FSB_TO_DADDR(mp, rablkno),
-                                               (int)XFS_QI_DQCHUNKLEN(mp));
+                                               mp->m_quotainfo->qi_dqchunklen);
                                        rablkno++;
                                }
                        }
@@ -1576,8 +1583,10 @@ xfs_qm_quotacheck_dqadjust(
        /*
         * Set default limits, adjust timers (since we changed usages)
+         *
+         * There are no timers for the default values set in the root dquot.
         */
-        if (! XFS_IS_SUSER_DQUOT(dqp)) {
+        if (dqp->q_core.d_id) {
                xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core);
                xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core);
        }
@@ -1747,14 +1756,14 @@ xfs_qm_quotacheck(
        lastino = 0;
        flags = 0;
-        ASSERT(XFS_QI_UQIP(mp) || XFS_QI_GQIP(mp));
+        ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
        /*
         * There should be no cached dquots. The (simplistic) quotacheck
         * algorithm doesn't like that.
         */
-        ASSERT(XFS_QI_MPLNDQUOTS(mp) == 0);
+        ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
        cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname);
@@ -1763,15 +1772,19 @@ xfs_qm_quotacheck(
         * their counters to zero. We need a clean slate.
         * We don't log our changes till later.
         */
-        if ((uip = XFS_QI_UQIP(mp))) {
+        uip = mp->m_quotainfo->qi_uquotaip;
-                if ((error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA)))
+        if (uip) {
+                error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA);
+                if (error)
                        goto error_return;
                flags |= XFS_UQUOTA_CHKD;
        }
-        if ((gip = XFS_QI_GQIP(mp))) {
+        gip = mp->m_quotainfo->qi_gquotaip;
-                if ((error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
+        if (gip) {
-                                        XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA)))
+                error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
+                                        XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
+                if (error)
                        goto error_return;
                flags |= XFS_OQUOTA_CHKD;
        }
@@ -1804,7 +1817,7 @@ xfs_qm_quotacheck(
         * at this point (because we intentionally didn't in dqget_noattach).
         */
        if (error) {
-                xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF);
+                xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
                goto error_return;
        }
@@ -1825,7 +1838,7 @@ xfs_qm_quotacheck(
        mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD);
        mp->m_qflags |= flags;
-        XQM_LIST_PRINT(&(XFS_QI_MPL_LIST(mp)), MPL_NEXT, "++++ Mp list +++");
+        xfs_qm_dquot_list_print(mp);
 error_return:
        if (error) {
@@ -1920,59 +1933,53 @@ xfs_qm_init_quotainos(
                }
        }
-        XFS_QI_UQIP(mp) = uip;
+        mp->m_quotainfo->qi_uquotaip = uip;
-        XFS_QI_GQIP(mp) = gip;
+        mp->m_quotainfo->qi_gquotaip = gip;
        return 0;
 }
 /*
- * Traverse the freelist of dquots and attempt to reclaim a maximum of
+ * Just pop the least recently used dquot off the freelist and
- * 'howmany' dquots. This operation races with dqlookup(), and attempts to
+ * recycle it. The returned dquot is locked.
- * favor the lookup function ...
- * XXXsup merge this with qm_reclaim_one().
 */
-STATIC int
+STATIC xfs_dquot_t *
-xfs_qm_shake_freelist(
+xfs_qm_dqreclaim_one(void)
-        int howmany)
 {
-        int             nreclaimed;
+        xfs_dquot_t     *dqpout;
-        xfs_dqhash_t    *hash;
+        xfs_dquot_t     *dqp;
-        xfs_dquot_t     *dqp, *nextdqp;
        int             restarts;
-        int             nflushes;
-        if (howmany <= 0)
-                return 0;
-        nreclaimed = 0;
        restarts = 0;
-        nflushes = 0;
+        dqpout = NULL;
-#ifdef QUOTADEBUG
+        /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
-        cmn_err(CE_DEBUG, "Shake free 0x%x", howmany);
+startagain:
-#endif
+        mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
-        /* lock order is : hashchainlock, freelistlock, mplistlock */
- tryagain:
-        xfs_qm_freelist_lock(xfs_Gqm);
-        for (dqp = xfs_Gqm->qm_dqfreelist.qh_next;
+        list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
-             ((dqp != (xfs_dquot_t *) &xfs_Gqm->qm_dqfreelist) &&
+                struct xfs_mount *mp = dqp->q_mount;
-              nreclaimed < howmany); ) {
                xfs_dqlock(dqp);
                /*
                 * We are racing with dqlookup here. Naturally we don't
-                 * want to reclaim a dquot that lookup wants.
+                 * want to reclaim a dquot that lookup wants. We release the
+                 * freelist lock and start over, so that lookup will grab
+                 * both the dquot and the freelistlock.
                 */
                if (dqp->dq_flags & XFS_DQ_WANT) {
+                        ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
+                        trace_xfs_dqreclaim_want(dqp);
                        xfs_dqunlock(dqp);
-                        xfs_qm_freelist_unlock(xfs_Gqm);
+                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
                        if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                                return nreclaimed;
+                                return NULL;
                        XQM_STATS_INC(xqmstats.xs_qm_dqwants);
-                        goto tryagain;
+                        goto startagain;
                }
                /*
@@ -1981,23 +1988,27 @@ xfs_qm_shake_freelist(
                 * life easier.
                 */
                if (dqp->dq_flags & XFS_DQ_INACTIVE) {
-                        ASSERT(dqp->q_mount == NULL);
+                        ASSERT(mp == NULL);
                        ASSERT(! XFS_DQ_IS_DIRTY(dqp));
-                        ASSERT(dqp->HL_PREVP == NULL);
+                        ASSERT(list_empty(&dqp->q_hashlist));
-                        ASSERT(dqp->MPL_PREVP == NULL);
+                        ASSERT(list_empty(&dqp->q_mplist));
+                        list_del_init(&dqp->q_freelist);
+                        xfs_Gqm->qm_dqfrlist_cnt--;
+                        xfs_dqunlock(dqp);
+                        dqpout = dqp;
                        XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
-                        nextdqp = dqp->dq_flnext;
+                        break;
-                        goto off_freelist;
                }
-                ASSERT(dqp->MPL_PREVP);
+                ASSERT(dqp->q_hash);
+                ASSERT(!list_empty(&dqp->q_mplist));
                /*
                 * Try to grab the flush lock. If this dquot is in the process of
                 * getting flushed to disk, we don't want to reclaim it.
                 */
                if (!xfs_dqflock_nowait(dqp)) {
                        xfs_dqunlock(dqp);
-                        dqp = dqp->dq_flnext;
                        continue;
                }
@@ -2010,21 +2021,21 @@ xfs_qm_shake_freelist(
                if (XFS_DQ_IS_DIRTY(dqp)) {
                        int     error;
-                        trace_xfs_dqshake_dirty(dqp);
+                        trace_xfs_dqreclaim_dirty(dqp);
                        /*
                         * We flush it delayed write, so don't bother
-                         * releasing the mplock.
+                         * releasing the freelist lock.
                         */
                        error = xfs_qm_dqflush(dqp, 0);
                        if (error) {
-                                xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+                                xfs_fs_cmn_err(CE_WARN, mp,
-                        "xfs_qm_dqflush_all: dquot %p flush failed", dqp);
+                        "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
                        }
                        xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
-                        dqp = dqp->dq_flnext;
                        continue;
                }
                /*
                 * We're trying to get the hashlock out of order. This races
                 * with dqlookup; so, we giveup and goto the next dquot if
@@ -2033,56 +2044,74 @@ xfs_qm_shake_freelist(
                 * waiting for the freelist lock.
                 */
                if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
-                        xfs_dqfunlock(dqp);
+                        restarts++;
-                        xfs_dqunlock(dqp);
+                        goto dqfunlock;
-                        dqp = dqp->dq_flnext;
-                        continue;
                }
                /*
                 * This races with dquot allocation code as well as dqflush_all
                 * and reclaim code. So, if we failed to grab the mplist lock,
                 * giveup everything and start over.
                 */
-                hash = dqp->q_hash;
+                if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
-                ASSERT(hash);
+                        restarts++;
-                if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
+                        mutex_unlock(&dqp->q_hash->qh_lock);
-                        /* XXX put a sentinel so that we can come back here */
                        xfs_dqfunlock(dqp);
                        xfs_dqunlock(dqp);
-                        mutex_unlock(&hash->qh_lock);
+                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-                        xfs_qm_freelist_unlock(xfs_Gqm);
+                        if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                        if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
+                                return NULL;
-                                return nreclaimed;
+                        goto startagain;
-                        goto tryagain;
                }
-                trace_xfs_dqshake_unlink(dqp);
-#ifdef QUOTADEBUG
-                cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n",
-                        dqp, be32_to_cpu(dqp->q_core.d_id));
-#endif
                ASSERT(dqp->q_nrefs == 0);
-                nextdqp = dqp->dq_flnext;
+                list_del_init(&dqp->q_mplist);
-                XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
+                mp->m_quotainfo->qi_dquots--;
-                XQM_HASHLIST_REMOVE(hash, dqp);
+                mp->m_quotainfo->qi_dqreclaims++;
+                list_del_init(&dqp->q_hashlist);
+                dqp->q_hash->qh_version++;
+                list_del_init(&dqp->q_freelist);
+                xfs_Gqm->qm_dqfrlist_cnt--;
+                dqpout = dqp;
+                mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
+                mutex_unlock(&dqp->q_hash->qh_lock);
+dqfunlock:
                xfs_dqfunlock(dqp);
-                xfs_qm_mplist_unlock(dqp->q_mount);
-                mutex_unlock(&hash->qh_lock);
- off_freelist:
-                XQM_FREELIST_REMOVE(dqp);
                xfs_dqunlock(dqp);
-                nreclaimed++;
+                if (dqpout)
-                XQM_STATS_INC(xqmstats.xs_qm_dqshake_reclaims);
+                        break;
+                if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
+                        return NULL;
+        }
+        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+        return dqpout;
+}
+/*
+ * Traverse the freelist of dquots and attempt to reclaim a maximum of
+ * 'howmany' dquots. This operation races with dqlookup(), and attempts to
+ * favor the lookup function ...
+ */
+STATIC int
+xfs_qm_shake_freelist(
+        int     howmany)
+{
+        int             nreclaimed = 0;
+        xfs_dquot_t     *dqp;
+        if (howmany <= 0)
+                return 0;
+        while (nreclaimed < howmany) {
+                dqp = xfs_qm_dqreclaim_one();
+                if (!dqp)
+                        return nreclaimed;
                xfs_qm_dqdestroy(dqp);
-                dqp = nextdqp;
+                nreclaimed++;
        }
-        xfs_qm_freelist_unlock(xfs_Gqm);
        return nreclaimed;
 }
 /*
 * The kmem_shake interface is invoked when memory is running low.
 */
@@ -2097,7 +2126,7 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
        if (!xfs_Gqm)
                return 0;
-        nfree = xfs_Gqm->qm_dqfreelist.qh_nelems; /* free dquots */
+        nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */
        /* incore dquots in all f/s's */
        ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
@@ -2113,131 +2142,6 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
 }
-/*
- * Just pop the least recently used dquot off the freelist and
- * recycle it. The returned dquot is locked.
- */
-STATIC xfs_dquot_t *
-xfs_qm_dqreclaim_one(void)
-{
-        xfs_dquot_t     *dqpout;
-        xfs_dquot_t     *dqp;
-        int             restarts;
-        int             nflushes;
-        restarts = 0;
-        dqpout = NULL;
-        nflushes = 0;
-        /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
- startagain:
-        xfs_qm_freelist_lock(xfs_Gqm);
-        FOREACH_DQUOT_IN_FREELIST(dqp, &(xfs_Gqm->qm_dqfreelist)) {
-                xfs_dqlock(dqp);
-                /*
-                 * We are racing with dqlookup here. Naturally we don't
-                 * want to reclaim a dquot that lookup wants. We release the
-                 * freelist lock and start over, so that lookup will grab
-                 * both the dquot and the freelistlock.
-                 */
-                if (dqp->dq_flags & XFS_DQ_WANT) {
-                        ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
-                        trace_xfs_dqreclaim_want(dqp);
-                        xfs_dqunlock(dqp);
-                        xfs_qm_freelist_unlock(xfs_Gqm);
-                        if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                                return NULL;
-                        XQM_STATS_INC(xqmstats.xs_qm_dqwants);
-                        goto startagain;
-                }
-                /*
-                 * If the dquot is inactive, we are assured that it is
-                 * not on the mplist or the hashlist, and that makes our
-                 * life easier.
-                 */
-                if (dqp->dq_flags & XFS_DQ_INACTIVE) {
-                        ASSERT(dqp->q_mount == NULL);
-                        ASSERT(! XFS_DQ_IS_DIRTY(dqp));
-                        ASSERT(dqp->HL_PREVP == NULL);
-                        ASSERT(dqp->MPL_PREVP == NULL);
-                        XQM_FREELIST_REMOVE(dqp);
-                        xfs_dqunlock(dqp);
-                        dqpout = dqp;
-                        XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
-                        break;
-                }
-                ASSERT(dqp->q_hash);
-                ASSERT(dqp->MPL_PREVP);
-                /*
-                 * Try to grab the flush lock. If this dquot is in the process of
-                 * getting flushed to disk, we don't want to reclaim it.
-                 */
-                if (!xfs_dqflock_nowait(dqp)) {
-                        xfs_dqunlock(dqp);
-                        continue;
-                }
-                /*
-                 * We have the flush lock so we know that this is not in the
-                 * process of being flushed. So, if this is dirty, flush it
-                 * DELWRI so that we don't get a freelist infested with
-                 * dirty dquots.
-                 */
-                if (XFS_DQ_IS_DIRTY(dqp)) {
-                        int     error;
-                        trace_xfs_dqreclaim_dirty(dqp);
-                        /*
-                         * We flush it delayed write, so don't bother
-                         * releasing the freelist lock.
-                         */
-                        error = xfs_qm_dqflush(dqp, 0);
-                        if (error) {
-                                xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
-                        "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
-                        }
-                        xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
-                        continue;
-                }
-                if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
-                        xfs_dqfunlock(dqp);
-                        xfs_dqunlock(dqp);
-                        continue;
-                }
-                if (!mutex_trylock(&dqp->q_hash->qh_lock))
-                        goto mplistunlock;
-                trace_xfs_dqreclaim_unlink(dqp);
-                ASSERT(dqp->q_nrefs == 0);
-                XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
-                XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
-                XQM_FREELIST_REMOVE(dqp);
-                dqpout = dqp;
-                mutex_unlock(&dqp->q_hash->qh_lock);
- mplistunlock:
-                xfs_qm_mplist_unlock(dqp->q_mount);
-                xfs_dqfunlock(dqp);
-                xfs_dqunlock(dqp);
-                if (dqpout)
-                        break;
-        }
-        xfs_qm_freelist_unlock(xfs_Gqm);
-        return dqpout;
-}
 /*------------------------------------------------------------------*/
 /*
@@ -2662,66 +2566,3 @@ xfs_qm_vop_create_dqattach(
        }
 }
-/* ------------- list stuff -----------------*/
-STATIC void
-xfs_qm_freelist_init(xfs_frlist_t *ql)
-{
-        ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql;
-        mutex_init(&ql->qh_lock);
-        ql->qh_version = 0;
-        ql->qh_nelems = 0;
-}
-STATIC void
-xfs_qm_freelist_destroy(xfs_frlist_t *ql)
-{
-        xfs_dquot_t     *dqp, *nextdqp;
-        mutex_lock(&ql->qh_lock);
-        for (dqp = ql->qh_next;
-             dqp != (xfs_dquot_t *)ql; ) {
-                xfs_dqlock(dqp);
-                nextdqp = dqp->dq_flnext;
-#ifdef QUOTADEBUG
-                cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
-#endif
-                XQM_FREELIST_REMOVE(dqp);
-                xfs_dqunlock(dqp);
-                xfs_qm_dqdestroy(dqp);
-                dqp = nextdqp;
-        }
-        mutex_unlock(&ql->qh_lock);
-        mutex_destroy(&ql->qh_lock);
-        ASSERT(ql->qh_nelems == 0);
-}
-STATIC void
-xfs_qm_freelist_insert(xfs_frlist_t *ql, xfs_dquot_t *dq)
-{
-        dq->dq_flnext = ql->qh_next;
-        dq->dq_flprev = (xfs_dquot_t *)ql;
-        ql->qh_next = dq;
-        dq->dq_flnext->dq_flprev = dq;
-        xfs_Gqm->qm_dqfreelist.qh_nelems++;
-        xfs_Gqm->qm_dqfreelist.qh_version++;
-}
-void
-xfs_qm_freelist_unlink(xfs_dquot_t *dq)
-{
-        xfs_dquot_t *next = dq->dq_flnext;
-        xfs_dquot_t *prev = dq->dq_flprev;
-        next->dq_flprev = prev;
-        prev->dq_flnext = next;
-        dq->dq_flnext = dq->dq_flprev = dq;
-        xfs_Gqm->qm_dqfreelist.qh_nelems--;
-        xfs_Gqm->qm_dqfreelist.qh_version++;
-}
-void
-xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq)
-{
-        xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq);
-}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index 495564b8af38..c9446f1c726d 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -72,17 +72,6 @@ extern kmem_zone_t	*qm_dqtrxzone;
 #define XFS_QM_MAX_DQCLUSTER_LOGSZ      3
 typedef xfs_dqhash_t    xfs_dqlist_t;
-/*
- * The freelist head. The first two fields match the first two in the
- * xfs_dquot_t structure (in xfs_dqmarker_t)
- */
-typedef struct xfs_frlist {
-       struct xfs_dquot *qh_next;
-       struct xfs_dquot *qh_prev;
-       struct mutex      qh_lock;
-       uint              qh_version;
-       uint              qh_nelems;
-} xfs_frlist_t;
 /*
 * Quota Manager (global) structure. Lives only in core.
@@ -91,7 +80,9 @@ typedef struct xfs_qm {
        xfs_dqlist_t    *qm_usr_dqhtable;/* udquot hash table */
        xfs_dqlist_t    *qm_grp_dqhtable;/* gdquot hash table */
        uint             qm_dqhashmask;  /* # buckets in dq hashtab - 1 */
-        xfs_frlist_t     qm_dqfreelist;  /* freelist of dquots */
+        struct list_head qm_dqfrlist;    /* freelist of dquots */
+        struct mutex     qm_dqfrlist_lock;
+        int              qm_dqfrlist_cnt;
        atomic_t         qm_totaldquots; /* total incore dquots */
        uint             qm_nrefs;       /* file systems with quota on */
        int              qm_dqfree_ratio;/* ratio of free to inuse dquots */
@@ -106,7 +97,9 @@ typedef struct xfs_qm {
 typedef struct xfs_quotainfo {
        xfs_inode_t     *qi_uquotaip;    /* user quota inode */
        xfs_inode_t     *qi_gquotaip;    /* group quota inode */
-        xfs_dqlist_t     qi_dqlist;      /* all dquots in filesys */
+        struct list_head qi_dqlist;      /* all dquots in filesys */
+        struct mutex     qi_dqlist_lock;
+        int              qi_dquots;
        int              qi_dqreclaims;  /* a change here indicates
                                            a removal in the dqlist */
        time_t           qi_btimelimit;  /* limit for blks timer */
@@ -175,10 +168,6 @@ extern int		xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
 extern int              xfs_qm_scall_quotaon(xfs_mount_t *, uint);
 extern int              xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
-/* list stuff */
-extern void             xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
-extern void             xfs_qm_freelist_unlink(xfs_dquot_t *);
 #ifdef DEBUG
 extern int              xfs_qm_internalqcheck(xfs_mount_t *);
 #else
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 83e7ea3e25fa..3d1fc79532e2 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -55,7 +55,7 @@ static int xqm_proc_show(struct seq_file *m, void *v)
                        ndquot,
                        xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
                        xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
-                        xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0);
+                        xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0);
        return 0;
 }
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 5d0ee8d492db..92b002f1805f 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -79,6 +79,7 @@ xfs_qm_scall_quotaoff(
        xfs_mount_t             *mp,
        uint                    flags)
 {
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
        uint                    dqtype;
        int                     error;
        uint                    inactivate_flags;
@@ -102,11 +103,8 @@ xfs_qm_scall_quotaoff(
         * critical thing.
         * If quotaoff, then we must be dealing with the root filesystem.
         */
-        ASSERT(mp->m_quotainfo);
+        ASSERT(q);
-        if (mp->m_quotainfo)
+        mutex_lock(&q->qi_quotaofflock);
-                mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
-        ASSERT(mp->m_quotainfo);
        /*
         * If we're just turning off quota enforcement, change mp and go.
@@ -117,7 +115,7 @@ xfs_qm_scall_quotaoff(
                spin_lock(&mp->m_sb_lock);
                mp->m_sb.sb_qflags = mp->m_qflags;
                spin_unlock(&mp->m_sb_lock);
-                mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+                mutex_unlock(&q->qi_quotaofflock);
                /* XXX what to do if error ? Revert back to old vals incore ? */
                error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
@@ -150,10 +148,8 @@ xfs_qm_scall_quotaoff(
         * Nothing to do?  Don't complain. This happens when we're just
         * turning off quota enforcement.
         */
-        if ((mp->m_qflags & flags) == 0) {
+        if ((mp->m_qflags & flags) == 0)
-                mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+                goto out_unlock;
-                return (0);
-        }
        /*
         * Write the LI_QUOTAOFF log record, and do SB changes atomically,
@@ -162,7 +158,7 @@ xfs_qm_scall_quotaoff(
         */
        error = xfs_qm_log_quotaoff(mp, &qoffstart, flags);
        if (error)
-                goto out_error;
+                goto out_unlock;
        /*
         * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
@@ -204,7 +200,7 @@ xfs_qm_scall_quotaoff(
         * So, if we couldn't purge all the dquots from the filesystem,
         * we can't get rid of the incore data structures.
         */
-        while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype|XFS_QMOPT_QUOTAOFF)))
+        while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype)))
                delay(10 * nculprits);
        /*
@@ -222,7 +218,7 @@ xfs_qm_scall_quotaoff(
        if (error) {
                /* We're screwed now. Shutdown is the only option. */
                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-                goto out_error;
+                goto out_unlock;
        }
        /*
@@ -230,27 +226,26 @@ xfs_qm_scall_quotaoff(
         */
        if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) ||
            ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) {
-                mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+                mutex_unlock(&q->qi_quotaofflock);
                xfs_qm_destroy_quotainfo(mp);
                return (0);
        }
        /*
-         * Release our quotainode references, and vn_purge them,
+         * Release our quotainode references if we don't need them anymore.
-         * if we don't need them anymore.
         */
-        if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) {
+        if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) {
-                IRELE(XFS_QI_UQIP(mp));
+                IRELE(q->qi_uquotaip);
-                XFS_QI_UQIP(mp) = NULL;
+                q->qi_uquotaip = NULL;
        }
-        if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && XFS_QI_GQIP(mp)) {
+        if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) {
-                IRELE(XFS_QI_GQIP(mp));
+                IRELE(q->qi_gquotaip);
-                XFS_QI_GQIP(mp) = NULL;
+                q->qi_gquotaip = NULL;
        }
-out_error:
-        mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
-        return (error);
+out_unlock:
+        mutex_unlock(&q->qi_quotaofflock);
+        return error;
 }
 int
@@ -379,9 +374,9 @@ xfs_qm_scall_quotaon(
        /*
         * Switch on quota enforcement in core.
         */
-        mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
+        mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
        mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
-        mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+        mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
        return (0);
 }
@@ -392,11 +387,12 @@ xfs_qm_scall_quotaon(
 */
 int
 xfs_qm_scall_getqstat(
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        fs_quota_stat_t *out)
+        struct fs_quota_stat    *out)
 {
-        xfs_inode_t     *uip, *gip;
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
-        boolean_t       tempuqip, tempgqip;
+        struct xfs_inode        *uip, *gip;
+        boolean_t               tempuqip, tempgqip;
        uip = gip = NULL;
        tempuqip = tempgqip = B_FALSE;
@@ -415,9 +411,9 @@ xfs_qm_scall_getqstat(
        out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
        out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
-        if (mp->m_quotainfo) {
+        if (q) {
-                uip = mp->m_quotainfo->qi_uquotaip;
+                uip = q->qi_uquotaip;
-                gip = mp->m_quotainfo->qi_gquotaip;
+                gip = q->qi_gquotaip;
        }
        if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
                if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
@@ -441,17 +437,20 @@ xfs_qm_scall_getqstat(
                if (tempgqip)
                        IRELE(gip);
        }
-        if (mp->m_quotainfo) {
+        if (q) {
-                out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp);
+                out->qs_incoredqs = q->qi_dquots;
-                out->qs_btimelimit = XFS_QI_BTIMELIMIT(mp);
+                out->qs_btimelimit = q->qi_btimelimit;
-                out->qs_itimelimit = XFS_QI_ITIMELIMIT(mp);
+                out->qs_itimelimit = q->qi_itimelimit;
-                out->qs_rtbtimelimit = XFS_QI_RTBTIMELIMIT(mp);
+                out->qs_rtbtimelimit = q->qi_rtbtimelimit;
-                out->qs_bwarnlimit = XFS_QI_BWARNLIMIT(mp);
+                out->qs_bwarnlimit = q->qi_bwarnlimit;
-                out->qs_iwarnlimit = XFS_QI_IWARNLIMIT(mp);
+                out->qs_iwarnlimit = q->qi_iwarnlimit;
        }
-        return (0);
+        return 0;
 }
+#define XFS_DQ_MASK \
+        (FS_DQ_LIMIT_MASK | FS_DQ_TIMER_MASK | FS_DQ_WARNS_MASK)
 /*
 * Adjust quota limits, and start/stop timers accordingly.
 */
@@ -462,15 +461,17 @@ xfs_qm_scall_setqlim(
        uint                    type,
        fs_disk_quota_t         *newlim)
 {
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
        xfs_disk_dquot_t        *ddq;
        xfs_dquot_t             *dqp;
        xfs_trans_t             *tp;
        int                     error;
        xfs_qcnt_t              hard, soft;
-        if ((newlim->d_fieldmask &
+        if (newlim->d_fieldmask & ~XFS_DQ_MASK)
-            (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0)
+                return EINVAL;
-                return (0);
+        if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
+                return 0;
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
        if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128,
@@ -485,7 +486,7 @@ xfs_qm_scall_setqlim(
         * a quotaoff from happening). (XXXThis doesn't currently happen
         * because we take the vfslock before calling xfs_qm_sysent).
         */
-        mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
+        mutex_lock(&q->qi_quotaofflock);
        /*
         * Get the dquot (locked), and join it to the transaction.
@@ -493,9 +494,8 @@ xfs_qm_scall_setqlim(
         */
        if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
                xfs_trans_cancel(tp, XFS_TRANS_ABORT);
-                mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
                ASSERT(error != ENOENT);
-                return (error);
+                goto out_unlock;
        }
        xfs_trans_dqjoin(tp, dqp);
        ddq = &dqp->q_core;
@@ -513,8 +513,8 @@ xfs_qm_scall_setqlim(
                ddq->d_blk_hardlimit = cpu_to_be64(hard);
                ddq->d_blk_softlimit = cpu_to_be64(soft);
                if (id == 0) {
-                        mp->m_quotainfo->qi_bhardlimit = hard;
+                        q->qi_bhardlimit = hard;
-                        mp->m_quotainfo->qi_bsoftlimit = soft;
+                        q->qi_bsoftlimit = soft;
                }
        } else {
                qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft);
@@ -529,8 +529,8 @@ xfs_qm_scall_setqlim(
                ddq->d_rtb_hardlimit = cpu_to_be64(hard);
                ddq->d_rtb_softlimit = cpu_to_be64(soft);
                if (id == 0) {
-                        mp->m_quotainfo->qi_rtbhardlimit = hard;
+                        q->qi_rtbhardlimit = hard;
-                        mp->m_quotainfo->qi_rtbsoftlimit = soft;
+                        q->qi_rtbsoftlimit = soft;
                }
        } else {
                qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
@@ -546,8 +546,8 @@ xfs_qm_scall_setqlim(
                ddq->d_ino_hardlimit = cpu_to_be64(hard);
                ddq->d_ino_softlimit = cpu_to_be64(soft);
                if (id == 0) {
-                        mp->m_quotainfo->qi_ihardlimit = hard;
+                        q->qi_ihardlimit = hard;
-                        mp->m_quotainfo->qi_isoftlimit = soft;
+                        q->qi_isoftlimit = soft;
                }
        } else {
                qdprintk("ihard %Ld < isoft %Ld\n", hard, soft);
@@ -572,23 +572,23 @@ xfs_qm_scall_setqlim(
                 * for warnings.
                 */
                if (newlim->d_fieldmask & FS_DQ_BTIMER) {
-                        mp->m_quotainfo->qi_btimelimit = newlim->d_btimer;
+                        q->qi_btimelimit = newlim->d_btimer;
                        ddq->d_btimer = cpu_to_be32(newlim->d_btimer);
                }
                if (newlim->d_fieldmask & FS_DQ_ITIMER) {
-                        mp->m_quotainfo->qi_itimelimit = newlim->d_itimer;
+                        q->qi_itimelimit = newlim->d_itimer;
                        ddq->d_itimer = cpu_to_be32(newlim->d_itimer);
                }
                if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
-                        mp->m_quotainfo->qi_rtbtimelimit = newlim->d_rtbtimer;
+                        q->qi_rtbtimelimit = newlim->d_rtbtimer;
                        ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer);
                }
                if (newlim->d_fieldmask & FS_DQ_BWARNS)
-                        mp->m_quotainfo->qi_bwarnlimit = newlim->d_bwarns;
+                        q->qi_bwarnlimit = newlim->d_bwarns;
                if (newlim->d_fieldmask & FS_DQ_IWARNS)
-                        mp->m_quotainfo->qi_iwarnlimit = newlim->d_iwarns;
+                        q->qi_iwarnlimit = newlim->d_iwarns;
                if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
-                        mp->m_quotainfo->qi_rtbwarnlimit = newlim->d_rtbwarns;
+                        q->qi_rtbwarnlimit = newlim->d_rtbwarns;
        } else {
                /*
                 * If the user is now over quota, start the timelimit.
@@ -605,8 +605,9 @@ xfs_qm_scall_setqlim(
        error = xfs_trans_commit(tp, 0);
        xfs_qm_dqprint(dqp);
        xfs_qm_dqrele(dqp);
-        mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+ out_unlock:
+        mutex_unlock(&q->qi_quotaofflock);
        return error;
 }
@@ -853,7 +854,8 @@ xfs_dqrele_inode(
        int                     error;
        /* skip quota inodes */
-        if (ip == XFS_QI_UQIP(ip->i_mount) || ip == XFS_QI_GQIP(ip->i_mount)) {
+        if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
+            ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
                ASSERT(ip->i_udquot == NULL);
                ASSERT(ip->i_gdquot == NULL);
                read_unlock(&pag->pag_ici_lock);
@@ -891,7 +893,8 @@ xfs_qm_dqrele_all_inodes(
        uint             flags)
 {
        ASSERT(mp->m_quotainfo);
-        xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
+        xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags,
+                                XFS_ICI_NO_TAG, 0, NULL);
 }
 /*------------------------------------------------------------------------*/
@@ -930,7 +933,8 @@ struct mutex  qcheck_lock;
 }
 typedef struct dqtest {
-        xfs_dqmarker_t  q_lists;
+        uint             dq_flags;      /* various flags (XFS_DQ_*) */
+        struct list_head q_hashlist;
        xfs_dqhash_t    *q_hash;        /* the hashchain header */
        xfs_mount_t     *q_mount;       /* filesystem this relates to */
        xfs_dqid_t      d_id;           /* user id or group id */
@@ -941,14 +945,9 @@ typedef struct dqtest {
 STATIC void
 xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
 {
-        xfs_dquot_t *d;
+        list_add(&dqp->q_hashlist, &h->qh_list);
-        if (((d) = (h)->qh_next))
+        h->qh_version++;
-                (d)->HL_PREVP = &((dqp)->HL_NEXT);
+        h->qh_nelems++;
-        (dqp)->HL_NEXT = d;
-        (dqp)->HL_PREVP = &((h)->qh_next);
-        (h)->qh_next = (xfs_dquot_t *)dqp;
-        (h)->qh_version++;
-        (h)->qh_nelems++;
 }
 STATIC void
 xfs_qm_dqtest_print(
@@ -1060,9 +1059,7 @@ xfs_qm_internalqcheck_dqget(
        xfs_dqhash_t    *h;
        h = DQTEST_HASH(mp, id, type);
-        for (d = (xfs_dqtest_t *) h->qh_next; d != NULL;
+        list_for_each_entry(d, &h->qh_list, q_hashlist) {
-             d = (xfs_dqtest_t *) d->HL_NEXT) {
-                /* DQTEST_LIST_PRINT(h, HL_NEXT, "@@@@@ dqtestlist @@@@@"); */
                if (d->d_id == id && mp == d->q_mount) {
                        *O_dq = d;
                        return (0);
@@ -1073,6 +1070,7 @@ xfs_qm_internalqcheck_dqget(
        d->d_id = id;
        d->q_mount = mp;
        d->q_hash = h;
+        INIT_LIST_HEAD(&d->q_hashlist);
        xfs_qm_hashinsert(h, d);
        *O_dq = d;
        return (0);
@@ -1179,8 +1177,6 @@ xfs_qm_internalqcheck(
        xfs_ino_t       lastino;
        int             done, count;
        int             i;
-        xfs_dqtest_t    *d, *e;
-        xfs_dqhash_t    *h1;
        int             error;
        lastino = 0;
@@ -1220,19 +1216,18 @@ xfs_qm_internalqcheck(
        }
        cmn_err(CE_DEBUG, "Checking results against system dquots");
        for (i = 0; i < qmtest_hashmask; i++) {
-                h1 = &qmtest_udqtab[i];
+                xfs_dqtest_t    *d, *n;
-                for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
+                xfs_dqhash_t    *h;
+                h = &qmtest_udqtab[i];
+                list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) {
                        xfs_dqtest_cmp(d);
-                        e = (xfs_dqtest_t *) d->HL_NEXT;
                        kmem_free(d);
-                        d = e;
                }
-                h1 = &qmtest_gdqtab[i];
+                h = &qmtest_gdqtab[i];
-                for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
+                list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) {
                        xfs_dqtest_cmp(d);
-                        e = (xfs_dqtest_t *) d->HL_NEXT;
                        kmem_free(d);
-                        d = e;
                }
        }
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
index 8286b2842b6b..94a3d927d716 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -24,43 +24,6 @@
 */
 #define XFS_DQITER_MAP_SIZE     10
-/* Number of dquots that fit in to a dquot block */
-#define XFS_QM_DQPERBLK(mp)     ((mp)->m_quotainfo->qi_dqperchunk)
-#define XFS_DQ_IS_ADDEDTO_TRX(t, d)     ((d)->q_transp == (t))
-#define XFS_QI_MPLRECLAIMS(mp)  ((mp)->m_quotainfo->qi_dqreclaims)
-#define XFS_QI_UQIP(mp)         ((mp)->m_quotainfo->qi_uquotaip)
-#define XFS_QI_GQIP(mp)         ((mp)->m_quotainfo->qi_gquotaip)
-#define XFS_QI_DQCHUNKLEN(mp)   ((mp)->m_quotainfo->qi_dqchunklen)
-#define XFS_QI_BTIMELIMIT(mp)   ((mp)->m_quotainfo->qi_btimelimit)
-#define XFS_QI_RTBTIMELIMIT(mp) ((mp)->m_quotainfo->qi_rtbtimelimit)
-#define XFS_QI_ITIMELIMIT(mp)   ((mp)->m_quotainfo->qi_itimelimit)
-#define XFS_QI_BWARNLIMIT(mp)   ((mp)->m_quotainfo->qi_bwarnlimit)
-#define XFS_QI_RTBWARNLIMIT(mp) ((mp)->m_quotainfo->qi_rtbwarnlimit)
-#define XFS_QI_IWARNLIMIT(mp)   ((mp)->m_quotainfo->qi_iwarnlimit)
-#define XFS_QI_QOFFLOCK(mp)     ((mp)->m_quotainfo->qi_quotaofflock)
-#define XFS_QI_MPL_LIST(mp)     ((mp)->m_quotainfo->qi_dqlist)
-#define XFS_QI_MPLNEXT(mp)      ((mp)->m_quotainfo->qi_dqlist.qh_next)
-#define XFS_QI_MPLNDQUOTS(mp)   ((mp)->m_quotainfo->qi_dqlist.qh_nelems)
-#define xfs_qm_mplist_lock(mp) \
-        mutex_lock(&(XFS_QI_MPL_LIST(mp).qh_lock))
-#define xfs_qm_mplist_nowait(mp) \
-        mutex_trylock(&(XFS_QI_MPL_LIST(mp).qh_lock))
-#define xfs_qm_mplist_unlock(mp) \
-        mutex_unlock(&(XFS_QI_MPL_LIST(mp).qh_lock))
-#define XFS_QM_IS_MPLIST_LOCKED(mp) \
-        mutex_is_locked(&(XFS_QI_MPL_LIST(mp).qh_lock))
-#define xfs_qm_freelist_lock(qm) \
-        mutex_lock(&((qm)->qm_dqfreelist.qh_lock))
-#define xfs_qm_freelist_lock_nowait(qm) \
-        mutex_trylock(&((qm)->qm_dqfreelist.qh_lock))
-#define xfs_qm_freelist_unlock(qm) \
-        mutex_unlock(&((qm)->qm_dqfreelist.qh_lock))
 /*
 * Hash into a bucket in the dquot hash table, based on <mp, id>.
 */
@@ -72,9 +35,6 @@
                                      XFS_DQ_HASHVAL(mp, id)) : \
                                     (xfs_Gqm->qm_grp_dqhtable + \
                                      XFS_DQ_HASHVAL(mp, id)))
-#define XFS_IS_DQTYPE_ON(mp, type)   (type == XFS_DQ_USER ? \
-                                        XFS_IS_UQUOTA_ON(mp) : \
-                                        XFS_IS_OQUOTA_ON(mp))
 #define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
        !dqp->q_core.d_blk_hardlimit && \
        !dqp->q_core.d_blk_softlimit && \
@@ -86,68 +46,6 @@
        !dqp->q_core.d_rtbcount && \
        !dqp->q_core.d_icount)
-#define HL_PREVP        dq_hashlist.ql_prevp
-#define HL_NEXT         dq_hashlist.ql_next
-#define MPL_PREVP       dq_mplist.ql_prevp
-#define MPL_NEXT        dq_mplist.ql_next
-#define _LIST_REMOVE(h, dqp, PVP, NXT)                          \
-        {                                                       \
-                 xfs_dquot_t *d;                                \
-                 if (((d) = (dqp)->NXT))                                \
-                         (d)->PVP = (dqp)->PVP;                 \
-                 *((dqp)->PVP) = d;                             \
-                 (dqp)->NXT = NULL;                             \
-                 (dqp)->PVP = NULL;                             \
-                 (h)->qh_version++;                             \
-                 (h)->qh_nelems--;                              \
-        }
-#define _LIST_INSERT(h, dqp, PVP, NXT)                          \
-        {                                                       \
-                 xfs_dquot_t *d;                                \
-                 if (((d) = (h)->qh_next))                      \
-                         (d)->PVP = &((dqp)->NXT);              \
-                 (dqp)->NXT = d;                                \
-                 (dqp)->PVP = &((h)->qh_next);                  \
-                 (h)->qh_next = dqp;                            \
-                 (h)->qh_version++;                             \
-                 (h)->qh_nelems++;                              \
-         }
-#define FOREACH_DQUOT_IN_MP(dqp, mp) \
-        for ((dqp) = XFS_QI_MPLNEXT(mp); (dqp) != NULL; (dqp) = (dqp)->MPL_NEXT)
-#define FOREACH_DQUOT_IN_FREELIST(dqp, qlist)   \
-for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \
-     (dqp) = (dqp)->dq_flnext)
-#define XQM_HASHLIST_INSERT(h, dqp)     \
-         _LIST_INSERT(h, dqp, HL_PREVP, HL_NEXT)
-#define XQM_FREELIST_INSERT(h, dqp)     \
-         xfs_qm_freelist_append(h, dqp)
-#define XQM_MPLIST_INSERT(h, dqp)       \
-         _LIST_INSERT(h, dqp, MPL_PREVP, MPL_NEXT)
-#define XQM_HASHLIST_REMOVE(h, dqp)     \
-         _LIST_REMOVE(h, dqp, HL_PREVP, HL_NEXT)
-#define XQM_FREELIST_REMOVE(dqp)        \
-         xfs_qm_freelist_unlink(dqp)
-#define XQM_MPLIST_REMOVE(h, dqp)       \
-        { _LIST_REMOVE(h, dqp, MPL_PREVP, MPL_NEXT); \
-          XFS_QI_MPLRECLAIMS((dqp)->q_mount)++; }
-#define XFS_DQ_IS_LOGITEM_INITD(dqp)    ((dqp)->q_logitem.qli_dquot == (dqp))
-#define XFS_QM_DQP_TO_DQACCT(tp, dqp)   (XFS_QM_ISUDQ(dqp) ? \
-                                         (tp)->t_dqinfo->dqa_usrdquots : \
-                                         (tp)->t_dqinfo->dqa_grpdquots)
-#define XFS_IS_SUSER_DQUOT(dqp)         \
-        (!((dqp)->q_core.d_id))
 #define DQFLAGTO_TYPESTR(d)     (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
                                 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
                                 (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index c3ab75cb1d9a..061d827da33c 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -59,12 +59,11 @@ xfs_trans_dqjoin(
        xfs_trans_t     *tp,
        xfs_dquot_t     *dqp)
 {
-        xfs_dq_logitem_t    *lp;
+        xfs_dq_logitem_t    *lp = &dqp->q_logitem;
-        ASSERT(! XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+        ASSERT(dqp->q_transp != tp);
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-        ASSERT(XFS_DQ_IS_LOGITEM_INITD(dqp));
+        ASSERT(lp->qli_dquot == dqp);
-        lp = &dqp->q_logitem;
        /*
         * Get a log_item_desc to point at the new item.
@@ -96,7 +95,7 @@ xfs_trans_log_dquot(
 {
        xfs_log_item_desc_t     *lidp;
-        ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+        ASSERT(dqp->q_transp == tp);
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem));
@@ -198,16 +197,16 @@ xfs_trans_get_dqtrx(
        int             i;
        xfs_dqtrx_t     *qa;
-        for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+        qa = XFS_QM_ISUDQ(dqp) ?
-                qa = XFS_QM_DQP_TO_DQACCT(tp, dqp);
+                tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots;
+        for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
                if (qa[i].qt_dquot == NULL ||
-                    qa[i].qt_dquot == dqp) {
+                    qa[i].qt_dquot == dqp)
-                        return (&qa[i]);
+                        return &qa[i];
-                }
        }
-        return (NULL);
+        return NULL;
 }
 /*
@@ -381,7 +380,7 @@ xfs_trans_apply_dquot_deltas(
                                break;
                        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-                        ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+                        ASSERT(dqp->q_transp == tp);
                        /*
                         * adjust the actual number of blocks used
@@ -639,7 +638,7 @@ xfs_trans_dqresv(
                        softlimit = q->qi_bsoftlimit;
                timer = be32_to_cpu(dqp->q_core.d_btimer);
                warns = be16_to_cpu(dqp->q_core.d_bwarns);
-                warnlimit = XFS_QI_BWARNLIMIT(dqp->q_mount);
+                warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit;
                resbcountp = &dqp->q_res_bcount;
        } else {
                ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
@@ -651,7 +650,7 @@ xfs_trans_dqresv(
                        softlimit = q->qi_rtbsoftlimit;
                timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
                warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
-                warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount);
+                warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit;
                resbcountp = &dqp->q_res_rtbcount;
        }
@@ -691,7 +690,7 @@ xfs_trans_dqresv(
                        count = be64_to_cpu(dqp->q_core.d_icount);
                        timer = be32_to_cpu(dqp->q_core.d_itimer);
                        warns = be16_to_cpu(dqp->q_core.d_iwarns);
-                        warnlimit = XFS_QI_IWARNLIMIT(dqp->q_mount);
+                        warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
                        hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
                        if (!hardlimit)
                                hardlimit = q->qi_ihardlimit;
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index d13eeba2c8f8..0135e2a669d7 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -49,8 +49,8 @@ extern int xfs_acl_chmod(struct inode *inode);
 extern int posix_acl_access_exists(struct inode *inode);
 extern int posix_acl_default_exists(struct inode *inode);
-extern struct xattr_handler xfs_xattr_acl_access_handler;
+extern const struct xattr_handler xfs_xattr_acl_access_handler;
-extern struct xattr_handler xfs_xattr_acl_default_handler;
+extern const struct xattr_handler xfs_xattr_acl_default_handler;
 #else
 # define xfs_check_acl                                  NULL
 # define xfs_get_acl(inode, type)                       NULL
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index b1a5a1ff88ea..401f364ad36c 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -175,14 +175,20 @@ typedef struct xfs_agfl {
 } xfs_agfl_t;
 /*
- * Busy block/extent entry.  Used in perag to mark blocks that have been freed
+ * Busy block/extent entry.  Indexed by a rbtree in perag to mark blocks that
- * but whose transactions aren't committed to disk yet.
+ * have been freed but whose transactions aren't committed to disk yet.
+ *
+ * Note that we use the transaction ID to record the transaction, not the
+ * transaction structure itself. See xfs_alloc_busy_insert() for details.
 */
-typedef struct xfs_perag_busy {
+struct xfs_busy_extent {
-        xfs_agblock_t   busy_start;
+        struct rb_node  rb_node;        /* ag by-bno indexed search tree */
-        xfs_extlen_t    busy_length;
+        struct list_head list;          /* transaction busy extent list */
-        struct xfs_trans *busy_tp;      /* transaction that did the free */
+        xfs_agnumber_t  agno;
-} xfs_perag_busy_t;
+        xfs_agblock_t   bno;
+        xfs_extlen_t    length;
+        xlog_tid_t      tid;            /* transaction that created this */
+};
 /*
 * Per-ag incore structure, copies of information in agf and agi,
@@ -216,16 +222,17 @@ typedef struct xfs_perag {
        xfs_agino_t     pagl_leftrec;
        xfs_agino_t     pagl_rightrec;
 #ifdef __KERNEL__
-        spinlock_t      pagb_lock;      /* lock for pagb_list */
+        spinlock_t      pagb_lock;      /* lock for pagb_tree */
+        struct rb_root  pagb_tree;      /* ordered tree of busy extents */
        atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
        int             pag_ici_init;   /* incore inode cache initialised */
        rwlock_t        pag_ici_lock;   /* incore inode lock */
        struct radix_tree_root pag_ici_root;    /* incore inode cache root */
+        int             pag_ici_reclaimable;    /* reclaimable inodes */
 #endif
        int             pagb_count;     /* pagb slots in use */
-        xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */
 } xfs_perag_t;
 /*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 94cddbfb2560..a7fbe8a99b12 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -46,11 +46,9 @@
 #define XFSA_FIXUP_BNO_OK       1
 #define XFSA_FIXUP_CNT_OK       2
-STATIC void
+static int
-xfs_alloc_search_busy(xfs_trans_t *tp,
+xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
-                    xfs_agnumber_t agno,
+                    xfs_agblock_t bno, xfs_extlen_t len);
-                    xfs_agblock_t bno,
-                    xfs_extlen_t len);
 /*
 * Prototypes for per-ag allocation routines
@@ -540,9 +538,16 @@ xfs_alloc_ag_vextent(
                                be32_to_cpu(agf->agf_length));
                        xfs_alloc_log_agf(args->tp, args->agbp,
                                                XFS_AGF_FREEBLKS);
-                        /* search the busylist for these blocks */
+                        /*
-                        xfs_alloc_search_busy(args->tp, args->agno,
+                         * Search the busylist for these blocks and mark the
-                                        args->agbno, args->len);
+                         * transaction as synchronous if blocks are found. This
+                         * avoids the need to block due to a synchronous log
+                         * force to ensure correct ordering as the synchronous
+                         * transaction will guarantee that for us.
+                         */
+                        if (xfs_alloc_busy_search(args->mp, args->agno,
+                                                args->agbno, args->len))
+                                xfs_trans_set_sync(args->tp);
                }
                if (!args->isfl)
                        xfs_trans_mod_sb(args->tp,
@@ -1693,7 +1698,7 @@ xfs_free_ag_extent(
         * when the iclog commits to disk.  If a busy block is allocated,
         * the iclog is pushed up to the LSN that freed the block.
         */
-        xfs_alloc_mark_busy(tp, agno, bno, len);
+        xfs_alloc_busy_insert(tp, agno, bno, len);
        return 0;
 error0:
@@ -1989,14 +1994,20 @@ xfs_alloc_get_freelist(
        *bnop = bno;
        /*
-         * As blocks are freed, they are added to the per-ag busy list
+         * As blocks are freed, they are added to the per-ag busy list and
-         * and remain there until the freeing transaction is committed to
+         * remain there until the freeing transaction is committed to disk.
-         * disk.  Now that we have allocated blocks, this list must be
+         * Now that we have allocated blocks, this list must be searched to see
-         * searched to see if a block is being reused.  If one is, then
+         * if a block is being reused.  If one is, then the freeing transaction
-         * the freeing transaction must be pushed to disk NOW by forcing
+         * must be pushed to disk before this transaction.
-         * to disk all iclogs up that transaction's LSN.
+         *
+         * We do this by setting the current transaction to a sync transaction
+         * which guarantees that the freeing transaction is on disk before this
+         * transaction. This is done instead of a synchronous log force here so
+         * that we don't sit and wait with the AGF locked in the transaction
+         * during the log force.
         */
-        xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1);
+        if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
+                xfs_trans_set_sync(tp);
        return 0;
 }
@@ -2201,7 +2212,7 @@ xfs_alloc_read_agf(
                        be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
                spin_lock_init(&pag->pagb_lock);
                pag->pagb_count = 0;
-                memset(pag->pagb_list, 0, sizeof(pag->pagb_list));
+                pag->pagb_tree = RB_ROOT;
                pag->pagf_init = 1;
        }
 #ifdef DEBUG
@@ -2479,127 +2490,263 @@ error0:
 * list is reused, the transaction that freed it must be forced to disk
 * before continuing to use the block.
 *
- * xfs_alloc_mark_busy - add to the per-ag busy list
+ * xfs_alloc_busy_insert - add to the per-ag busy list
- * xfs_alloc_clear_busy - remove an item from the per-ag busy list
+ * xfs_alloc_busy_clear - remove an item from the per-ag busy list
+ * xfs_alloc_busy_search - search for a busy extent
+ */
+/*
+ * Insert a new extent into the busy tree.
+ *
+ * The busy extent tree is indexed by the start block of the busy extent.
+ * there can be multiple overlapping ranges in the busy extent tree but only
+ * ever one entry at a given start block. The reason for this is that
+ * multi-block extents can be freed, then smaller chunks of that extent
+ * allocated and freed again before the first transaction commit is on disk.
+ * If the exact same start block is freed a second time, we have to wait for
+ * that busy extent to pass out of the tree before the new extent is inserted.
+ * There are two main cases we have to handle here.
+ *
+ * The first case is a transaction that triggers a "free - allocate - free"
+ * cycle. This can occur during btree manipulations as a btree block is freed
+ * to the freelist, then allocated from the free list, then freed again. In
+ * this case, the second extxpnet free is what triggers the duplicate and as
+ * such the transaction IDs should match. Because the extent was allocated in
+ * this transaction, the transaction must be marked as synchronous. This is
+ * true for all cases where the free/alloc/free occurs in the one transaction,
+ * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
+ * This serves to catch violations of the second case quite effectively.
+ *
+ * The second case is where the free/alloc/free occur in different
+ * transactions. In this case, the thread freeing the extent the second time
+ * can't mark the extent busy immediately because it is already tracked in a
+ * transaction that may be committing.  When the log commit for the existing
+ * busy extent completes, the busy extent will be removed from the tree. If we
+ * allow the second busy insert to continue using that busy extent structure,
+ * it can be freed before this transaction is safely in the log.  Hence our
+ * only option in this case is to force the log to remove the existing busy
+ * extent from the list before we insert the new one with the current
+ * transaction ID.
+ *
+ * The problem we are trying to avoid in the free-alloc-free in separate
+ * transactions is most easily described with a timeline:
+ *
+ *      Thread 1        Thread 2        Thread 3        xfslogd
+ *      xact alloc
+ *      free X
+ *      mark busy
+ *      commit xact
+ *      free xact
+ *                      xact alloc
+ *                      alloc X
+ *                      busy search
+ *                      mark xact sync
+ *                      commit xact
+ *                      free xact
+ *                      force log
+ *                      checkpoint starts
+ *                      ....
+ *                                      xact alloc
+ *                                      free X
+ *                                      mark busy
+ *                                      finds match
+ *                                      *** KABOOM! ***
+ *                                      ....
+ *                                                      log IO completes
+ *                                                      unbusy X
+ *                      checkpoint completes
+ *
+ * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
+ * the checkpoint completes, and the busy extent it matched will have been
+ * removed from the tree when it is woken. Hence it can then continue safely.
+ *
+ * However, to ensure this matching process is robust, we need to use the
+ * transaction ID for identifying transaction, as delayed logging results in
+ * the busy extent and transaction lifecycles being different. i.e. the busy
+ * extent is active for a lot longer than the transaction.  Hence the
+ * transaction structure can be freed and reallocated, then mark the same
+ * extent busy again in the new transaction. In this case the new transaction
+ * will have a different tid but can have the same address, and hence we need
+ * to check against the tid.
+ *
+ * Future: for delayed logging, we could avoid the log force if the extent was
+ * first freed in the current checkpoint sequence. This, however, requires the
+ * ability to pin the current checkpoint in memory until this transaction
+ * commits to ensure that both the original free and the current one combine
+ * logically into the one checkpoint. If the checkpoint sequences are
+ * different, however, we still need to wait on a log force.
 */
 void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
+xfs_alloc_busy_insert(
-                    xfs_agnumber_t agno,
+        struct xfs_trans        *tp,
-                    xfs_agblock_t bno,
+        xfs_agnumber_t          agno,
-                    xfs_extlen_t len)
+        xfs_agblock_t           bno,
+        xfs_extlen_t            len)
 {
-        xfs_perag_busy_t        *bsy;
+        struct xfs_busy_extent  *new;
+        struct xfs_busy_extent  *busyp;
        struct xfs_perag        *pag;
-        int                     n;
+        struct rb_node          **rbp;
+        struct rb_node          *parent;
+        int                     match;
-        pag = xfs_perag_get(tp->t_mountp, agno);
-        spin_lock(&pag->pagb_lock);
-        /* search pagb_list for an open slot */
+        new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
-        for (bsy = pag->pagb_list, n = 0;
+        if (!new) {
-             n < XFS_PAGB_NUM_SLOTS;
+                /*
-             bsy++, n++) {
+                 * No Memory!  Since it is now not possible to track the free
-                if (bsy->busy_tp == NULL) {
+                 * block, make this a synchronous transaction to insure that
-                        break;
+                 * the block is not reused before this transaction commits.
-                }
+                 */
+                trace_xfs_alloc_busy(tp, agno, bno, len, 1);
+                xfs_trans_set_sync(tp);
+                return;
        }
-        trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n);
+        new->agno = agno;
+        new->bno = bno;
+        new->length = len;
+        new->tid = xfs_log_get_trans_ident(tp);
-        if (n < XFS_PAGB_NUM_SLOTS) {
+        INIT_LIST_HEAD(&new->list);
-                bsy = &pag->pagb_list[n];
-                pag->pagb_count++;
+        /* trace before insert to be able to see failed inserts */
-                bsy->busy_start = bno;
+        trace_xfs_alloc_busy(tp, agno, bno, len, 0);
-                bsy->busy_length = len;
-                bsy->busy_tp = tp;
+        pag = xfs_perag_get(tp->t_mountp, new->agno);
-                xfs_trans_add_busy(tp, agno, n);
+restart:
-        } else {
+        spin_lock(&pag->pagb_lock);
+        rbp = &pag->pagb_tree.rb_node;
+        parent = NULL;
+        busyp = NULL;
+        match = 0;
+        while (*rbp && match >= 0) {
+                parent = *rbp;
+                busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
+                if (new->bno < busyp->bno) {
+                        /* may overlap, but exact start block is lower */
+                        rbp = &(*rbp)->rb_left;
+                        if (new->bno + new->length > busyp->bno)
+                                match = busyp->tid == new->tid ? 1 : -1;
+                } else if (new->bno > busyp->bno) {
+                        /* may overlap, but exact start block is higher */
+                        rbp = &(*rbp)->rb_right;
+                        if (bno < busyp->bno + busyp->length)
+                                match = busyp->tid == new->tid ? 1 : -1;
+                } else {
+                        match = busyp->tid == new->tid ? 1 : -1;
+                        break;
+                }
+        }
+        if (match < 0) {
+                /* overlap marked busy in different transaction */
+                spin_unlock(&pag->pagb_lock);
+                xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
+                goto restart;
+        }
+        if (match > 0) {
                /*
-                 * The busy list is full!  Since it is now not possible to
+                 * overlap marked busy in same transaction. Update if exact
-                 * track the free block, make this a synchronous transaction
+                 * start block match, otherwise combine the busy extents into
-                 * to insure that the block is not reused before this
+                 * a single range.
-                 * transaction commits.
                 */
-                xfs_trans_set_sync(tp);
+                if (busyp->bno == new->bno) {
-        }
+                        busyp->length = max(busyp->length, new->length);
+                        spin_unlock(&pag->pagb_lock);
+                        ASSERT(tp->t_flags & XFS_TRANS_SYNC);
+                        xfs_perag_put(pag);
+                        kmem_free(new);
+                        return;
+                }
+                rb_erase(&busyp->rb_node, &pag->pagb_tree);
+                new->length = max(busyp->bno + busyp->length,
+                                        new->bno + new->length) -
+                                min(busyp->bno, new->bno);
+                new->bno = min(busyp->bno, new->bno);
+        } else
+                busyp = NULL;
+        rb_link_node(&new->rb_node, parent, rbp);
+        rb_insert_color(&new->rb_node, &pag->pagb_tree);
+        list_add(&new->list, &tp->t_busy);
        spin_unlock(&pag->pagb_lock);
        xfs_perag_put(pag);
+        kmem_free(busyp);
 }
-void
+/*
-xfs_alloc_clear_busy(xfs_trans_t *tp,
+ * Search for a busy extent within the range of the extent we are about to
-                     xfs_agnumber_t agno,
+ * allocate.  You need to be holding the busy extent tree lock when calling
-                     int idx)
+ * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy
+ * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
+ * match. This is done so that a non-zero return indicates an overlap that
+ * will require a synchronous transaction, but it can still be
+ * used to distinguish between a partial or exact match.
+ */
+static int
+xfs_alloc_busy_search(
+        struct xfs_mount        *mp,
+        xfs_agnumber_t          agno,
+        xfs_agblock_t           bno,
+        xfs_extlen_t            len)
 {
        struct xfs_perag        *pag;
-        xfs_perag_busy_t        *list;
+        struct rb_node          *rbp;
+        struct xfs_busy_extent  *busyp;
+        int                     match = 0;
-        ASSERT(idx < XFS_PAGB_NUM_SLOTS);
+        pag = xfs_perag_get(mp, agno);
-        pag = xfs_perag_get(tp->t_mountp, agno);
        spin_lock(&pag->pagb_lock);
-        list = pag->pagb_list;
-        trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp);
+        rbp = pag->pagb_tree.rb_node;
-        if (list[idx].busy_tp == tp) {
+        /* find closest start bno overlap */
-                list[idx].busy_tp = NULL;
+        while (rbp) {
-                pag->pagb_count--;
+                busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node);
+                if (bno < busyp->bno) {
+                        /* may overlap, but exact start block is lower */
+                        if (bno + len > busyp->bno)
+                                match = -1;
+                        rbp = rbp->rb_left;
+                } else if (bno > busyp->bno) {
+                        /* may overlap, but exact start block is higher */
+                        if (bno < busyp->bno + busyp->length)
+                                match = -1;
+                        rbp = rbp->rb_right;
+                } else {
+                        /* bno matches busyp, length determines exact match */
+                        match = (busyp->length == len) ? 1 : -1;
+                        break;
+                }
        }
        spin_unlock(&pag->pagb_lock);
+        trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
        xfs_perag_put(pag);
+        return match;
 }
+void
-/*
+xfs_alloc_busy_clear(
- * If we find the extent in the busy list, force the log out to get the
+        struct xfs_mount        *mp,
- * extent out of the busy list so the caller can use it straight away.
+        struct xfs_busy_extent  *busyp)
- */
-STATIC void
-xfs_alloc_search_busy(xfs_trans_t *tp,
-                    xfs_agnumber_t agno,
-                    xfs_agblock_t bno,
-                    xfs_extlen_t len)
 {
        struct xfs_perag        *pag;
-        xfs_perag_busy_t        *bsy;
-        xfs_agblock_t           uend, bend;
-        xfs_lsn_t               lsn = 0;
-        int                     cnt;
-        pag = xfs_perag_get(tp->t_mountp, agno);
+        trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno,
-        spin_lock(&pag->pagb_lock);
+                                                busyp->length);
-        cnt = pag->pagb_count;
-        /*
+        ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno,
-         * search pagb_list for this slot, skipping open slots. We have to
+                                                busyp->length) == 1);
-         * search the entire array as there may be multiple overlaps and
-         * we have to get the most recent LSN for the log force to push out
-         * all the transactions that span the range.
-         */
-        uend = bno + len - 1;
-        for (cnt = 0; cnt < pag->pagb_count; cnt++) {
-                bsy = &pag->pagb_list[cnt];
-                if (!bsy->busy_tp)
-                        continue;
-                bend = bsy->busy_start + bsy->busy_length - 1;
+        list_del_init(&busyp->list);
-                if (bno > bend || uend < bsy->busy_start)
-                        continue;
-                /* (start1,length1) within (start2, length2) */
+        pag = xfs_perag_get(mp, busyp->agno);
-                if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0)
+        spin_lock(&pag->pagb_lock);
-                        lsn = bsy->busy_tp->t_commit_lsn;
+        rb_erase(&busyp->rb_node, &pag->pagb_tree);
-        }
        spin_unlock(&pag->pagb_lock);
        xfs_perag_put(pag);
-        trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn);
-        /*
+        kmem_free(busyp);
-         * If a block was found, force the log through the LSN of the
-         * transaction that freed the block
-         */
-        if (lsn)
-                xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC);
 }
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 599bffa39784..6d05199b667c 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -22,6 +22,7 @@ struct xfs_buf;
 struct xfs_mount;
 struct xfs_perag;
 struct xfs_trans;
+struct xfs_busy_extent;
 /*
 * Freespace allocation types.  Argument to xfs_alloc_[v]extent.
@@ -119,15 +120,13 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
 #ifdef __KERNEL__
 void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
+xfs_alloc_busy_insert(xfs_trans_t *tp,
                xfs_agnumber_t agno,
                xfs_agblock_t bno,
                xfs_extlen_t len);
 void
-xfs_alloc_clear_busy(xfs_trans_t *tp,
+xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
-                xfs_agnumber_t ag,
-                int idx);
 #endif  /* __KERNEL__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index b726e10d2c1c..83f494218759 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -134,7 +134,7 @@ xfs_allocbt_free_block(
         * disk. If a busy block is allocated, the iclog is pushed up to the
         * LSN that freed the block.
         */
-        xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
+        xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
        xfs_trans_agbtree_delta(cur->bc_tp, -1);
        return 0;
 }
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 5c11e4d17010..99587ded043f 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3829,7 +3829,7 @@ xfs_bmap_add_attrfork(
        }
        if ((error = xfs_bmap_finish(&tp, &flist, &committed)))
                goto error2;
-        error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES);
+        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
        ASSERT(ip->i_df.if_ext_max ==
               XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
        return error;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index f3c49e69eab9..02a80984aa05 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -64,7 +64,7 @@ xfs_buf_item_log_debug(
        nbytes = last - first + 1;
        bfset(bip->bli_logged, first, nbytes);
        for (x = 0; x < nbytes; x++) {
-                chunk_num = byte >> XFS_BLI_SHIFT;
+                chunk_num = byte >> XFS_BLF_SHIFT;
                word_num = chunk_num >> BIT_TO_WORD_SHIFT;
                bit_num = chunk_num & (NBWORD - 1);
                wordp = &(bip->bli_format.blf_data_map[word_num]);
@@ -166,7 +166,7 @@ xfs_buf_item_size(
                 * cancel flag in it.
                 */
                trace_xfs_buf_item_size_stale(bip);
-                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
                return 1;
        }
@@ -197,9 +197,9 @@ xfs_buf_item_size(
                } else if (next_bit != last_bit + 1) {
                        last_bit = next_bit;
                        nvecs++;
-                } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) !=
+                } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
-                           (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) +
+                           (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
-                            XFS_BLI_CHUNK)) {
+                            XFS_BLF_CHUNK)) {
                        last_bit = next_bit;
                        nvecs++;
                } else {
@@ -254,6 +254,20 @@ xfs_buf_item_format(
        vecp++;
        nvecs = 1;
+        /*
+         * If it is an inode buffer, transfer the in-memory state to the
+         * format flags and clear the in-memory state. We do not transfer
+         * this state if the inode buffer allocation has not yet been committed
+         * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
+         * correct replay of the inode allocation.
+         */
+        if (bip->bli_flags & XFS_BLI_INODE_BUF) {
+                if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+                      xfs_log_item_in_current_chkpt(&bip->bli_item)))
+                        bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
+                bip->bli_flags &= ~XFS_BLI_INODE_BUF;
+        }
        if (bip->bli_flags & XFS_BLI_STALE) {
                /*
                 * The buffer is stale, so all we need to log
@@ -261,7 +275,7 @@ xfs_buf_item_format(
                 * cancel flag in it.
                 */
                trace_xfs_buf_item_format_stale(bip);
-                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
                bip->bli_format.blf_size = nvecs;
                return;
        }
@@ -294,28 +308,28 @@ xfs_buf_item_format(
                 * keep counting and scanning.
                 */
                if (next_bit == -1) {
-                        buffer_offset = first_bit * XFS_BLI_CHUNK;
+                        buffer_offset = first_bit * XFS_BLF_CHUNK;
                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
-                        vecp->i_len = nbits * XFS_BLI_CHUNK;
+                        vecp->i_len = nbits * XFS_BLF_CHUNK;
                        vecp->i_type = XLOG_REG_TYPE_BCHUNK;
                        nvecs++;
                        break;
                } else if (next_bit != last_bit + 1) {
-                        buffer_offset = first_bit * XFS_BLI_CHUNK;
+                        buffer_offset = first_bit * XFS_BLF_CHUNK;
                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
-                        vecp->i_len = nbits * XFS_BLI_CHUNK;
+                        vecp->i_len = nbits * XFS_BLF_CHUNK;
                        vecp->i_type = XLOG_REG_TYPE_BCHUNK;
                        nvecs++;
                        vecp++;
                        first_bit = next_bit;
                        last_bit = next_bit;
                        nbits = 1;
-                } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) !=
+                } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) !=
-                           (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) +
+                           (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) +
-                            XFS_BLI_CHUNK)) {
+                            XFS_BLF_CHUNK)) {
-                        buffer_offset = first_bit * XFS_BLI_CHUNK;
+                        buffer_offset = first_bit * XFS_BLF_CHUNK;
                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
-                        vecp->i_len = nbits * XFS_BLI_CHUNK;
+                        vecp->i_len = nbits * XFS_BLF_CHUNK;
                        vecp->i_type = XLOG_REG_TYPE_BCHUNK;
 /* You would think we need to bump the nvecs here too, but we do not
 * this number is used by recovery, and it gets confused by the boundary
@@ -341,10 +355,15 @@ xfs_buf_item_format(
 }
 /*
- * This is called to pin the buffer associated with the buf log
+ * This is called to pin the buffer associated with the buf log item in memory
- * item in memory so it cannot be written out.  Simply call bpin()
+ * so it cannot be written out.  Simply call bpin() on the buffer to do this.
- * on the buffer to do this.
+ *
+ * We also always take a reference to the buffer log item here so that the bli
+ * is held while the item is pinned in memory. This means that we can
+ * unconditionally drop the reference count a transaction holds when the
+ * transaction is completed.
 */
 STATIC void
 xfs_buf_item_pin(
        xfs_buf_log_item_t      *bip)
@@ -356,6 +375,7 @@ xfs_buf_item_pin(
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
               (bip->bli_flags & XFS_BLI_STALE));
+        atomic_inc(&bip->bli_refcount);
        trace_xfs_buf_item_pin(bip);
        xfs_bpin(bp);
 }
@@ -372,12 +392,12 @@ xfs_buf_item_pin(
 */
 STATIC void
 xfs_buf_item_unpin(
-        xfs_buf_log_item_t      *bip,
+        xfs_buf_log_item_t      *bip)
-        int                     stale)
 {
        struct xfs_ail  *ailp;
        xfs_buf_t       *bp;
        int             freed;
+        int             stale = bip->bli_flags & XFS_BLI_STALE;
        bp = bip->bli_buf;
        ASSERT(bp != NULL);
@@ -393,7 +413,7 @@ xfs_buf_item_unpin(
                ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
                ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
                ASSERT(XFS_BUF_ISSTALE(bp));
-                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
                trace_xfs_buf_item_unpin_stale(bip);
                /*
@@ -428,40 +448,34 @@ xfs_buf_item_unpin_remove(
        xfs_buf_log_item_t      *bip,
        xfs_trans_t             *tp)
 {
-        xfs_buf_t               *bp;
+        /* will xfs_buf_item_unpin() call xfs_buf_item_relse()? */
-        xfs_log_item_desc_t     *lidp;
-        int                     stale = 0;
-        bp = bip->bli_buf;
-        /*
-         * will xfs_buf_item_unpin() call xfs_buf_item_relse()?
-         */
        if ((atomic_read(&bip->bli_refcount) == 1) &&
            (bip->bli_flags & XFS_BLI_STALE)) {
+                /*
+                 * yes -- We can safely do some work here and then call
+                 * buf_item_unpin to do the rest because we are
+                 * are holding the buffer locked so no one else will be
+                 * able to bump up the refcount. We have to remove the
+                 * log item from the transaction as we are about to release
+                 * our reference to the buffer. If we don't, the unlock that
+                 * occurs later in the xfs_trans_uncommit() will try to
+                 * reference the buffer which we no longer have a hold on.
+                 */
+                struct xfs_log_item_desc *lidp;
                ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
                trace_xfs_buf_item_unpin_stale(bip);
-                /*
+                lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)bip);
-                 * yes -- clear the xaction descriptor in-use flag
-                 * and free the chunk if required.  We can safely
-                 * do some work here and then call buf_item_unpin
-                 * to do the rest because if the if is true, then
-                 * we are holding the buffer locked so no one else
-                 * will be able to bump up the refcount.
-                 */
-                lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip);
-                stale = lidp->lid_flags & XFS_LID_BUF_STALE;
                xfs_trans_free_item(tp, lidp);
                /*
-                 * Since the transaction no longer refers to the buffer,
+                 * Since the transaction no longer refers to the buffer, the
-                 * the buffer should no longer refer to the transaction.
+                 * buffer should no longer refer to the transaction.
                 */
-                XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+                XFS_BUF_SET_FSPRIVATE2(bip->bli_buf, NULL);
        }
+        xfs_buf_item_unpin(bip);
-        xfs_buf_item_unpin(bip, stale);
-        return;
 }
 /*
@@ -495,20 +509,23 @@ xfs_buf_item_trylock(
 }
 /*
- * Release the buffer associated with the buf log item.
+ * Release the buffer associated with the buf log item.  If there is no dirty
- * If there is no dirty logged data associated with the
+ * logged data associated with the buffer recorded in the buf log item, then
- * buffer recorded in the buf log item, then free the
+ * free the buf log item and remove the reference to it in the buffer.
- * buf log item and remove the reference to it in the
+ *
- * buffer.
+ * This call ignores the recursion count.  It is only called when the buffer
+ * should REALLY be unlocked, regardless of the recursion count.
 *
- * This call ignores the recursion count.  It is only called
+ * We unconditionally drop the transaction's reference to the log item. If the
- * when the buffer should REALLY be unlocked, regardless
+ * item was logged, then another reference was taken when it was pinned, so we
- * of the recursion count.
+ * can safely drop the transaction reference now.  This also allows us to avoid
+ * potential races with the unpin code freeing the bli by not referencing the
+ * bli after we've dropped the reference count.
 *
- * If the XFS_BLI_HOLD flag is set in the buf log item, then
+ * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
- * free the log item if necessary but do not unlock the buffer.
+ * if necessary but do not unlock the buffer.  This is for support of
- * This is for support of xfs_trans_bhold(). Make sure the
+ * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
- * XFS_BLI_HOLD field is cleared if we don't free the item.
+ * free the item.
 */
 STATIC void
 xfs_buf_item_unlock(
@@ -520,73 +537,54 @@ xfs_buf_item_unlock(
        bp = bip->bli_buf;
-        /*
+        /* Clear the buffer's association with this transaction. */
-         * Clear the buffer's association with this transaction.
-         */
        XFS_BUF_SET_FSPRIVATE2(bp, NULL);
        /*
-         * If this is a transaction abort, don't return early.
+         * If this is a transaction abort, don't return early.  Instead, allow
-         * Instead, allow the brelse to happen.
+         * the brelse to happen.  Normally it would be done for stale
-         * Normally it would be done for stale (cancelled) buffers
+         * (cancelled) buffers at unpin time, but we'll never go through the
-         * at unpin time, but we'll never go through the pin/unpin
+         * pin/unpin cycle if we abort inside commit.
-         * cycle if we abort inside commit.
         */
        aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
        /*
-         * If the buf item is marked stale, then don't do anything.
+         * Before possibly freeing the buf item, determine if we should
-         * We'll unlock the buffer and free the buf item when the
+         * release the buffer at the end of this routine.
-         * buffer is unpinned for the last time.
         */
-        if (bip->bli_flags & XFS_BLI_STALE) {
+        hold = bip->bli_flags & XFS_BLI_HOLD;
-                bip->bli_flags &= ~XFS_BLI_LOGGED;
-                trace_xfs_buf_item_unlock_stale(bip);
+        /* Clear the per transaction state. */
-                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+        bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
-                if (!aborted)
-                        return;
-        }
        /*
-         * Drop the transaction's reference to the log item if
+         * If the buf item is marked stale, then don't do anything.  We'll
-         * it was not logged as part of the transaction.  Otherwise
+         * unlock the buffer and free the buf item when the buffer is unpinned
-         * we'll drop the reference in xfs_buf_item_unpin() when
+         * for the last time.
-         * the transaction is really through with the buffer.
         */
-        if (!(bip->bli_flags & XFS_BLI_LOGGED)) {
+        if (bip->bli_flags & XFS_BLI_STALE) {
-                atomic_dec(&bip->bli_refcount);
+                trace_xfs_buf_item_unlock_stale(bip);
-        } else {
+                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
-                /*
+                if (!aborted) {
-                 * Clear the logged flag since this is per
+                        atomic_dec(&bip->bli_refcount);
-                 * transaction state.
+                        return;
-                 */
+                }
-                bip->bli_flags &= ~XFS_BLI_LOGGED;
        }
-        /*
-         * Before possibly freeing the buf item, determine if we should
-         * release the buffer at the end of this routine.
-         */
-        hold = bip->bli_flags & XFS_BLI_HOLD;
        trace_xfs_buf_item_unlock(bip);
        /*
-         * If the buf item isn't tracking any data, free it.
+         * If the buf item isn't tracking any data, free it, otherwise drop the
-         * Otherwise, if XFS_BLI_HOLD is set clear it.
+         * reference we hold to it.
         */
        if (xfs_bitmap_empty(bip->bli_format.blf_data_map,
-                             bip->bli_format.blf_map_size)) {
+                             bip->bli_format.blf_map_size))
                xfs_buf_item_relse(bp);
-        } else if (hold) {
+        else
-                bip->bli_flags &= ~XFS_BLI_HOLD;
+                atomic_dec(&bip->bli_refcount);
-        }
-        /*
+        if (!hold)
-         * Release the buffer if XFS_BLI_HOLD was not set.
-         */
-        if (!hold) {
                xfs_buf_relse(bp);
-        }
 }
 /*
@@ -675,7 +673,7 @@ static struct xfs_item_ops xfs_buf_item_ops = {
        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
                                        xfs_buf_item_format,
        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_buf_item_pin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*, int))xfs_buf_item_unpin,
+        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_buf_item_unpin,
        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
                                        xfs_buf_item_unpin_remove,
        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock,
@@ -723,20 +721,17 @@ xfs_buf_item_init(
        }
        /*
-         * chunks is the number of XFS_BLI_CHUNK size pieces
+         * chunks is the number of XFS_BLF_CHUNK size pieces
         * the buffer can be divided into. Make sure not to
         * truncate any pieces.  map_size is the size of the
         * bitmap needed to describe the chunks of the buffer.
         */
-        chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT);
+        chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT);
        map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
        bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
                                                    KM_SLEEP);
-        bip->bli_item.li_type = XFS_LI_BUF;
+        xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
-        bip->bli_item.li_ops = &xfs_buf_item_ops;
-        bip->bli_item.li_mountp = mp;
-        bip->bli_item.li_ailp = mp->m_ail;
        bip->bli_buf = bp;
        xfs_buf_hold(bp);
        bip->bli_format.blf_type = XFS_LI_BUF;
@@ -799,8 +794,8 @@ xfs_buf_item_log(
        /*
         * Convert byte offsets to bit numbers.
         */
-        first_bit = first >> XFS_BLI_SHIFT;
+        first_bit = first >> XFS_BLF_SHIFT;
-        last_bit = last >> XFS_BLI_SHIFT;
+        last_bit = last >> XFS_BLF_SHIFT;
        /*
         * Calculate the total number of bits to be set.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 217f34af00cb..f20bb472d582 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -26,7 +26,7 @@ extern kmem_zone_t	*xfs_buf_item_zone;
 * have been logged.
 * For 6.2 and beyond, this is XFS_LI_BUF.  We use this to log everything.
 */
-typedef struct xfs_buf_log_format_t {
+typedef struct xfs_buf_log_format {
        unsigned short  blf_type;       /* buf log item type indicator */
        unsigned short  blf_size;       /* size of this item */
        ushort          blf_flags;      /* misc state */
@@ -41,22 +41,22 @@ typedef struct xfs_buf_log_format_t {
 * This flag indicates that the buffer contains on disk inodes
 * and requires special recovery handling.
 */
-#define XFS_BLI_INODE_BUF       0x1
+#define XFS_BLF_INODE_BUF       0x1
 /*
 * This flag indicates that the buffer should not be replayed
 * during recovery because its blocks are being freed.
 */
-#define XFS_BLI_CANCEL          0x2
+#define XFS_BLF_CANCEL          0x2
 /*
 * This flag indicates that the buffer contains on disk
 * user or group dquots and may require special recovery handling.
 */
-#define XFS_BLI_UDQUOT_BUF      0x4
+#define XFS_BLF_UDQUOT_BUF      0x4
-#define XFS_BLI_PDQUOT_BUF      0x8
+#define XFS_BLF_PDQUOT_BUF      0x8
-#define XFS_BLI_GDQUOT_BUF      0x10
+#define XFS_BLF_GDQUOT_BUF      0x10
-#define XFS_BLI_CHUNK           128
+#define XFS_BLF_CHUNK           128
-#define XFS_BLI_SHIFT           7
+#define XFS_BLF_SHIFT           7
 #define BIT_TO_WORD_SHIFT       5
 #define NBWORD                  (NBBY * sizeof(unsigned int))
@@ -69,6 +69,7 @@ typedef struct xfs_buf_log_format_t {
 #define XFS_BLI_LOGGED          0x08
 #define XFS_BLI_INODE_ALLOC_BUF 0x10
 #define XFS_BLI_STALE_INODE     0x20
+#define XFS_BLI_INODE_BUF       0x40
 #define XFS_BLI_FLAGS \
        { XFS_BLI_HOLD,         "HOLD" }, \
@@ -76,7 +77,8 @@ typedef struct xfs_buf_log_format_t {
        { XFS_BLI_STALE,        "STALE" }, \
        { XFS_BLI_LOGGED,       "LOGGED" }, \
        { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
-        { XFS_BLI_STALE_INODE,  "STALE_INODE" }
+        { XFS_BLI_STALE_INODE,  "STALE_INODE" }, \
+        { XFS_BLI_INODE_BUF,    "INODE_BUF" }
 #ifdef __KERNEL__
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index cd27c9d6c71f..5bba29a07812 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -177,16 +177,26 @@ xfs_swap_extents_check_format(
            XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
                return EINVAL;
-        /* Check root block of temp in btree form to max in target */
+        /*
+         * If we are in a btree format, check that the temp root block will fit
+         * in the target and that it has enough extents to be in btree format
+         * in the target.
+         *
+         * Note that we have to be careful to allow btree->extent conversions
+         * (a common defrag case) which will occur when the temp inode is in
+         * extent format...
+         */
        if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
-            XFS_IFORK_BOFF(ip) &&
+            ((XFS_IFORK_BOFF(ip) &&
-            tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
+              tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) ||
+             XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max))
                return EINVAL;
-        /* Check root block of target in btree form to max in temp */
+        /* Reciprocal target->temp btree format checks */
        if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
-            XFS_IFORK_BOFF(tip) &&
+            ((XFS_IFORK_BOFF(tip) &&
-            ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
+              ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) ||
+             XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max))
                return EINVAL;
        return 0;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 92d5cd5bf4f2..047b8a8e5c29 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -170,7 +170,7 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
        va_list ap;
 #ifdef DEBUG
-        xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT;
+        xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
 #endif
        if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
@@ -186,18 +186,18 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
 void
 xfs_error_report(
-        char            *tag,
+        const char              *tag,
-        int             level,
+        int                     level,
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        char            *fname,
+        const char              *filename,
-        int             linenum,
+        int                     linenum,
-        inst_t          *ra)
+        inst_t                  *ra)
 {
        if (level <= xfs_error_level) {
                xfs_cmn_err(XFS_PTAG_ERROR_REPORT,
                            CE_ALERT, mp,
                "XFS internal error %s at line %d of file %s.  Caller 0x%p\n",
-                            tag, linenum, fname, ra);
+                            tag, linenum, filename, ra);
                xfs_stack_trace();
        }
@@ -205,15 +205,15 @@ xfs_error_report(
 void
 xfs_corruption_error(
-        char            *tag,
+        const char              *tag,
-        int             level,
+        int                     level,
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        void            *p,
+        void                    *p,
-        char            *fname,
+        const char              *filename,
-        int             linenum,
+        int                     linenum,
-        inst_t          *ra)
+        inst_t                  *ra)
 {
        if (level <= xfs_error_level)
                xfs_hex_dump(p, 16);
-        xfs_error_report(tag, level, mp, fname, linenum, ra);
+        xfs_error_report(tag, level, mp, filename, linenum, ra);
 }
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 0c93051c4651..c2c1a072bb82 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -29,10 +29,11 @@ extern int	xfs_error_trap(int);
 struct xfs_mount;
-extern void xfs_error_report(char *tag, int level, struct xfs_mount *mp,
+extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
-                                char *fname, int linenum, inst_t *ra);
+                        const char *filename, int linenum, inst_t *ra);
-extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp,
+extern void xfs_corruption_error(const char *tag, int level,
-                                void *p, char *fname, int linenum, inst_t *ra);
+                        struct xfs_mount *mp, void *p, const char *filename,
+                        int linenum, inst_t *ra);
 #define XFS_ERROR_REPORT(e, lvl, mp)    \
        xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 6f35ed1b39b9..409fe81585fd 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -106,7 +106,7 @@ xfs_efi_item_pin(xfs_efi_log_item_t *efip)
 */
 /*ARGSUSED*/
 STATIC void
-xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale)
+xfs_efi_item_unpin(xfs_efi_log_item_t *efip)
 {
        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
@@ -224,7 +224,7 @@ static struct xfs_item_ops xfs_efi_item_ops = {
        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
                                        xfs_efi_item_format,
        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_efi_item_pin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*, int))xfs_efi_item_unpin,
+        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_efi_item_unpin,
        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
                                        xfs_efi_item_unpin_remove,
        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock,
@@ -259,10 +259,7 @@ xfs_efi_init(xfs_mount_t	*mp,
                                                             KM_SLEEP);
        }
-        efip->efi_item.li_type = XFS_LI_EFI;
+        xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
-        efip->efi_item.li_ops = &xfs_efi_item_ops;
-        efip->efi_item.li_mountp = mp;
-        efip->efi_item.li_ailp = mp->m_ail;
        efip->efi_format.efi_nextents = nextents;
        efip->efi_format.efi_id = (__psint_t)(void*)efip;
@@ -428,7 +425,7 @@ xfs_efd_item_pin(xfs_efd_log_item_t *efdp)
 */
 /*ARGSUSED*/
 STATIC void
-xfs_efd_item_unpin(xfs_efd_log_item_t *efdp, int stale)
+xfs_efd_item_unpin(xfs_efd_log_item_t *efdp)
 {
        return;
 }
@@ -518,7 +515,7 @@ static struct xfs_item_ops xfs_efd_item_ops = {
        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
                                        xfs_efd_item_format,
        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_efd_item_pin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*, int))xfs_efd_item_unpin,
+        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_efd_item_unpin,
        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
                                        xfs_efd_item_unpin_remove,
        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock,
@@ -554,10 +551,7 @@ xfs_efd_init(xfs_mount_t	*mp,
                                                             KM_SLEEP);
        }
-        efdp->efd_item.li_type = XFS_LI_EFD;
+        xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops);
-        efdp->efd_item.li_ops = &xfs_efd_item_ops;
-        efdp->efd_item.li_mountp = mp;
-        efdp->efd_item.li_ailp = mp->m_ail;
        efdp->efd_efip = efip;
        efdp->efd_format.efd_nextents = nextents;
        efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0ffd56447045..8cd6e8d8fe9c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2449,6 +2449,8 @@ xfs_iunpin_nowait(
 {
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+        trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
        /* Give the log a push to start the unpinning I/O */
        xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7bfea8540159..cf8249a60004 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -543,6 +543,7 @@ xfs_inode_item_pin(
 {
        ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
+        trace_xfs_inode_pin(iip->ili_inode, _RET_IP_);
        atomic_inc(&iip->ili_inode->i_pincount);
 }
@@ -556,11 +557,11 @@ xfs_inode_item_pin(
 /* ARGSUSED */
 STATIC void
 xfs_inode_item_unpin(
-        xfs_inode_log_item_t    *iip,
+        xfs_inode_log_item_t    *iip)
-        int                     stale)
 {
        struct xfs_inode        *ip = iip->ili_inode;
+        trace_xfs_inode_unpin(ip, _RET_IP_);
        ASSERT(atomic_read(&ip->i_pincount) > 0);
        if (atomic_dec_and_test(&ip->i_pincount))
                wake_up(&ip->i_ipin_wait);
@@ -572,7 +573,7 @@ xfs_inode_item_unpin_remove(
        xfs_inode_log_item_t    *iip,
        xfs_trans_t             *tp)
 {
-        xfs_inode_item_unpin(iip, 0);
+        xfs_inode_item_unpin(iip);
 }
 /*
@@ -838,7 +839,7 @@ static struct xfs_item_ops xfs_inode_item_ops = {
        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
                                        xfs_inode_item_format,
        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_inode_item_pin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*, int))xfs_inode_item_unpin,
+        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_inode_item_unpin,
        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
                                        xfs_inode_item_unpin_remove,
        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock,
@@ -865,17 +866,9 @@ xfs_inode_item_init(
        ASSERT(ip->i_itemp == NULL);
        iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
-        iip->ili_item.li_type = XFS_LI_INODE;
-        iip->ili_item.li_ops = &xfs_inode_item_ops;
-        iip->ili_item.li_mountp = mp;
-        iip->ili_item.li_ailp = mp->m_ail;
        iip->ili_inode = ip;
+        xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
-        /*
+                                                &xfs_inode_item_ops);
-           We have zeroed memory. No need ...
-           iip->ili_extents_buf = NULL;
-         */
        iip->ili_format.ilf_type = XFS_LI_INODE;
        iip->ili_format.ilf_ino = ip->i_ino;
        iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 0b65039951a0..ef14943829da 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -55,71 +55,33 @@
 #define XFS_STRAT_WRITE_IMAPS   2
 #define XFS_WRITE_IMAPS         XFS_BMAP_MAX_NMAP
-STATIC int
+STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-xfs_imap_to_bmap(
+                                  int, struct xfs_bmbt_irec *, int *);
-        xfs_inode_t     *ip,
+STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
-        xfs_off_t       offset,
+                                 struct xfs_bmbt_irec *, int *);
-        xfs_bmbt_irec_t *imap,
+STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
-        xfs_iomap_t     *iomapp,
+                                struct xfs_bmbt_irec *, int *);
-        int             imaps,                  /* Number of imap entries */
-        int             iomaps,                 /* Number of iomap entries */
-        int             flags)
-{
-        xfs_mount_t     *mp = ip->i_mount;
-        int             pbm;
-        xfs_fsblock_t   start_block;
-        for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) {
-                iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff);
-                iomapp->iomap_delta = offset - iomapp->iomap_offset;
-                iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount);
-                iomapp->iomap_flags = flags;
-                if (XFS_IS_REALTIME_INODE(ip)) {
-                        iomapp->iomap_flags |= IOMAP_REALTIME;
-                        iomapp->iomap_target = mp->m_rtdev_targp;
-                } else {
-                        iomapp->iomap_target = mp->m_ddev_targp;
-                }
-                start_block = imap->br_startblock;
-                if (start_block == HOLESTARTBLOCK) {
-                        iomapp->iomap_bn = IOMAP_DADDR_NULL;
-                        iomapp->iomap_flags |= IOMAP_HOLE;
-                } else if (start_block == DELAYSTARTBLOCK) {
-                        iomapp->iomap_bn = IOMAP_DADDR_NULL;
-                        iomapp->iomap_flags |= IOMAP_DELAY;
-                } else {
-                        iomapp->iomap_bn = xfs_fsb_to_db(ip, start_block);
-                        if (ISUNWRITTEN(imap))
-                                iomapp->iomap_flags |= IOMAP_UNWRITTEN;
-                }
-                offset += iomapp->iomap_bsize - iomapp->iomap_delta;
-        }
-        return pbm;     /* Return the number filled */
-}
 int
 xfs_iomap(
-        xfs_inode_t     *ip,
+        struct xfs_inode        *ip,
-        xfs_off_t       offset,
+        xfs_off_t               offset,
-        ssize_t         count,
+        ssize_t                 count,
-        int             flags,
+        int                     flags,
-        xfs_iomap_t     *iomapp,
+        struct xfs_bmbt_irec    *imap,
-        int             *niomaps)
+        int                     *nimaps,
+        int                     *new)
 {
-        xfs_mount_t     *mp = ip->i_mount;
+        struct xfs_mount        *mp = ip->i_mount;
-        xfs_fileoff_t   offset_fsb, end_fsb;
+        xfs_fileoff_t           offset_fsb, end_fsb;
-        int             error = 0;
+        int                     error = 0;
-        int             lockmode = 0;
+        int                     lockmode = 0;
-        xfs_bmbt_irec_t imap;
+        int                     bmapi_flags = 0;
-        int             nimaps = 1;
-        int             bmapi_flags = 0;
-        int             iomap_flags = 0;
        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
+        *new = 0;
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
@@ -160,8 +122,8 @@ xfs_iomap(
        error = xfs_bmapi(NULL, ip, offset_fsb,
                        (xfs_filblks_t)(end_fsb - offset_fsb),
-                        bmapi_flags,  NULL, 0, &imap,
+                        bmapi_flags,  NULL, 0, imap,
-                        &nimaps, NULL, NULL);
+                        nimaps, NULL, NULL);
        if (error)
                goto out;
@@ -169,46 +131,41 @@ xfs_iomap(
        switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
        case BMAPI_WRITE:
                /* If we found an extent, return it */
-                if (nimaps &&
+                if (*nimaps &&
-                    (imap.br_startblock != HOLESTARTBLOCK) &&
+                    (imap->br_startblock != HOLESTARTBLOCK) &&
-                    (imap.br_startblock != DELAYSTARTBLOCK)) {
+                    (imap->br_startblock != DELAYSTARTBLOCK)) {
-                        trace_xfs_iomap_found(ip, offset, count, flags, &imap);
+                        trace_xfs_iomap_found(ip, offset, count, flags, imap);
                        break;
                }
                if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) {
                        error = xfs_iomap_write_direct(ip, offset, count, flags,
-                                                       &imap, &nimaps, nimaps);
+                                                       imap, nimaps);
                } else {
                        error = xfs_iomap_write_delay(ip, offset, count, flags,
-                                                      &imap, &nimaps);
+                                                      imap, nimaps);
                }
                if (!error) {
-                        trace_xfs_iomap_alloc(ip, offset, count, flags, &imap);
+                        trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
                }
-                iomap_flags = IOMAP_NEW;
+                *new = 1;
                break;
        case BMAPI_ALLOCATE:
                /* If we found an extent, return it */
                xfs_iunlock(ip, lockmode);
                lockmode = 0;
-                if (nimaps && !isnullstartblock(imap.br_startblock)) {
+                if (*nimaps && !isnullstartblock(imap->br_startblock)) {
-                        trace_xfs_iomap_found(ip, offset, count, flags, &imap);
+                        trace_xfs_iomap_found(ip, offset, count, flags, imap);
                        break;
                }
                error = xfs_iomap_write_allocate(ip, offset, count,
-                                                 &imap, &nimaps);
+                                                 imap, nimaps);
                break;
        }
-        if (nimaps) {
+        ASSERT(*nimaps <= 1);
-                *niomaps = xfs_imap_to_bmap(ip, offset, &imap,
-                                            iomapp, nimaps, *niomaps, iomap_flags);
-        } else if (niomaps) {
-                *niomaps = 0;
-        }
 out:
        if (lockmode)
@@ -216,7 +173,6 @@ out:
        return XFS_ERROR(error);
 }
 STATIC int
 xfs_iomap_eof_align_last_fsb(
        xfs_mount_t     *mp,
@@ -285,15 +241,14 @@ xfs_cmn_err_fsblock_zero(
        return EFSCORRUPTED;
 }
-int
+STATIC int
 xfs_iomap_write_direct(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
        int             flags,
        xfs_bmbt_irec_t *ret_imap,
-        int             *nmaps,
+        int             *nmaps)
-        int             found)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb;
@@ -330,7 +285,7 @@ xfs_iomap_write_direct(
                if (error)
                        goto error_out;
        } else {
-                if (found && (ret_imap->br_startblock == HOLESTARTBLOCK))
+                if (*nmaps && (ret_imap->br_startblock == HOLESTARTBLOCK))
                        last_fsb = MIN(last_fsb, (xfs_fileoff_t)
                                        ret_imap->br_blockcount +
                                        ret_imap->br_startoff);
@@ -485,7 +440,7 @@ xfs_iomap_eof_want_preallocate(
        return 0;
 }
-int
+STATIC int
 xfs_iomap_write_delay(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
@@ -588,7 +543,7 @@ retry:
 * We no longer bother to look at the incoming map - all we have to
 * guarantee is that whatever we allocate fills the required range.
 */
-int
+STATIC int
 xfs_iomap_write_allocate(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 174f29990991..81ac4afd45b3 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,19 +18,6 @@
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__
-#define IOMAP_DADDR_NULL ((xfs_daddr_t) (-1LL))
-typedef enum {                          /* iomap_flags values */
-        IOMAP_READ =            0,      /* mapping for a read */
-        IOMAP_HOLE =            0x02,   /* mapping covers a hole  */
-        IOMAP_DELAY =           0x04,   /* mapping covers delalloc region  */
-        IOMAP_REALTIME =        0x10,   /* mapping on the realtime device  */
-        IOMAP_UNWRITTEN =       0x20,   /* mapping covers allocated */
-                                        /* but uninitialized file data  */
-        IOMAP_NEW =             0x40    /* just allocate */
-} iomap_flags_t;
 typedef enum {
        /* base extent manipulation calls */
        BMAPI_READ = (1 << 0),          /* read extents */
@@ -52,43 +39,11 @@ typedef enum {
        { BMAPI_MMAP,           "MMAP" }, \
        { BMAPI_TRYLOCK,        "TRYLOCK" }
-/*
- * xfs_iomap_t:  File system I/O map
- *
- * The iomap_bn field is expressed in 512-byte blocks, and is where the
- * mapping starts on disk.
- *
- * The iomap_offset, iomap_bsize and iomap_delta fields are in bytes.
- * iomap_offset is the offset of the mapping in the file itself.
- * iomap_bsize is the size of the mapping,  iomap_delta is the
- * desired data's offset into the mapping, given the offset supplied
- * to the file I/O map routine.
- *
- * When a request is made to read beyond the logical end of the object,
- * iomap_size may be set to 0, but iomap_offset and iomap_length should be set
- * to the actual amount of underlying storage that has been allocated, if any.
- */
-typedef struct xfs_iomap {
-        xfs_daddr_t             iomap_bn;       /* first 512B blk of mapping */
-        xfs_buftarg_t           *iomap_target;
-        xfs_off_t               iomap_offset;   /* offset of mapping, bytes */
-        xfs_off_t               iomap_bsize;    /* size of mapping, bytes */
-        xfs_off_t               iomap_delta;    /* offset into mapping, bytes */
-        iomap_flags_t           iomap_flags;
-} xfs_iomap_t;
 struct xfs_inode;
 struct xfs_bmbt_irec;
 extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int,
-                     struct xfs_iomap *, int *);
+                     struct xfs_bmbt_irec *, int *, int *);
-extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-                                  int, struct xfs_bmbt_irec *, int *, int);
-extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
-                                 struct xfs_bmbt_irec *, int *);
-extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
-                                struct xfs_bmbt_irec *, int *);
 extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index e8fba92d7cd9..5215abc8023a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -44,13 +44,8 @@
 kmem_zone_t     *xfs_log_ticket_zone;
-#define xlog_write_adv_cnt(ptr, len, off, bytes) \
-        { (ptr) += (bytes); \
-          (len) -= (bytes); \
-          (off) += (bytes);}
 /* Local miscellaneous function prototypes */
-STATIC int       xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket,
+STATIC int       xlog_commit_record(struct log *log, struct xlog_ticket *ticket,
                                    xlog_in_core_t **, xfs_lsn_t *);
 STATIC xlog_t *  xlog_alloc_log(xfs_mount_t     *mp,
                                xfs_buftarg_t   *log_target,
@@ -59,11 +54,6 @@ STATIC xlog_t *  xlog_alloc_log(xfs_mount_t	*mp,
 STATIC int       xlog_space_left(xlog_t *log, int cycle, int bytes);
 STATIC int       xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
 STATIC void      xlog_dealloc_log(xlog_t *log);
-STATIC int       xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[],
-                            int nentries, struct xlog_ticket *tic,
-                            xfs_lsn_t *start_lsn,
-                            xlog_in_core_t **commit_iclog,
-                            uint flags);
 /* local state machine functions */
 STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
@@ -93,16 +83,8 @@ STATIC int xlog_regrant_write_log_space(xlog_t		*log,
 STATIC void xlog_ungrant_log_space(xlog_t        *log,
                                   xlog_ticket_t *ticket);
-/* local ticket functions */
-STATIC xlog_ticket_t    *xlog_ticket_alloc(xlog_t *log,
-                                         int    unit_bytes,
-                                         int    count,
-                                         char   clientid,
-                                         uint   flags);
 #if defined(DEBUG)
-STATIC void     xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr);
+STATIC void     xlog_verify_dest_ptr(xlog_t *log, char *ptr);
 STATIC void     xlog_verify_grant_head(xlog_t *log, int equals);
 STATIC void     xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
                                  int count, boolean_t syncing);
@@ -258,7 +240,7 @@ xfs_log_done(
             * If we get an error, just continue and give back the log ticket.
             */
            (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
-             (xlog_commit_record(mp, ticket, iclog, &lsn)))) {
+             (xlog_commit_record(log, ticket, iclog, &lsn)))) {
                lsn = (xfs_lsn_t) -1;
                if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
                        flags |= XFS_LOG_REL_PERM_RESERV;
@@ -367,6 +349,15 @@ xfs_log_reserve(
                ASSERT(flags & XFS_LOG_PERM_RESERV);
                internal_ticket = *ticket;
+                /*
+                 * this is a new transaction on the ticket, so we need to
+                 * change the transaction ID so that the next transaction has a
+                 * different TID in the log. Just add one to the existing tid
+                 * so that we can see chains of rolling transactions in the log
+                 * easily.
+                 */
+                internal_ticket->t_tid++;
                trace_xfs_log_reserve(log, internal_ticket);
                xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
@@ -374,7 +365,8 @@ xfs_log_reserve(
        } else {
                /* may sleep if need to allocate more tickets */
                internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
-                                                  client, flags);
+                                                  client, flags,
+                                                  KM_SLEEP|KM_MAYFAIL);
                if (!internal_ticket)
                        return XFS_ERROR(ENOMEM);
                internal_ticket->t_trans_type = t_type;
@@ -459,6 +451,13 @@ xfs_log_mount(
        /* Normal transactions can now occur */
        mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
+        /*
+         * Now the log has been fully initialised and we know were our
+         * space grant counters are, we can initialise the permanent ticket
+         * needed for delayed logging to work.
+         */
+        xlog_cil_init_post_recovery(mp->m_log);
        return 0;
 out_destroy_ail:
@@ -516,18 +515,10 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 #ifdef DEBUG
        xlog_in_core_t   *first_iclog;
 #endif
-        xfs_log_iovec_t  reg[1];
        xlog_ticket_t   *tic = NULL;
        xfs_lsn_t        lsn;
        int              error;
-        /* the data section must be 32 bit size aligned */
-        struct {
-            __uint16_t magic;
-            __uint16_t pad1;
-            __uint32_t pad2; /* may as well make it 64 bits */
-        } magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
        /*
         * Don't write out unmount record on read-only mounts.
         * Or, if we are doing a forced umount (typically because of IO errors).
@@ -549,16 +540,30 @@ xfs_log_unmount_write(xfs_mount_t *mp)
        } while (iclog != first_iclog);
 #endif
        if (! (XLOG_FORCED_SHUTDOWN(log))) {
-                reg[0].i_addr = (void*)&magic;
-                reg[0].i_len  = sizeof(magic);
-                reg[0].i_type = XLOG_REG_TYPE_UNMOUNT;
                error = xfs_log_reserve(mp, 600, 1, &tic,
                                        XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
                if (!error) {
+                        /* the data section must be 32 bit size aligned */
+                        struct {
+                            __uint16_t magic;
+                            __uint16_t pad1;
+                            __uint32_t pad2; /* may as well make it 64 bits */
+                        } magic = {
+                                .magic = XLOG_UNMOUNT_TYPE,
+                        };
+                        struct xfs_log_iovec reg = {
+                                .i_addr = (void *)&magic,
+                                .i_len = sizeof(magic),
+                                .i_type = XLOG_REG_TYPE_UNMOUNT,
+                        };
+                        struct xfs_log_vec vec = {
+                                .lv_niovecs = 1,
+                                .lv_iovecp = &reg,
+                        };
                        /* remove inited flag */
-                        ((xlog_ticket_t *)tic)->t_flags = 0;
+                        tic->t_flags = 0;
-                        error = xlog_write(mp, reg, 1, tic, &lsn,
+                        error = xlog_write(log, &vec, tic, &lsn,
                                           NULL, XLOG_UNMOUNT_TRANS);
                        /*
                         * At this point, we're umounting anyway,
@@ -648,10 +653,30 @@ xfs_log_unmount(xfs_mount_t *mp)
        xlog_dealloc_log(mp->m_log);
 }
+void
+xfs_log_item_init(
+        struct xfs_mount        *mp,
+        struct xfs_log_item     *item,
+        int                     type,
+        struct xfs_item_ops     *ops)
+{
+        item->li_mountp = mp;
+        item->li_ailp = mp->m_ail;
+        item->li_type = type;
+        item->li_ops = ops;
+        item->li_lv = NULL;
+        INIT_LIST_HEAD(&item->li_ail);
+        INIT_LIST_HEAD(&item->li_cil);
+}
 /*
 * Write region vectors to log.  The write happens using the space reservation
 * of the ticket (tic).  It is not a requirement that all writes for a given
- * transaction occur with one call to xfs_log_write().
+ * transaction occur with one call to xfs_log_write(). However, it is important
+ * to note that the transaction reservation code makes an assumption about the
+ * number of log headers a transaction requires that may be violated if you
+ * don't pass all the transaction vectors in one call....
 */
 int
 xfs_log_write(
@@ -663,11 +688,15 @@ xfs_log_write(
 {
        struct log              *log = mp->m_log;
        int                     error;
+        struct xfs_log_vec      vec = {
+                .lv_niovecs = nentries,
+                .lv_iovecp = reg,
+        };
        if (XLOG_FORCED_SHUTDOWN(log))
                return XFS_ERROR(EIO);
-        error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0);
+        error = xlog_write(log, &vec, tic, start_lsn, NULL, 0);
        if (error)
                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
        return error;
@@ -745,9 +774,16 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 /*
 * Determine if we have a transaction that has gone to disk
- * that needs to be covered. Log activity needs to be idle (no AIL and
+ * that needs to be covered. To begin the transition to the idle state
- * nothing in the iclogs). And, we need to be in the right state indicating
+ * firstly the log needs to be idle (no AIL and nothing in the iclogs).
- * something has gone out.
+ * If we are then in a state where covering is needed, the caller is informed
+ * that dummy transactions are required to move the log into the idle state.
+ *
+ * Because this is called as part of the sync process, we should also indicate
+ * that dummy transactions should be issued in anything but the covered or
+ * idle states. This ensures that the log tail is accurately reflected in
+ * the log at the end of the sync, hence if a crash occurrs avoids replay
+ * of transactions where the metadata is already on disk.
 */
 int
 xfs_log_need_covered(xfs_mount_t *mp)
@@ -759,17 +795,24 @@ xfs_log_need_covered(xfs_mount_t *mp)
                return 0;
        spin_lock(&log->l_icloglock);
-        if (((log->l_covered_state == XLOG_STATE_COVER_NEED) ||
+        switch (log->l_covered_state) {
-                (log->l_covered_state == XLOG_STATE_COVER_NEED2))
+        case XLOG_STATE_COVER_DONE:
-                        && !xfs_trans_ail_tail(log->l_ailp)
+        case XLOG_STATE_COVER_DONE2:
-                        && xlog_iclogs_empty(log)) {
+        case XLOG_STATE_COVER_IDLE:
-                if (log->l_covered_state == XLOG_STATE_COVER_NEED)
+                break;
-                        log->l_covered_state = XLOG_STATE_COVER_DONE;
+        case XLOG_STATE_COVER_NEED:
-                else {
+        case XLOG_STATE_COVER_NEED2:
-                        ASSERT(log->l_covered_state == XLOG_STATE_COVER_NEED2);
+                if (!xfs_trans_ail_tail(log->l_ailp) &&
-                        log->l_covered_state = XLOG_STATE_COVER_DONE2;
+                    xlog_iclogs_empty(log)) {
+                        if (log->l_covered_state == XLOG_STATE_COVER_NEED)
+                                log->l_covered_state = XLOG_STATE_COVER_DONE;
+                        else
+                                log->l_covered_state = XLOG_STATE_COVER_DONE2;
                }
+                /* FALLTHRU */
+        default:
                needed = 1;
+                break;
        }
        spin_unlock(&log->l_icloglock);
        return needed;
@@ -1006,6 +1049,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
        int                     i;
        int                     iclogsize;
        int                     error = ENOMEM;
+        uint                    log2_size = 0;
        log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
        if (!log) {
@@ -1031,29 +1075,30 @@ xlog_alloc_log(xfs_mount_t	*mp,
        error = EFSCORRUPTED;
        if (xfs_sb_version_hassector(&mp->m_sb)) {
-                log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT;
+                log2_size = mp->m_sb.sb_logsectlog;
-                if (log->l_sectbb_log < 0 ||
+                if (log2_size < BBSHIFT) {
-                    log->l_sectbb_log > mp->m_sectbb_log) {
+                        xlog_warn("XFS: Log sector size too small "
-                        xlog_warn("XFS: Log sector size (0x%x) out of range.",
+                                "(0x%x < 0x%x)", log2_size, BBSHIFT);
-                                                log->l_sectbb_log);
                        goto out_free_log;
                }
-                /* for larger sector sizes, must have v2 or external log */
+                log2_size -= BBSHIFT;
-                if (log->l_sectbb_log != 0 &&
+                if (log2_size > mp->m_sectbb_log) {
-                    (log->l_logBBstart != 0 &&
+                        xlog_warn("XFS: Log sector size too large "
-                     !xfs_sb_version_haslogv2(&mp->m_sb))) {
+                                "(0x%x > 0x%x)", log2_size, mp->m_sectbb_log);
-                        xlog_warn("XFS: log sector size (0x%x) invalid "
-                                  "for configuration.", log->l_sectbb_log);
                        goto out_free_log;
                }
-                if (mp->m_sb.sb_logsectlog < BBSHIFT) {
-                        xlog_warn("XFS: Log sector log (0x%x) too small.",
+                /* for larger sector sizes, must have v2 or external log */
-                                                mp->m_sb.sb_logsectlog);
+                if (log2_size && log->l_logBBstart > 0 &&
+                            !xfs_sb_version_haslogv2(&mp->m_sb)) {
+                        xlog_warn("XFS: log sector size (0x%x) invalid "
+                                  "for configuration.", log2_size);
                        goto out_free_log;
                }
        }
-        log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1;
+        log->l_sectBBsize = 1 << log2_size;
        xlog_get_iclog_buffer_size(mp, log);
@@ -1133,6 +1178,9 @@ xlog_alloc_log(xfs_mount_t	*mp,
        *iclogp = log->l_iclog;                 /* complete ring */
        log->l_iclog->ic_prev = prev_iclog;     /* re-write 1st prev ptr */
+        error = xlog_cil_init(log);
+        if (error)
+                goto out_free_iclog;
        return log;
 out_free_iclog:
@@ -1160,26 +1208,31 @@ out:
 * ticket.  Return the lsn of the commit record.
 */
 STATIC int
-xlog_commit_record(xfs_mount_t  *mp,
+xlog_commit_record(
-                   xlog_ticket_t *ticket,
+        struct log              *log,
-                   xlog_in_core_t **iclog,
+        struct xlog_ticket      *ticket,
-                   xfs_lsn_t    *commitlsnp)
+        struct xlog_in_core     **iclog,
+        xfs_lsn_t               *commitlsnp)
 {
-        int             error;
+        struct xfs_mount *mp = log->l_mp;
-        xfs_log_iovec_t reg[1];
+        int     error;
+        struct xfs_log_iovec reg = {
-        reg[0].i_addr = NULL;
+                .i_addr = NULL,
-        reg[0].i_len = 0;
+                .i_len = 0,
-        reg[0].i_type = XLOG_REG_TYPE_COMMIT;
+                .i_type = XLOG_REG_TYPE_COMMIT,
+        };
+        struct xfs_log_vec vec = {
+                .lv_niovecs = 1,
+                .lv_iovecp = &reg,
+        };
        ASSERT_ALWAYS(iclog);
-        if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp,
+        error = xlog_write(log, &vec, ticket, commitlsnp, iclog,
-                               iclog, XLOG_COMMIT_TRANS))) {
+                                        XLOG_COMMIT_TRANS);
+        if (error)
                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
-        }
        return error;
-}       /* xlog_commit_record */
+}
 /*
 * Push on the buffer cache code if we ever use more than 75% of the on-disk
@@ -1454,6 +1507,8 @@ xlog_dealloc_log(xlog_t *log)
        xlog_in_core_t  *iclog, *next_iclog;
        int             i;
+        xlog_cil_destroy(log);
        iclog = log->l_iclog;
        for (i=0; i<log->l_iclog_bufs; i++) {
                sv_destroy(&iclog->ic_force_wait);
@@ -1496,8 +1551,10 @@ xlog_state_finish_copy(xlog_t		*log,
 * print out info relating to regions written which consume
 * the reservation
 */
-STATIC void
+void
-xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
+xlog_print_tic_res(
+        struct xfs_mount        *mp,
+        struct xlog_ticket      *ticket)
 {
        uint i;
        uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
@@ -1597,6 +1654,196 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
                            "bad-rtype" : res_type_str[r_type-1]),
                            ticket->t_res_arr[i].r_len);
        }
+        xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
+                "xfs_log_write: reservation ran out. Need to up reservation");
+        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+}
+/*
+ * Calculate the potential space needed by the log vector.  Each region gets
+ * its own xlog_op_header_t and may need to be double word aligned.
+ */
+static int
+xlog_write_calc_vec_length(
+        struct xlog_ticket      *ticket,
+        struct xfs_log_vec      *log_vector)
+{
+        struct xfs_log_vec      *lv;
+        int                     headers = 0;
+        int                     len = 0;
+        int                     i;
+        /* acct for start rec of xact */
+        if (ticket->t_flags & XLOG_TIC_INITED)
+                headers++;
+        for (lv = log_vector; lv; lv = lv->lv_next) {
+                headers += lv->lv_niovecs;
+                for (i = 0; i < lv->lv_niovecs; i++) {
+                        struct xfs_log_iovec    *vecp = &lv->lv_iovecp[i];
+                        len += vecp->i_len;
+                        xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type);
+                }
+        }
+        ticket->t_res_num_ophdrs += headers;
+        len += headers * sizeof(struct xlog_op_header);
+        return len;
+}
+/*
+ * If first write for transaction, insert start record  We can't be trying to
+ * commit if we are inited.  We can't have any "partial_copy" if we are inited.
+ */
+static int
+xlog_write_start_rec(
+        struct xlog_op_header   *ophdr,
+        struct xlog_ticket      *ticket)
+{
+        if (!(ticket->t_flags & XLOG_TIC_INITED))
+                return 0;
+        ophdr->oh_tid   = cpu_to_be32(ticket->t_tid);
+        ophdr->oh_clientid = ticket->t_clientid;
+        ophdr->oh_len = 0;
+        ophdr->oh_flags = XLOG_START_TRANS;
+        ophdr->oh_res2 = 0;
+        ticket->t_flags &= ~XLOG_TIC_INITED;
+        return sizeof(struct xlog_op_header);
+}
+static xlog_op_header_t *
+xlog_write_setup_ophdr(
+        struct log              *log,
+        struct xlog_op_header   *ophdr,
+        struct xlog_ticket      *ticket,
+        uint                    flags)
+{
+        ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+        ophdr->oh_clientid = ticket->t_clientid;
+        ophdr->oh_res2 = 0;
+        /* are we copying a commit or unmount record? */
+        ophdr->oh_flags = flags;
+        /*
+         * We've seen logs corrupted with bad transaction client ids.  This
+         * makes sure that XFS doesn't generate them on.  Turn this into an EIO
+         * and shut down the filesystem.
+         */
+        switch (ophdr->oh_clientid)  {
+        case XFS_TRANSACTION:
+        case XFS_VOLUME:
+        case XFS_LOG:
+                break;
+        default:
+                xfs_fs_cmn_err(CE_WARN, log->l_mp,
+                        "Bad XFS transaction clientid 0x%x in ticket 0x%p",
+                        ophdr->oh_clientid, ticket);
+                return NULL;
+        }
+        return ophdr;
+}
+/*
+ * Set up the parameters of the region copy into the log. This has
+ * to handle region write split across multiple log buffers - this
+ * state is kept external to this function so that this code can
+ * can be written in an obvious, self documenting manner.
+ */
+static int
+xlog_write_setup_copy(
+        struct xlog_ticket      *ticket,
+        struct xlog_op_header   *ophdr,
+        int                     space_available,
+        int                     space_required,
+        int                     *copy_off,
+        int                     *copy_len,
+        int                     *last_was_partial_copy,
+        int                     *bytes_consumed)
+{
+        int                     still_to_copy;
+        still_to_copy = space_required - *bytes_consumed;
+        *copy_off = *bytes_consumed;
+        if (still_to_copy <= space_available) {
+                /* write of region completes here */
+                *copy_len = still_to_copy;
+                ophdr->oh_len = cpu_to_be32(*copy_len);
+                if (*last_was_partial_copy)
+                        ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
+                *last_was_partial_copy = 0;
+                *bytes_consumed = 0;
+                return 0;
+        }
+        /* partial write of region, needs extra log op header reservation */
+        *copy_len = space_available;
+        ophdr->oh_len = cpu_to_be32(*copy_len);
+        ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
+        if (*last_was_partial_copy)
+                ophdr->oh_flags |= XLOG_WAS_CONT_TRANS;
+        *bytes_consumed += *copy_len;
+        (*last_was_partial_copy)++;
+        /* account for new log op header */
+        ticket->t_curr_res -= sizeof(struct xlog_op_header);
+        ticket->t_res_num_ophdrs++;
+        return sizeof(struct xlog_op_header);
+}
+static int
+xlog_write_copy_finish(
+        struct log              *log,
+        struct xlog_in_core     *iclog,
+        uint                    flags,
+        int                     *record_cnt,
+        int                     *data_cnt,
+        int                     *partial_copy,
+        int                     *partial_copy_len,
+        int                     log_offset,
+        struct xlog_in_core     **commit_iclog)
+{
+        if (*partial_copy) {
+                /*
+                 * This iclog has already been marked WANT_SYNC by
+                 * xlog_state_get_iclog_space.
+                 */
+                xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+                *record_cnt = 0;
+                *data_cnt = 0;
+                return xlog_state_release_iclog(log, iclog);
+        }
+        *partial_copy = 0;
+        *partial_copy_len = 0;
+        if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
+                /* no more space in this iclog - push it. */
+                xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+                *record_cnt = 0;
+                *data_cnt = 0;
+                spin_lock(&log->l_icloglock);
+                xlog_state_want_sync(log, iclog);
+                spin_unlock(&log->l_icloglock);
+                if (!commit_iclog)
+                        return xlog_state_release_iclog(log, iclog);
+                ASSERT(flags & XLOG_COMMIT_TRANS);
+                *commit_iclog = iclog;
+        }
+        return 0;
 }
 /*
@@ -1639,211 +1886,163 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
 *      we don't update ic_offset until the end when we know exactly how many
 *      bytes have been written out.
 */
-STATIC int
+int
 xlog_write(
-        struct xfs_mount        *mp,
+        struct log              *log,
-        struct xfs_log_iovec    reg[],
+        struct xfs_log_vec      *log_vector,
-        int                     nentries,
        struct xlog_ticket      *ticket,
        xfs_lsn_t               *start_lsn,
        struct xlog_in_core     **commit_iclog,
        uint                    flags)
 {
-    xlog_t           *log = mp->m_log;
+        struct xlog_in_core     *iclog = NULL;
-    xlog_in_core_t   *iclog = NULL;  /* ptr to current in-core log */
+        struct xfs_log_iovec    *vecp;
-    xlog_op_header_t *logop_head;    /* ptr to log operation header */
+        struct xfs_log_vec      *lv;
-    __psint_t        ptr;            /* copy address into data region */
+        int                     len;
-    int              len;            /* # xlog_write() bytes 2 still copy */
+        int                     index;
-    int              index;          /* region index currently copying */
+        int                     partial_copy = 0;
-    int              log_offset;     /* offset (from 0) into data region */
+        int                     partial_copy_len = 0;
-    int              start_rec_copy; /* # bytes to copy for start record */
+        int                     contwr = 0;
-    int              partial_copy;   /* did we split a region? */
+        int                     record_cnt = 0;
-    int              partial_copy_len;/* # bytes copied if split region */
+        int                     data_cnt = 0;
-    int              need_copy;      /* # bytes need to memcpy this region */
+        int                     error;
-    int              copy_len;       /* # bytes actually memcpy'ing */
-    int              copy_off;       /* # bytes from entry start */
-    int              contwr;         /* continued write of in-core log? */
-    int              error;
-    int              record_cnt = 0, data_cnt = 0;
-    partial_copy_len = partial_copy = 0;
-    /* Calculate potential maximum space.  Each region gets its own
-     * xlog_op_header_t and may need to be double word aligned.
-     */
-    len = 0;
-    if (ticket->t_flags & XLOG_TIC_INITED) {    /* acct for start rec of xact */
-        len += sizeof(xlog_op_header_t);
-        ticket->t_res_num_ophdrs++;
-    }
-    for (index = 0; index < nentries; index++) {
-        len += sizeof(xlog_op_header_t);            /* each region gets >= 1 */
-        ticket->t_res_num_ophdrs++;
-        len += reg[index].i_len;
-        xlog_tic_add_region(ticket, reg[index].i_len, reg[index].i_type);
-    }
-    contwr = *start_lsn = 0;
-    if (ticket->t_curr_res < len) {
+        *start_lsn = 0;
-        xlog_print_tic_res(mp, ticket);
-#ifdef DEBUG
-        xlog_panic(
-                "xfs_log_write: reservation ran out. Need to up reservation");
-#else
-        /* Customer configurable panic */
-        xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
-                "xfs_log_write: reservation ran out. Need to up reservation");
-        /* If we did not panic, shutdown the filesystem */
-        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-#endif
-    } else
-        ticket->t_curr_res -= len;
-    for (index = 0; index < nentries; ) {
+        len = xlog_write_calc_vec_length(ticket, log_vector);
-        if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+        if (log->l_cilp) {
-                                               &contwr, &log_offset)))
+                /*
-                return error;
+                 * Region headers and bytes are already accounted for.
+                 * We only need to take into account start records and
+                 * split regions in this function.
+                 */
+                if (ticket->t_flags & XLOG_TIC_INITED)
+                        ticket->t_curr_res -= sizeof(xlog_op_header_t);
-        ASSERT(log_offset <= iclog->ic_size - 1);
+                /*
-        ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset);
+                 * Commit record headers need to be accounted for. These
+                 * come in as separate writes so are easy to detect.
+                 */
+                if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
+                        ticket->t_curr_res -= sizeof(xlog_op_header_t);
+        } else
+                ticket->t_curr_res -= len;
+        if (ticket->t_curr_res < 0)
+                xlog_print_tic_res(log->l_mp, ticket);
+        index = 0;
+        lv = log_vector;
+        vecp = lv->lv_iovecp;
+        while (lv && index < lv->lv_niovecs) {
+                void            *ptr;
+                int             log_offset;
+                error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+                                                   &contwr, &log_offset);
+                if (error)
+                        return error;
-        /* start_lsn is the first lsn written to. That's all we need. */
+                ASSERT(log_offset <= iclog->ic_size - 1);
-        if (! *start_lsn)
+                ptr = iclog->ic_datap + log_offset;
-            *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
-        /* This loop writes out as many regions as can fit in the amount
+                /* start_lsn is the first lsn written to. That's all we need. */
-         * of space which was allocated by xlog_state_get_iclog_space().
+                if (!*start_lsn)
-         */
+                        *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
-        while (index < nentries) {
-            ASSERT(reg[index].i_len % sizeof(__int32_t) == 0);
-            ASSERT((__psint_t)ptr % sizeof(__int32_t) == 0);
-            start_rec_copy = 0;
-            /* If first write for transaction, insert start record.
-             * We can't be trying to commit if we are inited.  We can't
-             * have any "partial_copy" if we are inited.
-             */
-            if (ticket->t_flags & XLOG_TIC_INITED) {
-                logop_head              = (xlog_op_header_t *)ptr;
-                logop_head->oh_tid      = cpu_to_be32(ticket->t_tid);
-                logop_head->oh_clientid = ticket->t_clientid;
-                logop_head->oh_len      = 0;
-                logop_head->oh_flags    = XLOG_START_TRANS;
-                logop_head->oh_res2     = 0;
-                ticket->t_flags         &= ~XLOG_TIC_INITED;    /* clear bit */
-                record_cnt++;
-                start_rec_copy = sizeof(xlog_op_header_t);
-                xlog_write_adv_cnt(ptr, len, log_offset, start_rec_copy);
-            }
-            /* Copy log operation header directly into data section */
+                /*
-            logop_head                  = (xlog_op_header_t *)ptr;
+                 * This loop writes out as many regions as can fit in the amount
-            logop_head->oh_tid          = cpu_to_be32(ticket->t_tid);
+                 * of space which was allocated by xlog_state_get_iclog_space().
-            logop_head->oh_clientid     = ticket->t_clientid;
+                 */
-            logop_head->oh_res2         = 0;
+                while (lv && index < lv->lv_niovecs) {
+                        struct xfs_log_iovec    *reg = &vecp[index];
+                        struct xlog_op_header   *ophdr;
+                        int                     start_rec_copy;
+                        int                     copy_len;
+                        int                     copy_off;
+                        ASSERT(reg->i_len % sizeof(__int32_t) == 0);
+                        ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
+                        start_rec_copy = xlog_write_start_rec(ptr, ticket);
+                        if (start_rec_copy) {
+                                record_cnt++;
+                                xlog_write_adv_cnt(&ptr, &len, &log_offset,
+                                                   start_rec_copy);
+                        }
-            /* header copied directly */
+                        ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
-            xlog_write_adv_cnt(ptr, len, log_offset, sizeof(xlog_op_header_t));
+                        if (!ophdr)
+                                return XFS_ERROR(EIO);
-            /* are we copying a commit or unmount record? */
+                        xlog_write_adv_cnt(&ptr, &len, &log_offset,
-            logop_head->oh_flags = flags;
+                                           sizeof(struct xlog_op_header));
+                        len += xlog_write_setup_copy(ticket, ophdr,
+                                                     iclog->ic_size-log_offset,
+                                                     reg->i_len,
+                                                     &copy_off, &copy_len,
+                                                     &partial_copy,
+                                                     &partial_copy_len);
+                        xlog_verify_dest_ptr(log, ptr);
+                        /* copy region */
+                        ASSERT(copy_len >= 0);
+                        memcpy(ptr, reg->i_addr + copy_off, copy_len);
+                        xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len);
+                        copy_len += start_rec_copy + sizeof(xlog_op_header_t);
+                        record_cnt++;
+                        data_cnt += contwr ? copy_len : 0;
+                        error = xlog_write_copy_finish(log, iclog, flags,
+                                                       &record_cnt, &data_cnt,
+                                                       &partial_copy,
+                                                       &partial_copy_len,
+                                                       log_offset,
+                                                       commit_iclog);
+                        if (error)
+                                return error;
-            /*
+                        /*
-             * We've seen logs corrupted with bad transaction client
+                         * if we had a partial copy, we need to get more iclog
-             * ids.  This makes sure that XFS doesn't generate them on.
+                         * space but we don't want to increment the region
-             * Turn this into an EIO and shut down the filesystem.
+                         * index because there is still more is this region to
-             */
+                         * write.
-            switch (logop_head->oh_clientid)  {
+                         *
-            case XFS_TRANSACTION:
+                         * If we completed writing this region, and we flushed
-            case XFS_VOLUME:
+                         * the iclog (indicated by resetting of the record
-            case XFS_LOG:
+                         * count), then we also need to get more log space. If
-                break;
+                         * this was the last record, though, we are done and
-            default:
+                         * can just return.
-                xfs_fs_cmn_err(CE_WARN, mp,
+                         */
-                    "Bad XFS transaction clientid 0x%x in ticket 0x%p",
+                        if (partial_copy)
-                    logop_head->oh_clientid, ticket);
+                                break;
-                return XFS_ERROR(EIO);
-            }
-            /* Partial write last time? => (partial_copy != 0)
+                        if (++index == lv->lv_niovecs) {
-             * need_copy is the amount we'd like to copy if everything could
+                                lv = lv->lv_next;
-             * fit in the current memcpy.
+                                index = 0;
-             */
+                                if (lv)
-            need_copy = reg[index].i_len - partial_copy_len;
+                                        vecp = lv->lv_iovecp;
+                        }
-            copy_off = partial_copy_len;
+                        if (record_cnt == 0) {
-            if (need_copy <= iclog->ic_size - log_offset) { /*complete write */
+                                if (!lv)
-                copy_len = need_copy;
+                                        return 0;
-                logop_head->oh_len = cpu_to_be32(copy_len);
+                                break;
-                if (partial_copy)
+                        }
-                    logop_head->oh_flags|= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
-                partial_copy_len = partial_copy = 0;
-            } else {                                        /* partial write */
-                copy_len = iclog->ic_size - log_offset;
-                logop_head->oh_len = cpu_to_be32(copy_len);
-                logop_head->oh_flags |= XLOG_CONTINUE_TRANS;
-                if (partial_copy)
-                        logop_head->oh_flags |= XLOG_WAS_CONT_TRANS;
-                partial_copy_len += copy_len;
-                partial_copy++;
-                len += sizeof(xlog_op_header_t); /* from splitting of region */
-                /* account for new log op header */
-                ticket->t_curr_res -= sizeof(xlog_op_header_t);
-                ticket->t_res_num_ophdrs++;
-            }
-            xlog_verify_dest_ptr(log, ptr);
-            /* copy region */
-            ASSERT(copy_len >= 0);
-            memcpy((xfs_caddr_t)ptr, reg[index].i_addr + copy_off, copy_len);
-            xlog_write_adv_cnt(ptr, len, log_offset, copy_len);
-            /* make copy_len total bytes copied, including headers */
-            copy_len += start_rec_copy + sizeof(xlog_op_header_t);
-            record_cnt++;
-            data_cnt += contwr ? copy_len : 0;
-            if (partial_copy) {                 /* copied partial region */
-                    /* already marked WANT_SYNC by xlog_state_get_iclog_space */
-                    xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
-                    record_cnt = data_cnt = 0;
-                    if ((error = xlog_state_release_iclog(log, iclog)))
-                            return error;
-                    break;                      /* don't increment index */
-            } else {                            /* copied entire region */
-                index++;
-                partial_copy_len = partial_copy = 0;
-                if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
-                    xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
-                    record_cnt = data_cnt = 0;
-                    spin_lock(&log->l_icloglock);
-                    xlog_state_want_sync(log, iclog);
-                    spin_unlock(&log->l_icloglock);
-                    if (commit_iclog) {
-                        ASSERT(flags & XLOG_COMMIT_TRANS);
-                        *commit_iclog = iclog;
-                    } else if ((error = xlog_state_release_iclog(log, iclog)))
-                           return error;
-                    if (index == nentries)
-                            return 0;           /* we are done */
-                    else
-                            break;
                }
-            } /* if (partial_copy) */
+        }
-        } /* while (index < nentries) */
-    } /* for (index = 0; index < nentries; ) */
+        ASSERT(len == 0);
-    ASSERT(len == 0);
+        xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
+        if (!commit_iclog)
+                return xlog_state_release_iclog(log, iclog);
-    xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
-    if (commit_iclog) {
        ASSERT(flags & XLOG_COMMIT_TRANS);
        *commit_iclog = iclog;
        return 0;
-    }
+}
-    return xlog_state_release_iclog(log, iclog);
-}       /* xlog_write */
 /*****************************************************************************
@@ -2826,6 +3025,8 @@ _xfs_log_force(
        XFS_STATS_INC(xs_log_force);
+        xlog_cil_push(log, 1);
        spin_lock(&log->l_icloglock);
        iclog = log->l_iclog;
@@ -2975,6 +3176,12 @@ _xfs_log_force_lsn(
        XFS_STATS_INC(xs_log_force);
+        if (log->l_cilp) {
+                lsn = xlog_cil_push_lsn(log, lsn);
+                if (lsn == NULLCOMMITLSN)
+                        return 0;
+        }
 try_again:
        spin_lock(&log->l_icloglock);
        iclog = log->l_iclog;
@@ -3139,20 +3346,30 @@ xfs_log_ticket_get(
        return ticket;
 }
+xlog_tid_t
+xfs_log_get_trans_ident(
+        struct xfs_trans        *tp)
+{
+        return tp->t_ticket->t_tid;
+}
 /*
 * Allocate and initialise a new log ticket.
 */
-STATIC xlog_ticket_t *
+xlog_ticket_t *
-xlog_ticket_alloc(xlog_t                *log,
+xlog_ticket_alloc(
-                int             unit_bytes,
+        struct log      *log,
-                int             cnt,
+        int             unit_bytes,
-                char            client,
+        int             cnt,
-                uint            xflags)
+        char            client,
+        uint            xflags,
+        int             alloc_flags)
 {
-        xlog_ticket_t   *tic;
+        struct xlog_ticket *tic;
        uint            num_headers;
+        int             iclog_space;
-        tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL);
+        tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
        if (!tic)
                return NULL;
@@ -3194,16 +3411,40 @@ xlog_ticket_alloc(xlog_t		*log,
        /* for start-rec */
        unit_bytes += sizeof(xlog_op_header_t);
-        /* for LR headers */
+        /*
-        num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log);
+         * for LR headers - the space for data in an iclog is the size minus
+         * the space used for the headers. If we use the iclog size, then we
+         * undercalculate the number of headers required.
+         *
+         * Furthermore - the addition of op headers for split-recs might
+         * increase the space required enough to require more log and op
+         * headers, so take that into account too.
+         *
+         * IMPORTANT: This reservation makes the assumption that if this
+         * transaction is the first in an iclog and hence has the LR headers
+         * accounted to it, then the remaining space in the iclog is
+         * exclusively for this transaction.  i.e. if the transaction is larger
+         * than the iclog, it will be the only thing in that iclog.
+         * Fundamentally, this means we must pass the entire log vector to
+         * xlog_write to guarantee this.
+         */
+        iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+        num_headers = howmany(unit_bytes, iclog_space);
+        /* for split-recs - ophdrs added when data split over LRs */
+        unit_bytes += sizeof(xlog_op_header_t) * num_headers;
+        /* add extra header reservations if we overrun */
+        while (!num_headers ||
+               howmany(unit_bytes, iclog_space) > num_headers) {
+                unit_bytes += sizeof(xlog_op_header_t);
+                num_headers++;
+        }
        unit_bytes += log->l_iclog_hsize * num_headers;
        /* for commit-rec LR header - note: padding will subsume the ophdr */
        unit_bytes += log->l_iclog_hsize;
-        /* for split-recs - ophdrs added when data split over LRs */
-        unit_bytes += sizeof(xlog_op_header_t) * num_headers;
        /* for roundoff padding for transaction data and one for commit record */
        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
            log->l_mp->m_sb.sb_logsunit > 1) {
@@ -3219,13 +3460,13 @@ xlog_ticket_alloc(xlog_t		*log,
        tic->t_curr_res         = unit_bytes;
        tic->t_cnt              = cnt;
        tic->t_ocnt             = cnt;
-        tic->t_tid              = (xlog_tid_t)((__psint_t)tic & 0xffffffff);
+        tic->t_tid              = random32();
        tic->t_clientid         = client;
        tic->t_flags            = XLOG_TIC_INITED;
        tic->t_trans_type       = 0;
        if (xflags & XFS_LOG_PERM_RESERV)
                tic->t_flags |= XLOG_TIC_PERM_RESERV;
-        sv_init(&(tic->t_wait), SV_DEFAULT, "logtick");
+        sv_init(&tic->t_wait, SV_DEFAULT, "logtick");
        xlog_tic_reset_res(tic);
@@ -3246,20 +3487,22 @@ xlog_ticket_alloc(xlog_t		*log,
 * part of the log in case we trash the log structure.
 */
 void
-xlog_verify_dest_ptr(xlog_t     *log,
+xlog_verify_dest_ptr(
-                     __psint_t  ptr)
+        struct log      *log,
+        char            *ptr)
 {
        int i;
        int good_ptr = 0;
-        for (i=0; i < log->l_iclog_bufs; i++) {
+        for (i = 0; i < log->l_iclog_bufs; i++) {
-                if (ptr >= (__psint_t)log->l_iclog_bak[i] &&
+                if (ptr >= log->l_iclog_bak[i] &&
-                    ptr <= (__psint_t)log->l_iclog_bak[i]+log->l_iclog_size)
+                    ptr <= log->l_iclog_bak[i] + log->l_iclog_size)
                        good_ptr++;
        }
-        if (! good_ptr)
+        if (!good_ptr)
                xlog_panic("xlog_verify_dest_ptr: invalid ptr");
-}       /* xlog_verify_dest_ptr */
+}
 STATIC void
 xlog_verify_grant_head(xlog_t *log, int equals)
@@ -3445,6 +3688,11 @@ xlog_state_ioerror(
 *      c. nothing new gets queued up after (a) and (b) are done.
 *      d. if !logerror, flush the iclogs to disk, then seal them off
 *         for business.
+ *
+ * Note: for delayed logging the !logerror case needs to flush the regions
+ * held in memory out to the iclogs before flushing them to disk. This needs
+ * to be done before the log is marked as shutdown, otherwise the flush to the
+ * iclogs will fail.
 */
 int
 xfs_log_force_umount(
@@ -3478,6 +3726,16 @@ xfs_log_force_umount(
                return 1;
        }
        retval = 0;
+        /*
+         * Flush the in memory commit item list before marking the log as
+         * being shut down. We need to do it in this order to ensure all the
+         * completed transactions are flushed to disk with the xfs_log_force()
+         * call below.
+         */
+        if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
+                xlog_cil_push(log, 1);
        /*
         * We must hold both the GRANT lock and the LOG lock,
         * before we mark the filesystem SHUTDOWN and wake
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 97a24c7795a4..04c78e642cc8 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -19,7 +19,6 @@
 #define __XFS_LOG_H__
 /* get lsn fields */
 #define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
 #define BLOCK_LSN(lsn) ((uint)(lsn))
@@ -110,6 +109,15 @@ typedef struct xfs_log_iovec {
        uint            i_type;         /* type of region */
 } xfs_log_iovec_t;
+struct xfs_log_vec {
+        struct xfs_log_vec      *lv_next;       /* next lv in build list */
+        int                     lv_niovecs;     /* number of iovecs in lv */
+        struct xfs_log_iovec    *lv_iovecp;     /* iovec array */
+        struct xfs_log_item     *lv_item;       /* owner */
+        char                    *lv_buf;        /* formatted buffer */
+        int                     lv_buf_len;     /* size of formatted buffer */
+};
 /*
 * Structure used to pass callback function and the function's argument
 * to the log manager.
@@ -126,6 +134,14 @@ typedef struct xfs_log_callback {
 struct xfs_mount;
 struct xlog_in_core;
 struct xlog_ticket;
+struct xfs_log_item;
+struct xfs_item_ops;
+struct xfs_trans;
+void    xfs_log_item_init(struct xfs_mount      *mp,
+                        struct xfs_log_item     *item,
+                        int                     type,
+                        struct xfs_item_ops     *ops);
 xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
                       struct xlog_ticket *ticket,
@@ -174,9 +190,16 @@ int	  xfs_log_need_covered(struct xfs_mount *mp);
 void      xlog_iodone(struct xfs_buf *);
-struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket);
+struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
 void      xfs_log_ticket_put(struct xlog_ticket *ticket);
+xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
+int     xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
+                                struct xfs_log_vec *log_vector,
+                                xfs_lsn_t *commit_lsn, int flags);
+bool    xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
 #endif
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
new file mode 100644
index 000000000000..bb17cc044bf3
--- /dev/null
+++ b/fs/xfs/xfs_log_cil.c
@@ -0,0 +1,725 @@
+/*
+ * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log_priv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_alloc.h"
+/*
+ * Perform initial CIL structure initialisation. If the CIL is not
+ * enabled in this filesystem, ensure the log->l_cilp is null so
+ * we can check this conditional to determine if we are doing delayed
+ * logging or not.
+ */
+int
+xlog_cil_init(
+        struct log      *log)
+{
+        struct xfs_cil  *cil;
+        struct xfs_cil_ctx *ctx;
+        log->l_cilp = NULL;
+        if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG))
+                return 0;
+        cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
+        if (!cil)
+                return ENOMEM;
+        ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
+        if (!ctx) {
+                kmem_free(cil);
+                return ENOMEM;
+        }
+        INIT_LIST_HEAD(&cil->xc_cil);
+        INIT_LIST_HEAD(&cil->xc_committing);
+        spin_lock_init(&cil->xc_cil_lock);
+        init_rwsem(&cil->xc_ctx_lock);
+        sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
+        INIT_LIST_HEAD(&ctx->committing);
+        INIT_LIST_HEAD(&ctx->busy_extents);
+        ctx->sequence = 1;
+        ctx->cil = cil;
+        cil->xc_ctx = ctx;
+        cil->xc_log = log;
+        log->l_cilp = cil;
+        return 0;
+}
+void
+xlog_cil_destroy(
+        struct log      *log)
+{
+        if (!log->l_cilp)
+                return;
+        if (log->l_cilp->xc_ctx) {
+                if (log->l_cilp->xc_ctx->ticket)
+                        xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
+                kmem_free(log->l_cilp->xc_ctx);
+        }
+        ASSERT(list_empty(&log->l_cilp->xc_cil));
+        kmem_free(log->l_cilp);
+}
+/*
+ * Allocate a new ticket. Failing to get a new ticket makes it really hard to
+ * recover, so we don't allow failure here. Also, we allocate in a context that
+ * we don't want to be issuing transactions from, so we need to tell the
+ * allocation code this as well.
+ *
+ * We don't reserve any space for the ticket - we are going to steal whatever
+ * space we require from transactions as they commit. To ensure we reserve all
+ * the space required, we need to set the current reservation of the ticket to
+ * zero so that we know to steal the initial transaction overhead from the
+ * first transaction commit.
+ */
+static struct xlog_ticket *
+xlog_cil_ticket_alloc(
+        struct log      *log)
+{
+        struct xlog_ticket *tic;
+        tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
+                                KM_SLEEP|KM_NOFS);
+        tic->t_trans_type = XFS_TRANS_CHECKPOINT;
+        /*
+         * set the current reservation to zero so we know to steal the basic
+         * transaction overhead reservation from the first transaction commit.
+         */
+        tic->t_curr_res = 0;
+        return tic;
+}
+/*
+ * After the first stage of log recovery is done, we know where the head and
+ * tail of the log are. We need this log initialisation done before we can
+ * initialise the first CIL checkpoint context.
+ *
+ * Here we allocate a log ticket to track space usage during a CIL push.  This
+ * ticket is passed to xlog_write() directly so that we don't slowly leak log
+ * space by failing to account for space used by log headers and additional
+ * region headers for split regions.
+ */
+void
+xlog_cil_init_post_recovery(
+        struct log      *log)
+{
+        if (!log->l_cilp)
+                return;
+        log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
+        log->l_cilp->xc_ctx->sequence = 1;
+        log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
+                                                                log->l_curr_block);
+}
+/*
+ * Insert the log item into the CIL and calculate the difference in space
+ * consumed by the item. Add the space to the checkpoint ticket and calculate
+ * if the change requires additional log metadata. If it does, take that space
+ * as well. Remove the amount of space we addded to the checkpoint ticket from
+ * the current transaction ticket so that the accounting works out correctly.
+ *
+ * If this is the first time the item is being placed into the CIL in this
+ * context, pin it so it can't be written to disk until the CIL is flushed to
+ * the iclog and the iclog written to disk.
+ */
+static void
+xlog_cil_insert(
+        struct log              *log,
+        struct xlog_ticket      *ticket,
+        struct xfs_log_item     *item,
+        struct xfs_log_vec      *lv)
+{
+        struct xfs_cil          *cil = log->l_cilp;
+        struct xfs_log_vec      *old = lv->lv_item->li_lv;
+        struct xfs_cil_ctx      *ctx = cil->xc_ctx;
+        int                     len;
+        int                     diff_iovecs;
+        int                     iclog_space;
+        if (old) {
+                /* existing lv on log item, space used is a delta */
+                ASSERT(!list_empty(&item->li_cil));
+                ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
+                len = lv->lv_buf_len - old->lv_buf_len;
+                diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
+                kmem_free(old->lv_buf);
+                kmem_free(old);
+        } else {
+                /* new lv, must pin the log item */
+                ASSERT(!lv->lv_item->li_lv);
+                ASSERT(list_empty(&item->li_cil));
+                len = lv->lv_buf_len;
+                diff_iovecs = lv->lv_niovecs;
+                IOP_PIN(lv->lv_item);
+        }
+        len += diff_iovecs * sizeof(xlog_op_header_t);
+        /* attach new log vector to log item */
+        lv->lv_item->li_lv = lv;
+        spin_lock(&cil->xc_cil_lock);
+        list_move_tail(&item->li_cil, &cil->xc_cil);
+        ctx->nvecs += diff_iovecs;
+        /*
+         * If this is the first time the item is being committed to the CIL,
+         * store the sequence number on the log item so we can tell
+         * in future commits whether this is the first checkpoint the item is
+         * being committed into.
+         */
+        if (!item->li_seq)
+                item->li_seq = ctx->sequence;
+        /*
+         * Now transfer enough transaction reservation to the context ticket
+         * for the checkpoint. The context ticket is special - the unit
+         * reservation has to grow as well as the current reservation as we
+         * steal from tickets so we can correctly determine the space used
+         * during the transaction commit.
+         */
+        if (ctx->ticket->t_curr_res == 0) {
+                /* first commit in checkpoint, steal the header reservation */
+                ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
+                ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
+                ticket->t_curr_res -= ctx->ticket->t_unit_res;
+        }
+        /* do we need space for more log record headers? */
+        iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+        if (len > 0 && (ctx->space_used / iclog_space !=
+                                (ctx->space_used + len) / iclog_space)) {
+                int hdrs;
+                hdrs = (len + iclog_space - 1) / iclog_space;
+                /* need to take into account split region headers, too */
+                hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
+                ctx->ticket->t_unit_res += hdrs;
+                ctx->ticket->t_curr_res += hdrs;
+                ticket->t_curr_res -= hdrs;
+                ASSERT(ticket->t_curr_res >= len);
+        }
+        ticket->t_curr_res -= len;
+        ctx->space_used += len;
+        spin_unlock(&cil->xc_cil_lock);
+}
+/*
+ * Format log item into a flat buffers
+ *
+ * For delayed logging, we need to hold a formatted buffer containing all the
+ * changes on the log item. This enables us to relog the item in memory and
+ * write it out asynchronously without needing to relock the object that was
+ * modified at the time it gets written into the iclog.
+ *
+ * This function builds a vector for the changes in each log item in the
+ * transaction. It then works out the length of the buffer needed for each log
+ * item, allocates them and formats the vector for the item into the buffer.
+ * The buffer is then attached to the log item are then inserted into the
+ * Committed Item List for tracking until the next checkpoint is written out.
+ *
+ * We don't set up region headers during this process; we simply copy the
+ * regions into the flat buffer. We can do this because we still have to do a
+ * formatting step to write the regions into the iclog buffer.  Writing the
+ * ophdrs during the iclog write means that we can support splitting large
+ * regions across iclog boundares without needing a change in the format of the
+ * item/region encapsulation.
+ *
+ * Hence what we need to do now is change the rewrite the vector array to point
+ * to the copied region inside the buffer we just allocated. This allows us to
+ * format the regions into the iclog as though they are being formatted
+ * directly out of the objects themselves.
+ */
+static void
+xlog_cil_format_items(
+        struct log              *log,
+        struct xfs_log_vec      *log_vector,
+        struct xlog_ticket      *ticket,
+        xfs_lsn_t               *start_lsn)
+{
+        struct xfs_log_vec *lv;
+        if (start_lsn)
+                *start_lsn = log->l_cilp->xc_ctx->sequence;
+        ASSERT(log_vector);
+        for (lv = log_vector; lv; lv = lv->lv_next) {
+                void    *ptr;
+                int     index;
+                int     len = 0;
+                /* build the vector array and calculate it's length */
+                IOP_FORMAT(lv->lv_item, lv->lv_iovecp);
+                for (index = 0; index < lv->lv_niovecs; index++)
+                        len += lv->lv_iovecp[index].i_len;
+                lv->lv_buf_len = len;
+                lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
+                ptr = lv->lv_buf;
+                for (index = 0; index < lv->lv_niovecs; index++) {
+                        struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
+                        memcpy(ptr, vec->i_addr, vec->i_len);
+                        vec->i_addr = ptr;
+                        ptr += vec->i_len;
+                }
+                ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
+                xlog_cil_insert(log, ticket, lv->lv_item, lv);
+        }
+}
+static void
+xlog_cil_free_logvec(
+        struct xfs_log_vec      *log_vector)
+{
+        struct xfs_log_vec      *lv;
+        for (lv = log_vector; lv; ) {
+                struct xfs_log_vec *next = lv->lv_next;
+                kmem_free(lv->lv_buf);
+                kmem_free(lv);
+                lv = next;
+        }
+}
+/*
+ * Commit a transaction with the given vector to the Committed Item List.
+ *
+ * To do this, we need to format the item, pin it in memory if required and
+ * account for the space used by the transaction. Once we have done that we
+ * need to release the unused reservation for the transaction, attach the
+ * transaction to the checkpoint context so we carry the busy extents through
+ * to checkpoint completion, and then unlock all the items in the transaction.
+ *
+ * For more specific information about the order of operations in
+ * xfs_log_commit_cil() please refer to the comments in
+ * xfs_trans_commit_iclog().
+ *
+ * Called with the context lock already held in read mode to lock out
+ * background commit, returns without it held once background commits are
+ * allowed again.
+ */
+int
+xfs_log_commit_cil(
+        struct xfs_mount        *mp,
+        struct xfs_trans        *tp,
+        struct xfs_log_vec      *log_vector,
+        xfs_lsn_t               *commit_lsn,
+        int                     flags)
+{
+        struct log              *log = mp->m_log;
+        int                     log_flags = 0;
+        int                     push = 0;
+        if (flags & XFS_TRANS_RELEASE_LOG_RES)
+                log_flags = XFS_LOG_REL_PERM_RESERV;
+        if (XLOG_FORCED_SHUTDOWN(log)) {
+                xlog_cil_free_logvec(log_vector);
+                return XFS_ERROR(EIO);
+        }
+        /* lock out background commit */
+        down_read(&log->l_cilp->xc_ctx_lock);
+        xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
+        /* check we didn't blow the reservation */
+        if (tp->t_ticket->t_curr_res < 0)
+                xlog_print_tic_res(log->l_mp, tp->t_ticket);
+        /* attach the transaction to the CIL if it has any busy extents */
+        if (!list_empty(&tp->t_busy)) {
+                spin_lock(&log->l_cilp->xc_cil_lock);
+                list_splice_init(&tp->t_busy,
+                                        &log->l_cilp->xc_ctx->busy_extents);
+                spin_unlock(&log->l_cilp->xc_cil_lock);
+        }
+        tp->t_commit_lsn = *commit_lsn;
+        xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+        xfs_trans_unreserve_and_mod_sb(tp);
+        /* check for background commit before unlock */
+        if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
+                push = 1;
+        up_read(&log->l_cilp->xc_ctx_lock);
+        /*
+         * We need to push CIL every so often so we don't cache more than we
+         * can fit in the log. The limit really is that a checkpoint can't be
+         * more than half the log (the current checkpoint is not allowed to
+         * overwrite the previous checkpoint), but commit latency and memory
+         * usage limit this to a smaller size in most cases.
+         */
+        if (push)
+                xlog_cil_push(log, 0);
+        return 0;
+}
+/*
+ * Mark all items committed and clear busy extents. We free the log vector
+ * chains in a separate pass so that we unpin the log items as quickly as
+ * possible.
+ */
+static void
+xlog_cil_committed(
+        void    *args,
+        int     abort)
+{
+        struct xfs_cil_ctx      *ctx = args;
+        struct xfs_log_vec      *lv;
+        int                     abortflag = abort ? XFS_LI_ABORTED : 0;
+        struct xfs_busy_extent  *busyp, *n;
+        /* unpin all the log items */
+        for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
+                xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
+                                                        abortflag);
+        }
+        list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
+                xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
+        spin_lock(&ctx->cil->xc_cil_lock);
+        list_del(&ctx->committing);
+        spin_unlock(&ctx->cil->xc_cil_lock);
+        xlog_cil_free_logvec(ctx->lv_chain);
+        kmem_free(ctx);
+}
+/*
+ * Push the Committed Item List to the log. If the push_now flag is not set,
+ * then it is a background flush and so we can chose to ignore it.
+ */
+int
+xlog_cil_push(
+        struct log              *log,
+        int                     push_now)
+{
+        struct xfs_cil          *cil = log->l_cilp;
+        struct xfs_log_vec      *lv;
+        struct xfs_cil_ctx      *ctx;
+        struct xfs_cil_ctx      *new_ctx;
+        struct xlog_in_core     *commit_iclog;
+        struct xlog_ticket      *tic;
+        int                     num_lv;
+        int                     num_iovecs;
+        int                     len;
+        int                     error = 0;
+        struct xfs_trans_header thdr;
+        struct xfs_log_iovec    lhdr;
+        struct xfs_log_vec      lvhdr = { NULL };
+        xfs_lsn_t               commit_lsn;
+        if (!cil)
+                return 0;
+        new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
+        new_ctx->ticket = xlog_cil_ticket_alloc(log);
+        /* lock out transaction commit, but don't block on background push */
+        if (!down_write_trylock(&cil->xc_ctx_lock)) {
+                if (!push_now)
+                        goto out_free_ticket;
+                down_write(&cil->xc_ctx_lock);
+        }
+        ctx = cil->xc_ctx;
+        /* check if we've anything to push */
+        if (list_empty(&cil->xc_cil))
+                goto out_skip;
+        /* check for spurious background flush */
+        if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+                goto out_skip;
+        /*
+         * pull all the log vectors off the items in the CIL, and
+         * remove the items from the CIL. We don't need the CIL lock
+         * here because it's only needed on the transaction commit
+         * side which is currently locked out by the flush lock.
+         */
+        lv = NULL;
+        num_lv = 0;
+        num_iovecs = 0;
+        len = 0;
+        while (!list_empty(&cil->xc_cil)) {
+                struct xfs_log_item     *item;
+                int                     i;
+                item = list_first_entry(&cil->xc_cil,
+                                        struct xfs_log_item, li_cil);
+                list_del_init(&item->li_cil);
+                if (!ctx->lv_chain)
+                        ctx->lv_chain = item->li_lv;
+                else
+                        lv->lv_next = item->li_lv;
+                lv = item->li_lv;
+                item->li_lv = NULL;
+                num_lv++;
+                num_iovecs += lv->lv_niovecs;
+                for (i = 0; i < lv->lv_niovecs; i++)
+                        len += lv->lv_iovecp[i].i_len;
+        }
+        /*
+         * initialise the new context and attach it to the CIL. Then attach
+         * the current context to the CIL committing lsit so it can be found
+         * during log forces to extract the commit lsn of the sequence that
+         * needs to be forced.
+         */
+        INIT_LIST_HEAD(&new_ctx->committing);
+        INIT_LIST_HEAD(&new_ctx->busy_extents);
+        new_ctx->sequence = ctx->sequence + 1;
+        new_ctx->cil = cil;
+        cil->xc_ctx = new_ctx;
+        /*
+         * The switch is now done, so we can drop the context lock and move out
+         * of a shared context. We can't just go straight to the commit record,
+         * though - we need to synchronise with previous and future commits so
+         * that the commit records are correctly ordered in the log to ensure
+         * that we process items during log IO completion in the correct order.
+         *
+         * For example, if we get an EFI in one checkpoint and the EFD in the
+         * next (e.g. due to log forces), we do not want the checkpoint with
+         * the EFD to be committed before the checkpoint with the EFI.  Hence
+         * we must strictly order the commit records of the checkpoints so
+         * that: a) the checkpoint callbacks are attached to the iclogs in the
+         * correct order; and b) the checkpoints are replayed in correct order
+         * in log recovery.
+         *
+         * Hence we need to add this context to the committing context list so
+         * that higher sequences will wait for us to write out a commit record
+         * before they do.
+         */
+        spin_lock(&cil->xc_cil_lock);
+        list_add(&ctx->committing, &cil->xc_committing);
+        spin_unlock(&cil->xc_cil_lock);
+        up_write(&cil->xc_ctx_lock);
+        /*
+         * Build a checkpoint transaction header and write it to the log to
+         * begin the transaction. We need to account for the space used by the
+         * transaction header here as it is not accounted for in xlog_write().
+         *
+         * The LSN we need to pass to the log items on transaction commit is
+         * the LSN reported by the first log vector write. If we use the commit
+         * record lsn then we can move the tail beyond the grant write head.
+         */
+        tic = ctx->ticket;
+        thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
+        thdr.th_type = XFS_TRANS_CHECKPOINT;
+        thdr.th_tid = tic->t_tid;
+        thdr.th_num_items = num_iovecs;
+        lhdr.i_addr = (xfs_caddr_t)&thdr;
+        lhdr.i_len = sizeof(xfs_trans_header_t);
+        lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
+        tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
+        lvhdr.lv_niovecs = 1;
+        lvhdr.lv_iovecp = &lhdr;
+        lvhdr.lv_next = ctx->lv_chain;
+        error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
+        if (error)
+                goto out_abort;
+        /*
+         * now that we've written the checkpoint into the log, strictly
+         * order the commit records so replay will get them in the right order.
+         */
+restart:
+        spin_lock(&cil->xc_cil_lock);
+        list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
+                /*
+                 * Higher sequences will wait for this one so skip them.
+                 * Don't wait for own own sequence, either.
+                 */
+                if (new_ctx->sequence >= ctx->sequence)
+                        continue;
+                if (!new_ctx->commit_lsn) {
+                        /*
+                         * It is still being pushed! Wait for the push to
+                         * complete, then start again from the beginning.
+                         */
+                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+                        goto restart;
+                }
+        }
+        spin_unlock(&cil->xc_cil_lock);
+        commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
+        if (error || commit_lsn == -1)
+                goto out_abort;
+        /* attach all the transactions w/ busy extents to iclog */
+        ctx->log_cb.cb_func = xlog_cil_committed;
+        ctx->log_cb.cb_arg = ctx;
+        error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb);
+        if (error)
+                goto out_abort;
+        /*
+         * now the checkpoint commit is complete and we've attached the
+         * callbacks to the iclog we can assign the commit LSN to the context
+         * and wake up anyone who is waiting for the commit to complete.
+         */
+        spin_lock(&cil->xc_cil_lock);
+        ctx->commit_lsn = commit_lsn;
+        sv_broadcast(&cil->xc_commit_wait);
+        spin_unlock(&cil->xc_cil_lock);
+        /* release the hounds! */
+        return xfs_log_release_iclog(log->l_mp, commit_iclog);
+out_skip:
+        up_write(&cil->xc_ctx_lock);
+out_free_ticket:
+        xfs_log_ticket_put(new_ctx->ticket);
+        kmem_free(new_ctx);
+        return 0;
+out_abort:
+        xlog_cil_committed(ctx, XFS_LI_ABORTED);
+        return XFS_ERROR(EIO);
+}
+/*
+ * Conditionally push the CIL based on the sequence passed in.
+ *
+ * We only need to push if we haven't already pushed the sequence
+ * number given. Hence the only time we will trigger a push here is
+ * if the push sequence is the same as the current context.
+ *
+ * We return the current commit lsn to allow the callers to determine if a
+ * iclog flush is necessary following this call.
+ *
+ * XXX: Initially, just push the CIL unconditionally and return whatever
+ * commit lsn is there. It'll be empty, so this is broken for now.
+ */
+xfs_lsn_t
+xlog_cil_push_lsn(
+        struct log      *log,
+        xfs_lsn_t       push_seq)
+{
+        struct xfs_cil          *cil = log->l_cilp;
+        struct xfs_cil_ctx      *ctx;
+        xfs_lsn_t               commit_lsn = NULLCOMMITLSN;
+restart:
+        down_write(&cil->xc_ctx_lock);
+        ASSERT(push_seq <= cil->xc_ctx->sequence);
+        /* check to see if we need to force out the current context */
+        if (push_seq == cil->xc_ctx->sequence) {
+                up_write(&cil->xc_ctx_lock);
+                xlog_cil_push(log, 1);
+                goto restart;
+        }
+        /*
+         * See if we can find a previous sequence still committing.
+         * We can drop the flush lock as soon as we have the cil lock
+         * because we are now only comparing contexts protected by
+         * the cil lock.
+         *
+         * We need to wait for all previous sequence commits to complete
+         * before allowing the force of push_seq to go ahead. Hence block
+         * on commits for those as well.
+         */
+        spin_lock(&cil->xc_cil_lock);
+        up_write(&cil->xc_ctx_lock);
+        list_for_each_entry(ctx, &cil->xc_committing, committing) {
+                if (ctx->sequence > push_seq)
+                        continue;
+                if (!ctx->commit_lsn) {
+                        /*
+                         * It is still being pushed! Wait for the push to
+                         * complete, then start again from the beginning.
+                         */
+                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+                        goto restart;
+                }
+                if (ctx->sequence != push_seq)
+                        continue;
+                /* found it! */
+                commit_lsn = ctx->commit_lsn;
+        }
+        spin_unlock(&cil->xc_cil_lock);
+        return commit_lsn;
+}
+/*
+ * Check if the current log item was first committed in this sequence.
+ * We can't rely on just the log item being in the CIL, we have to check
+ * the recorded commit sequence number.
+ *
+ * Note: for this to be used in a non-racy manner, it has to be called with
+ * CIL flushing locked out. As a result, it should only be used during the
+ * transaction commit process when deciding what to format into the item.
+ */
+bool
+xfs_log_item_in_current_chkpt(
+        struct xfs_log_item *lip)
+{
+        struct xfs_cil_ctx *ctx;
+        if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG))
+                return false;
+        if (list_empty(&lip->li_cil))
+                return false;
+        ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
+        /*
+         * li_seq is written on the first commit of a log item to record the
+         * first checkpoint it is written to. Hence if it is different to the
+         * current sequence, we're in a new checkpoint.
+         */
+        if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
+                return false;
+        return true;
+}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index fd02a18facd5..8c072618965c 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -152,8 +152,6 @@ static inline uint xlog_get_client_id(__be32 i)
 #define XLOG_RECOVERY_NEEDED    0x4     /* log was recovered */
 #define XLOG_IO_ERROR           0x8     /* log hit an I/O error, and being
                                           shutdown */
-typedef __uint32_t xlog_tid_t;
 #ifdef __KERNEL__
 /*
@@ -379,6 +377,99 @@ typedef struct xlog_in_core {
 } xlog_in_core_t;
 /*
+ * The CIL context is used to aggregate per-transaction details as well be
+ * passed to the iclog for checkpoint post-commit processing.  After being
+ * passed to the iclog, another context needs to be allocated for tracking the
+ * next set of transactions to be aggregated into a checkpoint.
+ */
+struct xfs_cil;
+struct xfs_cil_ctx {
+        struct xfs_cil          *cil;
+        xfs_lsn_t               sequence;       /* chkpt sequence # */
+        xfs_lsn_t               start_lsn;      /* first LSN of chkpt commit */
+        xfs_lsn_t               commit_lsn;     /* chkpt commit record lsn */
+        struct xlog_ticket      *ticket;        /* chkpt ticket */
+        int                     nvecs;          /* number of regions */
+        int                     space_used;     /* aggregate size of regions */
+        struct list_head        busy_extents;   /* busy extents in chkpt */
+        struct xfs_log_vec      *lv_chain;      /* logvecs being pushed */
+        xfs_log_callback_t      log_cb;         /* completion callback hook. */
+        struct list_head        committing;     /* ctx committing list */
+};
+/*
+ * Committed Item List structure
+ *
+ * This structure is used to track log items that have been committed but not
+ * yet written into the log. It is used only when the delayed logging mount
+ * option is enabled.
+ *
+ * This structure tracks the list of committing checkpoint contexts so
+ * we can avoid the problem of having to hold out new transactions during a
+ * flush until we have a the commit record LSN of the checkpoint. We can
+ * traverse the list of committing contexts in xlog_cil_push_lsn() to find a
+ * sequence match and extract the commit LSN directly from there. If the
+ * checkpoint is still in the process of committing, we can block waiting for
+ * the commit LSN to be determined as well. This should make synchronous
+ * operations almost as efficient as the old logging methods.
+ */
+struct xfs_cil {
+        struct log              *xc_log;
+        struct list_head        xc_cil;
+        spinlock_t              xc_cil_lock;
+        struct xfs_cil_ctx      *xc_ctx;
+        struct rw_semaphore     xc_ctx_lock;
+        struct list_head        xc_committing;
+        sv_t                    xc_commit_wait;
+};
+/*
+ * The amount of log space we should the CIL to aggregate is difficult to size.
+ * Whatever we chose we have to make we can get a reservation for the log space
+ * effectively, that it is large enough to capture sufficient relogging to
+ * reduce log buffer IO significantly, but it is not too large for the log or
+ * induces too much latency when writing out through the iclogs. We track both
+ * space consumed and the number of vectors in the checkpoint context, so we
+ * need to decide which to use for limiting.
+ *
+ * Every log buffer we write out during a push needs a header reserved, which
+ * is at least one sector and more for v2 logs. Hence we need a reservation of
+ * at least 512 bytes per 32k of log space just for the LR headers. That means
+ * 16KB of reservation per megabyte of delayed logging space we will consume,
+ * plus various headers.  The number of headers will vary based on the num of
+ * io vectors, so limiting on a specific number of vectors is going to result
+ * in transactions of varying size. IOWs, it is more consistent to track and
+ * limit space consumed in the log rather than by the number of objects being
+ * logged in order to prevent checkpoint ticket overruns.
+ *
+ * Further, use of static reservations through the log grant mechanism is
+ * problematic. It introduces a lot of complexity (e.g. reserve grant vs write
+ * grant) and a significant deadlock potential because regranting write space
+ * can block on log pushes. Hence if we have to regrant log space during a log
+ * push, we can deadlock.
+ *
+ * However, we can avoid this by use of a dynamic "reservation stealing"
+ * technique during transaction commit whereby unused reservation space in the
+ * transaction ticket is transferred to the CIL ctx commit ticket to cover the
+ * space needed by the checkpoint transaction. This means that we never need to
+ * specifically reserve space for the CIL checkpoint transaction, nor do we
+ * need to regrant space once the checkpoint completes. This also means the
+ * checkpoint transaction ticket is specific to the checkpoint context, rather
+ * than the CIL itself.
+ *
+ * With dynamic reservations, we can basically make up arbitrary limits for the
+ * checkpoint size so long as they don't violate any other size rules.  Hence
+ * the initial maximum size for the checkpoint transaction will be set to a
+ * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit
+ * right now based on the latency of writing out a large amount of data through
+ * the circular iclog buffers.
+ */
+#define XLOG_CIL_SPACE_LIMIT(log)       \
+        (min((log->l_logsize >> 2), (8 * 1024 * 1024)))
+/*
 * The reservation head lsn is not made up of a cycle number and block number.
 * Instead, it uses a cycle number and byte number.  Logs don't expect to
 * overflow 31 bits worth of byte offset, so using a byte number will mean
@@ -388,6 +479,7 @@ typedef struct log {
        /* The following fields don't need locking */
        struct xfs_mount        *l_mp;          /* mount point */
        struct xfs_ail          *l_ailp;        /* AIL log is working with */
+        struct xfs_cil          *l_cilp;        /* CIL log is working with */
        struct xfs_buf          *l_xbuf;        /* extra buffer for log
                                                 * wrapping */
        struct xfs_buftarg      *l_targ;        /* buftarg of log */
@@ -396,9 +488,7 @@ typedef struct log {
        struct xfs_buf_cancel   **l_buf_cancel_table;
        int                     l_iclog_hsize;  /* size of iclog header */
        int                     l_iclog_heads;  /* # of iclog header sectors */
-        uint                    l_sectbb_log;   /* log2 of sector size in BBs */
+        uint                    l_sectBBsize;   /* sector size in BBs (2^n) */
-        uint                    l_sectbb_mask;  /* sector size (in BBs)
-                                                 * alignment mask */
        int                     l_iclog_size;   /* size of log in bytes */
        int                     l_iclog_size_log; /* log power size of log */
        int                     l_iclog_bufs;   /* number of iclog buffers */
@@ -440,14 +530,40 @@ typedef struct log {
 #define XLOG_FORCED_SHUTDOWN(log)       ((log)->l_flags & XLOG_IO_ERROR)
 /* common routines */
 extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
 extern int       xlog_recover(xlog_t *log);
 extern int       xlog_recover_finish(xlog_t *log);
 extern void      xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
-extern kmem_zone_t      *xfs_log_ticket_zone;
+extern kmem_zone_t *xfs_log_ticket_zone;
+struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
+                                int count, char client, uint xflags,
+                                int alloc_flags);
+static inline void
+xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
+{
+        *ptr += bytes;
+        *len -= bytes;
+        *off += bytes;
+}
+void    xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
+int     xlog_write(struct log *log, struct xfs_log_vec *log_vector,
+                                struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
+                                xlog_in_core_t **commit_iclog, uint flags);
+/*
+ * Committed Item List interfaces
+ */
+int     xlog_cil_init(struct log *log);
+void    xlog_cil_init_post_recovery(struct log *log);
+void    xlog_cil_destroy(struct log *log);
+int     xlog_cil_push(struct log *log, int push_now);
+xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence);
 /*
 * Unmount record type is used as a pseudo transaction type for the ticket.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 22e6efdc17ea..14a69aec2c0b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -56,33 +56,61 @@ STATIC void	xlog_recover_check_summary(xlog_t *);
 #define xlog_recover_check_summary(log)
 #endif
 /*
 * Sector aligned buffer routines for buffer create/read/write/access
 */
-#define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs)   \
+/*
-        ( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \
+ * Verify the given count of basic blocks is valid number of blocks
-        ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) )
+ * to specify for an operation involving the given XFS log buffer.
-#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno)   ((bno) & ~(log)->l_sectbb_mask)
+ * Returns nonzero if the count is valid, 0 otherwise.
+ */
+static inline int
+xlog_buf_bbcount_valid(
+        xlog_t          *log,
+        int             bbcount)
+{
+        return bbcount > 0 && bbcount <= log->l_logBBsize;
+}
+/*
+ * Allocate a buffer to hold log data.  The buffer needs to be able
+ * to map to a range of nbblks basic blocks at any valid (basic
+ * block) offset within the log.
+ */
 STATIC xfs_buf_t *
 xlog_get_bp(
        xlog_t          *log,
        int             nbblks)
 {
-        if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+        if (!xlog_buf_bbcount_valid(log, nbblks)) {
-                xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+                xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
-                XFS_ERROR_REPORT("xlog_get_bp(1)",
+                        nbblks);
-                                 XFS_ERRLEVEL_HIGH, log->l_mp);
+                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
                return NULL;
        }
-        if (log->l_sectbb_log) {
+        /*
-                if (nbblks > 1)
+         * We do log I/O in units of log sectors (a power-of-2
-                        nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
+         * multiple of the basic block size), so we round up the
-                nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
+         * requested size to acommodate the basic blocks required
-        }
+         * for complete log sectors.
+         *
+         * In addition, the buffer may be used for a non-sector-
+         * aligned block offset, in which case an I/O of the
+         * requested size could extend beyond the end of the
+         * buffer.  If the requested size is only 1 basic block it
+         * will never straddle a sector boundary, so this won't be
+         * an issue.  Nor will this be a problem if the log I/O is
+         * done in basic blocks (sector size 1).  But otherwise we
+         * extend the buffer by one extra log sector to ensure
+         * there's space to accomodate this possiblility.
+         */
+        if (nbblks > 1 && log->l_sectBBsize > 1)
+                nbblks += log->l_sectBBsize;
+        nbblks = round_up(nbblks, log->l_sectBBsize);
        return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
 }
@@ -93,6 +121,10 @@ xlog_put_bp(
        xfs_buf_free(bp);
 }
+/*
+ * Return the address of the start of the given block number's data
+ * in a log buffer.  The buffer covers a log sector-aligned region.
+ */
 STATIC xfs_caddr_t
 xlog_align(
        xlog_t          *log,
@@ -100,14 +132,14 @@ xlog_align(
        int             nbblks,
        xfs_buf_t       *bp)
 {
+        xfs_daddr_t     offset;
        xfs_caddr_t     ptr;
-        if (!log->l_sectbb_log)
+        offset = blk_no & ((xfs_daddr_t) log->l_sectBBsize - 1);
-                return XFS_BUF_PTR(bp);
+        ptr = XFS_BUF_PTR(bp) + BBTOB(offset);
+        ASSERT(ptr + BBTOB(nbblks) <= XFS_BUF_PTR(bp) + XFS_BUF_SIZE(bp));
-        ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
-        ASSERT(XFS_BUF_SIZE(bp) >=
-                BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
        return ptr;
 }
@@ -124,21 +156,18 @@ xlog_bread_noalign(
 {
        int             error;
-        if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+        if (!xlog_buf_bbcount_valid(log, nbblks)) {
-                xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+                xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
-                XFS_ERROR_REPORT("xlog_bread(1)",
+                        nbblks);
-                                 XFS_ERRLEVEL_HIGH, log->l_mp);
+                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
                return EFSCORRUPTED;
        }
-        if (log->l_sectbb_log) {
+        blk_no = round_down(blk_no, log->l_sectBBsize);
-                blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
+        nbblks = round_up(nbblks, log->l_sectBBsize);
-                nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
-        }
        ASSERT(nbblks > 0);
        ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
-        ASSERT(bp);
        XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
        XFS_BUF_READ(bp);
@@ -186,17 +215,15 @@ xlog_bwrite(
 {
        int             error;
-        if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+        if (!xlog_buf_bbcount_valid(log, nbblks)) {
-                xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+                xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
-                XFS_ERROR_REPORT("xlog_bwrite(1)",
+                        nbblks);
-                                 XFS_ERRLEVEL_HIGH, log->l_mp);
+                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
                return EFSCORRUPTED;
        }
-        if (log->l_sectbb_log) {
+        blk_no = round_down(blk_no, log->l_sectBBsize);
-                blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
+        nbblks = round_up(nbblks, log->l_sectBBsize);
-                nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
-        }
        ASSERT(nbblks > 0);
        ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
@@ -327,39 +354,38 @@ xlog_find_cycle_start(
 {
        xfs_caddr_t     offset;
        xfs_daddr_t     mid_blk;
+        xfs_daddr_t     end_blk;
        uint            mid_cycle;
        int             error;
-        mid_blk = BLK_AVG(first_blk, *last_blk);
+        end_blk = *last_blk;
-        while (mid_blk != first_blk && mid_blk != *last_blk) {
+        mid_blk = BLK_AVG(first_blk, end_blk);
+        while (mid_blk != first_blk && mid_blk != end_blk) {
                error = xlog_bread(log, mid_blk, 1, bp, &offset);
                if (error)
                        return error;
                mid_cycle = xlog_get_cycle(offset);
-                if (mid_cycle == cycle) {
+                if (mid_cycle == cycle)
-                        *last_blk = mid_blk;
+                        end_blk = mid_blk;   /* last_half_cycle == mid_cycle */
-                        /* last_half_cycle == mid_cycle */
+                else
-                } else {
+                        first_blk = mid_blk; /* first_half_cycle == mid_cycle */
-                        first_blk = mid_blk;
+                mid_blk = BLK_AVG(first_blk, end_blk);
-                        /* first_half_cycle == mid_cycle */
-                }
-                mid_blk = BLK_AVG(first_blk, *last_blk);
        }
-        ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) ||
+        ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
-               (mid_blk == *last_blk && mid_blk-1 == first_blk));
+               (mid_blk == end_blk && mid_blk-1 == first_blk));
+        *last_blk = end_blk;
        return 0;
 }
 /*
- * Check that the range of blocks does not contain the cycle number
+ * Check that a range of blocks does not contain stop_on_cycle_no.
- * given.  The scan needs to occur from front to back and the ptr into the
+ * Fill in *new_blk with the block offset where such a block is
- * region must be updated since a later routine will need to perform another
+ * found, or with -1 (an invalid block number) if there is no such
- * test.  If the region is completely good, we end up returning the same
+ * block in the range.  The scan needs to occur from front to back
- * last block number.
+ * and the pointer into the region must be updated since a later
- *
+ * routine will need to perform another test.
- * Set blkno to -1 if we encounter no errors.  This is an invalid block number
- * since we don't ever expect logs to get this large.
 */
 STATIC int
 xlog_find_verify_cycle(
@@ -376,12 +402,16 @@ xlog_find_verify_cycle(
        xfs_caddr_t     buf = NULL;
        int             error = 0;
+        /*
+         * Greedily allocate a buffer big enough to handle the full
+         * range of basic blocks we'll be examining.  If that fails,
+         * try a smaller size.  We need to be able to read at least
+         * a log sector, or we're out of luck.
+         */
        bufblks = 1 << ffs(nbblks);
        while (!(bp = xlog_get_bp(log, bufblks))) {
-                /* can't get enough memory to do everything in one big buffer */
                bufblks >>= 1;
-                if (bufblks <= log->l_sectbb_log)
+                if (bufblks < log->l_sectBBsize)
                        return ENOMEM;
        }
@@ -629,7 +659,7 @@ xlog_find_head(
                 * In this case we want to find the first block with cycle
                 * number matching last_half_cycle.  We expect the log to be
                 * some variation on
-                 *        x + 1 ... | x ...
+                 *        x + 1 ... | x ... | x
                 * The first block with cycle number x (last_half_cycle) will
                 * be where the new head belongs.  First we do a binary search
                 * for the first occurrence of last_half_cycle.  The binary
@@ -639,11 +669,13 @@ xlog_find_head(
                 * the log, then we look for occurrences of last_half_cycle - 1
                 * at the end of the log.  The cases we're looking for look
                 * like
-                 *        x + 1 ... | x | x + 1 | x ...
+                 *                               v binary search stopped here
-                 *                               ^ binary search stopped here
+                 *        x + 1 ... | x | x + 1 | x ... | x
+                 *                   ^ but we want to locate this spot
                 * or
-                 *        x + 1 ... | x ... | x - 1 | x
                 *        <---------> less than scan distance
+                 *        x + 1 ... | x ... | x - 1 | x
+                 *                           ^ we want to locate this spot
                 */
                stop_on_cycle = last_half_cycle;
                if ((error = xlog_find_cycle_start(log, bp, first_blk,
@@ -699,16 +731,16 @@ xlog_find_head(
                 * certainly not the head of the log.  By searching for
                 * last_half_cycle-1 we accomplish that.
                 */
-                start_blk = log_bbnum - num_scan_bblks + head_blk;
                ASSERT(head_blk <= INT_MAX &&
-                        (xfs_daddr_t) num_scan_bblks - head_blk >= 0);
+                        (xfs_daddr_t) num_scan_bblks >= head_blk);
+                start_blk = log_bbnum - (num_scan_bblks - head_blk);
                if ((error = xlog_find_verify_cycle(log, start_blk,
                                        num_scan_bblks - (int)head_blk,
                                        (stop_on_cycle - 1), &new_blk)))
                        goto bp_err;
                if (new_blk != -1) {
                        head_blk = new_blk;
-                        goto bad_blk;
+                        goto validate_head;
                }
                /*
@@ -726,7 +758,7 @@ xlog_find_head(
                        head_blk = new_blk;
        }
- bad_blk:
+validate_head:
        /*
         * Now we need to make sure head_blk is not pointing to a block in
         * the middle of a log record.
@@ -748,7 +780,7 @@ xlog_find_head(
                if ((error = xlog_find_verify_log_record(log, start_blk,
                                                        &head_blk, 0)) == -1) {
                        /* We hit the beginning of the log during our search */
-                        start_blk = log_bbnum - num_scan_bblks + head_blk;
+                        start_blk = log_bbnum - (num_scan_bblks - head_blk);
                        new_blk = log_bbnum;
                        ASSERT(start_blk <= INT_MAX &&
                                (xfs_daddr_t) log_bbnum-start_blk >= 0);
@@ -833,12 +865,12 @@ xlog_find_tail(
        if (*head_blk == 0) {                           /* special case */
                error = xlog_bread(log, 0, 1, bp, &offset);
                if (error)
-                        goto bread_err;
+                        goto done;
                if (xlog_get_cycle(offset) == 0) {
                        *tail_blk = 0;
                        /* leave all other log inited values alone */
-                        goto exit;
+                        goto done;
                }
        }
@@ -849,7 +881,7 @@ xlog_find_tail(
        for (i = (int)(*head_blk) - 1; i >= 0; i--) {
                error = xlog_bread(log, i, 1, bp, &offset);
                if (error)
-                        goto bread_err;
+                        goto done;
                if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
                        found = 1;
@@ -866,7 +898,7 @@ xlog_find_tail(
                for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
                        error = xlog_bread(log, i, 1, bp, &offset);
                        if (error)
-                                goto bread_err;
+                                goto done;
                        if (XLOG_HEADER_MAGIC_NUM ==
                            be32_to_cpu(*(__be32 *)offset)) {
@@ -941,7 +973,7 @@ xlog_find_tail(
                umount_data_blk = (i + hblks) % log->l_logBBsize;
                error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
                if (error)
-                        goto bread_err;
+                        goto done;
                op_head = (xlog_op_header_t *)offset;
                if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
@@ -987,12 +1019,10 @@ xlog_find_tail(
         * But... if the -device- itself is readonly, just skip this.
         * We can't recover this device anyway, so it won't matter.
         */
-        if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
+        if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
                error = xlog_clear_stale_blocks(log, tail_lsn);
-        }
-bread_err:
+done:
-exit:
        xlog_put_bp(bp);
        if (error)
@@ -1152,16 +1182,22 @@ xlog_write_log_records(
        xfs_caddr_t     offset;
        xfs_buf_t       *bp;
        int             balign, ealign;
-        int             sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
+        int             sectbb = log->l_sectBBsize;
        int             end_block = start_block + blocks;
        int             bufblks;
        int             error = 0;
        int             i, j = 0;
+        /*
+         * Greedily allocate a buffer big enough to handle the full
+         * range of basic blocks to be written.  If that fails, try
+         * a smaller size.  We need to be able to write at least a
+         * log sector, or we're out of luck.
+         */
        bufblks = 1 << ffs(blocks);
        while (!(bp = xlog_get_bp(log, bufblks))) {
                bufblks >>= 1;
-                if (bufblks <= log->l_sectbb_log)
+                if (bufblks < sectbb)
                        return ENOMEM;
        }
@@ -1169,7 +1205,7 @@ xlog_write_log_records(
         * the buffer in the starting sector not covered by the first
         * write below.
         */
-        balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block);
+        balign = round_down(start_block, sectbb);
        if (balign != start_block) {
                error = xlog_bread_noalign(log, start_block, 1, bp);
                if (error)
@@ -1188,7 +1224,7 @@ xlog_write_log_records(
                 * the buffer in the final sector not covered by the write.
                 * If this is the same sector as the above read, skip it.
                 */
-                ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block);
+                ealign = round_down(end_block, sectbb);
                if (j == 0 && (start_block + endcount > ealign)) {
                        offset = XFS_BUF_PTR(bp);
                        balign = BBTOB(ealign - start_block);
@@ -1408,6 +1444,7 @@ xlog_recover_add_item(
 STATIC int
 xlog_recover_add_to_cont_trans(
+        struct log              *log,
        xlog_recover_t          *trans,
        xfs_caddr_t             dp,
        int                     len)
@@ -1434,6 +1471,7 @@ xlog_recover_add_to_cont_trans(
        memcpy(&ptr[old_len], dp, len); /* d, s, l */
        item->ri_buf[item->ri_cnt-1].i_len += len;
        item->ri_buf[item->ri_cnt-1].i_addr = ptr;
+        trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
        return 0;
 }
@@ -1452,6 +1490,7 @@ xlog_recover_add_to_cont_trans(
 */
 STATIC int
 xlog_recover_add_to_trans(
+        struct log              *log,
        xlog_recover_t          *trans,
        xfs_caddr_t             dp,
        int                     len)
@@ -1510,6 +1549,7 @@ xlog_recover_add_to_trans(
        item->ri_buf[item->ri_cnt].i_addr = ptr;
        item->ri_buf[item->ri_cnt].i_len  = len;
        item->ri_cnt++;
+        trace_xfs_log_recover_item_add(log, trans, item, 0);
        return 0;
 }
@@ -1521,7 +1561,9 @@ xlog_recover_add_to_trans(
 */
 STATIC int
 xlog_recover_reorder_trans(
-        xlog_recover_t          *trans)
+        struct log              *log,
+        xlog_recover_t          *trans,
+        int                     pass)
 {
        xlog_recover_item_t     *item, *n;
        LIST_HEAD(sort_list);
@@ -1534,7 +1576,9 @@ xlog_recover_reorder_trans(
                switch (ITEM_TYPE(item)) {
                case XFS_LI_BUF:
-                        if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) {
+                        if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
+                                trace_xfs_log_recover_item_reorder_head(log,
+                                                        trans, item, pass);
                                list_move(&item->ri_list, &trans->r_itemq);
                                break;
                        }
@@ -1543,6 +1587,8 @@ xlog_recover_reorder_trans(
                case XFS_LI_QUOTAOFF:
                case XFS_LI_EFD:
                case XFS_LI_EFI:
+                        trace_xfs_log_recover_item_reorder_tail(log,
+                                                        trans, item, pass);
                        list_move_tail(&item->ri_list, &trans->r_itemq);
                        break;
                default:
@@ -1592,8 +1638,10 @@ xlog_recover_do_buffer_pass1(
        /*
         * If this isn't a cancel buffer item, then just return.
         */
-        if (!(flags & XFS_BLI_CANCEL))
+        if (!(flags & XFS_BLF_CANCEL)) {
+                trace_xfs_log_recover_buf_not_cancel(log, buf_f);
                return;
+        }
        /*
         * Insert an xfs_buf_cancel record into the hash table of
@@ -1627,6 +1675,7 @@ xlog_recover_do_buffer_pass1(
        while (nextp != NULL) {
                if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
                        nextp->bc_refcount++;
+                        trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
                        return;
                }
                prevp = nextp;
@@ -1640,13 +1689,14 @@ xlog_recover_do_buffer_pass1(
        bcp->bc_refcount = 1;
        bcp->bc_next = NULL;
        prevp->bc_next = bcp;
+        trace_xfs_log_recover_buf_cancel_add(log, buf_f);
 }
 /*
 * Check to see whether the buffer being recovered has a corresponding
 * entry in the buffer cancel record table.  If it does then return 1
 * so that it will be cancelled, otherwise return 0.  If the buffer is
- * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement
+ * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
 * the refcount on the entry in the table and remove it from the table
 * if this is the last reference.
 *
@@ -1671,7 +1721,7 @@ xlog_check_buffer_cancelled(
                 * There is nothing in the table built in pass one,
                 * so this buffer must not be cancelled.
                 */
-                ASSERT(!(flags & XFS_BLI_CANCEL));
+                ASSERT(!(flags & XFS_BLF_CANCEL));
                return 0;
        }
@@ -1683,7 +1733,7 @@ xlog_check_buffer_cancelled(
                 * There is no corresponding entry in the table built
                 * in pass one, so this buffer has not been cancelled.
                 */
-                ASSERT(!(flags & XFS_BLI_CANCEL));
+                ASSERT(!(flags & XFS_BLF_CANCEL));
                return 0;
        }
@@ -1702,7 +1752,7 @@ xlog_check_buffer_cancelled(
                         * one in the table and remove it if this is the
                         * last reference.
                         */
-                        if (flags & XFS_BLI_CANCEL) {
+                        if (flags & XFS_BLF_CANCEL) {
                                bcp->bc_refcount--;
                                if (bcp->bc_refcount == 0) {
                                        if (prevp == NULL) {
@@ -1722,7 +1772,7 @@ xlog_check_buffer_cancelled(
         * We didn't find a corresponding entry in the table, so
         * return 0 so that the buffer is NOT cancelled.
         */
-        ASSERT(!(flags & XFS_BLI_CANCEL));
+        ASSERT(!(flags & XFS_BLF_CANCEL));
        return 0;
 }
@@ -1779,6 +1829,8 @@ xlog_recover_do_inode_buffer(
        unsigned int            *data_map = NULL;
        unsigned int            map_size = 0;
+        trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
        switch (buf_f->blf_type) {
        case XFS_LI_BUF:
                data_map = buf_f->blf_data_map;
@@ -1822,8 +1874,8 @@ xlog_recover_do_inode_buffer(
                        nbits = xfs_contig_bits(data_map, map_size,
                                                         bit);
                        ASSERT(nbits > 0);
-                        reg_buf_offset = bit << XFS_BLI_SHIFT;
+                        reg_buf_offset = bit << XFS_BLF_SHIFT;
-                        reg_buf_bytes = nbits << XFS_BLI_SHIFT;
+                        reg_buf_bytes = nbits << XFS_BLF_SHIFT;
                        item_index++;
                }
@@ -1837,7 +1889,7 @@ xlog_recover_do_inode_buffer(
                }
                ASSERT(item->ri_buf[item_index].i_addr != NULL);
-                ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0);
+                ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
                ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
                /*
@@ -1874,6 +1926,7 @@ xlog_recover_do_inode_buffer(
 /*ARGSUSED*/
 STATIC void
 xlog_recover_do_reg_buffer(
+        struct xfs_mount        *mp,
        xlog_recover_item_t     *item,
        xfs_buf_t               *bp,
        xfs_buf_log_format_t    *buf_f)
@@ -1885,6 +1938,8 @@ xlog_recover_do_reg_buffer(
        unsigned int            map_size = 0;
        int                     error;
+        trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
        switch (buf_f->blf_type) {
        case XFS_LI_BUF:
                data_map = buf_f->blf_data_map;
@@ -1900,9 +1955,9 @@ xlog_recover_do_reg_buffer(
                nbits = xfs_contig_bits(data_map, map_size, bit);
                ASSERT(nbits > 0);
                ASSERT(item->ri_buf[i].i_addr != NULL);
-                ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0);
+                ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
                ASSERT(XFS_BUF_COUNT(bp) >=
-                       ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT));
+                       ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT));
                /*
                 * Do a sanity check if this is a dquot buffer. Just checking
@@ -1911,7 +1966,7 @@ xlog_recover_do_reg_buffer(
                 */
                error = 0;
                if (buf_f->blf_flags &
-                   (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
+                   (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
                        if (item->ri_buf[i].i_addr == NULL) {
                                cmn_err(CE_ALERT,
                                        "XFS: NULL dquot in %s.", __func__);
@@ -1932,9 +1987,9 @@ xlog_recover_do_reg_buffer(
                }
                memcpy(xfs_buf_offset(bp,
-                        (uint)bit << XFS_BLI_SHIFT),    /* dest */
+                        (uint)bit << XFS_BLF_SHIFT),    /* dest */
                        item->ri_buf[i].i_addr,         /* source */
-                        nbits<<XFS_BLI_SHIFT);          /* length */
+                        nbits<<XFS_BLF_SHIFT);          /* length */
 next:
                i++;
                bit += nbits;
@@ -2083,6 +2138,8 @@ xlog_recover_do_dquot_buffer(
 {
        uint                    type;
+        trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
        /*
         * Filesystems are required to send in quota flags at mount time.
         */
@@ -2091,11 +2148,11 @@ xlog_recover_do_dquot_buffer(
        }
        type = 0;
-        if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF)
+        if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
                type |= XFS_DQ_USER;
-        if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF)
+        if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
                type |= XFS_DQ_PROJ;
-        if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF)
+        if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
                type |= XFS_DQ_GROUP;
        /*
         * This type of quotas was turned off, so ignore this buffer
@@ -2103,7 +2160,7 @@ xlog_recover_do_dquot_buffer(
        if (log->l_quotaoffs_flag & type)
                return;
-        xlog_recover_do_reg_buffer(item, bp, buf_f);
+        xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
 }
 /*
@@ -2116,7 +2173,7 @@ xlog_recover_do_dquot_buffer(
 * here which overlaps that may be stale.
 *
 * When meta-data buffers are freed at run time we log a buffer item
- * with the XFS_BLI_CANCEL bit set to indicate that previous copies
+ * with the XFS_BLF_CANCEL bit set to indicate that previous copies
 * of the buffer in the log should not be replayed at recovery time.
 * This is so that if the blocks covered by the buffer are reused for
 * file data before we crash we don't end up replaying old, freed
@@ -2150,7 +2207,7 @@ xlog_recover_do_buffer_trans(
        if (pass == XLOG_RECOVER_PASS1) {
                /*
                 * In this pass we're only looking for buf items
-                 * with the XFS_BLI_CANCEL bit set.
+                 * with the XFS_BLF_CANCEL bit set.
                 */
                xlog_recover_do_buffer_pass1(log, buf_f);
                return 0;
@@ -2164,9 +2221,11 @@ xlog_recover_do_buffer_trans(
                 */
                cancel = xlog_recover_do_buffer_pass2(log, buf_f);
                if (cancel) {
+                        trace_xfs_log_recover_buf_cancel(log, buf_f);
                        return 0;
                }
        }
+        trace_xfs_log_recover_buf_recover(log, buf_f);
        switch (buf_f->blf_type) {
        case XFS_LI_BUF:
                blkno = buf_f->blf_blkno;
@@ -2185,7 +2244,7 @@ xlog_recover_do_buffer_trans(
        mp = log->l_mp;
        buf_flags = XBF_LOCK;
-        if (!(flags & XFS_BLI_INODE_BUF))
+        if (!(flags & XFS_BLF_INODE_BUF))
                buf_flags |= XBF_MAPPED;
        bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
@@ -2198,13 +2257,13 @@ xlog_recover_do_buffer_trans(
        }
        error = 0;
-        if (flags & XFS_BLI_INODE_BUF) {
+        if (flags & XFS_BLF_INODE_BUF) {
                error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
        } else if (flags &
-                  (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
+                  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
                xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
        } else {
-                xlog_recover_do_reg_buffer(item, bp, buf_f);
+                xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
        }
        if (error)
                return XFS_ERROR(error);
@@ -2284,8 +2343,10 @@ xlog_recover_do_inode_trans(
        if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
                                        in_f->ilf_len, 0)) {
                error = 0;
+                trace_xfs_log_recover_inode_cancel(log, in_f);
                goto error;
        }
+        trace_xfs_log_recover_inode_recover(log, in_f);
        bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
                          XBF_LOCK);
@@ -2337,6 +2398,7 @@ xlog_recover_do_inode_trans(
                        /* do nothing */
                } else {
                        xfs_buf_relse(bp);
+                        trace_xfs_log_recover_inode_skip(log, in_f);
                        error = 0;
                        goto error;
                }
@@ -2758,11 +2820,12 @@ xlog_recover_do_trans(
        int                     error = 0;
        xlog_recover_item_t     *item;
-        error = xlog_recover_reorder_trans(trans);
+        error = xlog_recover_reorder_trans(log, trans, pass);
        if (error)
                return error;
        list_for_each_entry(item, &trans->r_itemq, ri_list) {
+                trace_xfs_log_recover_item_recover(log, trans, item, pass);
                switch (ITEM_TYPE(item)) {
                case XFS_LI_BUF:
                        error = xlog_recover_do_buffer_trans(log, item, pass);
@@ -2919,8 +2982,9 @@ xlog_recover_process_data(
                                error = xlog_recover_unmount_trans(trans);
                                break;
                        case XLOG_WAS_CONT_TRANS:
-                                error = xlog_recover_add_to_cont_trans(trans,
+                                error = xlog_recover_add_to_cont_trans(log,
-                                                dp, be32_to_cpu(ohead->oh_len));
+                                                trans, dp,
+                                                be32_to_cpu(ohead->oh_len));
                                break;
                        case XLOG_START_TRANS:
                                xlog_warn(
@@ -2930,7 +2994,7 @@ xlog_recover_process_data(
                                break;
                        case 0:
                        case XLOG_CONTINUE_TRANS:
-                                error = xlog_recover_add_to_trans(trans,
+                                error = xlog_recover_add_to_trans(log, trans,
                                                dp, be32_to_cpu(ohead->oh_len));
                                break;
                        default:
@@ -3331,42 +3395,6 @@ xlog_pack_data(
        }
 }
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-STATIC void
-xlog_unpack_data_checksum(
-        xlog_rec_header_t       *rhead,
-        xfs_caddr_t             dp,
-        xlog_t                  *log)
-{
-        __be32                  *up = (__be32 *)dp;
-        uint                    chksum = 0;
-        int                     i;
-        /* divide length by 4 to get # words */
-        for (i=0; i < be32_to_cpu(rhead->h_len) >> 2; i++) {
-                chksum ^= be32_to_cpu(*up);
-                up++;
-        }
-        if (chksum != be32_to_cpu(rhead->h_chksum)) {
-            if (rhead->h_chksum ||
-                ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
-                    cmn_err(CE_DEBUG,
-                        "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n",
-                            be32_to_cpu(rhead->h_chksum), chksum);
-                    cmn_err(CE_DEBUG,
-"XFS: Disregard message if filesystem was created with non-DEBUG kernel");
-                    if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
-                            cmn_err(CE_DEBUG,
-                                "XFS: LogR this is a LogV2 filesystem\n");
-                    }
-                    log->l_flags |= XLOG_CHKSUM_MISMATCH;
-            }
-        }
-}
-#else
-#define xlog_unpack_data_checksum(rhead, dp, log)
-#endif
 STATIC void
 xlog_unpack_data(
        xlog_rec_header_t       *rhead,
@@ -3390,8 +3418,6 @@ xlog_unpack_data(
                        dp += BBSIZE;
                }
        }
-        xlog_unpack_data_checksum(rhead, dp, log);
 }
 STATIC int
@@ -3490,7 +3516,7 @@ xlog_do_recovery_pass(
                        hblks = 1;
                }
        } else {
-                ASSERT(log->l_sectbb_log == 0);
+                ASSERT(log->l_sectBBsize == 1);
                hblks = 1;
                hbp = xlog_get_bp(log, 1);
                h_size = XLOG_BIG_RECORD_BSIZE;
@@ -3946,10 +3972,6 @@ xlog_recover_check_summary(
        xfs_agf_t       *agfp;
        xfs_buf_t       *agfbp;
        xfs_buf_t       *agibp;
-        xfs_buf_t       *sbbp;
-#ifdef XFS_LOUD_RECOVERY
-        xfs_sb_t        *sbp;
-#endif
        xfs_agnumber_t  agno;
        __uint64_t      freeblks;
        __uint64_t      itotal;
@@ -3984,30 +4006,5 @@ xlog_recover_check_summary(
                        xfs_buf_relse(agibp);
                }
        }
-        sbbp = xfs_getsb(mp, 0);
-#ifdef XFS_LOUD_RECOVERY
-        sbp = &mp->m_sb;
-        xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(sbbp));
-        cmn_err(CE_NOTE,
-                "xlog_recover_check_summary: sb_icount %Lu itotal %Lu",
-                sbp->sb_icount, itotal);
-        cmn_err(CE_NOTE,
-                "xlog_recover_check_summary: sb_ifree %Lu itotal %Lu",
-                sbp->sb_ifree, ifree);
-        cmn_err(CE_NOTE,
-                "xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu",
-                sbp->sb_fdblocks, freeblks);
-#if 0
-        /*
-         * This is turned off until I account for the allocation
-         * btree blocks which live in free space.
-         */
-        ASSERT(sbp->sb_icount == itotal);
-        ASSERT(sbp->sb_ifree == ifree);
-        ASSERT(sbp->sb_fdblocks == freeblks);
-#endif
-#endif
-        xfs_buf_relse(sbbp);
 }
 #endif /* DEBUG */
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
index 75d749207258..1c55ccbb379d 100644
--- a/fs/xfs/xfs_log_recover.h
+++ b/fs/xfs/xfs_log_recover.h
@@ -28,7 +28,7 @@
 #define XLOG_RHASH(tid) \
        ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
-#define XLOG_MAX_REGIONS_IN_ITEM   (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1)
+#define XLOG_MAX_REGIONS_IN_ITEM   (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1)
 /*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e79b56b4bca6..d7bf38c8cd1c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1405,13 +1405,6 @@ xfs_mountfs(
                xfs_qm_mount_quotas(mp);
        }
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-        if (XFS_IS_QUOTA_ON(mp))
-                xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on");
-        else
-                xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on");
-#endif
        /*
         * Now we are mounted, reserve a small amount of unused space for
         * privileged transactions. This is needed so that transaction
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 4fa0bc7b983e..1d2c7eed4eda 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -259,6 +259,7 @@ typedef struct xfs_mount {
        wait_queue_head_t       m_wait_single_sync_task;
        __int64_t               m_update_flags; /* sb flags we need to update
                                                   on the next remount,rw */
+        struct list_head        m_mplist;       /* inode shrinker mount list */
 } xfs_mount_t;
 /*
@@ -267,6 +268,7 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_WSYNC         (1ULL << 0)     /* for nfs - all metadata ops
                                                   must be synchronous except
                                                   for space allocations */
+#define XFS_MOUNT_DELAYLOG      (1ULL << 1)     /* delayed logging is enabled */
 #define XFS_MOUNT_DMAPI         (1ULL << 2)     /* dmapi is enabled */
 #define XFS_MOUNT_WAS_CLEAN     (1ULL << 3)
 #define XFS_MOUNT_FS_SHUTDOWN   (1ULL << 4)     /* atomic stop of all filesystem
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index fdcab3f81dde..e0e64b113bd6 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -201,9 +201,6 @@ typedef struct xfs_qoff_logformat {
 #define XFS_QMOPT_FORCE_RES     0x0000010 /* ignore quota limits */
 #define XFS_QMOPT_DQSUSER       0x0000020 /* don't cache super users dquot */
 #define XFS_QMOPT_SBVERSION     0x0000040 /* change superblock version num */
-#define XFS_QMOPT_QUOTAOFF      0x0000080 /* quotas are being turned off */
-#define XFS_QMOPT_UMOUNTING     0x0000100 /* filesys is being unmounted */
-#define XFS_QMOPT_DOLOG         0x0000200 /* log buf changes (in quotacheck) */
 #define XFS_QMOPT_DOWARN        0x0000400 /* increase warning cnt if needed */
 #define XFS_QMOPT_DQREPAIR      0x0001000 /* repair dquot if damaged */
 #define XFS_QMOPT_GQUOTA        0x0002000 /* group dquot requested */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f73e358bae8d..ce558efa2ea0 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -44,24 +44,14 @@
 #include "xfs_trans_priv.h"
 #include "xfs_trans_space.h"
 #include "xfs_inode_item.h"
+#include "xfs_trace.h"
-STATIC void     xfs_trans_apply_sb_deltas(xfs_trans_t *);
-STATIC uint     xfs_trans_count_vecs(xfs_trans_t *);
-STATIC void     xfs_trans_fill_vecs(xfs_trans_t *, xfs_log_iovec_t *);
-STATIC void     xfs_trans_uncommit(xfs_trans_t *, uint);
-STATIC void     xfs_trans_committed(xfs_trans_t *, int);
-STATIC void     xfs_trans_chunk_committed(xfs_log_item_chunk_t *, xfs_lsn_t, int);
-STATIC void     xfs_trans_free(xfs_trans_t *);
 kmem_zone_t     *xfs_trans_zone;
 /*
 * Reservation functions here avoid a huge stack in xfs_trans_init
 * due to register overflow from temporaries in the calculations.
 */
 STATIC uint
 xfs_calc_write_reservation(xfs_mount_t *mp)
 {
@@ -254,13 +244,30 @@ _xfs_trans_alloc(
        tp->t_type = type;
        tp->t_mountp = mp;
        tp->t_items_free = XFS_LIC_NUM_SLOTS;
-        tp->t_busy_free = XFS_LBC_NUM_SLOTS;
        xfs_lic_init(&(tp->t_items));
-        XFS_LBC_INIT(&(tp->t_busy));
+        INIT_LIST_HEAD(&tp->t_busy);
        return tp;
 }
 /*
+ * Free the transaction structure.  If there is more clean up
+ * to do when the structure is freed, add it here.
+ */
+STATIC void
+xfs_trans_free(
+        struct xfs_trans        *tp)
+{
+        struct xfs_busy_extent  *busyp, *n;
+        list_for_each_entry_safe(busyp, n, &tp->t_busy, list)
+                xfs_alloc_busy_clear(tp->t_mountp, busyp);
+        atomic_dec(&tp->t_mountp->m_active_trans);
+        xfs_trans_free_dqinfo(tp);
+        kmem_zone_free(xfs_trans_zone, tp);
+}
+/*
 * This is called to create a new transaction which will share the
 * permanent log reservation of the given transaction.  The remaining
 * unused block and rt extent reservations are also inherited.  This
@@ -283,9 +290,8 @@ xfs_trans_dup(
        ntp->t_type = tp->t_type;
        ntp->t_mountp = tp->t_mountp;
        ntp->t_items_free = XFS_LIC_NUM_SLOTS;
-        ntp->t_busy_free = XFS_LBC_NUM_SLOTS;
        xfs_lic_init(&(ntp->t_items));
-        XFS_LBC_INIT(&(ntp->t_busy));
+        INIT_LIST_HEAD(&ntp->t_busy);
        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
        ASSERT(tp->t_ticket != NULL);
@@ -421,7 +427,6 @@ undo_blocks:
        return error;
 }
 /*
 * Record the indicated change to the given field for application
 * to the file system's superblock when the transaction commits.
@@ -650,7 +655,7 @@ xfs_trans_apply_sb_deltas(
 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
 * still need to update the incore superblock with the changes.
 */
-STATIC void
+void
 xfs_trans_unreserve_and_mod_sb(
        xfs_trans_t     *tp)
 {
@@ -764,94 +769,256 @@ xfs_trans_unreserve_and_mod_sb(
        }
 }
+/*
+ * Total up the number of log iovecs needed to commit this
+ * transaction.  The transaction itself needs one for the
+ * transaction header.  Ask each dirty item in turn how many
+ * it needs to get the total.
+ */
+static uint
+xfs_trans_count_vecs(
+        struct xfs_trans        *tp)
+{
+        int                     nvecs;
+        xfs_log_item_desc_t     *lidp;
+        nvecs = 1;
+        lidp = xfs_trans_first_item(tp);
+        ASSERT(lidp != NULL);
+        /* In the non-debug case we need to start bailing out if we
+         * didn't find a log_item here, return zero and let trans_commit
+         * deal with it.
+         */
+        if (lidp == NULL)
+                return 0;
+        while (lidp != NULL) {
+                /*
+                 * Skip items which aren't dirty in this transaction.
+                 */
+                if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+                        lidp = xfs_trans_next_item(tp, lidp);
+                        continue;
+                }
+                lidp->lid_size = IOP_SIZE(lidp->lid_item);
+                nvecs += lidp->lid_size;
+                lidp = xfs_trans_next_item(tp, lidp);
+        }
+        return nvecs;
+}
 /*
- * xfs_trans_commit
+ * Fill in the vector with pointers to data to be logged
+ * by this transaction.  The transaction header takes
+ * the first vector, and then each dirty item takes the
+ * number of vectors it indicated it needed in xfs_trans_count_vecs().
 *
- * Commit the given transaction to the log a/synchronously.
+ * As each item fills in the entries it needs, also pin the item
+ * so that it cannot be flushed out until the log write completes.
+ */
+static void
+xfs_trans_fill_vecs(
+        struct xfs_trans        *tp,
+        struct xfs_log_iovec    *log_vector)
+{
+        xfs_log_item_desc_t     *lidp;
+        struct xfs_log_iovec    *vecp;
+        uint                    nitems;
+        /*
+         * Skip over the entry for the transaction header, we'll
+         * fill that in at the end.
+         */
+        vecp = log_vector + 1;
+        nitems = 0;
+        lidp = xfs_trans_first_item(tp);
+        ASSERT(lidp);
+        while (lidp) {
+                /* Skip items which aren't dirty in this transaction. */
+                if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+                        lidp = xfs_trans_next_item(tp, lidp);
+                        continue;
+                }
+                /*
+                 * The item may be marked dirty but not log anything.  This can
+                 * be used to get called when a transaction is committed.
+                 */
+                if (lidp->lid_size)
+                        nitems++;
+                IOP_FORMAT(lidp->lid_item, vecp);
+                vecp += lidp->lid_size;
+                IOP_PIN(lidp->lid_item);
+                lidp = xfs_trans_next_item(tp, lidp);
+        }
+        /*
+         * Now that we've counted the number of items in this transaction, fill
+         * in the transaction header. Note that the transaction header does not
+         * have a log item.
+         */
+        tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC;
+        tp->t_header.th_type = tp->t_type;
+        tp->t_header.th_num_items = nitems;
+        log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
+        log_vector->i_len = sizeof(xfs_trans_header_t);
+        log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
+}
+/*
+ * The committed item processing consists of calling the committed routine of
+ * each logged item, updating the item's position in the AIL if necessary, and
+ * unpinning each item.  If the committed routine returns -1, then do nothing
+ * further with the item because it may have been freed.
 *
- * XFS disk error handling mechanism is not based on a typical
+ * Since items are unlocked when they are copied to the incore log, it is
- * transaction abort mechanism. Logically after the filesystem
+ * possible for two transactions to be completing and manipulating the same
- * gets marked 'SHUTDOWN', we can't let any new transactions
+ * item simultaneously.  The AIL lock will protect the lsn field of each item.
- * be durable - ie. committed to disk - because some metadata might
+ * The value of this field can never go backwards.
- * be inconsistent. In such cases, this returns an error, and the
+ *
- * caller may assume that all locked objects joined to the transaction
+ * We unpin the items after repositioning them in the AIL, because otherwise
- * have already been unlocked as if the commit had succeeded.
+ * they could be immediately flushed and we'd have to race with the flusher
- * Do not reference the transaction structure after this call.
+ * trying to pull the item from the AIL as we add it.
 */
- /*ARGSUSED*/
+void
-int
+xfs_trans_item_committed(
-_xfs_trans_commit(
+        struct xfs_log_item     *lip,
-        xfs_trans_t     *tp,
+        xfs_lsn_t               commit_lsn,
-        uint            flags,
+        int                     aborted)
-        int             *log_flushed)
 {
-        xfs_log_iovec_t         *log_vector;
+        xfs_lsn_t               item_lsn;
-        int                     nvec;
+        struct xfs_ail          *ailp;
-        xfs_mount_t             *mp;
-        xfs_lsn_t               commit_lsn;
-        /* REFERENCED */
-        int                     error;
-        int                     log_flags;
-        int                     sync;
-#define XFS_TRANS_LOGVEC_COUNT  16
-        xfs_log_iovec_t         log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
-        struct xlog_in_core     *commit_iclog;
-        int                     shutdown;
-        commit_lsn = -1;
+        if (aborted)
+                lip->li_flags |= XFS_LI_ABORTED;
+        item_lsn = IOP_COMMITTED(lip, commit_lsn);
+        /* If the committed routine returns -1, item has been freed. */
+        if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
+                return;
        /*
-         * Determine whether this commit is releasing a permanent
+         * If the returned lsn is greater than what it contained before, update
-         * log reservation or not.
+         * the location of the item in the AIL.  If it is not, then do nothing.
+         * Items can never move backwards in the AIL.
+         *
+         * While the new lsn should usually be greater, it is possible that a
+         * later transaction completing simultaneously with an earlier one
+         * using the same item could complete first with a higher lsn.  This
+         * would cause the earlier transaction to fail the test below.
         */
-        if (flags & XFS_TRANS_RELEASE_LOG_RES) {
+        ailp = lip->li_ailp;
-                ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+        spin_lock(&ailp->xa_lock);
-                log_flags = XFS_LOG_REL_PERM_RESERV;
+        if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
+                /*
+                 * This will set the item's lsn to item_lsn and update the
+                 * position of the item in the AIL.
+                 *
+                 * xfs_trans_ail_update() drops the AIL lock.
+                 */
+                xfs_trans_ail_update(ailp, lip, item_lsn);
        } else {
-                log_flags = 0;
+                spin_unlock(&ailp->xa_lock);
        }
-        mp = tp->t_mountp;
        /*
-         * If there is nothing to be logged by the transaction,
+         * Now that we've repositioned the item in the AIL, unpin it so it can
-         * then unlock all of the items associated with the
+         * be flushed. Pass information about buffer stale state down from the
-         * transaction and free the transaction structure.
+         * log item flags, if anyone else stales the buffer we do not want to
-         * Also make sure to return any reserved blocks to
+         * pay any attention to it.
-         * the free pool.
         */
-shut_us_down:
+        IOP_UNPIN(lip);
-        shutdown = XFS_FORCED_SHUTDOWN(mp) ? EIO : 0;
+}
-        if (!(tp->t_flags & XFS_TRANS_DIRTY) || shutdown) {
-                xfs_trans_unreserve_and_mod_sb(tp);
+/*
+ * This is typically called by the LM when a transaction has been fully
+ * committed to disk.  It needs to unpin the items which have
+ * been logged by the transaction and update their positions
+ * in the AIL if necessary.
+ *
+ * This also gets called when the transactions didn't get written out
+ * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
+ */
+STATIC void
+xfs_trans_committed(
+        struct xfs_trans        *tp,
+        int                     abortflag)
+{
+        xfs_log_item_desc_t     *lidp;
+        xfs_log_item_chunk_t    *licp;
+        xfs_log_item_chunk_t    *next_licp;
+        /* Call the transaction's completion callback if there is one. */
+        if (tp->t_callback != NULL)
+                tp->t_callback(tp, tp->t_callarg);
+        for (lidp = xfs_trans_first_item(tp);
+             lidp != NULL;
+             lidp = xfs_trans_next_item(tp, lidp)) {
+                xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
+        }
+        /* free the item chunks, ignoring the embedded chunk */
+        for (licp = tp->t_items.lic_next; licp != NULL; licp = next_licp) {
+                next_licp = licp->lic_next;
+                kmem_free(licp);
+        }
+        xfs_trans_free(tp);
+}
+/*
+ * Called from the trans_commit code when we notice that
+ * the filesystem is in the middle of a forced shutdown.
+ */
+STATIC void
+xfs_trans_uncommit(
+        struct xfs_trans        *tp,
+        uint                    flags)
+{
+        xfs_log_item_desc_t     *lidp;
+        for (lidp = xfs_trans_first_item(tp);
+             lidp != NULL;
+             lidp = xfs_trans_next_item(tp, lidp)) {
                /*
-                 * It is indeed possible for the transaction to be
+                 * Unpin all but those that aren't dirty.
-                 * not dirty but the dqinfo portion to be. All that
-                 * means is that we have some (non-persistent) quota
-                 * reservations that need to be unreserved.
                 */
-                xfs_trans_unreserve_and_mod_dquots(tp);
+                if (lidp->lid_flags & XFS_LID_DIRTY)
-                if (tp->t_ticket) {
+                        IOP_UNPIN_REMOVE(lidp->lid_item, tp);
-                        commit_lsn = xfs_log_done(mp, tp->t_ticket,
-                                                        NULL, log_flags);
-                        if (commit_lsn == -1 && !shutdown)
-                                shutdown = XFS_ERROR(EIO);
-                }
-                current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-                xfs_trans_free_items(tp, shutdown? XFS_TRANS_ABORT : 0);
-                xfs_trans_free_busy(tp);
-                xfs_trans_free(tp);
-                XFS_STATS_INC(xs_trans_empty);
-                return (shutdown);
        }
-        ASSERT(tp->t_ticket != NULL);
-        /*
+        xfs_trans_unreserve_and_mod_sb(tp);
-         * If we need to update the superblock, then do it now.
+        xfs_trans_unreserve_and_mod_dquots(tp);
-         */
-        if (tp->t_flags & XFS_TRANS_SB_DIRTY)
+        xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
-                xfs_trans_apply_sb_deltas(tp);
+        xfs_trans_free(tp);
-        xfs_trans_apply_dquot_deltas(tp);
+}
+/*
+ * Format the transaction direct to the iclog. This isolates the physical
+ * transaction commit operation from the logical operation and hence allows
+ * other methods to be introduced without affecting the existing commit path.
+ */
+static int
+xfs_trans_commit_iclog(
+        struct xfs_mount        *mp,
+        struct xfs_trans        *tp,
+        xfs_lsn_t               *commit_lsn,
+        int                     flags)
+{
+        int                     shutdown;
+        int                     error;
+        int                     log_flags = 0;
+        struct xlog_in_core     *commit_iclog;
+#define XFS_TRANS_LOGVEC_COUNT  16
+        struct xfs_log_iovec    log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
+        struct xfs_log_iovec    *log_vector;
+        uint                    nvec;
        /*
         * Ask each log item how many log_vector entries it will
@@ -861,8 +1028,7 @@ shut_us_down:
         */
        nvec = xfs_trans_count_vecs(tp);
        if (nvec == 0) {
-                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+                return ENOMEM;  /* triggers a shutdown! */
-                goto shut_us_down;
        } else if (nvec <= XFS_TRANS_LOGVEC_COUNT) {
                log_vector = log_vector_fast;
        } else {
@@ -877,6 +1043,9 @@ shut_us_down:
         */
        xfs_trans_fill_vecs(tp, log_vector);
+        if (flags & XFS_TRANS_RELEASE_LOG_RES)
+                log_flags = XFS_LOG_REL_PERM_RESERV;
        error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn));
        /*
@@ -884,18 +1053,19 @@ shut_us_down:
         * at any time after this call.  However, all the items associated
         * with the transaction are still locked and pinned in memory.
         */
-        commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
+        *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
-        tp->t_commit_lsn = commit_lsn;
+        tp->t_commit_lsn = *commit_lsn;
-        if (nvec > XFS_TRANS_LOGVEC_COUNT) {
+        trace_xfs_trans_commit_lsn(tp);
+        if (nvec > XFS_TRANS_LOGVEC_COUNT)
                kmem_free(log_vector);
-        }
        /*
         * If we got a log write error. Unpin the logitems that we
         * had pinned, clean up, free trans structure, and return error.
         */
-        if (error || commit_lsn == -1) {
+        if (error || *commit_lsn == -1) {
                current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
                xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT);
                return XFS_ERROR(EIO);
@@ -909,8 +1079,6 @@ shut_us_down:
         */
        xfs_trans_unreserve_and_mod_sb(tp);
-        sync = tp->t_flags & XFS_TRANS_SYNC;
        /*
         * Tell the LM to call the transaction completion routine
         * when the log write with LSN commit_lsn completes (e.g.
@@ -953,7 +1121,7 @@ shut_us_down:
         * the commit lsn of this transaction for dependency tracking
         * purposes.
         */
-        xfs_trans_unlock_items(tp, commit_lsn);
+        xfs_trans_unlock_items(tp, *commit_lsn);
        /*
         * If we detected a log error earlier, finish committing
@@ -973,156 +1141,204 @@ shut_us_down:
         * and the items are released we can finally allow the iclog to
         * go to disk.
         */
-        error = xfs_log_release_iclog(mp, commit_iclog);
+        return xfs_log_release_iclog(mp, commit_iclog);
-        /*
-         * If the transaction needs to be synchronous, then force the
-         * log out now and wait for it.
-         */
-        if (sync) {
-                if (!error) {
-                        error = _xfs_log_force_lsn(mp, commit_lsn,
-                                      XFS_LOG_SYNC, log_flushed);
-                }
-                XFS_STATS_INC(xs_trans_sync);
-        } else {
-                XFS_STATS_INC(xs_trans_async);
-        }
-        return (error);
 }
 /*
- * Total up the number of log iovecs needed to commit this
+ * Walk the log items and allocate log vector structures for
- * transaction.  The transaction itself needs one for the
+ * each item large enough to fit all the vectors they require.
- * transaction header.  Ask each dirty item in turn how many
+ * Note that this format differs from the old log vector format in
- * it needs to get the total.
+ * that there is no transaction header in these log vectors.
 */
-STATIC uint
+STATIC struct xfs_log_vec *
-xfs_trans_count_vecs(
+xfs_trans_alloc_log_vecs(
        xfs_trans_t     *tp)
 {
-        int                     nvecs;
        xfs_log_item_desc_t     *lidp;
+        struct xfs_log_vec      *lv = NULL;
+        struct xfs_log_vec      *ret_lv = NULL;
-        nvecs = 1;
        lidp = xfs_trans_first_item(tp);
-        ASSERT(lidp != NULL);
-        /* In the non-debug case we need to start bailing out if we
+        /* Bail out if we didn't find a log item.  */
-         * didn't find a log_item here, return zero and let trans_commit
+        if (!lidp) {
-         * deal with it.
+                ASSERT(0);
-         */
+                return NULL;
-        if (lidp == NULL)
+        }
-                return 0;
        while (lidp != NULL) {
-                /*
+                struct xfs_log_vec *new_lv;
-                 * Skip items which aren't dirty in this transaction.
-                 */
+                /* Skip items which aren't dirty in this transaction. */
                if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
                        lidp = xfs_trans_next_item(tp, lidp);
                        continue;
                }
+                /* Skip items that do not have any vectors for writing */
                lidp->lid_size = IOP_SIZE(lidp->lid_item);
-                nvecs += lidp->lid_size;
+                if (!lidp->lid_size) {
+                        lidp = xfs_trans_next_item(tp, lidp);
+                        continue;
+                }
+                new_lv = kmem_zalloc(sizeof(*new_lv) +
+                                lidp->lid_size * sizeof(struct xfs_log_iovec),
+                                KM_SLEEP);
+                /* The allocated iovec region lies beyond the log vector. */
+                new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
+                new_lv->lv_niovecs = lidp->lid_size;
+                new_lv->lv_item = lidp->lid_item;
+                if (!ret_lv)
+                        ret_lv = new_lv;
+                else
+                        lv->lv_next = new_lv;
+                lv = new_lv;
                lidp = xfs_trans_next_item(tp, lidp);
        }
-        return nvecs;
+        return ret_lv;
 }
-/*
+static int
- * Called from the trans_commit code when we notice that
+xfs_trans_commit_cil(
- * the filesystem is in the middle of a forced shutdown.
+        struct xfs_mount        *mp,
- */
+        struct xfs_trans        *tp,
-STATIC void
+        xfs_lsn_t               *commit_lsn,
-xfs_trans_uncommit(
+        int                     flags)
-        xfs_trans_t     *tp,
-        uint            flags)
 {
-        xfs_log_item_desc_t     *lidp;
+        struct xfs_log_vec      *log_vector;
+        int                     error;
-        for (lidp = xfs_trans_first_item(tp);
+        /*
-             lidp != NULL;
+         * Get each log item to allocate a vector structure for
-             lidp = xfs_trans_next_item(tp, lidp)) {
+         * the log item to to pass to the log write code. The
-                /*
+         * CIL commit code will format the vector and save it away.
-                 * Unpin all but those that aren't dirty.
+         */
-                 */
+        log_vector = xfs_trans_alloc_log_vecs(tp);
-                if (lidp->lid_flags & XFS_LID_DIRTY)
+        if (!log_vector)
-                        IOP_UNPIN_REMOVE(lidp->lid_item, tp);
+                return ENOMEM;
-        }
-        xfs_trans_unreserve_and_mod_sb(tp);
+        error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
-        xfs_trans_unreserve_and_mod_dquots(tp);
+        if (error)
+                return error;
-        xfs_trans_free_items(tp, flags);
+        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-        xfs_trans_free_busy(tp);
+        /* xfs_trans_free_items() unlocks them first */
+        xfs_trans_free_items(tp, *commit_lsn, 0);
        xfs_trans_free(tp);
+        return 0;
 }
 /*
- * Fill in the vector with pointers to data to be logged
+ * xfs_trans_commit
- * by this transaction.  The transaction header takes
- * the first vector, and then each dirty item takes the
- * number of vectors it indicated it needed in xfs_trans_count_vecs().
 *
- * As each item fills in the entries it needs, also pin the item
+ * Commit the given transaction to the log a/synchronously.
- * so that it cannot be flushed out until the log write completes.
+ *
+ * XFS disk error handling mechanism is not based on a typical
+ * transaction abort mechanism. Logically after the filesystem
+ * gets marked 'SHUTDOWN', we can't let any new transactions
+ * be durable - ie. committed to disk - because some metadata might
+ * be inconsistent. In such cases, this returns an error, and the
+ * caller may assume that all locked objects joined to the transaction
+ * have already been unlocked as if the commit had succeeded.
+ * Do not reference the transaction structure after this call.
 */
-STATIC void
+int
-xfs_trans_fill_vecs(
+_xfs_trans_commit(
-        xfs_trans_t             *tp,
+        struct xfs_trans        *tp,
-        xfs_log_iovec_t         *log_vector)
+        uint                    flags,
+        int                     *log_flushed)
 {
-        xfs_log_item_desc_t     *lidp;
+        struct xfs_mount        *mp = tp->t_mountp;
-        xfs_log_iovec_t         *vecp;
+        xfs_lsn_t               commit_lsn = -1;
-        uint                    nitems;
+        int                     error = 0;
+        int                     log_flags = 0;
+        int                     sync = tp->t_flags & XFS_TRANS_SYNC;
        /*
-         * Skip over the entry for the transaction header, we'll
+         * Determine whether this commit is releasing a permanent
-         * fill that in at the end.
+         * log reservation or not.
         */
-        vecp = log_vector + 1;          /* pointer arithmetic */
+        if (flags & XFS_TRANS_RELEASE_LOG_RES) {
+                ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+                log_flags = XFS_LOG_REL_PERM_RESERV;
+        }
-        nitems = 0;
+        /*
-        lidp = xfs_trans_first_item(tp);
+         * If there is nothing to be logged by the transaction,
-        ASSERT(lidp != NULL);
+         * then unlock all of the items associated with the
-        while (lidp != NULL) {
+         * transaction and free the transaction structure.
-                /*
+         * Also make sure to return any reserved blocks to
-                 * Skip items which aren't dirty in this transaction.
+         * the free pool.
-                 */
+         */
-                if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+        if (!(tp->t_flags & XFS_TRANS_DIRTY))
-                        lidp = xfs_trans_next_item(tp, lidp);
+                goto out_unreserve;
-                        continue;
-                }
+        if (XFS_FORCED_SHUTDOWN(mp)) {
-                /*
+                error = XFS_ERROR(EIO);
-                 * The item may be marked dirty but not log anything.
+                goto out_unreserve;
-                 * This can be used to get called when a transaction
+        }
-                 * is committed.
-                 */
+        ASSERT(tp->t_ticket != NULL);
-                if (lidp->lid_size) {
-                        nitems++;
+        /*
+         * If we need to update the superblock, then do it now.
+         */
+        if (tp->t_flags & XFS_TRANS_SB_DIRTY)
+                xfs_trans_apply_sb_deltas(tp);
+        xfs_trans_apply_dquot_deltas(tp);
+        if (mp->m_flags & XFS_MOUNT_DELAYLOG)
+                error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags);
+        else
+                error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags);
+        if (error == ENOMEM) {
+                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+                error = XFS_ERROR(EIO);
+                goto out_unreserve;
+        }
+        /*
+         * If the transaction needs to be synchronous, then force the
+         * log out now and wait for it.
+         */
+        if (sync) {
+                if (!error) {
+                        error = _xfs_log_force_lsn(mp, commit_lsn,
+                                      XFS_LOG_SYNC, log_flushed);
                }
-                IOP_FORMAT(lidp->lid_item, vecp);
+                XFS_STATS_INC(xs_trans_sync);
-                vecp += lidp->lid_size;         /* pointer arithmetic */
+        } else {
-                IOP_PIN(lidp->lid_item);
+                XFS_STATS_INC(xs_trans_async);
-                lidp = xfs_trans_next_item(tp, lidp);
        }
+        return error;
+out_unreserve:
+        xfs_trans_unreserve_and_mod_sb(tp);
        /*
-         * Now that we've counted the number of items in this
+         * It is indeed possible for the transaction to be not dirty but
-         * transaction, fill in the transaction header.
+         * the dqinfo portion to be.  All that means is that we have some
+         * (non-persistent) quota reservations that need to be unreserved.
         */
-        tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC;
+        xfs_trans_unreserve_and_mod_dquots(tp);
-        tp->t_header.th_type = tp->t_type;
+        if (tp->t_ticket) {
-        tp->t_header.th_num_items = nitems;
+                commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
-        log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
+                if (commit_lsn == -1 && !error)
-        log_vector->i_len = sizeof(xfs_trans_header_t);
+                        error = XFS_ERROR(EIO);
-        log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
+        }
-}
+        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+        xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
+        xfs_trans_free(tp);
+        XFS_STATS_INC(xs_trans_empty);
+        return error;
+}
 /*
 * Unlock all of the transaction's items and free the transaction.
@@ -1195,25 +1411,10 @@ xfs_trans_cancel(
        /* mark this thread as no longer being in a transaction */
        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-        xfs_trans_free_items(tp, flags);
+        xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
-        xfs_trans_free_busy(tp);
        xfs_trans_free(tp);
 }
-/*
- * Free the transaction structure.  If there is more clean up
- * to do when the structure is freed, add it here.
- */
-STATIC void
-xfs_trans_free(
-        xfs_trans_t     *tp)
-{
-        atomic_dec(&tp->t_mountp->m_active_trans);
-        xfs_trans_free_dqinfo(tp);
-        kmem_zone_free(xfs_trans_zone, tp);
-}
 /*
 * Roll from one trans in the sequence of PERMANENT transactions to
 * the next: permanent transactions are only flushed out when
@@ -1283,174 +1484,3 @@ xfs_trans_roll(
        xfs_trans_ihold(trans, dp);
        return 0;
 }
-/*
- * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item().
- *
- * This is typically called by the LM when a transaction has been fully
- * committed to disk.  It needs to unpin the items which have
- * been logged by the transaction and update their positions
- * in the AIL if necessary.
- * This also gets called when the transactions didn't get written out
- * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
- *
- * Call xfs_trans_chunk_committed() to process the items in
- * each chunk.
- */
-STATIC void
-xfs_trans_committed(
-        xfs_trans_t     *tp,
-        int             abortflag)
-{
-        xfs_log_item_chunk_t    *licp;
-        xfs_log_item_chunk_t    *next_licp;
-        xfs_log_busy_chunk_t    *lbcp;
-        xfs_log_busy_slot_t     *lbsp;
-        int                     i;
-        /*
-         * Call the transaction's completion callback if there
-         * is one.
-         */
-        if (tp->t_callback != NULL) {
-                tp->t_callback(tp, tp->t_callarg);
-        }
-        /*
-         * Special case the chunk embedded in the transaction.
-         */
-        licp = &(tp->t_items);
-        if (!(xfs_lic_are_all_free(licp))) {
-                xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
-        }
-        /*
-         * Process the items in each chunk in turn.
-         */
-        licp = licp->lic_next;
-        while (licp != NULL) {
-                ASSERT(!xfs_lic_are_all_free(licp));
-                xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
-                next_licp = licp->lic_next;
-                kmem_free(licp);
-                licp = next_licp;
-        }
-        /*
-         * Clear all the per-AG busy list items listed in this transaction
-         */
-        lbcp = &tp->t_busy;
-        while (lbcp != NULL) {
-                for (i = 0, lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) {
-                        if (!XFS_LBC_ISFREE(lbcp, i)) {
-                                xfs_alloc_clear_busy(tp, lbsp->lbc_ag,
-                                                     lbsp->lbc_idx);
-                        }
-                }
-                lbcp = lbcp->lbc_next;
-        }
-        xfs_trans_free_busy(tp);
-        /*
-         * That's it for the transaction structure.  Free it.
-         */
-        xfs_trans_free(tp);
-}
-/*
- * This is called to perform the commit processing for each
- * item described by the given chunk.
- *
- * The commit processing consists of unlocking items which were
- * held locked with the SYNC_UNLOCK attribute, calling the committed
- * routine of each logged item, updating the item's position in the AIL
- * if necessary, and unpinning each item.  If the committed routine
- * returns -1, then do nothing further with the item because it
- * may have been freed.
- *
- * Since items are unlocked when they are copied to the incore
- * log, it is possible for two transactions to be completing
- * and manipulating the same item simultaneously.  The AIL lock
- * will protect the lsn field of each item.  The value of this
- * field can never go backwards.
- *
- * We unpin the items after repositioning them in the AIL, because
- * otherwise they could be immediately flushed and we'd have to race
- * with the flusher trying to pull the item from the AIL as we add it.
- */
-STATIC void
-xfs_trans_chunk_committed(
-        xfs_log_item_chunk_t    *licp,
-        xfs_lsn_t               lsn,
-        int                     aborted)
-{
-        xfs_log_item_desc_t     *lidp;
-        xfs_log_item_t          *lip;
-        xfs_lsn_t               item_lsn;
-        int                     i;
-        lidp = licp->lic_descs;
-        for (i = 0; i < licp->lic_unused; i++, lidp++) {
-                struct xfs_ail          *ailp;
-                if (xfs_lic_isfree(licp, i)) {
-                        continue;
-                }
-                lip = lidp->lid_item;
-                if (aborted)
-                        lip->li_flags |= XFS_LI_ABORTED;
-                /*
-                 * Send in the ABORTED flag to the COMMITTED routine
-                 * so that it knows whether the transaction was aborted
-                 * or not.
-                 */
-                item_lsn = IOP_COMMITTED(lip, lsn);
-                /*
-                 * If the committed routine returns -1, make
-                 * no more references to the item.
-                 */
-                if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) {
-                        continue;
-                }
-                /*
-                 * If the returned lsn is greater than what it
-                 * contained before, update the location of the
-                 * item in the AIL.  If it is not, then do nothing.
-                 * Items can never move backwards in the AIL.
-                 *
-                 * While the new lsn should usually be greater, it
-                 * is possible that a later transaction completing
-                 * simultaneously with an earlier one using the
-                 * same item could complete first with a higher lsn.
-                 * This would cause the earlier transaction to fail
-                 * the test below.
-                 */
-                ailp = lip->li_ailp;
-                spin_lock(&ailp->xa_lock);
-                if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
-                        /*
-                         * This will set the item's lsn to item_lsn
-                         * and update the position of the item in
-                         * the AIL.
-                         *
-                         * xfs_trans_ail_update() drops the AIL lock.
-                         */
-                        xfs_trans_ail_update(ailp, lip, item_lsn);
-                } else {
-                        spin_unlock(&ailp->xa_lock);
-                }
-                /*
-                 * Now that we've repositioned the item in the AIL,
-                 * unpin it so it can be flushed. Pass information
-                 * about buffer stale state down from the log item
-                 * flags, if anyone else stales the buffer we do not
-                 * want to pay any attention to it.
-                 */
-                IOP_UNPIN(lip, lidp->lid_flags & XFS_LID_BUF_STALE);
-        }
-}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 79c8bab9dfff..8c69e7824f68 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -49,6 +49,15 @@ typedef struct xfs_trans_header {
 #define XFS_LI_DQUOT            0x123d
 #define XFS_LI_QUOTAOFF         0x123e
+#define XFS_LI_TYPE_DESC \
+        { XFS_LI_EFI,           "XFS_LI_EFI" }, \
+        { XFS_LI_EFD,           "XFS_LI_EFD" }, \
+        { XFS_LI_IUNLINK,       "XFS_LI_IUNLINK" }, \
+        { XFS_LI_INODE,         "XFS_LI_INODE" }, \
+        { XFS_LI_BUF,           "XFS_LI_BUF" }, \
+        { XFS_LI_DQUOT,         "XFS_LI_DQUOT" }, \
+        { XFS_LI_QUOTAOFF,      "XFS_LI_QUOTAOFF" }
 /*
 * Transaction types.  Used to distinguish types of buffers.
 */
@@ -97,7 +106,8 @@ typedef struct xfs_trans_header {
 #define XFS_TRANS_GROWFSRT_FREE         39
 #define XFS_TRANS_SWAPEXT               40
 #define XFS_TRANS_SB_COUNT              41
-#define XFS_TRANS_TYPE_MAX              41
+#define XFS_TRANS_CHECKPOINT            42
+#define XFS_TRANS_TYPE_MAX              42
 /* new transaction types need to be reflected in xfs_logprint(8) */
 #define XFS_TRANS_TYPES \
@@ -139,6 +149,7 @@ typedef struct xfs_trans_header {
        { XFS_TRANS_GROWFSRT_FREE,      "GROWFSRT_FREE" }, \
        { XFS_TRANS_SWAPEXT,            "SWAPEXT" }, \
        { XFS_TRANS_SB_COUNT,           "SB_COUNT" }, \
+        { XFS_TRANS_CHECKPOINT,         "CHECKPOINT" }, \
        { XFS_TRANS_DUMMY1,             "DUMMY1" }, \
        { XFS_TRANS_DUMMY2,             "DUMMY2" }, \
        { XLOG_UNMOUNT_REC_TYPE,        "UNMOUNT" }
@@ -159,7 +170,6 @@ typedef struct xfs_log_item_desc {
 #define XFS_LID_DIRTY           0x1
 #define XFS_LID_PINNED          0x2
-#define XFS_LID_BUF_STALE       0x8
 /*
 * This structure is used to maintain a chunk list of log_item_desc
@@ -805,6 +815,7 @@ struct xfs_log_item_desc;
 struct xfs_mount;
 struct xfs_trans;
 struct xfs_dquot_acct;
+struct xfs_busy_extent;
 typedef struct xfs_log_item {
        struct list_head                li_ail;         /* AIL pointers */
@@ -820,6 +831,11 @@ typedef struct xfs_log_item {
                                                        /* buffer item iodone */
                                                        /* callback func */
        struct xfs_item_ops             *li_ops;        /* function list */
+        /* delayed logging */
+        struct list_head                li_cil;         /* CIL pointers */
+        struct xfs_log_vec              *li_lv;         /* active log vector */
+        xfs_lsn_t                       li_seq;         /* CIL commit seq */
 } xfs_log_item_t;
 #define XFS_LI_IN_AIL   0x1
@@ -833,7 +849,7 @@ typedef struct xfs_item_ops {
        uint (*iop_size)(xfs_log_item_t *);
        void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
        void (*iop_pin)(xfs_log_item_t *);
-        void (*iop_unpin)(xfs_log_item_t *, int);
+        void (*iop_unpin)(xfs_log_item_t *);
        void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
        uint (*iop_trylock)(xfs_log_item_t *);
        void (*iop_unlock)(xfs_log_item_t *);
@@ -846,7 +862,7 @@ typedef struct xfs_item_ops {
 #define IOP_SIZE(ip)            (*(ip)->li_ops->iop_size)(ip)
 #define IOP_FORMAT(ip,vp)       (*(ip)->li_ops->iop_format)(ip, vp)
 #define IOP_PIN(ip)             (*(ip)->li_ops->iop_pin)(ip)
-#define IOP_UNPIN(ip, flags)    (*(ip)->li_ops->iop_unpin)(ip, flags)
+#define IOP_UNPIN(ip)           (*(ip)->li_ops->iop_unpin)(ip)
 #define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
 #define IOP_TRYLOCK(ip)         (*(ip)->li_ops->iop_trylock)(ip)
 #define IOP_UNLOCK(ip)          (*(ip)->li_ops->iop_unlock)(ip)
@@ -864,34 +880,6 @@ typedef struct xfs_item_ops {
 #define XFS_ITEM_PUSHBUF        3
 /*
- * This structure is used to maintain a list of block ranges that have been
- * freed in the transaction.  The ranges are listed in the perag[] busy list
- * between when they're freed and the transaction is committed to disk.
- */
-typedef struct xfs_log_busy_slot {
-        xfs_agnumber_t          lbc_ag;
-        ushort                  lbc_idx;        /* index in perag.busy[] */
-} xfs_log_busy_slot_t;
-#define XFS_LBC_NUM_SLOTS       31
-typedef struct xfs_log_busy_chunk {
-        struct xfs_log_busy_chunk       *lbc_next;
-        uint                            lbc_free;       /* free slots bitmask */
-        ushort                          lbc_unused;     /* first unused */
-        xfs_log_busy_slot_t             lbc_busy[XFS_LBC_NUM_SLOTS];
-} xfs_log_busy_chunk_t;
-#define XFS_LBC_MAX_SLOT        (XFS_LBC_NUM_SLOTS - 1)
-#define XFS_LBC_FREEMASK        ((1U << XFS_LBC_NUM_SLOTS) - 1)
-#define XFS_LBC_INIT(cp)        ((cp)->lbc_free = XFS_LBC_FREEMASK)
-#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
-#define XFS_LBC_SLOT(cp, slot)  (&((cp)->lbc_busy[(slot)]))
-#define XFS_LBC_VACANCY(cp)     (((cp)->lbc_free) & XFS_LBC_FREEMASK)
-#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
-/*
 * This is the type of function which can be given to xfs_trans_callback()
 * to be called upon the transaction's commit to disk.
 */
@@ -942,8 +930,7 @@ typedef struct xfs_trans {
        unsigned int            t_items_free;   /* log item descs free */
        xfs_log_item_chunk_t    t_items;        /* first log item desc chunk */
        xfs_trans_header_t      t_header;       /* header for in-log trans */
-        unsigned int            t_busy_free;    /* busy descs free */
+        struct list_head        t_busy;         /* list of busy extents */
-        xfs_log_busy_chunk_t    t_busy;         /* busy/async free blocks */
        unsigned long           t_pflags;       /* saved process flags state */
 } xfs_trans_t;
@@ -1017,9 +1004,6 @@ int		_xfs_trans_commit(xfs_trans_t *,
 void            xfs_trans_cancel(xfs_trans_t *, int);
 int             xfs_trans_ail_init(struct xfs_mount *);
 void            xfs_trans_ail_destroy(struct xfs_mount *);
-xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
-                                        xfs_agnumber_t ag,
-                                        xfs_extlen_t idx);
 extern kmem_zone_t      *xfs_trans_zone;
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index fb586360d1c9..63d81a22f4fd 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -40,11 +40,51 @@
 #include "xfs_rw.h"
 #include "xfs_trace.h"
+/*
+ * Check to see if a buffer matching the given parameters is already
+ * a part of the given transaction.
+ */
+STATIC struct xfs_buf *
+xfs_trans_buf_item_match(
+        struct xfs_trans        *tp,
+        struct xfs_buftarg      *target,
+        xfs_daddr_t             blkno,
+        int                     len)
+{
+        xfs_log_item_chunk_t    *licp;
+        xfs_log_item_desc_t     *lidp;
+        xfs_buf_log_item_t      *blip;
+        int                     i;
+        len = BBTOB(len);
+        for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
+                if (xfs_lic_are_all_free(licp)) {
+                        ASSERT(licp == &tp->t_items);
+                        ASSERT(licp->lic_next == NULL);
+                        return NULL;
+                }
+                for (i = 0; i < licp->lic_unused; i++) {
+                        /*
+                         * Skip unoccupied slots.
+                         */
+                        if (xfs_lic_isfree(licp, i))
+                                continue;
+                        lidp = xfs_lic_slot(licp, i);
+                        blip = (xfs_buf_log_item_t *)lidp->lid_item;
+                        if (blip->bli_item.li_type != XFS_LI_BUF)
+                                continue;
+                        if (XFS_BUF_TARGET(blip->bli_buf) == target &&
+                            XFS_BUF_ADDR(blip->bli_buf) == blkno &&
+                            XFS_BUF_COUNT(blip->bli_buf) == len)
+                                return blip->bli_buf;
+                }
+        }
-STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *,
+        return NULL;
-                xfs_daddr_t, int);
+}
-STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *,
-                xfs_daddr_t, int);
 /*
 * Add the locked buffer to the transaction.
@@ -74,7 +114,7 @@ _xfs_trans_bjoin(
        xfs_buf_item_init(bp, tp->t_mountp);
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+        ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
        ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
        if (reset_recur)
                bip->bli_recur = 0;
@@ -112,14 +152,6 @@ xfs_trans_bjoin(
 * within the transaction, just increment its lock recursion count
 * and return a pointer to it.
 *
- * Use the fast path function xfs_trans_buf_item_match() or the buffer
- * cache routine incore_match() to find the buffer
- * if it is already owned by this transaction.
- *
- * If we don't already own the buffer, use get_buf() to get it.
- * If it doesn't yet have an associated xfs_buf_log_item structure,
- * then allocate one and add the item to this transaction.
- *
 * If the transaction pointer is NULL, make this just a normal
 * get_buf() call.
 */
@@ -149,11 +181,7 @@ xfs_trans_get_buf(xfs_trans_t	*tp,
         * have it locked.  In this case we just increment the lock
         * recursion count and return the buffer to the caller.
         */
-        if (tp->t_items.lic_next == NULL) {
+        bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
-                bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
-        } else {
-                bp  = xfs_trans_buf_item_match_all(tp, target_dev, blkno, len);
-        }
        if (bp != NULL) {
                ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
                if (XFS_FORCED_SHUTDOWN(tp->t_mountp))
@@ -259,14 +287,6 @@ int	xfs_error_mod = 33;
 * within the transaction and already read in, just increment its
 * lock recursion count and return a pointer to it.
 *
- * Use the fast path function xfs_trans_buf_item_match() or the buffer
- * cache routine incore_match() to find the buffer
- * if it is already owned by this transaction.
- *
- * If we don't already own the buffer, use read_buf() to get it.
- * If it doesn't yet have an associated xfs_buf_log_item structure,
- * then allocate one and add the item to this transaction.
- *
 * If the transaction pointer is NULL, make this just a normal
 * read_buf() call.
 */
@@ -328,11 +348,7 @@ xfs_trans_read_buf(
         * If the buffer is not yet read in, then we read it in, increment
         * the lock recursion count, and return it to the caller.
         */
-        if (tp->t_items.lic_next == NULL) {
+        bp = xfs_trans_buf_item_match(tp, target, blkno, len);
-                bp = xfs_trans_buf_item_match(tp, target, blkno, len);
-        } else {
-                bp = xfs_trans_buf_item_match_all(tp, target, blkno, len);
-        }
        if (bp != NULL) {
                ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
                ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
@@ -495,7 +511,7 @@ xfs_trans_brelse(xfs_trans_t	*tp,
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+        ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        /*
@@ -603,7 +619,7 @@ xfs_trans_bhold(xfs_trans_t	*tp,
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+        ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        bip->bli_flags |= XFS_BLI_HOLD;
        trace_xfs_trans_bhold(bip);
@@ -625,7 +641,7 @@ xfs_trans_bhold_release(xfs_trans_t	*tp,
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+        ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        ASSERT(bip->bli_flags & XFS_BLI_HOLD);
        bip->bli_flags &= ~XFS_BLI_HOLD;
@@ -688,7 +704,7 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
                bip->bli_flags &= ~XFS_BLI_STALE;
                ASSERT(XFS_BUF_ISSTALE(bp));
                XFS_BUF_UNSTALE(bp);
-                bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL;
+                bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL;
        }
        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
@@ -696,7 +712,6 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
        tp->t_flags |= XFS_TRANS_DIRTY;
        lidp->lid_flags |= XFS_LID_DIRTY;
-        lidp->lid_flags &= ~XFS_LID_BUF_STALE;
        bip->bli_flags |= XFS_BLI_LOGGED;
        xfs_buf_item_log(bip, first, last);
 }
@@ -747,8 +762,8 @@ xfs_trans_binval(
                ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
                ASSERT(XFS_BUF_ISSTALE(bp));
                ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
-                ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF));
+                ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
-                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
                ASSERT(lidp->lid_flags & XFS_LID_DIRTY);
                ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
                return;
@@ -759,7 +774,7 @@ xfs_trans_binval(
         * in the buf log item.  The STALE flag will be used in
         * xfs_buf_item_unpin() to determine if it should clean up
         * when the last reference to the buf item is given up.
-         * We set the XFS_BLI_CANCEL flag in the buf log format structure
+         * We set the XFS_BLF_CANCEL flag in the buf log format structure
         * and log the buf item.  This will be used at recovery time
         * to determine that copies of the buffer in the log before
         * this should not be replayed.
@@ -777,26 +792,26 @@ xfs_trans_binval(
        XFS_BUF_UNDELAYWRITE(bp);
        XFS_BUF_STALE(bp);
        bip->bli_flags |= XFS_BLI_STALE;
-        bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY);
+        bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
-        bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF;
+        bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
-        bip->bli_format.blf_flags |= XFS_BLI_CANCEL;
+        bip->bli_format.blf_flags |= XFS_BLF_CANCEL;
        memset((char *)(bip->bli_format.blf_data_map), 0,
              (bip->bli_format.blf_map_size * sizeof(uint)));
-        lidp->lid_flags |= XFS_LID_DIRTY|XFS_LID_BUF_STALE;
+        lidp->lid_flags |= XFS_LID_DIRTY;
        tp->t_flags |= XFS_TRANS_DIRTY;
 }
 /*
- * This call is used to indicate that the buffer contains on-disk
+ * This call is used to indicate that the buffer contains on-disk inodes which
- * inodes which must be handled specially during recovery.  They
+ * must be handled specially during recovery.  They require special handling
- * require special handling because only the di_next_unlinked from
+ * because only the di_next_unlinked from the inodes in the buffer should be
- * the inodes in the buffer should be recovered.  The rest of the
+ * recovered.  The rest of the data in the buffer is logged via the inodes
- * data in the buffer is logged via the inodes themselves.
+ * themselves.
 *
- * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log
+ * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be
- * format structure so that we'll know what to do at recovery time.
+ * transferred to the buffer's log format structure so that we'll know what to
+ * do at recovery time.
 */
-/* ARGSUSED */
 void
 xfs_trans_inode_buf(
        xfs_trans_t     *tp,
@@ -811,7 +826,7 @@ xfs_trans_inode_buf(
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
-        bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF;
+        bip->bli_flags |= XFS_BLI_INODE_BUF;
 }
 /*
@@ -893,120 +908,12 @@ xfs_trans_dquot_buf(
        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
-        ASSERT(type == XFS_BLI_UDQUOT_BUF ||
+        ASSERT(type == XFS_BLF_UDQUOT_BUF ||
-               type == XFS_BLI_PDQUOT_BUF ||
+               type == XFS_BLF_PDQUOT_BUF ||
-               type == XFS_BLI_GDQUOT_BUF);
+               type == XFS_BLF_GDQUOT_BUF);
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        bip->bli_format.blf_flags |= type;
 }
-/*
- * Check to see if a buffer matching the given parameters is already
- * a part of the given transaction.  Only check the first, embedded
- * chunk, since we don't want to spend all day scanning large transactions.
- */
-STATIC xfs_buf_t *
-xfs_trans_buf_item_match(
-        xfs_trans_t     *tp,
-        xfs_buftarg_t   *target,
-        xfs_daddr_t     blkno,
-        int             len)
-{
-        xfs_log_item_chunk_t    *licp;
-        xfs_log_item_desc_t     *lidp;
-        xfs_buf_log_item_t      *blip;
-        xfs_buf_t               *bp;
-        int                     i;
-        bp = NULL;
-        len = BBTOB(len);
-        licp = &tp->t_items;
-        if (!xfs_lic_are_all_free(licp)) {
-                for (i = 0; i < licp->lic_unused; i++) {
-                        /*
-                         * Skip unoccupied slots.
-                         */
-                        if (xfs_lic_isfree(licp, i)) {
-                                continue;
-                        }
-                        lidp = xfs_lic_slot(licp, i);
-                        blip = (xfs_buf_log_item_t *)lidp->lid_item;
-                        if (blip->bli_item.li_type != XFS_LI_BUF) {
-                                continue;
-                        }
-                        bp = blip->bli_buf;
-                        if ((XFS_BUF_TARGET(bp) == target) &&
-                            (XFS_BUF_ADDR(bp) == blkno) &&
-                            (XFS_BUF_COUNT(bp) == len)) {
-                                /*
-                                 * We found it.  Break out and
-                                 * return the pointer to the buffer.
-                                 */
-                                break;
-                        } else {
-                                bp = NULL;
-                        }
-                }
-        }
-        return bp;
-}
-/*
- * Check to see if a buffer matching the given parameters is already
- * a part of the given transaction.  Check all the chunks, we
- * want to be thorough.
- */
-STATIC xfs_buf_t *
-xfs_trans_buf_item_match_all(
-        xfs_trans_t     *tp,
-        xfs_buftarg_t   *target,
-        xfs_daddr_t     blkno,
-        int             len)
-{
-        xfs_log_item_chunk_t    *licp;
-        xfs_log_item_desc_t     *lidp;
-        xfs_buf_log_item_t      *blip;
-        xfs_buf_t               *bp;
-        int                     i;
-        bp = NULL;
-        len = BBTOB(len);
-        for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
-                if (xfs_lic_are_all_free(licp)) {
-                        ASSERT(licp == &tp->t_items);
-                        ASSERT(licp->lic_next == NULL);
-                        return NULL;
-                }
-                for (i = 0; i < licp->lic_unused; i++) {
-                        /*
-                         * Skip unoccupied slots.
-                         */
-                        if (xfs_lic_isfree(licp, i)) {
-                                continue;
-                        }
-                        lidp = xfs_lic_slot(licp, i);
-                        blip = (xfs_buf_log_item_t *)lidp->lid_item;
-                        if (blip->bli_item.li_type != XFS_LI_BUF) {
-                                continue;
-                        }
-                        bp = blip->bli_buf;
-                        if ((XFS_BUF_TARGET(bp) == target) &&
-                            (XFS_BUF_ADDR(bp) == blkno) &&
-                            (XFS_BUF_COUNT(bp) == len)) {
-                                /*
-                                 * We found it.  Break out and
-                                 * return the pointer to the buffer.
-                                 */
-                                return bp;
-                        }
-                }
-        }
-        return NULL;
-}
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index eb3fc57f9eef..f11d37d06dcc 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -299,6 +299,7 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
 void
 xfs_trans_free_items(
        xfs_trans_t     *tp,
+        xfs_lsn_t       commit_lsn,
        int             flags)
 {
        xfs_log_item_chunk_t    *licp;
@@ -311,7 +312,7 @@ xfs_trans_free_items(
         * Special case the embedded chunk so we don't free it below.
         */
        if (!xfs_lic_are_all_free(licp)) {
-                (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
+                (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
                xfs_lic_all_free(licp);
                licp->lic_unused = 0;
        }
@@ -322,7 +323,7 @@ xfs_trans_free_items(
         */
        while (licp != NULL) {
                ASSERT(!xfs_lic_are_all_free(licp));
-                (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
+                (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
                next_licp = licp->lic_next;
                kmem_free(licp);
                licp = next_licp;
@@ -438,112 +439,3 @@ xfs_trans_unlock_chunk(
        return freed;
 }
-/*
- * This is called to add the given busy item to the transaction's
- * list of busy items.  It must find a free busy item descriptor
- * or allocate a new one and add the item to that descriptor.
- * The function returns a pointer to busy descriptor used to point
- * to the new busy entry.  The log busy entry will now point to its new
- * descriptor with its ???? field.
- */
-xfs_log_busy_slot_t *
-xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx)
-{
-        xfs_log_busy_chunk_t    *lbcp;
-        xfs_log_busy_slot_t     *lbsp;
-        int                     i=0;
-        /*
-         * If there are no free descriptors, allocate a new chunk
-         * of them and put it at the front of the chunk list.
-         */
-        if (tp->t_busy_free == 0) {
-                lbcp = (xfs_log_busy_chunk_t*)
-                       kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP);
-                ASSERT(lbcp != NULL);
-                /*
-                 * Initialize the chunk, and then
-                 * claim the first slot in the newly allocated chunk.
-                 */
-                XFS_LBC_INIT(lbcp);
-                XFS_LBC_CLAIM(lbcp, 0);
-                lbcp->lbc_unused = 1;
-                lbsp = XFS_LBC_SLOT(lbcp, 0);
-                /*
-                 * Link in the new chunk and update the free count.
-                 */
-                lbcp->lbc_next = tp->t_busy.lbc_next;
-                tp->t_busy.lbc_next = lbcp;
-                tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1;
-                /*
-                 * Initialize the descriptor and the generic portion
-                 * of the log item.
-                 *
-                 * Point the new slot at this item and return it.
-                 * Also point the log item at its currently active
-                 * descriptor and set the item's mount pointer.
-                 */
-                lbsp->lbc_ag = ag;
-                lbsp->lbc_idx = idx;
-                return lbsp;
-        }
-        /*
-         * Find the free descriptor. It is somewhere in the chunklist
-         * of descriptors.
-         */
-        lbcp = &tp->t_busy;
-        while (lbcp != NULL) {
-                if (XFS_LBC_VACANCY(lbcp)) {
-                        if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) {
-                                i = lbcp->lbc_unused;
-                                break;
-                        } else {
-                                /* out-of-order vacancy */
-                                cmn_err(CE_DEBUG, "OOO vacancy lbcp 0x%p\n", lbcp);
-                                ASSERT(0);
-                        }
-                }
-                lbcp = lbcp->lbc_next;
-        }
-        ASSERT(lbcp != NULL);
-        /*
-         * If we find a free descriptor, claim it,
-         * initialize it, and return it.
-         */
-        XFS_LBC_CLAIM(lbcp, i);
-        if (lbcp->lbc_unused <= i) {
-                lbcp->lbc_unused = i + 1;
-        }
-        lbsp = XFS_LBC_SLOT(lbcp, i);
-        tp->t_busy_free--;
-        lbsp->lbc_ag = ag;
-        lbsp->lbc_idx = idx;
-        return lbsp;
-}
-/*
- * xfs_trans_free_busy
- * Free all of the busy lists from a transaction
- */
-void
-xfs_trans_free_busy(xfs_trans_t *tp)
-{
-        xfs_log_busy_chunk_t    *lbcp;
-        xfs_log_busy_chunk_t    *lbcq;
-        lbcp = tp->t_busy.lbc_next;
-        while (lbcp != NULL) {
-                lbcq = lbcp->lbc_next;
-                kmem_free(lbcp);
-                lbcp = lbcq;
-        }
-        XFS_LBC_INIT(&tp->t_busy);
-        tp->t_busy.lbc_unused = 0;
-}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 73e2ad397432..c6e4f2c8de6e 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -35,13 +35,14 @@ struct xfs_log_item_desc	*xfs_trans_find_item(struct xfs_trans *,
 struct xfs_log_item_desc        *xfs_trans_first_item(struct xfs_trans *);
 struct xfs_log_item_desc        *xfs_trans_next_item(struct xfs_trans *,
                                             struct xfs_log_item_desc *);
-void                            xfs_trans_free_items(struct xfs_trans *, int);
-void                            xfs_trans_unlock_items(struct xfs_trans *,
+void    xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn);
-                                                        xfs_lsn_t);
+void    xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
-void                            xfs_trans_free_busy(xfs_trans_t *tp);
+                                int flags);
-xfs_log_busy_slot_t             *xfs_trans_add_busy(xfs_trans_t *tp,
-                                                    xfs_agnumber_t ag,
+void    xfs_trans_item_committed(struct xfs_log_item *lip,
-                                                    xfs_extlen_t idx);
+                                xfs_lsn_t commit_lsn, int aborted);
+void    xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
 /*
 * AIL traversal cursor.
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index b09904555d07..320775295e32 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -75,6 +75,8 @@ typedef	__uint32_t	xfs_dahash_t;	/* dir/attr hash value */
 typedef __uint16_t      xfs_prid_t;     /* prid_t truncated to 16bits in XFS */
+typedef __uint32_t      xlog_tid_t;     /* transaction ID type */
 /*
 * These types are 64 bits on disk but are either 32 or 64 bits in memory.
 * Disk based types: